{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9998007835845674,
  "eval_steps": 100,
  "global_step": 1882,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0,
      "learning_rate": 2.6455026455026456e-08,
      "loss": 1.2347,
      "loss/mini_gap_loss": 1.2346683740615845,
      "loss/ori_loss": 1.3862943649291992,
      "loss/reward_entrophy": 0.15162594616413116,
      "mask/mask_ratio": 0.44552892446517944,
      "reward/A01_acc": 0.0,
      "reward/A02_acc": 0.0,
      "reward/A03_acc": 0.0,
      "reward/reward_A0": 0.0,
      "reward/reward_A1": 0.0,
      "reward/reward_A2": 0.0,
      "reward/reward_A3": 0.0,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "epoch": 0.01,
      "learning_rate": 2.6455026455026455e-07,
      "loss": 1.1632,
      "loss/mini_gap_loss": 1.1631971597671509,
      "loss/ori_loss": 1.3863022327423096,
      "loss/reward_entrophy": 0.22310495376586914,
      "mask/mask_ratio": 0.4674115777015686,
      "reward/A01_acc": 0.46875,
      "reward/A02_acc": 0.4652777910232544,
      "reward/A03_acc": 0.4375,
      "reward/reward_A0": -0.00010829935490619391,
      "reward/reward_A1": 0.00018974825798068196,
      "reward/reward_A2": -0.00016076747851911932,
      "reward/reward_A3": 0.00010651136108208448,
      "rewards/accuracies": 0.45717132091522217,
      "rewards/chosen": -0.00010829935490619391,
      "rewards/margins": -0.0001534629554953426,
      "rewards/rejected": 4.516359695116989e-05,
      "step": 10
    },
    {
      "epoch": 0.01,
      "learning_rate": 5.291005291005291e-07,
      "loss": 1.1919,
      "loss/mini_gap_loss": 1.1919147968292236,
      "loss/ori_loss": 1.38637375831604,
      "loss/reward_entrophy": 0.19445905089378357,
      "mask/mask_ratio": 0.46323472261428833,
      "reward/A01_acc": 0.4906249940395355,
      "reward/A02_acc": 0.5,
      "reward/A03_acc": 0.4937500059604645,
      "reward/reward_A0": -3.231215669075027e-05,
      "reward/reward_A1": -7.370363164227456e-05,
      "reward/reward_A2": 0.0002596504637040198,
      "reward/reward_A3": -0.0001572092587593943,
      "rewards/accuracies": 0.49478673934936523,
      "rewards/chosen": -3.231215669075027e-05,
      "rewards/margins": -4.189123137621209e-05,
      "rewards/rejected": 9.579091965861153e-06,
      "step": 20
    },
    {
      "epoch": 0.02,
      "learning_rate": 7.936507936507937e-07,
      "loss": 1.1305,
      "loss/mini_gap_loss": 1.1305261850357056,
      "loss/ori_loss": 1.3863718509674072,
      "loss/reward_entrophy": 0.2558456063270569,
      "mask/mask_ratio": 0.44212430715560913,
      "reward/A01_acc": 0.5062500238418579,
      "reward/A02_acc": 0.515625,
      "reward/A03_acc": 0.5,
      "reward/reward_A0": 3.48491121258121e-05,
      "reward/reward_A1": 8.441967656835914e-05,
      "reward/reward_A2": -0.00020365572709124535,
      "reward/reward_A3": 9.05819033505395e-05,
      "rewards/accuracies": 0.507286548614502,
      "rewards/chosen": 3.48491121258121e-05,
      "rewards/margins": 4.44003744632937e-05,
      "rewards/rejected": -9.551285984343849e-06,
      "step": 30
    },
    {
      "epoch": 0.02,
      "learning_rate": 1.0582010582010582e-06,
      "loss": 1.1602,
      "loss/mini_gap_loss": 1.1601699590682983,
      "loss/ori_loss": 1.3860584497451782,
      "loss/reward_entrophy": 0.2258884459733963,
      "mask/mask_ratio": 0.4440450668334961,
      "reward/A01_acc": 0.503125011920929,
      "reward/A02_acc": 0.503125011920929,
      "reward/A03_acc": 0.49687498807907104,
      "reward/reward_A0": 6.791128544136882e-05,
      "reward/reward_A1": -0.000260756176430732,
      "reward/reward_A2": -0.0001289776264457032,
      "reward/reward_A3": -7.973484753165394e-05,
      "rewards/accuracies": 0.5010367035865784,
      "rewards/chosen": 6.791128544136882e-05,
      "rewards/margins": 0.00022439930762629956,
      "rewards/rejected": -0.00015648799308110029,
      "step": 40
    },
    {
      "epoch": 0.03,
      "learning_rate": 1.3227513227513228e-06,
      "loss": 1.153,
      "loss/mini_gap_loss": 1.1530485153198242,
      "loss/ori_loss": 1.385598063468933,
      "loss/reward_entrophy": 0.23254959285259247,
      "mask/mask_ratio": 0.4664740562438965,
      "reward/A01_acc": 0.5562499761581421,
      "reward/A02_acc": 0.5843750238418579,
      "reward/A03_acc": 0.596875011920929,
      "reward/reward_A0": 0.0005568187916651368,
      "reward/reward_A1": -0.00013656688679475337,
      "reward/reward_A2": -0.0003448982606641948,
      "reward/reward_A3": -0.0007238680263981223,
      "rewards/accuracies": 0.5791608691215515,
      "rewards/chosen": 0.0005568187916651368,
      "rewards/margins": 0.0009585924562998116,
      "rewards/rejected": -0.0004017737228423357,
      "step": 50
    },
    {
      "epoch": 0.03,
      "learning_rate": 1.5873015873015873e-06,
      "loss": 1.1605,
      "loss/mini_gap_loss": 1.160509467124939,
      "loss/ori_loss": 1.38529372215271,
      "loss/reward_entrophy": 0.224784255027771,
      "mask/mask_ratio": 0.46073460578918457,
      "reward/A01_acc": 0.528124988079071,
      "reward/A02_acc": 0.59375,
      "reward/A03_acc": 0.643750011920929,
      "reward/reward_A0": 0.0006132640992291272,
      "reward/reward_A1": 0.00015073490794748068,
      "reward/reward_A2": -0.0006917371647432446,
      "reward/reward_A3": -0.001151248929090798,
      "rewards/accuracies": 0.588535726070404,
      "rewards/chosen": 0.0006132640992291272,
      "rewards/margins": 0.0011773421429097652,
      "rewards/rejected": -0.000564078101888299,
      "step": 60
    },
    {
      "epoch": 0.04,
      "learning_rate": 1.8518518518518519e-06,
      "loss": 1.1255,
      "loss/mini_gap_loss": 1.1255247592926025,
      "loss/ori_loss": 1.3846409320831299,
      "loss/reward_entrophy": 0.25911587476730347,
      "mask/mask_ratio": 0.46946725249290466,
      "reward/A01_acc": 0.5687500238418579,
      "reward/A02_acc": 0.596875011920929,
      "reward/A03_acc": 0.643750011920929,
      "reward/reward_A0": 0.0010544664692133665,
      "reward/reward_A1": -2.7736085030483082e-05,
      "reward/reward_A2": -0.0011483042035251856,
      "reward/reward_A3": -0.0018421607092022896,
      "rewards/accuracies": 0.6031190156936646,
      "rewards/chosen": 0.0010544664692133665,
      "rewards/margins": 0.002060523722320795,
      "rewards/rejected": -0.001006056903861463,
      "step": 70
    },
    {
      "epoch": 0.04,
      "learning_rate": 2.1164021164021164e-06,
      "loss": 1.1428,
      "loss/mini_gap_loss": 1.1428346633911133,
      "loss/ori_loss": 1.3840487003326416,
      "loss/reward_entrophy": 0.24121394753456116,
      "mask/mask_ratio": 0.4525940418243408,
      "reward/A01_acc": 0.5531250238418579,
      "reward/A02_acc": 0.659375011920929,
      "reward/A03_acc": 0.675000011920929,
      "reward/reward_A0": 0.0015174144646152854,
      "reward/reward_A1": -0.0002732494322117418,
      "reward/reward_A2": -0.0014590247301384807,
      "reward/reward_A3": -0.002397050615400076,
      "rewards/accuracies": 0.6291602849960327,
      "rewards/chosen": 0.0015174144646152854,
      "rewards/margins": 0.002893842523917556,
      "rewards/rejected": -0.0013764279428869486,
      "step": 80
    },
    {
      "epoch": 0.05,
      "learning_rate": 2.380952380952381e-06,
      "loss": 1.1341,
      "loss/mini_gap_loss": 1.134113073348999,
      "loss/ori_loss": 1.3829294443130493,
      "loss/reward_entrophy": 0.24881640076637268,
      "mask/mask_ratio": 0.45713871717453003,
      "reward/A01_acc": 0.574999988079071,
      "reward/A02_acc": 0.640625,
      "reward/A03_acc": 0.690625011920929,
      "reward/reward_A0": 0.0024915661197155714,
      "reward/reward_A1": -3.262766404077411e-05,
      "reward/reward_A2": -0.001902287476696074,
      "reward/reward_A3": -0.003296253038570285,
      "rewards/accuracies": 0.6354103684425354,
      "rewards/chosen": 0.0024915661197155714,
      "rewards/margins": 0.004235271364450455,
      "rewards/rejected": -0.0017437052447348833,
      "step": 90
    },
    {
      "epoch": 0.05,
      "learning_rate": 2.6455026455026455e-06,
      "loss": 1.1592,
      "loss/mini_gap_loss": 1.1591534614562988,
      "loss/ori_loss": 1.381446123123169,
      "loss/reward_entrophy": 0.22229242324829102,
      "mask/mask_ratio": 0.4683295786380768,
      "reward/A01_acc": 0.6312500238418579,
      "reward/A02_acc": 0.668749988079071,
      "reward/A03_acc": 0.7281249761581421,
      "reward/reward_A0": 0.003342908574268222,
      "reward/reward_A1": -0.0012097046710550785,
      "reward/reward_A2": -0.0027560230810195208,
      "reward/reward_A3": -0.005116731859743595,
      "rewards/accuracies": 0.6760349869728088,
      "rewards/chosen": 0.003342908574268222,
      "rewards/margins": 0.006370364688336849,
      "rewards/rejected": -0.0030274561140686274,
      "step": 100
    },
    {
      "epoch": 0.05,
      "eval_loss": 1.1483122110366821,
      "eval_loss/mini_gap_loss": 1.1484355926513672,
      "eval_loss/ori_loss": 1.38108491897583,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 0.0007877232856117189,
      "eval_regularization/policy_data_loss": 1.2692722082138062,
      "eval_regularization/policy_ref_data_loss_gap": 0.0032685401383787394,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.0007838514866307378,
      "eval_reward/A01_acc": 0.5864388942718506,
      "eval_reward/A02_acc": 0.6666666865348816,
      "eval_reward/A03_acc": 0.7204968929290771,
      "eval_reward/reward_A0": 0.003096706001088023,
      "eval_reward/reward_A1": -0.0005257408483885229,
      "eval_reward/reward_A2": -0.0032208659686148167,
      "eval_reward/reward_A3": -0.0065715922974050045,
      "eval_rewards/accuracies": 0.6578609347343445,
      "eval_rewards/chosen": 0.003096706001088023,
      "eval_rewards/margins": 0.00653607165440917,
      "eval_rewards/rejected": -0.0034393654204905033,
      "eval_runtime": 1143.0508,
      "eval_samples_per_second": 1.689,
      "eval_steps_per_second": 0.423,
      "step": 100
    },
    {
      "epoch": 0.06,
      "learning_rate": 2.9100529100529103e-06,
      "loss": 1.1481,
      "loss/mini_gap_loss": 1.148089051246643,
      "loss/ori_loss": 1.3806387186050415,
      "loss/reward_entrophy": 0.23254959285259247,
      "mask/mask_ratio": 0.4441676735877991,
      "reward/A01_acc": 0.596875011920929,
      "reward/A02_acc": 0.668749988079071,
      "reward/A03_acc": 0.753125011920929,
      "reward/reward_A0": 0.003329743165522814,
      "reward/reward_A1": -0.0019971313886344433,
      "reward/reward_A2": -0.003069226397201419,
      "reward/reward_A3": -0.007622469216585159,
      "rewards/accuracies": 0.6729099154472351,
      "rewards/chosen": 0.003329743165522814,
      "rewards/margins": 0.007559309713542461,
      "rewards/rejected": -0.004229567013680935,
      "step": 110
    },
    {
      "epoch": 0.06,
      "learning_rate": 3.1746031746031746e-06,
      "loss": 1.1729,
      "loss/mini_gap_loss": 1.172925353050232,
      "loss/ori_loss": 1.378214955329895,
      "loss/reward_entrophy": 0.20528948307037354,
      "mask/mask_ratio": 0.44959086179733276,
      "reward/A01_acc": 0.6031249761581421,
      "reward/A02_acc": 0.6625000238418579,
      "reward/A03_acc": 0.7593749761581421,
      "reward/reward_A0": 0.004024973139166832,
      "reward/reward_A1": -0.002336194971576333,
      "reward/reward_A2": -0.007239366415888071,
      "reward/reward_A3": -0.011439996771514416,
      "rewards/accuracies": 0.6749932765960693,
      "rewards/chosen": 0.004024973139166832,
      "rewards/margins": 0.011030088178813457,
      "rewards/rejected": -0.007005115505307913,
      "step": 120
    },
    {
      "epoch": 0.07,
      "learning_rate": 3.4391534391534394e-06,
      "loss": 1.1446,
      "loss/mini_gap_loss": 1.1445523500442505,
      "loss/ori_loss": 1.376039981842041,
      "loss/reward_entrophy": 0.2314877212047577,
      "mask/mask_ratio": 0.45006194710731506,
      "reward/A01_acc": 0.606249988079071,
      "reward/A02_acc": 0.668749988079071,
      "reward/A03_acc": 0.768750011920929,
      "reward/reward_A0": 0.004440720193088055,
      "reward/reward_A1": -0.0030044266022741795,
      "reward/reward_A2": -0.007617408875375986,
      "reward/reward_A3": -0.015090301632881165,
      "rewards/accuracies": 0.6812432408332825,
      "rewards/chosen": 0.004440720193088055,
      "rewards/margins": 0.013011346571147442,
      "rewards/rejected": -0.008570625446736813,
      "step": 130
    },
    {
      "epoch": 0.07,
      "learning_rate": 3.7037037037037037e-06,
      "loss": 1.1457,
      "loss/mini_gap_loss": 1.1457185745239258,
      "loss/ori_loss": 1.372668981552124,
      "loss/reward_entrophy": 0.22695031762123108,
      "mask/mask_ratio": 0.4774394929409027,
      "reward/A01_acc": 0.6000000238418579,
      "reward/A02_acc": 0.684374988079071,
      "reward/A03_acc": 0.7437499761581421,
      "reward/reward_A0": 0.004974964540451765,
      "reward/reward_A1": -0.007659500930458307,
      "reward/reward_A2": -0.014985652640461922,
      "reward/reward_A3": -0.02094295620918274,
      "rewards/accuracies": 0.6760349273681641,
      "rewards/chosen": 0.004974964540451765,
      "rewards/margins": 0.019504185765981674,
      "rewards/rejected": -0.014529223553836346,
      "step": 140
    },
    {
      "epoch": 0.08,
      "learning_rate": 3.968253968253968e-06,
      "loss": 1.1192,
      "loss/mini_gap_loss": 1.119193434715271,
      "loss/ori_loss": 1.366742730140686,
      "loss/reward_entrophy": 0.24754932522773743,
      "mask/mask_ratio": 0.4677630364894867,
      "reward/A01_acc": 0.5625,
      "reward/A02_acc": 0.6781250238418579,
      "reward/A03_acc": 0.7250000238418579,
      "reward/reward_A0": 0.0006335077923722565,
      "reward/reward_A1": -0.010926964692771435,
      "reward/reward_A2": -0.025047313421964645,
      "reward/reward_A3": -0.03096495009958744,
      "rewards/accuracies": 0.6552018523216248,
      "rewards/chosen": 0.0006335077923722565,
      "rewards/margins": 0.02294636145234108,
      "rewards/rejected": -0.02231285534799099,
      "step": 150
    },
    {
      "epoch": 0.08,
      "learning_rate": 4.232804232804233e-06,
      "loss": 1.1292,
      "loss/mini_gap_loss": 1.1292277574539185,
      "loss/ori_loss": 1.3621454238891602,
      "loss/reward_entrophy": 0.2329176366329193,
      "mask/mask_ratio": 0.4471195340156555,
      "reward/A01_acc": 0.6156250238418579,
      "reward/A02_acc": 0.6625000238418579,
      "reward/A03_acc": 0.746874988079071,
      "reward/reward_A0": -0.0030881259590387344,
      "reward/reward_A1": -0.02398960292339325,
      "reward/reward_A2": -0.03651643171906471,
      "reward/reward_A3": -0.05081651732325554,
      "rewards/accuracies": 0.6749932765960693,
      "rewards/chosen": -0.0030881259590387344,
      "rewards/margins": 0.034019019454717636,
      "rewards/rejected": -0.03710714355111122,
      "step": 160
    },
    {
      "epoch": 0.09,
      "learning_rate": 4.497354497354498e-06,
      "loss": 1.1019,
      "loss/mini_gap_loss": 1.1018766164779663,
      "loss/ori_loss": 1.3530219793319702,
      "loss/reward_entrophy": 0.2511453330516815,
      "mask/mask_ratio": 0.4702727198600769,
      "reward/A01_acc": 0.6343749761581421,
      "reward/A02_acc": 0.668749988079071,
      "reward/A03_acc": 0.7406250238418579,
      "reward/reward_A0": -0.01095401868224144,
      "reward/reward_A1": -0.04301467910408974,
      "reward/reward_A2": -0.05291104316711426,
      "reward/reward_A3": -0.07674823701381683,
      "rewards/accuracies": 0.6812432408332825,
      "rewards/chosen": -0.01095401868224144,
      "rewards/margins": 0.04660339280962944,
      "rewards/rejected": -0.05755741521716118,
      "step": 170
    },
    {
      "epoch": 0.1,
      "learning_rate": 4.761904761904762e-06,
      "loss": 1.1027,
      "loss/mini_gap_loss": 1.1027108430862427,
      "loss/ori_loss": 1.3415956497192383,
      "loss/reward_entrophy": 0.23888495564460754,
      "mask/mask_ratio": 0.458621084690094,
      "reward/A01_acc": 0.6031249761581421,
      "reward/A02_acc": 0.6499999761581421,
      "reward/A03_acc": 0.7406250238418579,
      "reward/reward_A0": -0.030702512711286545,
      "reward/reward_A1": -0.06770393997430801,
      "reward/reward_A2": -0.08712705969810486,
      "reward/reward_A3": -0.11711319535970688,
      "rewards/accuracies": 0.6645767688751221,
      "rewards/chosen": -0.030702512711286545,
      "rewards/margins": 0.05994465947151184,
      "rewards/rejected": -0.09064716845750809,
      "step": 180
    },
    {
      "epoch": 0.1,
      "learning_rate": 4.999995695767548e-06,
      "loss": 1.1337,
      "loss/mini_gap_loss": 1.133699893951416,
      "loss/ori_loss": 1.3330219984054565,
      "loss/reward_entrophy": 0.1993221640586853,
      "mask/mask_ratio": 0.4486338198184967,
      "reward/A01_acc": 0.606249988079071,
      "reward/A02_acc": 0.706250011920929,
      "reward/A03_acc": 0.721875011920929,
      "reward/reward_A0": -0.06198056414723396,
      "reward/reward_A1": -0.11920014768838882,
      "reward/reward_A2": -0.14645102620124817,
      "reward/reward_A3": -0.16733619570732117,
      "rewards/accuracies": 0.6781182289123535,
      "rewards/chosen": -0.06198056414723396,
      "rewards/margins": 0.08234710991382599,
      "rewards/rejected": -0.14432767033576965,
      "step": 190
    },
    {
      "epoch": 0.11,
      "learning_rate": 4.999479205806641e-06,
      "loss": 1.0838,
      "loss/mini_gap_loss": 1.0838396549224854,
      "loss/ori_loss": 1.3189235925674438,
      "loss/reward_entrophy": 0.23508372902870178,
      "mask/mask_ratio": 0.44241079688072205,
      "reward/A01_acc": 0.6156250238418579,
      "reward/A02_acc": 0.6656249761581421,
      "reward/A03_acc": 0.762499988079071,
      "reward/reward_A0": -0.11501292884349823,
      "reward/reward_A1": -0.17026112973690033,
      "reward/reward_A2": -0.21682190895080566,
      "reward/reward_A3": -0.2764994204044342,
      "rewards/accuracies": 0.6812433004379272,
      "rewards/chosen": -0.11501292884349823,
      "rewards/margins": 0.10617899894714355,
      "rewards/rejected": -0.22119192779064178,
      "step": 200
    },
    {
      "epoch": 0.11,
      "eval_loss": 1.0771942138671875,
      "eval_loss/mini_gap_loss": 1.0773193836212158,
      "eval_loss/ori_loss": 1.3099685907363892,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 0.1509634405374527,
      "eval_regularization/policy_data_loss": 1.4842382669448853,
      "eval_regularization/policy_ref_data_loss_gap": 0.21823477745056152,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.12646323442459106,
      "eval_reward/A01_acc": 0.6040372848510742,
      "eval_reward/A02_acc": 0.6697722673416138,
      "eval_reward/A03_acc": 0.7080745100975037,
      "eval_reward/reward_A0": -0.1489766389131546,
      "eval_reward/reward_A1": -0.21975569427013397,
      "eval_reward/reward_A2": -0.26389676332473755,
      "eval_reward/reward_A3": -0.3184634745121002,
      "eval_rewards/accuracies": 0.6606214046478271,
      "eval_rewards/chosen": -0.1489766389131546,
      "eval_rewards/margins": 0.11839266866445541,
      "eval_rewards/rejected": -0.2673693001270294,
      "eval_runtime": 1141.8872,
      "eval_samples_per_second": 1.691,
      "eval_steps_per_second": 0.423,
      "step": 200
    },
    {
      "epoch": 0.11,
      "learning_rate": 4.998102073134384e-06,
      "loss": 1.0548,
      "loss/mini_gap_loss": 1.0548268556594849,
      "loss/ori_loss": 1.3079754114151,
      "loss/reward_entrophy": 0.2531485855579376,
      "mask/mask_ratio": 0.43034273386001587,
      "reward/A01_acc": 0.559374988079071,
      "reward/A02_acc": 0.684374988079071,
      "reward/A03_acc": 0.7437499761581421,
      "reward/reward_A0": -0.17067770659923553,
      "reward/reward_A1": -0.22951173782348633,
      "reward/reward_A2": -0.3054881691932678,
      "reward/reward_A3": -0.3474501967430115,
      "rewards/accuracies": 0.6624933481216431,
      "rewards/chosen": -0.17067770659923553,
      "rewards/margins": 0.12346938997507095,
      "rewards/rejected": -0.2941471040248871,
      "step": 210
    },
    {
      "epoch": 0.12,
      "learning_rate": 4.995864771937239e-06,
      "loss": 1.0244,
      "loss/mini_gap_loss": 1.0243996381759644,
      "loss/ori_loss": 1.2724800109863281,
      "loss/reward_entrophy": 0.24808025360107422,
      "mask/mask_ratio": 0.4437647759914398,
      "reward/A01_acc": 0.653124988079071,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.737500011920929,
      "reward/reward_A0": -0.23702342808246613,
      "reward/reward_A1": -0.3726288676261902,
      "reward/reward_A2": -0.4514033794403076,
      "reward/reward_A3": -0.48706427216529846,
      "rewards/accuracies": 0.7062430381774902,
      "rewards/chosen": -0.23702342808246613,
      "rewards/margins": 0.20000441372394562,
      "rewards/rejected": -0.43702784180641174,
      "step": 220
    },
    {
      "epoch": 0.12,
      "learning_rate": 4.992768072582473e-06,
      "loss": 1.083,
      "loss/mini_gap_loss": 1.0830028057098389,
      "loss/ori_loss": 1.2801588773727417,
      "loss/reward_entrophy": 0.19715605676174164,
      "mask/mask_ratio": 0.4560978412628174,
      "reward/A01_acc": 0.590624988079071,
      "reward/A02_acc": 0.6343749761581421,
      "reward/A03_acc": 0.699999988079071,
      "reward/reward_A0": -0.39179345965385437,
      "reward/reward_A1": -0.5065832138061523,
      "reward/reward_A2": -0.5875496864318848,
      "reward/reward_A3": -0.673936665058136,
      "rewards/accuracies": 0.6416603326797485,
      "rewards/chosen": -0.39179345965385437,
      "rewards/margins": 0.19755719602108002,
      "rewards/rejected": -0.5893506407737732,
      "step": 230
    },
    {
      "epoch": 0.13,
      "learning_rate": 4.988813041352904e-06,
      "loss": 1.0493,
      "loss/mini_gap_loss": 1.0493232011795044,
      "loss/ori_loss": 1.3053315877914429,
      "loss/reward_entrophy": 0.25600844621658325,
      "mask/mask_ratio": 0.45394477248191833,
      "reward/A01_acc": 0.578125,
      "reward/A02_acc": 0.59375,
      "reward/A03_acc": 0.6937500238418579,
      "reward/reward_A0": -0.39473479986190796,
      "reward/reward_A1": -0.4833299517631531,
      "reward/reward_A2": -0.5630390048027039,
      "reward/reward_A3": -0.6872426867485046,
      "rewards/accuracies": 0.6218687295913696,
      "rewards/chosen": -0.39473479986190796,
      "rewards/margins": 0.18312998116016388,
      "rewards/rejected": -0.5778647661209106,
      "step": 240
    },
    {
      "epoch": 0.13,
      "learning_rate": 4.984001040079745e-06,
      "loss": 1.0656,
      "loss/mini_gap_loss": 1.0656225681304932,
      "loss/ori_loss": 1.2859117984771729,
      "loss/reward_entrophy": 0.2202892303466797,
      "mask/mask_ratio": 0.47816920280456543,
      "reward/A01_acc": 0.6187499761581421,
      "reward/A02_acc": 0.640625,
      "reward/A03_acc": 0.737500011920929,
      "reward/reward_A0": -0.3241721987724304,
      "reward/reward_A1": -0.4337772727012634,
      "reward/reward_A2": -0.519018292427063,
      "reward/reward_A3": -0.6774327158927917,
      "rewards/accuracies": 0.665618360042572,
      "rewards/chosen": -0.3241721987724304,
      "rewards/margins": 0.21923179924488068,
      "rewards/rejected": -0.5434039235115051,
      "step": 250
    },
    {
      "epoch": 0.14,
      "learning_rate": 4.978333725673691e-06,
      "loss": 1.0582,
      "loss/mini_gap_loss": 1.0581995248794556,
      "loss/ori_loss": 1.299250602722168,
      "loss/reward_entrophy": 0.24105104804039001,
      "mask/mask_ratio": 0.4484889507293701,
      "reward/A01_acc": 0.5843750238418579,
      "reward/A02_acc": 0.628125011920929,
      "reward/A03_acc": 0.737500011920929,
      "reward/reward_A0": -0.3335839807987213,
      "reward/reward_A1": -0.3942530155181885,
      "reward/reward_A2": -0.4768710732460022,
      "reward/reward_A3": -0.6270440816879272,
      "rewards/accuracies": 0.6499935388565063,
      "rewards/chosen": -0.3335839807987213,
      "rewards/margins": 0.16580040752887726,
      "rewards/rejected": -0.49938440322875977,
      "step": 260
    },
    {
      "epoch": 0.14,
      "learning_rate": 4.97181304955439e-06,
      "loss": 1.0417,
      "loss/mini_gap_loss": 1.0416896343231201,
      "loss/ori_loss": 1.2627148628234863,
      "loss/reward_entrophy": 0.22102534770965576,
      "mask/mask_ratio": 0.44621172547340393,
      "reward/A01_acc": 0.596875011920929,
      "reward/A02_acc": 0.71875,
      "reward/A03_acc": 0.75,
      "reward/reward_A0": -0.3230935037136078,
      "reward/reward_A1": -0.4606807827949524,
      "reward/reward_A2": -0.5572239756584167,
      "reward/reward_A3": -0.6876312494277954,
      "rewards/accuracies": 0.6885348558425903,
      "rewards/chosen": -0.3230935037136078,
      "rewards/margins": 0.24541282653808594,
      "rewards/rejected": -0.5685063600540161,
      "step": 270
    },
    {
      "epoch": 0.15,
      "learning_rate": 4.964441256978517e-06,
      "loss": 1.017,
      "loss/mini_gap_loss": 1.0169792175292969,
      "loss/ori_loss": 1.2477308511734009,
      "loss/reward_entrophy": 0.23075155913829803,
      "mask/mask_ratio": 0.456037700176239,
      "reward/A01_acc": 0.6468750238418579,
      "reward/A02_acc": 0.715624988079071,
      "reward/A03_acc": 0.765625,
      "reward/reward_A0": -0.34196653962135315,
      "reward/reward_A1": -0.5081108212471008,
      "reward/reward_A2": -0.6040583848953247,
      "reward/reward_A3": -0.7474610805511475,
      "rewards/accuracies": 0.7093678712844849,
      "rewards/chosen": -0.34196653962135315,
      "rewards/margins": 0.27790406346321106,
      "rewards/rejected": -0.6198705434799194,
      "step": 280
    },
    {
      "epoch": 0.15,
      "learning_rate": 4.956220886266673e-06,
      "loss": 0.9907,
      "loss/mini_gap_loss": 0.9906512498855591,
      "loss/ori_loss": 1.2497670650482178,
      "loss/reward_entrophy": 0.25911587476730347,
      "mask/mask_ratio": 0.4533205032348633,
      "reward/A01_acc": 0.6156250238418579,
      "reward/A02_acc": 0.671875,
      "reward/A03_acc": 0.778124988079071,
      "reward/reward_A0": -0.4096229672431946,
      "reward/reward_A1": -0.5659546852111816,
      "reward/reward_A2": -0.7472653388977051,
      "reward/reward_A3": -0.8127982020378113,
      "rewards/accuracies": 0.6885347962379456,
      "rewards/chosen": -0.4096229672431946,
      "rewards/margins": 0.2990427017211914,
      "rewards/rejected": -0.708665668964386,
      "step": 290
    },
    {
      "epoch": 0.16,
      "learning_rate": 4.947154767929356e-06,
      "loss": 1.0427,
      "loss/mini_gap_loss": 1.042747974395752,
      "loss/ori_loss": 1.2500406503677368,
      "loss/reward_entrophy": 0.20729270577430725,
      "mask/mask_ratio": 0.441511869430542,
      "reward/A01_acc": 0.6031249761581421,
      "reward/A02_acc": 0.675000011920929,
      "reward/A03_acc": 0.7250000238418579,
      "reward/reward_A0": -0.4927302300930023,
      "reward/reward_A1": -0.6570430994033813,
      "reward/reward_A2": -0.8242027163505554,
      "reward/reward_A3": -0.9096619486808777,
      "rewards/accuracies": 0.6677016615867615,
      "rewards/chosen": -0.4927302300930023,
      "rewards/margins": 0.30423104763031006,
      "rewards/rejected": -0.7969613075256348,
      "step": 300
    },
    {
      "epoch": 0.16,
      "eval_loss": 1.0091421604156494,
      "eval_loss/mini_gap_loss": 1.0092347860336304,
      "eval_loss/ori_loss": 1.2418839931488037,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 0.5873395204544067,
      "eval_regularization/policy_data_loss": 1.8853754997253418,
      "eval_regularization/policy_ref_data_loss_gap": 0.6193717122077942,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.4077347218990326,
      "eval_reward/A01_acc": 0.6195651888847351,
      "eval_reward/A02_acc": 0.6744306683540344,
      "eval_reward/A03_acc": 0.7360248565673828,
      "eval_reward/reward_A0": -0.4752160906791687,
      "eval_reward/reward_A1": -0.6617422699928284,
      "eval_reward/reward_A2": -0.7888895869255066,
      "eval_reward/reward_A3": -0.949398934841156,
      "eval_rewards/accuracies": 0.6766667366027832,
      "eval_rewards/chosen": -0.4752160906791687,
      "eval_rewards/margins": 0.32478612661361694,
      "eval_rewards/rejected": -0.8000022768974304,
      "eval_runtime": 1142.7073,
      "eval_samples_per_second": 1.69,
      "eval_steps_per_second": 0.423,
      "step": 300
    },
    {
      "epoch": 0.16,
      "learning_rate": 4.937246023692343e-06,
      "loss": 1.0114,
      "loss/mini_gap_loss": 1.011406660079956,
      "loss/ori_loss": 1.2443242073059082,
      "loss/reward_entrophy": 0.2329176664352417,
      "mask/mask_ratio": 0.43483877182006836,
      "reward/A01_acc": 0.6312500238418579,
      "reward/A02_acc": 0.6468750238418579,
      "reward/A03_acc": 0.784375011920929,
      "reward/reward_A0": -0.4514276087284088,
      "reward/reward_A1": -0.621374249458313,
      "reward/reward_A2": -0.767790675163269,
      "reward/reward_A3": -0.9453509449958801,
      "rewards/accuracies": 0.687493085861206,
      "rewards/chosen": -0.4514276087284088,
      "rewards/margins": 0.32673656940460205,
      "rewards/rejected": -0.778164267539978,
      "step": 310
    },
    {
      "epoch": 0.17,
      "learning_rate": 4.926498065421791e-06,
      "loss": 0.9961,
      "loss/mini_gap_loss": 0.9960936307907104,
      "loss/ori_loss": 1.2488741874694824,
      "loss/reward_entrophy": 0.2527804970741272,
      "mask/mask_ratio": 0.4733213484287262,
      "reward/A01_acc": 0.612500011920929,
      "reward/A02_acc": 0.6781250238418579,
      "reward/A03_acc": 0.7593749761581421,
      "reward/reward_A0": -0.46273964643478394,
      "reward/reward_A1": -0.645828127861023,
      "reward/reward_A2": -0.7839781045913696,
      "reward/reward_A3": -0.9258670806884766,
      "rewards/accuracies": 0.6833265423774719,
      "rewards/chosen": -0.46273964643478394,
      "rewards/margins": 0.32247692346572876,
      "rewards/rejected": -0.7852166295051575,
      "step": 320
    },
    {
      "epoch": 0.18,
      "learning_rate": 4.914914593949426e-06,
      "loss": 0.9635,
      "loss/mini_gap_loss": 0.9634878039360046,
      "loss/ori_loss": 1.2068678140640259,
      "loss/reward_entrophy": 0.24337999522686005,
      "mask/mask_ratio": 0.45655718445777893,
      "reward/A01_acc": 0.6499999761581421,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.768750011920929,
      "reward/reward_A0": -0.42338424921035767,
      "reward/reward_A1": -0.654930055141449,
      "reward/reward_A2": -0.8409613370895386,
      "reward/reward_A3": -0.9505659341812134,
      "rewards/accuracies": 0.7156178951263428,
      "rewards/chosen": -0.42338424921035767,
      "rewards/margins": 0.3920934200286865,
      "rewards/rejected": -0.8154776692390442,
      "step": 330
    },
    {
      "epoch": 0.18,
      "learning_rate": 4.902499597798246e-06,
      "loss": 0.9766,
      "loss/mini_gap_loss": 0.9766014218330383,
      "loss/ori_loss": 1.215649127960205,
      "loss/reward_entrophy": 0.2390478104352951,
      "mask/mask_ratio": 0.45488548278808594,
      "reward/A01_acc": 0.6468750238418579,
      "reward/A02_acc": 0.7124999761581421,
      "reward/A03_acc": 0.7906249761581421,
      "reward/reward_A0": -0.5466843843460083,
      "reward/reward_A1": -0.7560637593269348,
      "reward/reward_A2": -0.8990615010261536,
      "reward/reward_A3": -1.1720495223999023,
      "rewards/accuracies": 0.7166595458984375,
      "rewards/chosen": -0.5466843843460083,
      "rewards/margins": 0.3956977128982544,
      "rewards/rejected": -0.9423821568489075,
      "step": 340
    },
    {
      "epoch": 0.19,
      "learning_rate": 4.889257351809156e-06,
      "loss": 1.0241,
      "loss/mini_gap_loss": 1.0240620374679565,
      "loss/ori_loss": 1.228615403175354,
      "loss/reward_entrophy": 0.20455333590507507,
      "mask/mask_ratio": 0.46685218811035156,
      "reward/A01_acc": 0.606249988079071,
      "reward/A02_acc": 0.6812499761581421,
      "reward/A03_acc": 0.7562500238418579,
      "reward/reward_A0": -0.6056363582611084,
      "reward/reward_A1": -0.8125576972961426,
      "reward/reward_A2": -0.9080101847648621,
      "reward/reward_A3": -1.168592929840088,
      "rewards/accuracies": 0.6812432408332825,
      "rewards/chosen": -0.6056363582611084,
      "rewards/margins": 0.3574075400829315,
      "rewards/rejected": -0.9630439877510071,
      "step": 350
    },
    {
      "epoch": 0.19,
      "learning_rate": 4.875192415669014e-06,
      "loss": 1.009,
      "loss/mini_gap_loss": 1.009019136428833,
      "loss/ori_loss": 1.222973108291626,
      "loss/reward_entrophy": 0.21395382285118103,
      "mask/mask_ratio": 0.4303799271583557,
      "reward/A01_acc": 0.6000000238418579,
      "reward/A02_acc": 0.6875,
      "reward/A03_acc": 0.793749988079071,
      "reward/reward_A0": -0.5146493911743164,
      "reward/reward_A1": -0.6888226270675659,
      "reward/reward_A2": -0.8788517117500305,
      "reward/reward_A3": -1.1491509675979614,
      "rewards/accuracies": 0.693743109703064,
      "rewards/chosen": -0.5146493911743164,
      "rewards/margins": 0.3909500539302826,
      "rewards/rejected": -0.9055994153022766,
      "step": 360
    },
    {
      "epoch": 0.2,
      "learning_rate": 4.860309632340608e-06,
      "loss": 1.0358,
      "loss/mini_gap_loss": 1.0358043909072876,
      "loss/ori_loss": 1.2199641466140747,
      "loss/reward_entrophy": 0.1841595619916916,
      "mask/mask_ratio": 0.4581855833530426,
      "reward/A01_acc": 0.596875011920929,
      "reward/A02_acc": 0.721875011920929,
      "reward/A03_acc": 0.765625,
      "reward/reward_A0": -0.5426836609840393,
      "reward/reward_A1": -0.7493712306022644,
      "reward/reward_A2": -0.9741304516792297,
      "reward/reward_A3": -1.0966941118240356,
      "rewards/accuracies": 0.6947847604751587,
      "rewards/chosen": -0.5426836609840393,
      "rewards/margins": 0.3973722755908966,
      "rewards/rejected": -0.9400560259819031,
      "step": 370
    },
    {
      "epoch": 0.2,
      "learning_rate": 4.844614126395074e-06,
      "loss": 0.9982,
      "loss/mini_gap_loss": 0.998186469078064,
      "loss/ori_loss": 1.2242377996444702,
      "loss/reward_entrophy": 0.22605133056640625,
      "mask/mask_ratio": 0.4682645797729492,
      "reward/A01_acc": 0.640625,
      "reward/A02_acc": 0.731249988079071,
      "reward/A03_acc": 0.762499988079071,
      "reward/reward_A0": -0.515438973903656,
      "reward/reward_A1": -0.6721990704536438,
      "reward/reward_A2": -0.8983248472213745,
      "reward/reward_A3": -1.0949065685272217,
      "rewards/accuracies": 0.7114512324333191,
      "rewards/chosen": -0.515438973903656,
      "rewards/margins": 0.37302905321121216,
      "rewards/rejected": -0.8884679675102234,
      "step": 380
    },
    {
      "epoch": 0.21,
      "learning_rate": 4.828111302247363e-06,
      "loss": 0.9829,
      "loss/mini_gap_loss": 0.9829089045524597,
      "loss/ori_loss": 1.199191689491272,
      "loss/reward_entrophy": 0.21628277003765106,
      "mask/mask_ratio": 0.4407065510749817,
      "reward/A01_acc": 0.606249988079071,
      "reward/A02_acc": 0.737500011920929,
      "reward/A03_acc": 0.784375011920929,
      "reward/reward_A0": -0.49949589371681213,
      "reward/reward_A1": -0.6741065382957458,
      "reward/reward_A2": -0.9367591142654419,
      "reward/reward_A3": -1.1603825092315674,
      "rewards/accuracies": 0.7093678712844849,
      "rewards/chosen": -0.49949589371681213,
      "rewards/margins": 0.424244225025177,
      "rewards/rejected": -0.9237400889396667,
      "step": 390
    },
    {
      "epoch": 0.21,
      "learning_rate": 4.810806842295349e-06,
      "loss": 0.9666,
      "loss/mini_gap_loss": 0.9666471481323242,
      "loss/ori_loss": 1.229196310043335,
      "loss/reward_entrophy": 0.2625490725040436,
      "mask/mask_ratio": 0.46905335783958435,
      "reward/A01_acc": 0.574999988079071,
      "reward/A02_acc": 0.699999988079071,
      "reward/A03_acc": 0.7406250238418579,
      "reward/reward_A0": -0.5871526598930359,
      "reward/reward_A1": -0.7332569360733032,
      "reward/reward_A2": -0.8946301341056824,
      "reward/reward_A3": -1.277956247329712,
      "rewards/accuracies": 0.6718683242797852,
      "rewards/chosen": -0.5871526598930359,
      "rewards/margins": 0.38145214319229126,
      "rewards/rejected": -0.9686048626899719,
      "step": 400
    },
    {
      "epoch": 0.21,
      "eval_loss": 0.9711907505989075,
      "eval_loss/mini_gap_loss": 0.9712932705879211,
      "eval_loss/ori_loss": 1.2039425373077393,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 0.7687075138092041,
      "eval_regularization/policy_data_loss": 2.236077308654785,
      "eval_regularization/policy_ref_data_loss_gap": 0.9700738191604614,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.4464285671710968,
      "eval_reward/A01_acc": 0.6288819909095764,
      "eval_reward/A02_acc": 0.7013457417488098,
      "eval_reward/A03_acc": 0.7675983309745789,
      "eval_reward/reward_A0": -0.5325741171836853,
      "eval_reward/reward_A1": -0.7467907667160034,
      "eval_reward/reward_A2": -0.9238122701644897,
      "eval_reward/reward_A3": -1.1649597883224487,
      "eval_rewards/accuracies": 0.6992684006690979,
      "eval_rewards/chosen": -0.5325741171836853,
      "eval_rewards/margins": 0.41260409355163574,
      "eval_rewards/rejected": -0.9451781511306763,
      "eval_runtime": 1142.4869,
      "eval_samples_per_second": 1.69,
      "eval_steps_per_second": 0.423,
      "step": 400
    },
    {
      "epoch": 0.22,
      "learning_rate": 4.792706704963207e-06,
      "loss": 0.9734,
      "loss/mini_gap_loss": 0.9734487533569336,
      "loss/ori_loss": 1.2020342350006104,
      "loss/reward_entrophy": 0.22858548164367676,
      "mask/mask_ratio": 0.45472821593284607,
      "reward/A01_acc": 0.609375,
      "reward/A02_acc": 0.71875,
      "reward/A03_acc": 0.7593749761581421,
      "reward/reward_A0": -0.5522108674049377,
      "reward/reward_A1": -0.7848314046859741,
      "reward/reward_A2": -1.0006763935089111,
      "reward/reward_A3": -1.2082939147949219,
      "rewards/accuracies": 0.6958264112472534,
      "rewards/chosen": -0.5522108674049377,
      "rewards/margins": 0.4457131028175354,
      "rewards/rejected": -0.9979238510131836,
      "step": 410
    },
    {
      "epoch": 0.22,
      "learning_rate": 4.773817122649767e-06,
      "loss": 0.9739,
      "loss/mini_gap_loss": 0.9739271402359009,
      "loss/ori_loss": 1.183180809020996,
      "loss/reward_entrophy": 0.20925359427928925,
      "mask/mask_ratio": 0.4537140727043152,
      "reward/A01_acc": 0.637499988079071,
      "reward/A02_acc": 0.7093750238418579,
      "reward/A03_acc": 0.8125,
      "reward/reward_A0": -0.5377050638198853,
      "reward/reward_A1": -0.8125573992729187,
      "reward/reward_A2": -0.9663890600204468,
      "reward/reward_A3": -1.1753923892974854,
      "rewards/accuracies": 0.7197844386100769,
      "rewards/chosen": -0.5377050638198853,
      "rewards/margins": 0.4470647871494293,
      "rewards/rejected": -0.9847698211669922,
      "step": 420
    },
    {
      "epoch": 0.23,
      "learning_rate": 4.754144599582505e-06,
      "loss": 1.0089,
      "loss/mini_gap_loss": 1.008928656578064,
      "loss/ori_loss": 1.2391493320465088,
      "loss/reward_entrophy": 0.23022063076496124,
      "mask/mask_ratio": 0.44677695631980896,
      "reward/A01_acc": 0.609375,
      "reward/A02_acc": 0.675000011920929,
      "reward/A03_acc": 0.793749988079071,
      "reward/reward_A0": -0.600826621055603,
      "reward/reward_A1": -0.8562310338020325,
      "reward/reward_A2": -0.9535056948661804,
      "reward/reward_A3": -1.2299911975860596,
      "rewards/accuracies": 0.6927014589309692,
      "rewards/chosen": -0.600826621055603,
      "rewards/margins": 0.4124060273170471,
      "rewards/rejected": -1.0132325887680054,
      "step": 430
    },
    {
      "epoch": 0.23,
      "learning_rate": 4.733695909577969e-06,
      "loss": 0.9918,
      "loss/mini_gap_loss": 0.9918249249458313,
      "loss/ori_loss": 1.2178761959075928,
      "loss/reward_entrophy": 0.22605130076408386,
      "mask/mask_ratio": 0.4528673589229584,
      "reward/A01_acc": 0.581250011920929,
      "reward/A02_acc": 0.7093750238418579,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.5735477209091187,
      "reward/reward_A1": -0.7092422246932983,
      "reward/reward_A2": -0.9832903146743774,
      "reward/reward_A3": -1.2372522354125977,
      "rewards/accuracies": 0.7020763754844666,
      "rewards/chosen": -0.5735477209091187,
      "rewards/margins": 0.4030374586582184,
      "rewards/rejected": -0.9765852093696594,
      "step": 440
    },
    {
      "epoch": 0.24,
      "learning_rate": 4.712478093709339e-06,
      "loss": 0.9844,
      "loss/mini_gap_loss": 0.9843851327896118,
      "loss/ori_loss": 1.2111725807189941,
      "loss/reward_entrophy": 0.2267874777317047,
      "mask/mask_ratio": 0.44743743538856506,
      "reward/A01_acc": 0.596875011920929,
      "reward/A02_acc": 0.690625011920929,
      "reward/A03_acc": 0.7749999761581421,
      "reward/reward_A0": -0.5790210962295532,
      "reward/reward_A1": -0.7799104452133179,
      "reward/reward_A2": -0.9879090189933777,
      "reward/reward_A3": -1.2731367349624634,
      "rewards/accuracies": 0.6874931454658508,
      "rewards/chosen": -0.5790210962295532,
      "rewards/margins": 0.4346209168434143,
      "rewards/rejected": -1.0136420726776123,
      "step": 450
    },
    {
      "epoch": 0.24,
      "learning_rate": 4.690498457881996e-06,
      "loss": 0.9781,
      "loss/mini_gap_loss": 0.9780756235122681,
      "loss/ori_loss": 1.215488076210022,
      "loss/reward_entrophy": 0.23741266131401062,
      "mask/mask_ratio": 0.45294028520584106,
      "reward/A01_acc": 0.581250011920929,
      "reward/A02_acc": 0.699999988079071,
      "reward/A03_acc": 0.78125,
      "reward/reward_A0": -0.6491990089416504,
      "reward/reward_A1": -0.8428533673286438,
      "reward/reward_A2": -1.0364949703216553,
      "reward/reward_A3": -1.3164191246032715,
      "rewards/accuracies": 0.6874932050704956,
      "rewards/chosen": -0.6491990089416504,
      "rewards/margins": 0.41604623198509216,
      "rewards/rejected": -1.065245270729065,
      "step": 460
    },
    {
      "epoch": 0.25,
      "learning_rate": 4.667764570317885e-06,
      "loss": 0.9784,
      "loss/mini_gap_loss": 0.9783965349197388,
      "loss/ori_loss": 1.2214086055755615,
      "loss/reward_entrophy": 0.24301192164421082,
      "mask/mask_ratio": 0.437546968460083,
      "reward/A01_acc": 0.550000011920929,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.762499988079071,
      "reward/reward_A0": -0.537890613079071,
      "reward/reward_A1": -0.6226123571395874,
      "reward/reward_A2": -1.0026746988296509,
      "reward/reward_A3": -1.156337022781372,
      "rewards/accuracies": 0.6802015900611877,
      "rewards/chosen": -0.537890613079071,
      "rewards/margins": 0.3893081247806549,
      "rewards/rejected": -0.9271987676620483,
      "step": 470
    },
    {
      "epoch": 0.25,
      "learning_rate": 4.6442842589495544e-06,
      "loss": 1.005,
      "loss/mini_gap_loss": 1.0050337314605713,
      "loss/ori_loss": 1.1964277029037476,
      "loss/reward_entrophy": 0.19139397144317627,
      "mask/mask_ratio": 0.45219412446022034,
      "reward/A01_acc": 0.6312500238418579,
      "reward/A02_acc": 0.6937500238418579,
      "reward/A03_acc": 0.78125,
      "reward/reward_A0": -0.5010525584220886,
      "reward/reward_A1": -0.7348469495773315,
      "reward/reward_A2": -0.876773476600647,
      "reward/reward_A3": -1.1304813623428345,
      "rewards/accuracies": 0.7020763158798218,
      "rewards/chosen": -0.5010525584220886,
      "rewards/margins": 0.4129721522331238,
      "rewards/rejected": -0.9140247106552124,
      "step": 480
    },
    {
      "epoch": 0.26,
      "learning_rate": 4.620065608724777e-06,
      "loss": 0.9983,
      "loss/mini_gap_loss": 0.998252272605896,
      "loss/ori_loss": 1.2089358568191528,
      "loss/reward_entrophy": 0.21068353950977325,
      "mask/mask_ratio": 0.45923057198524475,
      "reward/A01_acc": 0.609375,
      "reward/A02_acc": 0.7124999761581421,
      "reward/A03_acc": 0.7718750238418579,
      "reward/reward_A0": -0.5660179257392883,
      "reward/reward_A1": -0.7702693343162537,
      "reward/reward_A2": -1.0262346267700195,
      "reward/reward_A3": -1.1961078643798828,
      "rewards/accuracies": 0.6979097127914429,
      "rewards/chosen": -0.5660179257392883,
      "rewards/margins": 0.431509405374527,
      "rewards/rejected": -0.9975274205207825,
      "step": 490
    },
    {
      "epoch": 0.27,
      "learning_rate": 4.595116958822672e-06,
      "loss": 0.984,
      "loss/mini_gap_loss": 0.9840449094772339,
      "loss/ori_loss": 1.218965768814087,
      "loss/reward_entrophy": 0.23492088913917542,
      "mask/mask_ratio": 0.4441341459751129,
      "reward/A01_acc": 0.6468750238418579,
      "reward/A02_acc": 0.6625000238418579,
      "reward/A03_acc": 0.765625,
      "reward/reward_A0": -0.5911238789558411,
      "reward/reward_A1": -0.8028348684310913,
      "reward/reward_A2": -0.9396981000900269,
      "reward/reward_A3": -1.3013564348220825,
      "rewards/accuracies": 0.6916597485542297,
      "rewards/chosen": -0.5911238789558411,
      "rewards/margins": 0.4234958589076996,
      "rewards/rejected": -1.0146197080612183,
      "step": 500
    },
    {
      "epoch": 0.27,
      "eval_loss": 0.9522699117660522,
      "eval_loss/mini_gap_loss": 0.9523714780807495,
      "eval_loss/ori_loss": 1.1850208044052124,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 0.8699195981025696,
      "eval_regularization/policy_data_loss": 2.4013259410858154,
      "eval_regularization/policy_ref_data_loss_gap": 1.1353222131729126,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.4758760631084442,
      "eval_reward/A01_acc": 0.6242235898971558,
      "eval_reward/A02_acc": 0.7034161686897278,
      "eval_reward/A03_acc": 0.7831262946128845,
      "eval_reward/reward_A0": -0.5792595744132996,
      "eval_reward/reward_A1": -0.8081175088882446,
      "eval_reward/reward_A2": -1.013391375541687,
      "eval_reward/reward_A3": -1.2918646335601807,
      "eval_rewards/accuracies": 0.7035816311836243,
      "eval_rewards/chosen": -0.5792595744132996,
      "eval_rewards/margins": 0.4585212469100952,
      "eval_rewards/rejected": -1.0377808809280396,
      "eval_runtime": 1142.3842,
      "eval_samples_per_second": 1.69,
      "eval_steps_per_second": 0.423,
      "step": 500
    },
    {
      "epoch": 0.27,
      "learning_rate": 4.569446899782275e-06,
      "loss": 0.9635,
      "loss/mini_gap_loss": 0.9634801745414734,
      "loss/ori_loss": 1.1855673789978027,
      "loss/reward_entrophy": 0.22208721935749054,
      "mask/mask_ratio": 0.4513009488582611,
      "reward/A01_acc": 0.612500011920929,
      "reward/A02_acc": 0.734375,
      "reward/A03_acc": 0.8031250238418579,
      "reward/reward_A0": -0.5696190595626831,
      "reward/reward_A1": -0.7533131241798401,
      "reward/reward_A2": -1.0570814609527588,
      "reward/reward_A3": -1.2387502193450928,
      "rewards/accuracies": 0.7166595458984375,
      "rewards/chosen": -0.5696190595626831,
      "rewards/margins": 0.4467523694038391,
      "rewards/rejected": -1.016371488571167,
      "step": 510
    },
    {
      "epoch": 0.28,
      "learning_rate": 4.543064270544583e-06,
      "loss": 0.9045,
      "loss/mini_gap_loss": 0.9044593572616577,
      "loss/ori_loss": 1.1572397947311401,
      "loss/reward_entrophy": 0.2527804970741272,
      "mask/mask_ratio": 0.46794748306274414,
      "reward/A01_acc": 0.6812499761581421,
      "reward/A02_acc": 0.7124999761581421,
      "reward/A03_acc": 0.840624988079071,
      "reward/reward_A0": -0.5594173669815063,
      "reward/reward_A1": -0.8432528376579285,
      "reward/reward_A2": -1.0961835384368896,
      "reward/reward_A3": -1.352468490600586,
      "rewards/accuracies": 0.7447842359542847,
      "rewards/chosen": -0.5594173669815063,
      "rewards/margins": 0.537873387336731,
      "rewards/rejected": -1.0972907543182373,
      "step": 520
    },
    {
      "epoch": 0.28,
      "learning_rate": 4.5159781554090366e-06,
      "loss": 0.9423,
      "loss/mini_gap_loss": 0.9423474073410034,
      "loss/ori_loss": 1.1835613250732422,
      "loss/reward_entrophy": 0.24121391773223877,
      "mask/mask_ratio": 0.441779762506485,
      "reward/A01_acc": 0.6156250238418579,
      "reward/A02_acc": 0.7250000238418579,
      "reward/A03_acc": 0.7875000238418579,
      "reward/reward_A0": -0.6580714583396912,
      "reward/reward_A1": -0.905372142791748,
      "reward/reward_A2": -1.1648839712142944,
      "reward/reward_A3": -1.3159373998641968,
      "rewards/accuracies": 0.7093679308891296,
      "rewards/chosen": -0.6580714583396912,
      "rewards/margins": 0.47064852714538574,
      "rewards/rejected": -1.1287199258804321,
      "step": 530
    },
    {
      "epoch": 0.29,
      "learning_rate": 4.488197880905546e-06,
      "loss": 0.964,
      "loss/mini_gap_loss": 0.9639832377433777,
      "loss/ori_loss": 1.2140666246414185,
      "loss/reward_entrophy": 0.25008347630500793,
      "mask/mask_ratio": 0.4769526422023773,
      "reward/A01_acc": 0.609375,
      "reward/A02_acc": 0.668749988079071,
      "reward/A03_acc": 0.778124988079071,
      "reward/reward_A0": -0.7397810816764832,
      "reward/reward_A1": -0.9682385325431824,
      "reward/reward_A2": -1.1592479944229126,
      "reward/reward_A3": -1.454517126083374,
      "rewards/accuracies": 0.6854099035263062,
      "rewards/chosen": -0.7397810816764832,
      "rewards/margins": 0.4542081952095032,
      "rewards/rejected": -1.1939892768859863,
      "step": 540
    },
    {
      "epoch": 0.29,
      "learning_rate": 4.459733012583094e-06,
      "loss": 0.9911,
      "loss/mini_gap_loss": 0.991104245185852,
      "loss/ori_loss": 1.1980289220809937,
      "loss/reward_entrophy": 0.20692463219165802,
      "mask/mask_ratio": 0.47903475165367126,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.7406250238418579,
      "reward/A03_acc": 0.765625,
      "reward/reward_A0": -0.6994279623031616,
      "reward/reward_A1": -0.9433349370956421,
      "reward/reward_A2": -1.1847262382507324,
      "reward/reward_A3": -1.453439474105835,
      "rewards/accuracies": 0.7104095816612244,
      "rewards/chosen": -0.6994279623031616,
      "rewards/margins": 0.4943936765193939,
      "rewards/rejected": -1.193821668624878,
      "step": 550
    },
    {
      "epoch": 0.3,
      "learning_rate": 4.430593351716037e-06,
      "loss": 0.9446,
      "loss/mini_gap_loss": 0.9445670247077942,
      "loss/ori_loss": 1.172784447669983,
      "loss/reward_entrophy": 0.22821743786334991,
      "mask/mask_ratio": 0.45040836930274963,
      "reward/A01_acc": 0.6000000238418579,
      "reward/A02_acc": 0.7250000238418579,
      "reward/A03_acc": 0.824999988079071,
      "reward/reward_A0": -0.678536593914032,
      "reward/reward_A1": -0.9556156396865845,
      "reward/reward_A2": -1.2308508157730103,
      "reward/reward_A3": -1.533501148223877,
      "rewards/accuracies": 0.7166595458984375,
      "rewards/chosen": -0.678536593914032,
      "rewards/margins": 0.5614401698112488,
      "rewards/rejected": -1.2399767637252808,
      "step": 560
    },
    {
      "epoch": 0.3,
      "learning_rate": 4.400788931929254e-06,
      "loss": 0.9842,
      "loss/mini_gap_loss": 0.9842472076416016,
      "loss/ori_loss": 1.2199041843414307,
      "loss/reward_entrophy": 0.23565702140331268,
      "mask/mask_ratio": 0.45239463448524475,
      "reward/A01_acc": 0.643750011920929,
      "reward/A02_acc": 0.6875,
      "reward/A03_acc": 0.8125,
      "reward/reward_A0": -0.6992251873016357,
      "reward/reward_A1": -0.9362695813179016,
      "reward/reward_A2": -1.1078832149505615,
      "reward/reward_A3": -1.4563452005386353,
      "rewards/accuracies": 0.7145761847496033,
      "rewards/chosen": -0.6992251873016357,
      "rewards/margins": 0.46759581565856934,
      "rewards/rejected": -1.1668208837509155,
      "step": 570
    },
    {
      "epoch": 0.31,
      "learning_rate": 4.370330015743269e-06,
      "loss": 0.9258,
      "loss/mini_gap_loss": 0.9257532358169556,
      "loss/ori_loss": 1.189732313156128,
      "loss/reward_entrophy": 0.2639789879322052,
      "mask/mask_ratio": 0.46364179253578186,
      "reward/A01_acc": 0.581250011920929,
      "reward/A02_acc": 0.659375011920929,
      "reward/A03_acc": 0.800000011920929,
      "reward/reward_A0": -0.7443928718566895,
      "reward/reward_A1": -0.9076956510543823,
      "reward/reward_A2": -1.1452367305755615,
      "reward/reward_A3": -1.5802555084228516,
      "rewards/accuracies": 0.6802015900611877,
      "rewards/chosen": -0.7443928718566895,
      "rewards/margins": 0.4666576385498047,
      "rewards/rejected": -1.2110505104064941,
      "step": 580
    },
    {
      "epoch": 0.31,
      "learning_rate": 4.33922709104058e-06,
      "loss": 0.9339,
      "loss/mini_gap_loss": 0.9339002370834351,
      "loss/ori_loss": 1.1684529781341553,
      "loss/reward_entrophy": 0.23455281555652618,
      "mask/mask_ratio": 0.44735345244407654,
      "reward/A01_acc": 0.637499988079071,
      "reward/A02_acc": 0.675000011920929,
      "reward/A03_acc": 0.78125,
      "reward/reward_A0": -0.7243441939353943,
      "reward/reward_A1": -1.0683943033218384,
      "reward/reward_A2": -1.1853992938995361,
      "reward/reward_A3": -1.5182180404663086,
      "rewards/accuracies": 0.6979097127914429,
      "rewards/chosen": -0.7243441939353943,
      "rewards/margins": 0.5329803824424744,
      "rewards/rejected": -1.2573245763778687,
      "step": 590
    },
    {
      "epoch": 0.32,
      "learning_rate": 4.3074908674543695e-06,
      "loss": 1.0017,
      "loss/mini_gap_loss": 1.0016663074493408,
      "loss/ori_loss": 1.20769202709198,
      "loss/reward_entrophy": 0.206025630235672,
      "mask/mask_ratio": 0.4590482711791992,
      "reward/A01_acc": 0.6156250238418579,
      "reward/A02_acc": 0.6968749761581421,
      "reward/A03_acc": 0.78125,
      "reward/reward_A0": -0.8391634821891785,
      "reward/reward_A1": -1.068449854850769,
      "reward/reward_A2": -1.295798897743225,
      "reward/reward_A3": -1.6111853122711182,
      "rewards/accuracies": 0.6979097127914429,
      "rewards/chosen": -0.8391634821891785,
      "rewards/margins": 0.4859679639339447,
      "rewards/rejected": -1.3251314163208008,
      "step": 600
    },
    {
      "epoch": 0.32,
      "eval_loss": 0.936708927154541,
      "eval_loss/mini_gap_loss": 0.9367876648902893,
      "eval_loss/ori_loss": 1.1694368124008179,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.0543571710586548,
      "eval_regularization/policy_data_loss": 2.6902899742126465,
      "eval_regularization/policy_ref_data_loss_gap": 1.4242863655090332,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.6108787059783936,
      "eval_reward/A01_acc": 0.6413043737411499,
      "eval_reward/A02_acc": 0.7091097235679626,
      "eval_reward/A03_acc": 0.783643901348114,
      "eval_reward/reward_A0": -0.7540619373321533,
      "eval_reward/reward_A1": -1.0241268873214722,
      "eval_reward/reward_A2": -1.2660417556762695,
      "eval_reward/reward_A3": -1.576935887336731,
      "eval_rewards/accuracies": 0.7113455533981323,
      "eval_rewards/chosen": -0.7540619373321533,
      "eval_rewards/margins": 0.5349600315093994,
      "eval_rewards/rejected": -1.2890218496322632,
      "eval_runtime": 1142.302,
      "eval_samples_per_second": 1.69,
      "eval_steps_per_second": 0.423,
      "step": 600
    },
    {
      "epoch": 0.32,
      "learning_rate": 4.275132272680877e-06,
      "loss": 0.9619,
      "loss/mini_gap_loss": 0.9618844985961914,
      "loss/ori_loss": 1.1679099798202515,
      "loss/reward_entrophy": 0.206025630235672,
      "mask/mask_ratio": 0.4475005567073822,
      "reward/A01_acc": 0.5843750238418579,
      "reward/A02_acc": 0.734375,
      "reward/A03_acc": 0.8031250238418579,
      "reward/reward_A0": -0.7105655074119568,
      "reward/reward_A1": -0.9871571660041809,
      "reward/reward_A2": -1.220444679260254,
      "reward/reward_A3": -1.5417016744613647,
      "rewards/accuracies": 0.7072846293449402,
      "rewards/chosen": -0.7105655074119568,
      "rewards/margins": 0.5391899347305298,
      "rewards/rejected": -1.2497553825378418,
      "step": 610
    },
    {
      "epoch": 0.33,
      "learning_rate": 4.2421624487166745e-06,
      "loss": 0.9459,
      "loss/mini_gap_loss": 0.9459471702575684,
      "loss/ori_loss": 1.1736336946487427,
      "loss/reward_entrophy": 0.22768644988536835,
      "mask/mask_ratio": 0.45922961831092834,
      "reward/A01_acc": 0.6625000238418579,
      "reward/A02_acc": 0.7437499761581421,
      "reward/A03_acc": 0.8218749761581421,
      "reward/reward_A0": -0.7807016372680664,
      "reward/reward_A1": -1.0496947765350342,
      "reward/reward_A2": -1.337281584739685,
      "reward/reward_A3": -1.6021606922149658,
      "rewards/accuracies": 0.7427009344100952,
      "rewards/chosen": -0.7807016372680664,
      "rewards/margins": 0.5489975214004517,
      "rewards/rejected": -1.329699158668518,
      "step": 620
    },
    {
      "epoch": 0.33,
      "learning_rate": 4.208592748022154e-06,
      "loss": 0.9032,
      "loss/mini_gap_loss": 0.9032374620437622,
      "loss/ori_loss": 1.1567538976669312,
      "loss/reward_entrophy": 0.25351664423942566,
      "mask/mask_ratio": 0.46603697538375854,
      "reward/A01_acc": 0.621874988079071,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.7412405014038086,
      "reward/reward_A1": -1.0218275785446167,
      "reward/reward_A2": -1.2691974639892578,
      "reward/reward_A3": -1.6865675449371338,
      "rewards/accuracies": 0.7218677997589111,
      "rewards/chosen": -0.7412405014038086,
      "rewards/margins": 0.5846105813980103,
      "rewards/rejected": -1.3258510828018188,
      "step": 630
    },
    {
      "epoch": 0.34,
      "learning_rate": 4.174434729612555e-06,
      "loss": 0.9767,
      "loss/mini_gap_loss": 0.9767365455627441,
      "loss/ori_loss": 1.202625036239624,
      "loss/reward_entrophy": 0.22588849067687988,
      "mask/mask_ratio": 0.47255539894104004,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.675000011920929,
      "reward/A03_acc": 0.765625,
      "reward/reward_A0": -0.764710009098053,
      "reward/reward_A1": -1.0162545442581177,
      "reward/reward_A2": -1.2282243967056274,
      "reward/reward_A3": -1.5697773694992065,
      "rewards/accuracies": 0.6885348558425903,
      "rewards/chosen": -0.764710009098053,
      "rewards/margins": 0.5066961050033569,
      "rewards/rejected": -1.2714060544967651,
      "step": 640
    },
    {
      "epoch": 0.35,
      "learning_rate": 4.139700155077855e-06,
      "loss": 0.9365,
      "loss/mini_gap_loss": 0.9365004301071167,
      "loss/ori_loss": 1.1780824661254883,
      "loss/reward_entrophy": 0.241581991314888,
      "mask/mask_ratio": 0.45620447397232056,
      "reward/A01_acc": 0.628125011920929,
      "reward/A02_acc": 0.684374988079071,
      "reward/A03_acc": 0.78125,
      "reward/reward_A0": -0.6695815324783325,
      "reward/reward_A1": -0.9447723627090454,
      "reward/reward_A2": -1.1493713855743408,
      "reward/reward_A3": -1.479016661643982,
      "rewards/accuracies": 0.6979097127914429,
      "rewards/chosen": -0.6695815324783325,
      "rewards/margins": 0.5214599370956421,
      "rewards/rejected": -1.1910417079925537,
      "step": 650
    },
    {
      "epoch": 0.35,
      "learning_rate": 4.1044009845329195e-06,
      "loss": 0.9211,
      "loss/mini_gap_loss": 0.9211176633834839,
      "loss/ori_loss": 1.172100305557251,
      "loss/reward_entrophy": 0.25098246335983276,
      "mask/mask_ratio": 0.4536631107330322,
      "reward/A01_acc": 0.6031249761581421,
      "reward/A02_acc": 0.684374988079071,
      "reward/A03_acc": 0.8343750238418579,
      "reward/reward_A0": -0.6658995151519775,
      "reward/reward_A1": -0.8940489888191223,
      "reward/reward_A2": -1.1372933387756348,
      "reward/reward_A3": -1.581298828125,
      "rewards/accuracies": 0.7072845697402954,
      "rewards/chosen": -0.6658995151519775,
      "rewards/margins": 0.5383021831512451,
      "rewards/rejected": -1.2042016983032227,
      "step": 660
    },
    {
      "epoch": 0.36,
      "learning_rate": 4.068549372499287e-06,
      "loss": 0.9323,
      "loss/mini_gap_loss": 0.9322719573974609,
      "loss/ori_loss": 1.1827235221862793,
      "loss/reward_entrophy": 0.25045156478881836,
      "mask/mask_ratio": 0.4734960198402405,
      "reward/A01_acc": 0.640625,
      "reward/A02_acc": 0.690625011920929,
      "reward/A03_acc": 0.8062499761581421,
      "reward/reward_A0": -0.709541916847229,
      "reward/reward_A1": -0.9263612627983093,
      "reward/reward_A2": -1.2381455898284912,
      "reward/reward_A3": -1.5102349519729614,
      "rewards/accuracies": 0.7124929428100586,
      "rewards/chosen": -0.709541916847229,
      "rewards/margins": 0.5153599381446838,
      "rewards/rejected": -1.2249019145965576,
      "step": 670
    },
    {
      "epoch": 0.36,
      "learning_rate": 4.032157663720023e-06,
      "loss": 0.9432,
      "loss/mini_gap_loss": 0.9432209134101868,
      "loss/ori_loss": 1.1625688076019287,
      "loss/reward_entrophy": 0.21934787929058075,
      "mask/mask_ratio": 0.45433536171913147,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.746874988079071,
      "reward/A03_acc": 0.828125,
      "reward/reward_A0": -0.7249696850776672,
      "reward/reward_A1": -0.9835321307182312,
      "reward/reward_A2": -1.3489134311676025,
      "reward/reward_A3": -1.6229822635650635,
      "rewards/accuracies": 0.7333260774612427,
      "rewards/chosen": -0.7249696850776672,
      "rewards/margins": 0.5934931039810181,
      "rewards/rejected": -1.318462610244751,
      "step": 680
    },
    {
      "epoch": 0.37,
      "learning_rate": 3.9952383889090605e-06,
      "loss": 0.9383,
      "loss/mini_gap_loss": 0.9383009672164917,
      "loss/ori_loss": 1.1479227542877197,
      "loss/reward_entrophy": 0.20962166786193848,
      "mask/mask_ratio": 0.45981112122535706,
      "reward/A01_acc": 0.6656249761581421,
      "reward/A02_acc": 0.7124999761581421,
      "reward/A03_acc": 0.8343750238418579,
      "reward/reward_A0": -0.8494369387626648,
      "reward/reward_A1": -1.1669069528579712,
      "reward/reward_A2": -1.4780070781707764,
      "reward/reward_A3": -1.805368185043335,
      "rewards/accuracies": 0.7374926805496216,
      "rewards/chosen": -0.8494369387626648,
      "rewards/margins": 0.633975625038147,
      "rewards/rejected": -1.483412742614746,
      "step": 690
    },
    {
      "epoch": 0.37,
      "learning_rate": 3.957804260436522e-06,
      "loss": 0.9615,
      "loss/mini_gap_loss": 0.9614984393119812,
      "loss/ori_loss": 1.1971131563186646,
      "loss/reward_entrophy": 0.23561465740203857,
      "mask/mask_ratio": 0.44703227281570435,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.721875011920929,
      "reward/A03_acc": 0.7718750238418579,
      "reward/reward_A0": -0.9121298789978027,
      "reward/reward_A1": -1.1535695791244507,
      "reward/reward_A2": -1.4155457019805908,
      "reward/reward_A3": -1.7736327648162842,
      "rewards/accuracies": 0.7062429785728455,
      "rewards/chosen": -0.9121298789978027,
      "rewards/margins": 0.5354383587837219,
      "rewards/rejected": -1.4475681781768799,
      "step": 700
    },
    {
      "epoch": 0.37,
      "eval_loss": 0.9337851405143738,
      "eval_loss/mini_gap_loss": 0.9338251352310181,
      "eval_loss/ori_loss": 1.166474461555481,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.2767223119735718,
      "eval_regularization/policy_data_loss": 3.0577688217163086,
      "eval_regularization/policy_ref_data_loss_gap": 1.7917649745941162,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.7017377614974976,
      "eval_reward/A01_acc": 0.6288819909095764,
      "eval_reward/A02_acc": 0.7132505178451538,
      "eval_reward/A03_acc": 0.7867494821548462,
      "eval_reward/reward_A0": -0.9159491062164307,
      "eval_reward/reward_A1": -1.204805850982666,
      "eval_reward/reward_A2": -1.4643090963363647,
      "eval_reward/reward_A3": -1.7939122915267944,
      "eval_rewards/accuracies": 0.7096202373504639,
      "eval_rewards/chosen": -0.9159491062164307,
      "eval_rewards/margins": 0.5717117786407471,
      "eval_rewards/rejected": -1.4876607656478882,
      "eval_runtime": 1141.7438,
      "eval_samples_per_second": 1.691,
      "eval_steps_per_second": 0.423,
      "step": 700
    },
    {
      "epoch": 0.38,
      "learning_rate": 3.919868167951479e-06,
      "loss": 0.9394,
      "loss/mini_gap_loss": 0.9393788576126099,
      "loss/ori_loss": 1.1782639026641846,
      "loss/reward_entrophy": 0.23888497054576874,
      "mask/mask_ratio": 0.4431493282318115,
      "reward/A01_acc": 0.609375,
      "reward/A02_acc": 0.71875,
      "reward/A03_acc": 0.7875000238418579,
      "reward/reward_A0": -0.9162033796310425,
      "reward/reward_A1": -1.1286146640777588,
      "reward/reward_A2": -1.4869550466537476,
      "reward/reward_A3": -1.7891355752944946,
      "rewards/accuracies": 0.7052013278007507,
      "rewards/chosen": -0.9162033796310425,
      "rewards/margins": 0.552017092704773,
      "rewards/rejected": -1.4682204723358154,
      "step": 710
    },
    {
      "epoch": 0.38,
      "learning_rate": 3.8814431739436765e-06,
      "loss": 0.9907,
      "loss/mini_gap_loss": 0.9906998872756958,
      "loss/ori_loss": 1.1900221109390259,
      "loss/reward_entrophy": 0.1993221640586853,
      "mask/mask_ratio": 0.4513615667819977,
      "reward/A01_acc": 0.5843750238418579,
      "reward/A02_acc": 0.715624988079071,
      "reward/A03_acc": 0.784375011920929,
      "reward/reward_A0": -0.8162211179733276,
      "reward/reward_A1": -0.9965826272964478,
      "reward/reward_A2": -1.278685450553894,
      "reward/reward_A3": -1.6541904211044312,
      "rewards/accuracies": 0.6947847604751587,
      "rewards/chosen": -0.8162211179733276,
      "rewards/margins": 0.4935851991176605,
      "rewards/rejected": -1.3098063468933105,
      "step": 720
    },
    {
      "epoch": 0.39,
      "learning_rate": 3.842542509245742e-06,
      "loss": 0.9035,
      "loss/mini_gap_loss": 0.9035048484802246,
      "loss/ori_loss": 1.1467220783233643,
      "loss/reward_entrophy": 0.24321715533733368,
      "mask/mask_ratio": 0.4677800238132477,
      "reward/A01_acc": 0.6499999761581421,
      "reward/A02_acc": 0.7749999761581421,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.7537440061569214,
      "reward/reward_A1": -1.056485891342163,
      "reward/reward_A2": -1.318361520767212,
      "reward/reward_A3": -1.7329498529434204,
      "rewards/accuracies": 0.7468675374984741,
      "rewards/chosen": -0.7537440061569214,
      "rewards/margins": 0.6155081391334534,
      "rewards/rejected": -1.369252324104309,
      "step": 730
    },
    {
      "epoch": 0.39,
      "learning_rate": 3.8031795684774266e-06,
      "loss": 0.9598,
      "loss/mini_gap_loss": 0.9597532153129578,
      "loss/ori_loss": 1.1970031261444092,
      "loss/reward_entrophy": 0.23724982142448425,
      "mask/mask_ratio": 0.4842701852321625,
      "reward/A01_acc": 0.621874988079071,
      "reward/A02_acc": 0.690625011920929,
      "reward/A03_acc": 0.778124988079071,
      "reward/reward_A0": -0.7912360429763794,
      "reward/reward_A1": -1.0588842630386353,
      "reward/reward_A2": -1.3089988231658936,
      "reward/reward_A3": -1.6169288158416748,
      "rewards/accuracies": 0.6968680620193481,
      "rewards/chosen": -0.7912360429763794,
      "rewards/margins": 0.537021279335022,
      "rewards/rejected": -1.328257441520691,
      "step": 740
    },
    {
      "epoch": 0.4,
      "learning_rate": 3.7633679054334528e-06,
      "loss": 0.9067,
      "loss/mini_gap_loss": 0.9066831469535828,
      "loss/ori_loss": 1.1809617280960083,
      "loss/reward_entrophy": 0.274278461933136,
      "mask/mask_ratio": 0.4637815058231354,
      "reward/A01_acc": 0.637499988079071,
      "reward/A02_acc": 0.6781250238418579,
      "reward/A03_acc": 0.7875000238418579,
      "reward/reward_A0": -0.6895176768302917,
      "reward/reward_A1": -0.9718774557113647,
      "reward/reward_A2": -1.2514431476593018,
      "reward/reward_A3": -1.5459508895874023,
      "rewards/accuracies": 0.701034665107727,
      "rewards/chosen": -0.6895176768302917,
      "rewards/margins": 0.5668935775756836,
      "rewards/rejected": -1.2564113140106201,
      "step": 750
    },
    {
      "epoch": 0.4,
      "learning_rate": 3.7231212284165533e-06,
      "loss": 0.9268,
      "loss/mini_gap_loss": 0.9267603754997253,
      "loss/ori_loss": 1.1577171087265015,
      "loss/reward_entrophy": 0.2309567928314209,
      "mask/mask_ratio": 0.4639623761177063,
      "reward/A01_acc": 0.621874988079071,
      "reward/A02_acc": 0.721875011920929,
      "reward/A03_acc": 0.8343750238418579,
      "reward/reward_A0": -0.696740448474884,
      "reward/reward_A1": -0.9329498410224915,
      "reward/reward_A2": -1.2429828643798828,
      "reward/reward_A3": -1.5924853086471558,
      "rewards/accuracies": 0.72603440284729,
      "rewards/chosen": -0.696740448474884,
      "rewards/margins": 0.5593864321708679,
      "rewards/rejected": -1.256126880645752,
      "step": 760
    },
    {
      "epoch": 0.41,
      "learning_rate": 3.682453395517306e-06,
      "loss": 0.9626,
      "loss/mini_gap_loss": 0.9626052975654602,
      "loss/ori_loss": 1.1821585893630981,
      "loss/reward_entrophy": 0.21955308318138123,
      "mask/mask_ratio": 0.4511083960533142,
      "reward/A01_acc": 0.609375,
      "reward/A02_acc": 0.703125,
      "reward/A03_acc": 0.778124988079071,
      "reward/reward_A0": -0.7466509342193604,
      "reward/reward_A1": -1.0403249263763428,
      "reward/reward_A2": -1.2753461599349976,
      "reward/reward_A3": -1.5596883296966553,
      "rewards/accuracies": 0.6968680620193481,
      "rewards/chosen": -0.7466509342193604,
      "rewards/margins": 0.5451226234436035,
      "rewards/rejected": -1.2917735576629639,
      "step": 770
    },
    {
      "epoch": 0.41,
      "learning_rate": 3.641378409842392e-06,
      "loss": 0.9363,
      "loss/mini_gap_loss": 0.9362820386886597,
      "loss/ori_loss": 1.1596362590789795,
      "loss/reward_entrophy": 0.2233542948961258,
      "mask/mask_ratio": 0.45404618978500366,
      "reward/A01_acc": 0.606249988079071,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.800000011920929,
      "reward/reward_A0": -0.6884819269180298,
      "reward/reward_A1": -0.9051922559738159,
      "reward/reward_A2": -1.1688793897628784,
      "reward/reward_A3": -1.5154519081115723,
      "rewards/accuracies": 0.7114512324333191,
      "rewards/chosen": -0.6884819269180298,
      "rewards/margins": 0.5080140829086304,
      "rewards/rejected": -1.1964961290359497,
      "step": 780
    },
    {
      "epoch": 0.42,
      "learning_rate": 3.5999104146929296e-06,
      "loss": 0.9358,
      "loss/mini_gap_loss": 0.9357892870903015,
      "loss/ori_loss": 1.1808044910430908,
      "loss/reward_entrophy": 0.24501514434814453,
      "mask/mask_ratio": 0.4853687286376953,
      "reward/A01_acc": 0.6031249761581421,
      "reward/A02_acc": 0.731249988079071,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.7647777199745178,
      "reward/reward_A1": -0.9949877858161926,
      "reward/reward_A2": -1.234678864479065,
      "reward/reward_A3": -1.5675617456436157,
      "rewards/accuracies": 0.7166594862937927,
      "rewards/chosen": -0.7647777199745178,
      "rewards/margins": 0.500952422618866,
      "rewards/rejected": -1.2657301425933838,
      "step": 790
    },
    {
      "epoch": 0.42,
      "learning_rate": 3.55806368869452e-06,
      "loss": 0.9292,
      "loss/mini_gap_loss": 0.9291973114013672,
      "loss/ori_loss": 1.169512152671814,
      "loss/reward_entrophy": 0.24031491577625275,
      "mask/mask_ratio": 0.45235228538513184,
      "reward/A01_acc": 0.612500011920929,
      "reward/A02_acc": 0.746874988079071,
      "reward/A03_acc": 0.784375011920929,
      "reward/reward_A0": -0.7771322727203369,
      "reward/reward_A1": -1.0771121978759766,
      "reward/reward_A2": -1.384040355682373,
      "reward/reward_A3": -1.6630605459213257,
      "rewards/accuracies": 0.7145761847496033,
      "rewards/chosen": -0.7771322727203369,
      "rewards/margins": 0.5975915789604187,
      "rewards/rejected": -1.3747239112854004,
      "step": 800
    },
    {
      "epoch": 0.42,
      "eval_loss": 0.9236516952514648,
      "eval_loss/mini_gap_loss": 0.9237271547317505,
      "eval_loss/ori_loss": 1.1563763618469238,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.3185492753982544,
      "eval_regularization/policy_data_loss": 3.1224536895751953,
      "eval_regularization/policy_ref_data_loss_gap": 1.856450080871582,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.7645561695098877,
      "eval_reward/A01_acc": 0.6283643841743469,
      "eval_reward/A02_acc": 0.7168737053871155,
      "eval_reward/A03_acc": 0.804347813129425,
      "eval_reward/reward_A0": -0.8569299578666687,
      "eval_reward/reward_A1": -1.1332881450653076,
      "eval_reward/reward_A2": -1.408140778541565,
      "eval_reward/reward_A3": -1.7546687126159668,
      "eval_rewards/accuracies": 0.7165215015411377,
      "eval_rewards/chosen": -0.8569299578666687,
      "eval_rewards/margins": 0.575088381767273,
      "eval_rewards/rejected": -1.4320181608200073,
      "eval_runtime": 1142.1285,
      "eval_samples_per_second": 1.691,
      "eval_steps_per_second": 0.423,
      "step": 800
    },
    {
      "epoch": 0.43,
      "learning_rate": 3.515852640880707e-06,
      "loss": 0.934,
      "loss/mini_gap_loss": 0.9340019226074219,
      "loss/ori_loss": 1.190909504890442,
      "loss/reward_entrophy": 0.25690746307373047,
      "mask/mask_ratio": 0.47213101387023926,
      "reward/A01_acc": 0.628125011920929,
      "reward/A02_acc": 0.690625011920929,
      "reward/A03_acc": 0.778124988079071,
      "reward/reward_A0": -0.9069668650627136,
      "reward/reward_A1": -1.1855518817901611,
      "reward/reward_A2": -1.4043241739273071,
      "reward/reward_A3": -1.744106650352478,
      "rewards/accuracies": 0.6989513635635376,
      "rewards/chosen": -0.9069668650627136,
      "rewards/margins": 0.5376794934272766,
      "rewards/rejected": -1.4446464776992798,
      "step": 810
    },
    {
      "epoch": 0.44,
      "learning_rate": 3.473291805731538e-06,
      "loss": 0.9281,
      "loss/mini_gap_loss": 0.928051769733429,
      "loss/ori_loss": 1.1340773105621338,
      "loss/reward_entrophy": 0.206025630235672,
      "mask/mask_ratio": 0.4593987464904785,
      "reward/A01_acc": 0.690625011920929,
      "reward/A02_acc": 0.784375011920929,
      "reward/A03_acc": 0.8187500238418579,
      "reward/reward_A0": -0.8016375303268433,
      "reward/reward_A1": -1.1721004247665405,
      "reward/reward_A2": -1.4409055709838867,
      "reward/reward_A3": -1.688701868057251,
      "rewards/accuracies": 0.7645756602287292,
      "rewards/chosen": -0.8016375303268433,
      "rewards/margins": 0.6322507262229919,
      "rewards/rejected": -1.4338881969451904,
      "step": 820
    },
    {
      "epoch": 0.44,
      "learning_rate": 3.4303958381689163e-06,
      "loss": 0.9395,
      "loss/mini_gap_loss": 0.9394693374633789,
      "loss/ori_loss": 1.1711199283599854,
      "loss/reward_entrophy": 0.23165059089660645,
      "mask/mask_ratio": 0.46329689025878906,
      "reward/A01_acc": 0.640625,
      "reward/A02_acc": 0.7593749761581421,
      "reward/A03_acc": 0.8187500238418579,
      "reward/reward_A0": -0.7665198445320129,
      "reward/reward_A1": -1.0698915719985962,
      "reward/reward_A2": -1.302970290184021,
      "reward/reward_A3": -1.6561956405639648,
      "rewards/accuracies": 0.7395759224891663,
      "rewards/chosen": -0.7665198445320129,
      "rewards/margins": 0.5764859914779663,
      "rewards/rejected": -1.343005657196045,
      "step": 830
    },
    {
      "epoch": 0.45,
      "learning_rate": 3.3871795085104895e-06,
      "loss": 0.9027,
      "loss/mini_gap_loss": 0.9027311205863953,
      "loss/ori_loss": 1.136016845703125,
      "loss/reward_entrophy": 0.23328574001789093,
      "mask/mask_ratio": 0.43526506423950195,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.7562500238418579,
      "reward/A03_acc": 0.824999988079071,
      "reward/reward_A0": -0.7623356580734253,
      "reward/reward_A1": -1.0283563137054443,
      "reward/reward_A2": -1.4091527462005615,
      "reward/reward_A3": -1.7750015258789062,
      "rewards/accuracies": 0.7354093194007874,
      "rewards/chosen": -0.7623356580734253,
      "rewards/margins": 0.6418204307556152,
      "rewards/rejected": -1.4041563272476196,
      "step": 840
    },
    {
      "epoch": 0.45,
      "learning_rate": 3.343657697383811e-06,
      "loss": 0.8854,
      "loss/mini_gap_loss": 0.885395884513855,
      "loss/ori_loss": 1.151540994644165,
      "loss/reward_entrophy": 0.26614508032798767,
      "mask/mask_ratio": 0.45245495438575745,
      "reward/A01_acc": 0.612500011920929,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.800000011920929,
      "reward/reward_A0": -0.7817342877388,
      "reward/reward_A1": -1.0502495765686035,
      "reward/reward_A2": -1.3685169219970703,
      "reward/reward_A3": -1.7655839920043945,
      "rewards/accuracies": 0.7135345339775085,
      "rewards/chosen": -0.7817342877388,
      "rewards/margins": 0.6130353212356567,
      "rewards/rejected": -1.394769549369812,
      "step": 850
    },
    {
      "epoch": 0.46,
      "learning_rate": 3.299845390602501e-06,
      "loss": 0.9604,
      "loss/mini_gap_loss": 0.9604137539863586,
      "loss/ori_loss": 1.1882628202438354,
      "loss/reward_entrophy": 0.2278493344783783,
      "mask/mask_ratio": 0.47633543610572815,
      "reward/A01_acc": 0.621874988079071,
      "reward/A02_acc": 0.737500011920929,
      "reward/A03_acc": 0.7875000238418579,
      "reward/reward_A0": -0.7442869544029236,
      "reward/reward_A1": -0.9653006792068481,
      "reward/reward_A2": -1.3038464784622192,
      "reward/reward_A3": -1.6213366985321045,
      "rewards/accuracies": 0.715617835521698,
      "rewards/chosen": -0.7442869544029236,
      "rewards/margins": 0.5525280237197876,
      "rewards/rejected": -1.2968151569366455,
      "step": 860
    },
    {
      "epoch": 0.46,
      "learning_rate": 3.2557576740062073e-06,
      "loss": 0.9292,
      "loss/mini_gap_loss": 0.929183304309845,
      "loss/ori_loss": 1.182168960571289,
      "loss/reward_entrophy": 0.25298571586608887,
      "mask/mask_ratio": 0.461022287607193,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.721875011920929,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.7272266149520874,
      "reward/reward_A1": -1.0132129192352295,
      "reward/reward_A2": -1.18190598487854,
      "reward/reward_A3": -1.6023107767105103,
      "rewards/accuracies": 0.7208261489868164,
      "rewards/chosen": -0.7272266149520874,
      "rewards/margins": 0.5385707020759583,
      "rewards/rejected": -1.2657973766326904,
      "step": 870
    },
    {
      "epoch": 0.47,
      "learning_rate": 3.2114097282661106e-06,
      "loss": 0.8837,
      "loss/mini_gap_loss": 0.8837278485298157,
      "loss/ori_loss": 1.1419023275375366,
      "loss/reward_entrophy": 0.25817450881004333,
      "mask/mask_ratio": 0.4674050211906433,
      "reward/A01_acc": 0.6656249761581421,
      "reward/A02_acc": 0.715624988079071,
      "reward/A03_acc": 0.8374999761581421,
      "reward/reward_A0": -0.7092471122741699,
      "reward/reward_A1": -1.0785386562347412,
      "reward/reward_A2": -1.314321517944336,
      "reward/reward_A3": -1.7553882598876953,
      "rewards/accuracies": 0.739575982093811,
      "rewards/chosen": -0.7092471122741699,
      "rewards/margins": 0.6734884977340698,
      "rewards/rejected": -1.3827357292175293,
      "step": 880
    },
    {
      "epoch": 0.47,
      "learning_rate": 3.1668168236577855e-06,
      "loss": 0.915,
      "loss/mini_gap_loss": 0.9150179028511047,
      "loss/ori_loss": 1.1683294773101807,
      "loss/reward_entrophy": 0.2533114552497864,
      "mask/mask_ratio": 0.4523714482784271,
      "reward/A01_acc": 0.628125011920929,
      "reward/A02_acc": 0.7406250238418579,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.7885862588882446,
      "reward/reward_A1": -1.0429341793060303,
      "reward/reward_A2": -1.4531290531158447,
      "reward/reward_A3": -1.7511851787567139,
      "rewards/accuracies": 0.728117823600769,
      "rewards/chosen": -0.7885862588882446,
      "rewards/margins": 0.627149224281311,
      "rewards/rejected": -1.4157354831695557,
      "step": 890
    },
    {
      "epoch": 0.48,
      "learning_rate": 3.1219943148032022e-06,
      "loss": 0.9366,
      "loss/mini_gap_loss": 0.9365940093994141,
      "loss/ori_loss": 1.1319520473480225,
      "loss/reward_entrophy": 0.1953580677509308,
      "mask/mask_ratio": 0.44088372588157654,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.706250011920929,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.8132621645927429,
      "reward/reward_A1": -1.1268073320388794,
      "reward/reward_A2": -1.4510248899459839,
      "reward/reward_A3": -1.9299644231796265,
      "rewards/accuracies": 0.7156178951263428,
      "rewards/chosen": -0.8132621645927429,
      "rewards/margins": 0.689321756362915,
      "rewards/rejected": -1.5025837421417236,
      "step": 900
    },
    {
      "epoch": 0.48,
      "eval_loss": 0.9098740816116333,
      "eval_loss/mini_gap_loss": 0.9099754095077515,
      "eval_loss/ori_loss": 1.1426246166229248,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.3334044218063354,
      "eval_regularization/policy_data_loss": 2.9612393379211426,
      "eval_regularization/policy_ref_data_loss_gap": 1.6952359676361084,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.7448984980583191,
      "eval_reward/A01_acc": 0.6387163400650024,
      "eval_reward/A02_acc": 0.7220497131347656,
      "eval_reward/A03_acc": 0.8121117949485779,
      "eval_reward/reward_A0": -0.8157702684402466,
      "eval_reward/reward_A1": -1.1198359727859497,
      "eval_reward/reward_A2": -1.4145647287368774,
      "eval_reward/reward_A3": -1.8110640048980713,
      "eval_rewards/accuracies": 0.724285364151001,
      "eval_rewards/chosen": -0.8157702684402466,
      "eval_rewards/margins": 0.6327034831047058,
      "eval_rewards/rejected": -1.4484738111495972,
      "eval_runtime": 1142.4398,
      "eval_samples_per_second": 1.69,
      "eval_steps_per_second": 0.423,
      "step": 900
    },
    {
      "epoch": 0.48,
      "learning_rate": 3.076957635383691e-06,
      "loss": 0.8833,
      "loss/mini_gap_loss": 0.8832573890686035,
      "loss/ori_loss": 1.1234095096588135,
      "loss/reward_entrophy": 0.240152046084404,
      "mask/mask_ratio": 0.4638892114162445,
      "reward/A01_acc": 0.637499988079071,
      "reward/A02_acc": 0.721875011920929,
      "reward/A03_acc": 0.8531249761581421,
      "reward/reward_A0": -0.7949765920639038,
      "reward/reward_A1": -1.1488959789276123,
      "reward/reward_A2": -1.4088515043258667,
      "reward/reward_A3": -1.8714975118637085,
      "rewards/accuracies": 0.7374926209449768,
      "rewards/chosen": -0.7949765920639038,
      "rewards/margins": 0.6814237236976624,
      "rewards/rejected": -1.4764002561569214,
      "step": 910
    },
    {
      "epoch": 0.49,
      "learning_rate": 3.0317222928256755e-06,
      "loss": 0.907,
      "loss/mini_gap_loss": 0.9070010185241699,
      "loss/ori_loss": 1.1238571405410767,
      "loss/reward_entrophy": 0.21685604751110077,
      "mask/mask_ratio": 0.4666944444179535,
      "reward/A01_acc": 0.637499988079071,
      "reward/A02_acc": 0.715624988079071,
      "reward/A03_acc": 0.831250011920929,
      "reward/reward_A0": -0.8587416410446167,
      "reward/reward_A1": -1.2039604187011719,
      "reward/reward_A2": -1.457226037979126,
      "reward/reward_A3": -1.9418308734893799,
      "rewards/accuracies": 0.7281177043914795,
      "rewards/chosen": -0.8587416410446167,
      "rewards/margins": 0.6755821108818054,
      "rewards/rejected": -1.5343239307403564,
      "step": 920
    },
    {
      "epoch": 0.49,
      "learning_rate": 2.986303862961024e-06,
      "loss": 0.9562,
      "loss/mini_gap_loss": 0.9562448263168335,
      "loss/ori_loss": 1.1525018215179443,
      "loss/reward_entrophy": 0.1962570697069168,
      "mask/mask_ratio": 0.463174432516098,
      "reward/A01_acc": 0.609375,
      "reward/A02_acc": 0.734375,
      "reward/A03_acc": 0.7906249761581421,
      "reward/reward_A0": -0.8547403216362,
      "reward/reward_A1": -1.1244045495986938,
      "reward/reward_A2": -1.4881595373153687,
      "reward/reward_A3": -1.8364604711532593,
      "rewards/accuracies": 0.7114512324333191,
      "rewards/chosen": -0.8547403216362,
      "rewards/margins": 0.6282529830932617,
      "rewards/rejected": -1.4829933643341064,
      "step": 930
    },
    {
      "epoch": 0.5,
      "learning_rate": 2.9407179846638423e-06,
      "loss": 0.9502,
      "loss/mini_gap_loss": 0.9502479434013367,
      "loss/ori_loss": 1.1891329288482666,
      "loss/reward_entrophy": 0.23888497054576874,
      "mask/mask_ratio": 0.4388662278652191,
      "reward/A01_acc": 0.628125011920929,
      "reward/A02_acc": 0.71875,
      "reward/A03_acc": 0.7906249761581421,
      "reward/reward_A0": -0.9266487956047058,
      "reward/reward_A1": -1.163091778755188,
      "reward/reward_A2": -1.4372966289520264,
      "reward/reward_A3": -1.9497992992401123,
      "rewards/accuracies": 0.7124928832054138,
      "rewards/chosen": -0.9266487956047058,
      "rewards/margins": 0.5900651812553406,
      "rewards/rejected": -1.5167139768600464,
      "step": 940
    },
    {
      "epoch": 0.5,
      "learning_rate": 2.8949803544655512e-06,
      "loss": 0.9171,
      "loss/mini_gap_loss": 0.917148768901825,
      "loss/ori_loss": 1.1569328308105469,
      "loss/reward_entrophy": 0.23978395760059357,
      "mask/mask_ratio": 0.47027429938316345,
      "reward/A01_acc": 0.609375,
      "reward/A02_acc": 0.715624988079071,
      "reward/A03_acc": 0.8031250238418579,
      "reward/reward_A0": -0.9046562910079956,
      "reward/reward_A1": -1.1435497999191284,
      "reward/reward_A2": -1.5251405239105225,
      "reward/reward_A3": -1.8601748943328857,
      "rewards/accuracies": 0.7093679308891296,
      "rewards/chosen": -0.9046562910079956,
      "rewards/margins": 0.6049503087997437,
      "rewards/rejected": -1.5096065998077393,
      "step": 950
    },
    {
      "epoch": 0.51,
      "learning_rate": 2.849106721150128e-06,
      "loss": 0.86,
      "loss/mini_gap_loss": 0.8600271344184875,
      "loss/ori_loss": 1.1276021003723145,
      "loss/reward_entrophy": 0.2675749957561493,
      "mask/mask_ratio": 0.43062344193458557,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.7250000238418579,
      "reward/A03_acc": 0.831250011920929,
      "reward/reward_A0": -0.8564362525939941,
      "reward/reward_A1": -1.2477037906646729,
      "reward/reward_A2": -1.4472278356552124,
      "reward/reward_A3": -1.8734315633773804,
      "rewards/accuracies": 0.7270760536193848,
      "rewards/chosen": -0.8564362525939941,
      "rewards/margins": 0.6663362979888916,
      "rewards/rejected": -1.5227725505828857,
      "step": 960
    },
    {
      "epoch": 0.52,
      "learning_rate": 2.8031128803313407e-06,
      "loss": 0.9102,
      "loss/mini_gap_loss": 0.9101539850234985,
      "loss/ori_loss": 1.1680028438568115,
      "loss/reward_entrophy": 0.2578487992286682,
      "mask/mask_ratio": 0.4641999304294586,
      "reward/A01_acc": 0.612500011920929,
      "reward/A02_acc": 0.703125,
      "reward/A03_acc": 0.809374988079071,
      "reward/reward_A0": -0.8775478601455688,
      "reward/reward_A1": -1.19893217086792,
      "reward/reward_A2": -1.4316117763519287,
      "reward/reward_A3": -1.945580244064331,
      "rewards/accuracies": 0.7083262801170349,
      "rewards/chosen": -0.8775478601455688,
      "rewards/margins": 0.6478115916252136,
      "rewards/rejected": -1.5253595113754272,
      "step": 970
    },
    {
      "epoch": 0.52,
      "learning_rate": 2.7570146690138644e-06,
      "loss": 0.8871,
      "loss/mini_gap_loss": 0.887065589427948,
      "loss/ori_loss": 1.1329796314239502,
      "loss/reward_entrophy": 0.24591417610645294,
      "mask/mask_ratio": 0.4476150572299957,
      "reward/A01_acc": 0.65625,
      "reward/A02_acc": 0.7437499761581421,
      "reward/A03_acc": 0.840624988079071,
      "reward/reward_A0": -0.851311206817627,
      "reward/reward_A1": -1.251677393913269,
      "reward/reward_A2": -1.6142154932022095,
      "reward/reward_A3": -2.0174622535705566,
      "rewards/accuracies": 0.7468675374984741,
      "rewards/chosen": -0.851311206817627,
      "rewards/margins": 0.7764576077461243,
      "rewards/rejected": -1.627768874168396,
      "step": 980
    },
    {
      "epoch": 0.53,
      "learning_rate": 2.710827960140144e-06,
      "loss": 0.8905,
      "loss/mini_gap_loss": 0.8904505968093872,
      "loss/ori_loss": 1.1195671558380127,
      "loss/reward_entrophy": 0.22911641001701355,
      "mask/mask_ratio": 0.46204042434692383,
      "reward/A01_acc": 0.653124988079071,
      "reward/A02_acc": 0.715624988079071,
      "reward/A03_acc": 0.859375,
      "reward/reward_A0": -0.908622145652771,
      "reward/reward_A1": -1.3006603717803955,
      "reward/reward_A2": -1.565114140510559,
      "reward/reward_A3": -2.004376173019409,
      "rewards/accuracies": 0.7427009344100952,
      "rewards/chosen": -0.908622145652771,
      "rewards/margins": 0.7147451043128967,
      "rewards/rejected": -1.6233673095703125,
      "step": 990
    },
    {
      "epoch": 0.53,
      "learning_rate": 2.664568657124883e-06,
      "loss": 0.8746,
      "loss/mini_gap_loss": 0.8746147155761719,
      "loss/ori_loss": 1.1367957592010498,
      "loss/reward_entrophy": 0.26218098402023315,
      "mask/mask_ratio": 0.45259198546409607,
      "reward/A01_acc": 0.628125011920929,
      "reward/A02_acc": 0.703125,
      "reward/A03_acc": 0.8218749761581421,
      "reward/reward_A0": -0.9181060791015625,
      "reward/reward_A1": -1.2174785137176514,
      "reward/reward_A2": -1.6367425918579102,
      "reward/reward_A3": -1.924608588218689,
      "rewards/accuracies": 0.7177011370658875,
      "rewards/chosen": -0.9181060791015625,
      "rewards/margins": 0.674821138381958,
      "rewards/rejected": -1.5929272174835205,
      "step": 1000
    },
    {
      "epoch": 0.53,
      "eval_loss": 0.9004649519920349,
      "eval_loss/mini_gap_loss": 0.9005493521690369,
      "eval_loss/ori_loss": 1.1331984996795654,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.4734618663787842,
      "eval_regularization/policy_data_loss": 3.0807888507843018,
      "eval_regularization/policy_ref_data_loss_gap": 1.8147852420806885,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.852311372756958,
      "eval_reward/A01_acc": 0.6392339468002319,
      "eval_reward/A02_acc": 0.7318840622901917,
      "eval_reward/A03_acc": 0.8079710006713867,
      "eval_reward/reward_A0": -0.8930760025978088,
      "eval_reward/reward_A1": -1.2235256433486938,
      "eval_reward/reward_A2": -1.5379818677902222,
      "eval_reward/reward_A3": -1.9732606410980225,
      "eval_rewards/accuracies": 0.7263556718826294,
      "eval_rewards/chosen": -0.8930760025978088,
      "eval_rewards/margins": 0.6851643919944763,
      "eval_rewards/rejected": -1.5782402753829956,
      "eval_runtime": 1142.1037,
      "eval_samples_per_second": 1.691,
      "eval_steps_per_second": 0.423,
      "step": 1000
    },
    {
      "epoch": 0.54,
      "learning_rate": 2.6182526883790404e-06,
      "loss": 0.9417,
      "loss/mini_gap_loss": 0.9417101144790649,
      "loss/ori_loss": 1.1336348056793213,
      "loss/reward_entrophy": 0.19192489981651306,
      "mask/mask_ratio": 0.4625304639339447,
      "reward/A01_acc": 0.668749988079071,
      "reward/A02_acc": 0.7718750238418579,
      "reward/A03_acc": 0.8125,
      "reward/reward_A0": -0.8729017972946167,
      "reward/reward_A1": -1.2057321071624756,
      "reward/reward_A2": -1.5405256748199463,
      "reward/reward_A3": -2.004790782928467,
      "rewards/accuracies": 0.751034140586853,
      "rewards/chosen": -0.8729017972946167,
      "rewards/margins": 0.7107653021812439,
      "rewards/rejected": -1.5836670398712158,
      "step": 1010
    },
    {
      "epoch": 0.54,
      "learning_rate": 2.5718960018252186e-06,
      "loss": 0.9431,
      "loss/mini_gap_loss": 0.9431403875350952,
      "loss/ori_loss": 1.2024190425872803,
      "loss/reward_entrophy": 0.25927871465682983,
      "mask/mask_ratio": 0.47696346044540405,
      "reward/A01_acc": 0.621874988079071,
      "reward/A02_acc": 0.6781250238418579,
      "reward/A03_acc": 0.7749999761581421,
      "reward/reward_A0": -0.8945455551147461,
      "reward/reward_A1": -1.2518880367279053,
      "reward/reward_A2": -1.4263898134231567,
      "reward/reward_A3": -1.794471025466919,
      "rewards/accuracies": 0.6916598081588745,
      "rewards/chosen": -0.8945455551147461,
      "rewards/margins": 0.5963557958602905,
      "rewards/rejected": -1.4909013509750366,
      "step": 1020
    },
    {
      "epoch": 0.55,
      "learning_rate": 2.525514559406334e-06,
      "loss": 0.9057,
      "loss/mini_gap_loss": 0.9056817293167114,
      "loss/ori_loss": 1.1118700504302979,
      "loss/reward_entrophy": 0.20618848502635956,
      "mask/mask_ratio": 0.446524053812027,
      "reward/A01_acc": 0.659375011920929,
      "reward/A02_acc": 0.768750011920929,
      "reward/A03_acc": 0.8218749761581421,
      "reward/reward_A0": -0.8822765350341797,
      "reward/reward_A1": -1.224003791809082,
      "reward/reward_A2": -1.6232621669769287,
      "reward/reward_A3": -2.0317912101745605,
      "rewards/accuracies": 0.7499925494194031,
      "rewards/chosen": -0.8822765350341797,
      "rewards/margins": 0.744059681892395,
      "rewards/rejected": -1.6263360977172852,
      "step": 1030
    },
    {
      "epoch": 0.55,
      "learning_rate": 2.4791243315894608e-06,
      "loss": 0.9061,
      "loss/mini_gap_loss": 0.9061107635498047,
      "loss/ori_loss": 1.1155694723129272,
      "loss/reward_entrophy": 0.20945878326892853,
      "mask/mask_ratio": 0.4651219844818115,
      "reward/A01_acc": 0.6812499761581421,
      "reward/A02_acc": 0.7124999761581421,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.9464297294616699,
      "reward/reward_A1": -1.3999968767166138,
      "reward/reward_A2": -1.6003210544586182,
      "reward/reward_A3": -1.9525985717773438,
      "rewards/accuracies": 0.7364509701728821,
      "rewards/chosen": -0.9464297294616699,
      "rewards/margins": 0.7045261263847351,
      "rewards/rejected": -1.6509557962417603,
      "step": 1040
    },
    {
      "epoch": 0.56,
      "learning_rate": 2.43274129186674e-06,
      "loss": 0.914,
      "loss/mini_gap_loss": 0.9140174984931946,
      "loss/ori_loss": 1.1452999114990234,
      "loss/reward_entrophy": 0.23128250241279602,
      "mask/mask_ratio": 0.4726598858833313,
      "reward/A01_acc": 0.6343749761581421,
      "reward/A02_acc": 0.75,
      "reward/A03_acc": 0.8125,
      "reward/reward_A0": -0.905637264251709,
      "reward/reward_A1": -1.2303783893585205,
      "reward/reward_A2": -1.5758672952651978,
      "reward/reward_A3": -1.958164930343628,
      "rewards/accuracies": 0.732284426689148,
      "rewards/chosen": -0.905637264251709,
      "rewards/margins": 0.6824837923049927,
      "rewards/rejected": -1.5881210565567017,
      "step": 1050
    },
    {
      "epoch": 0.56,
      "learning_rate": 2.3863814112552425e-06,
      "loss": 0.9166,
      "loss/mini_gap_loss": 0.9165714383125305,
      "loss/ori_loss": 1.1487529277801514,
      "loss/reward_entrophy": 0.23218150436878204,
      "mask/mask_ratio": 0.44937458634376526,
      "reward/A01_acc": 0.659375011920929,
      "reward/A02_acc": 0.71875,
      "reward/A03_acc": 0.824999988079071,
      "reward/reward_A0": -0.824482798576355,
      "reward/reward_A1": -1.198451042175293,
      "reward/reward_A2": -1.506230115890503,
      "reward/reward_A3": -1.8324158191680908,
      "rewards/accuracies": 0.7343677282333374,
      "rewards/chosen": -0.824482798576355,
      "rewards/margins": 0.6878676414489746,
      "rewards/rejected": -1.5123504400253296,
      "step": 1060
    },
    {
      "epoch": 0.57,
      "learning_rate": 2.3400606527976927e-06,
      "loss": 0.8788,
      "loss/mini_gap_loss": 0.8788288831710815,
      "loss/ori_loss": 1.1242121458053589,
      "loss/reward_entrophy": 0.24538321793079376,
      "mask/mask_ratio": 0.4519672393798828,
      "reward/A01_acc": 0.659375011920929,
      "reward/A02_acc": 0.7437499761581421,
      "reward/A03_acc": 0.856249988079071,
      "reward/reward_A0": -0.8667643666267395,
      "reward/reward_A1": -1.2134991884231567,
      "reward/reward_A2": -1.509170413017273,
      "reward/reward_A3": -1.9185794591903687,
      "rewards/accuracies": 0.7531174421310425,
      "rewards/chosen": -0.8667643666267395,
      "rewards/margins": 0.6803032755851746,
      "rewards/rejected": -1.5470675230026245,
      "step": 1070
    },
    {
      "epoch": 0.57,
      "learning_rate": 2.2937949660659277e-06,
      "loss": 0.9528,
      "loss/mini_gap_loss": 0.9527662992477417,
      "loss/ori_loss": 1.196514368057251,
      "loss/reward_entrophy": 0.24374809861183167,
      "mask/mask_ratio": 0.46035680174827576,
      "reward/A01_acc": 0.606249988079071,
      "reward/A02_acc": 0.703125,
      "reward/A03_acc": 0.778124988079071,
      "reward/reward_A0": -0.8692795038223267,
      "reward/reward_A1": -1.1266334056854248,
      "reward/reward_A2": -1.4968044757843018,
      "reward/reward_A3": -1.7552311420440674,
      "rewards/accuracies": 0.6958263516426086,
      "rewards/chosen": -0.8692795038223267,
      "rewards/margins": 0.5902623534202576,
      "rewards/rejected": -1.45954167842865,
      "step": 1080
    },
    {
      "epoch": 0.58,
      "learning_rate": 2.2476002816690048e-06,
      "loss": 0.882,
      "loss/mini_gap_loss": 0.8819801211357117,
      "loss/ori_loss": 1.1286306381225586,
      "loss/reward_entrophy": 0.24665026366710663,
      "mask/mask_ratio": 0.4457703232765198,
      "reward/A01_acc": 0.6499999761581421,
      "reward/A02_acc": 0.731249988079071,
      "reward/A03_acc": 0.8125,
      "reward/reward_A0": -0.8124328851699829,
      "reward/reward_A1": -1.1674379110336304,
      "reward/reward_A2": -1.4054027795791626,
      "reward/reward_A3": -1.8508962392807007,
      "rewards/accuracies": 0.7312427163124084,
      "rewards/chosen": -0.8124328851699829,
      "rewards/margins": 0.6621314287185669,
      "rewards/rejected": -1.4745643138885498,
      "step": 1090
    },
    {
      "epoch": 0.58,
      "learning_rate": 2.201492505767828e-06,
      "loss": 0.8941,
      "loss/mini_gap_loss": 0.8941013216972351,
      "loss/ori_loss": 1.1418135166168213,
      "loss/reward_entrophy": 0.247712180018425,
      "mask/mask_ratio": 0.4859614372253418,
      "reward/A01_acc": 0.6343749761581421,
      "reward/A02_acc": 0.721875011920929,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.9167166948318481,
      "reward/reward_A1": -1.2421965599060059,
      "reward/reward_A2": -1.6455814838409424,
      "reward/reward_A3": -1.9309985637664795,
      "rewards/accuracies": 0.7239510416984558,
      "rewards/chosen": -0.9167166948318481,
      "rewards/margins": 0.6895262002944946,
      "rewards/rejected": -1.6062428951263428,
      "step": 1100
    },
    {
      "epoch": 0.58,
      "eval_loss": 0.89515620470047,
      "eval_loss/mini_gap_loss": 0.8952183127403259,
      "eval_loss/ori_loss": 1.1278674602508545,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.4775267839431763,
      "eval_regularization/policy_data_loss": 3.1270034313201904,
      "eval_regularization/policy_ref_data_loss_gap": 1.8609998226165771,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.8425523042678833,
      "eval_reward/A01_acc": 0.6413043737411499,
      "eval_reward/A02_acc": 0.7339544296264648,
      "eval_reward/A03_acc": 0.8110765814781189,
      "eval_reward/reward_A0": -0.9340749979019165,
      "eval_reward/reward_A1": -1.2736340761184692,
      "eval_reward/reward_A2": -1.6024119853973389,
      "eval_reward/reward_A3": -2.041482448577881,
      "eval_rewards/accuracies": 0.7287711501121521,
      "eval_rewards/chosen": -0.9340749979019165,
      "eval_rewards/margins": 0.7050848007202148,
      "eval_rewards/rejected": -1.6391597986221313,
      "eval_runtime": 1142.0583,
      "eval_samples_per_second": 1.691,
      "eval_steps_per_second": 0.423,
      "step": 1100
    },
    {
      "epoch": 0.59,
      "learning_rate": 2.1554875145982053e-06,
      "loss": 0.9879,
      "loss/mini_gap_loss": 0.9878588914871216,
      "loss/ori_loss": 1.1844841241836548,
      "loss/reward_entrophy": 0.19662514328956604,
      "mask/mask_ratio": 0.4767111837863922,
      "reward/A01_acc": 0.574999988079071,
      "reward/A02_acc": 0.7124999761581421,
      "reward/A03_acc": 0.809374988079071,
      "reward/reward_A0": -1.0013420581817627,
      "reward/reward_A1": -1.2072036266326904,
      "reward/reward_A2": -1.645656943321228,
      "reward/reward_A3": -2.0107064247131348,
      "rewards/accuracies": 0.6989513635635376,
      "rewards/chosen": -1.0013420581817627,
      "rewards/margins": 0.6198307871818542,
      "rewards/rejected": -1.6211726665496826,
      "step": 1110
    },
    {
      "epoch": 0.59,
      "learning_rate": 2.1096011490041987e-06,
      "loss": 0.8865,
      "loss/mini_gap_loss": 0.886489748954773,
      "loss/ori_loss": 1.1224725246429443,
      "loss/reward_entrophy": 0.235982745885849,
      "mask/mask_ratio": 0.4587629437446594,
      "reward/A01_acc": 0.671875,
      "reward/A02_acc": 0.7093750238418579,
      "reward/A03_acc": 0.828125,
      "reward/reward_A0": -0.8895727396011353,
      "reward/reward_A1": -1.3164061307907104,
      "reward/reward_A2": -1.5560072660446167,
      "reward/reward_A3": -2.0833237171173096,
      "rewards/accuracies": 0.7364510297775269,
      "rewards/chosen": -0.8895727396011353,
      "rewards/margins": 0.7623232007026672,
      "rewards/rejected": -1.6518958806991577,
      "step": 1120
    },
    {
      "epoch": 0.6,
      "learning_rate": 2.0638492089836672e-06,
      "loss": 0.9059,
      "loss/mini_gap_loss": 0.9058830142021179,
      "loss/ori_loss": 1.1084754467010498,
      "loss/reward_entrophy": 0.20259246230125427,
      "mask/mask_ratio": 0.46282655000686646,
      "reward/A01_acc": 0.6625000238418579,
      "reward/A02_acc": 0.7562500238418579,
      "reward/A03_acc": 0.8343750238418579,
      "reward/reward_A0": -0.8400664329528809,
      "reward/reward_A1": -1.1908605098724365,
      "reward/reward_A2": -1.6160781383514404,
      "reward/reward_A3": -2.012190818786621,
      "rewards/accuracies": 0.751034140586853,
      "rewards/chosen": -0.8400664329528809,
      "rewards/margins": 0.7662941217422485,
      "rewards/rejected": -1.6063604354858398,
      "step": 1130
    },
    {
      "epoch": 0.61,
      "learning_rate": 2.018247448247871e-06,
      "loss": 0.929,
      "loss/mini_gap_loss": 0.9290445446968079,
      "loss/ori_loss": 1.1265686750411987,
      "loss/reward_entrophy": 0.19752416014671326,
      "mask/mask_ratio": 0.4490880072116852,
      "reward/A01_acc": 0.653124988079071,
      "reward/A02_acc": 0.7749999761581421,
      "reward/A03_acc": 0.8218749761581421,
      "reward/reward_A0": -0.8943448066711426,
      "reward/reward_A1": -1.2431347370147705,
      "reward/reward_A2": -1.601015329360962,
      "reward/reward_A3": -2.0108237266540527,
      "rewards/accuracies": 0.7499924898147583,
      "rewards/chosen": -0.8943448066711426,
      "rewards/margins": 0.7239636182785034,
      "rewards/rejected": -1.618308424949646,
      "step": 1140
    },
    {
      "epoch": 0.61,
      "learning_rate": 1.9728115687970136e-06,
      "loss": 0.9065,
      "loss/mini_gap_loss": 0.9064540863037109,
      "loss/ori_loss": 1.1289093494415283,
      "loss/reward_entrophy": 0.22245530784130096,
      "mask/mask_ratio": 0.4666585922241211,
      "reward/A01_acc": 0.668749988079071,
      "reward/A02_acc": 0.731249988079071,
      "reward/A03_acc": 0.8374999761581421,
      "reward/reward_A0": -0.8837703466415405,
      "reward/reward_A1": -1.3084309101104736,
      "reward/reward_A2": -1.5557196140289307,
      "reward/reward_A3": -2.070192813873291,
      "rewards/accuracies": 0.7458258867263794,
      "rewards/chosen": -0.8837703466415405,
      "rewards/margins": 0.7609941363334656,
      "rewards/rejected": -1.6447645425796509,
      "step": 1150
    },
    {
      "epoch": 0.62,
      "learning_rate": 1.9275572155135953e-06,
      "loss": 0.9119,
      "loss/mini_gap_loss": 0.9118515849113464,
      "loss/ori_loss": 1.1319355964660645,
      "loss/reward_entrophy": 0.22008399665355682,
      "mask/mask_ratio": 0.45845308899879456,
      "reward/A01_acc": 0.643750011920929,
      "reward/A02_acc": 0.7093750238418579,
      "reward/A03_acc": 0.8374999761581421,
      "reward/reward_A0": -0.8838861584663391,
      "reward/reward_A1": -1.2567625045776367,
      "reward/reward_A2": -1.468165636062622,
      "reward/reward_A3": -2.0357277393341064,
      "rewards/accuracies": 0.7302010655403137,
      "rewards/chosen": -0.8838861584663391,
      "rewards/margins": 0.7029833197593689,
      "rewards/rejected": -1.5868693590164185,
      "step": 1160
    },
    {
      "epoch": 0.62,
      "learning_rate": 1.8824999707754232e-06,
      "loss": 0.9094,
      "loss/mini_gap_loss": 0.9093992114067078,
      "loss/ori_loss": 1.118121862411499,
      "loss/reward_entrophy": 0.20872266590595245,
      "mask/mask_ratio": 0.4621976912021637,
      "reward/A01_acc": 0.6499999761581421,
      "reward/A02_acc": 0.734375,
      "reward/A03_acc": 0.8343750238418579,
      "reward/reward_A0": -0.8938802480697632,
      "reward/reward_A1": -1.2617824077606201,
      "reward/reward_A2": -1.5379282236099243,
      "reward/reward_A3": -1.9680677652359009,
      "rewards/accuracies": 0.739575982093811,
      "rewards/chosen": -0.8938802480697632,
      "rewards/margins": 0.6953632831573486,
      "rewards/rejected": -1.5892435312271118,
      "step": 1170
    },
    {
      "epoch": 0.63,
      "learning_rate": 1.8376553490901566e-06,
      "loss": 0.8502,
      "loss/mini_gap_loss": 0.8502097129821777,
      "loss/ori_loss": 1.0977588891983032,
      "loss/reward_entrophy": 0.24754929542541504,
      "mask/mask_ratio": 0.47455301880836487,
      "reward/A01_acc": 0.671875,
      "reward/A02_acc": 0.734375,
      "reward/A03_acc": 0.8343750238418579,
      "reward/reward_A0": -0.8576068878173828,
      "reward/reward_A1": -1.2621508836746216,
      "reward/reward_A2": -1.605719804763794,
      "reward/reward_A3": -2.1159207820892334,
      "rewards/accuracies": 0.7468675971031189,
      "rewards/chosen": -0.8576068878173828,
      "rewards/margins": 0.8036403656005859,
      "rewards/rejected": -1.6612474918365479,
      "step": 1180
    },
    {
      "epoch": 0.63,
      "learning_rate": 1.7930387917532086e-06,
      "loss": 0.8999,
      "loss/mini_gap_loss": 0.8998895883560181,
      "loss/ori_loss": 1.1347682476043701,
      "loss/reward_entrophy": 0.2348785400390625,
      "mask/mask_ratio": 0.46207195520401,
      "reward/A01_acc": 0.640625,
      "reward/A02_acc": 0.7437499761581421,
      "reward/A03_acc": 0.7875000238418579,
      "reward/reward_A0": -0.9528508186340332,
      "reward/reward_A1": -1.2940700054168701,
      "reward/reward_A2": -1.6676479578018188,
      "reward/reward_A3": -1.965710997581482,
      "rewards/accuracies": 0.7239511609077454,
      "rewards/chosen": -0.9528508186340332,
      "rewards/margins": 0.6896090507507324,
      "rewards/rejected": -1.6424598693847656,
      "step": 1190
    },
    {
      "epoch": 0.64,
      "learning_rate": 1.7486656615308647e-06,
      "loss": 0.9201,
      "loss/mini_gap_loss": 0.9200838208198547,
      "loss/ori_loss": 1.1349366903305054,
      "loss/reward_entrophy": 0.21485285460948944,
      "mask/mask_ratio": 0.4597243368625641,
      "reward/A01_acc": 0.6031249761581421,
      "reward/A02_acc": 0.75,
      "reward/A03_acc": 0.8656250238418579,
      "reward/reward_A0": -0.8614175915718079,
      "reward/reward_A1": -1.1403881311416626,
      "reward/reward_A2": -1.4737049341201782,
      "reward/reward_A3": -2.1031506061553955,
      "rewards/accuracies": 0.739575982093811,
      "rewards/chosen": -0.8614175915718079,
      "rewards/margins": 0.7109813094139099,
      "rewards/rejected": -1.5723989009857178,
      "step": 1200
    },
    {
      "epoch": 0.64,
      "eval_loss": 0.8890694975852966,
      "eval_loss/mini_gap_loss": 0.8891425132751465,
      "eval_loss/ori_loss": 1.1217918395996094,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.5022636651992798,
      "eval_regularization/policy_data_loss": 3.2583136558532715,
      "eval_regularization/policy_ref_data_loss_gap": 1.9923101663589478,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.8384832143783569,
      "eval_reward/A01_acc": 0.6335403919219971,
      "eval_reward/A02_acc": 0.7329192757606506,
      "eval_reward/A03_acc": 0.8245341777801514,
      "eval_reward/reward_A0": -0.9362310767173767,
      "eval_reward/reward_A1": -1.2763676643371582,
      "eval_reward/reward_A2": -1.609952449798584,
      "eval_reward/reward_A3": -2.0559723377227783,
      "eval_rewards/accuracies": 0.7303239703178406,
      "eval_rewards/chosen": -0.9362310767173767,
      "eval_rewards/margins": 0.7111834287643433,
      "eval_rewards/rejected": -1.6474144458770752,
      "eval_runtime": 1142.9954,
      "eval_samples_per_second": 1.689,
      "eval_steps_per_second": 0.423,
      "step": 1200
    },
    {
      "epoch": 0.64,
      "learning_rate": 1.7045512373704426e-06,
      "loss": 0.9269,
      "loss/mini_gap_loss": 0.9269148111343384,
      "loss/ori_loss": 1.1737279891967773,
      "loss/reward_entrophy": 0.24681314826011658,
      "mask/mask_ratio": 0.4646865725517273,
      "reward/A01_acc": 0.606249988079071,
      "reward/A02_acc": 0.75,
      "reward/A03_acc": 0.809374988079071,
      "reward/reward_A0": -0.979504406452179,
      "reward/reward_A1": -1.2192656993865967,
      "reward/reward_A2": -1.6894992589950562,
      "reward/reward_A3": -1.9739471673965454,
      "rewards/accuracies": 0.7218677997589111,
      "rewards/chosen": -0.979504406452179,
      "rewards/margins": 0.64805006980896,
      "rewards/rejected": -1.6275545358657837,
      "step": 1210
    },
    {
      "epoch": 0.65,
      "learning_rate": 1.660710709139308e-06,
      "loss": 0.8986,
      "loss/mini_gap_loss": 0.8986064195632935,
      "loss/ori_loss": 1.1445205211639404,
      "loss/reward_entrophy": 0.24591414630413055,
      "mask/mask_ratio": 0.45599421858787537,
      "reward/A01_acc": 0.640625,
      "reward/A02_acc": 0.7406250238418579,
      "reward/A03_acc": 0.796875,
      "reward/reward_A0": -1.043963074684143,
      "reward/reward_A1": -1.344058871269226,
      "reward/reward_A2": -1.7295551300048828,
      "reward/reward_A3": -2.0795583724975586,
      "rewards/accuracies": 0.72603440284729,
      "rewards/chosen": -1.043963074684143,
      "rewards/margins": 0.6737439036369324,
      "rewards/rejected": -1.7177069187164307,
      "step": 1220
    },
    {
      "epoch": 0.65,
      "learning_rate": 1.6171591723945652e-06,
      "loss": 0.9189,
      "loss/mini_gap_loss": 0.9188982844352722,
      "loss/ori_loss": 1.116790533065796,
      "loss/reward_entrophy": 0.1978922337293625,
      "mask/mask_ratio": 0.4362107217311859,
      "reward/A01_acc": 0.643750011920929,
      "reward/A02_acc": 0.75,
      "reward/A03_acc": 0.828125,
      "reward/reward_A0": -0.8978436589241028,
      "reward/reward_A1": -1.2273656129837036,
      "reward/reward_A2": -1.6464773416519165,
      "reward/reward_A3": -2.0276336669921875,
      "rewards/accuracies": 0.7406176328659058,
      "rewards/chosen": -0.8978436589241028,
      "rewards/margins": 0.7359654903411865,
      "rewards/rejected": -1.6338093280792236,
      "step": 1230
    },
    {
      "epoch": 0.66,
      "learning_rate": 1.5739116231852239e-06,
      "loss": 0.9446,
      "loss/mini_gap_loss": 0.9445958137512207,
      "loss/ori_loss": 1.1735492944717407,
      "loss/reward_entrophy": 0.228953555226326,
      "mask/mask_ratio": 0.46198320388793945,
      "reward/A01_acc": 0.6156250238418579,
      "reward/A02_acc": 0.734375,
      "reward/A03_acc": 0.8062499761581421,
      "reward/reward_A0": -0.93732088804245,
      "reward/reward_A1": -1.230878233909607,
      "reward/reward_A2": -1.5421321392059326,
      "reward/reward_A3": -1.9741131067276,
      "rewards/accuracies": 0.7187429070472717,
      "rewards/chosen": -0.93732088804245,
      "rewards/margins": 0.645037829875946,
      "rewards/rejected": -1.5823585987091064,
      "step": 1240
    },
    {
      "epoch": 0.66,
      "learning_rate": 1.5309829528886255e-06,
      "loss": 0.8707,
      "loss/mini_gap_loss": 0.8707369565963745,
      "loss/ori_loss": 1.118286371231079,
      "loss/reward_entrophy": 0.24754932522773743,
      "mask/mask_ratio": 0.4562970995903015,
      "reward/A01_acc": 0.637499988079071,
      "reward/A02_acc": 0.7124999761581421,
      "reward/A03_acc": 0.84375,
      "reward/reward_A0": -0.8604240417480469,
      "reward/reward_A1": -1.1385290622711182,
      "reward/reward_A2": -1.5266456604003906,
      "reward/reward_A3": -2.008594512939453,
      "rewards/accuracies": 0.7312427163124084,
      "rewards/chosen": -0.8604240417480469,
      "rewards/margins": 0.6974835395812988,
      "rewards/rejected": -1.5579074621200562,
      "step": 1250
    },
    {
      "epoch": 0.67,
      "learning_rate": 1.4883879430829135e-06,
      "loss": 0.9288,
      "loss/mini_gap_loss": 0.928776741027832,
      "loss/ori_loss": 1.1125682592391968,
      "loss/reward_entrophy": 0.18379148840904236,
      "mask/mask_ratio": 0.4460979104042053,
      "reward/A01_acc": 0.6499999761581421,
      "reward/A02_acc": 0.737500011920929,
      "reward/A03_acc": 0.8656250238418579,
      "reward/reward_A0": -0.8459585905075073,
      "reward/reward_A1": -1.2186377048492432,
      "reward/reward_A2": -1.5286105871200562,
      "reward/reward_A3": -1.9998916387557983,
      "rewards/accuracies": 0.751034140586853,
      "rewards/chosen": -0.8459585905075073,
      "rewards/margins": 0.7364055514335632,
      "rewards/rejected": -1.5823643207550049,
      "step": 1260
    },
    {
      "epoch": 0.67,
      "learning_rate": 1.4461412604573103e-06,
      "loss": 0.8805,
      "loss/mini_gap_loss": 0.8805146217346191,
      "loss/ori_loss": 1.1255297660827637,
      "loss/reward_entrophy": 0.24501517415046692,
      "mask/mask_ratio": 0.46336379647254944,
      "reward/A01_acc": 0.643750011920929,
      "reward/A02_acc": 0.7093750238418579,
      "reward/A03_acc": 0.8531249761581421,
      "reward/reward_A0": -0.9264041781425476,
      "reward/reward_A1": -1.2304786443710327,
      "reward/reward_A2": -1.56497323513031,
      "reward/reward_A3": -2.1068522930145264,
      "rewards/accuracies": 0.7354093790054321,
      "rewards/chosen": -0.9264041781425476,
      "rewards/margins": 0.7076807022094727,
      "rewards/rejected": -1.634084939956665,
      "step": 1270
    },
    {
      "epoch": 0.68,
      "learning_rate": 1.4042574517619523e-06,
      "loss": 0.9097,
      "loss/mini_gap_loss": 0.9096649289131165,
      "loss/ori_loss": 1.129055142402649,
      "loss/reward_entrophy": 0.21939019858837128,
      "mask/mask_ratio": 0.46221208572387695,
      "reward/A01_acc": 0.637499988079071,
      "reward/A02_acc": 0.7593749761581421,
      "reward/A03_acc": 0.8374999761581421,
      "reward/reward_A0": -1.0107618570327759,
      "reward/reward_A1": -1.393139362335205,
      "reward/reward_A2": -1.6921437978744507,
      "reward/reward_A3": -2.176147937774658,
      "rewards/accuracies": 0.7447842359542847,
      "rewards/chosen": -1.0107618570327759,
      "rewards/margins": 0.7430309653282166,
      "rewards/rejected": -1.7537930011749268,
      "step": 1280
    },
    {
      "epoch": 0.69,
      "learning_rate": 1.3627509387990267e-06,
      "loss": 0.8854,
      "loss/mini_gap_loss": 0.8854067921638489,
      "loss/ori_loss": 1.1134612560272217,
      "loss/reward_entrophy": 0.22805452346801758,
      "mask/mask_ratio": 0.4789578914642334,
      "reward/A01_acc": 0.6625000238418579,
      "reward/A02_acc": 0.7562500238418579,
      "reward/A03_acc": 0.8374999761581421,
      "reward/reward_A0": -0.8993616104125977,
      "reward/reward_A1": -1.259871006011963,
      "reward/reward_A2": -1.6670808792114258,
      "reward/reward_A3": -2.148637056350708,
      "rewards/accuracies": 0.7520757913589478,
      "rewards/chosen": -0.8993616104125977,
      "rewards/margins": 0.7924845814704895,
      "rewards/rejected": -1.6918461322784424,
      "step": 1290
    },
    {
      "epoch": 0.69,
      "learning_rate": 1.3216360134569303e-06,
      "loss": 0.8358,
      "loss/mini_gap_loss": 0.835830807685852,
      "loss/ori_loss": 1.0756146907806396,
      "loss/reward_entrophy": 0.23978397250175476,
      "mask/mask_ratio": 0.45455822348594666,
      "reward/A01_acc": 0.65625,
      "reward/A02_acc": 0.765625,
      "reward/A03_acc": 0.8531249761581421,
      "reward/reward_A0": -0.9166932106018066,
      "reward/reward_A1": -1.3082512617111206,
      "reward/reward_A2": -1.6767066717147827,
      "reward/reward_A3": -2.2413954734802246,
      "rewards/accuracies": 0.7583257555961609,
      "rewards/chosen": -0.9166932106018066,
      "rewards/margins": 0.8254071474075317,
      "rewards/rejected": -1.7421003580093384,
      "step": 1300
    },
    {
      "epoch": 0.69,
      "eval_loss": 0.8859832286834717,
      "eval_loss/mini_gap_loss": 0.8860694169998169,
      "eval_loss/ori_loss": 1.1187187433242798,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.653988242149353,
      "eval_regularization/policy_data_loss": 3.2861666679382324,
      "eval_regularization/policy_ref_data_loss_gap": 2.0201632976531982,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.9301286935806274,
      "eval_reward/A01_acc": 0.6423395276069641,
      "eval_reward/A02_acc": 0.7329192757606506,
      "eval_reward/A03_acc": 0.8214285969734192,
      "eval_reward/reward_A0": -0.9350094795227051,
      "eval_reward/reward_A1": -1.284964680671692,
      "eval_reward/reward_A2": -1.6318646669387817,
      "eval_reward/reward_A3": -2.1210973262786865,
      "eval_rewards/accuracies": 0.7322218418121338,
      "eval_rewards/chosen": -0.9350094795227051,
      "eval_rewards/margins": 0.7442826628684998,
      "eval_rewards/rejected": -1.67929208278656,
      "eval_runtime": 1142.1034,
      "eval_samples_per_second": 1.691,
      "eval_steps_per_second": 0.423,
      "step": 1300
    },
    {
      "epoch": 0.7,
      "learning_rate": 1.2809268327891558e-06,
      "loss": 0.8944,
      "loss/mini_gap_loss": 0.89441978931427,
      "loss/ori_loss": 1.1589298248291016,
      "loss/reward_entrophy": 0.264509916305542,
      "mask/mask_ratio": 0.44963616132736206,
      "reward/A01_acc": 0.6031249761581421,
      "reward/A02_acc": 0.7093750238418579,
      "reward/A03_acc": 0.8062499761581421,
      "reward/reward_A0": -0.9925732612609863,
      "reward/reward_A1": -1.288104772567749,
      "reward/reward_A2": -1.6785694360733032,
      "reward/reward_A3": -2.110100030899048,
      "rewards/accuracies": 0.7062429189682007,
      "rewards/chosen": -0.9925732612609863,
      "rewards/margins": 0.6996678709983826,
      "rewards/rejected": -1.6922409534454346,
      "step": 1310
    },
    {
      "epoch": 0.7,
      "learning_rate": 1.2406374141396154e-06,
      "loss": 0.9314,
      "loss/mini_gap_loss": 0.931443989276886,
      "loss/ori_loss": 1.1307661533355713,
      "loss/reward_entrophy": 0.1993221640586853,
      "mask/mask_ratio": 0.46222686767578125,
      "reward/A01_acc": 0.65625,
      "reward/A02_acc": 0.7250000238418579,
      "reward/A03_acc": 0.8343750238418579,
      "reward/reward_A0": -0.9944862127304077,
      "reward/reward_A1": -1.385481595993042,
      "reward/reward_A2": -1.6123449802398682,
      "reward/reward_A3": -2.115051507949829,
      "rewards/accuracies": 0.7385343313217163,
      "rewards/chosen": -0.9944862127304077,
      "rewards/margins": 0.7097894549369812,
      "rewards/rejected": -1.7042754888534546,
      "step": 1320
    },
    {
      "epoch": 0.71,
      "learning_rate": 1.2007816303160605e-06,
      "loss": 0.8815,
      "loss/mini_gap_loss": 0.8815375566482544,
      "loss/ori_loss": 1.0860908031463623,
      "loss/reward_entrophy": 0.20455333590507507,
      "mask/mask_ratio": 0.4566218852996826,
      "reward/A01_acc": 0.621874988079071,
      "reward/A02_acc": 0.765625,
      "reward/A03_acc": 0.878125011920929,
      "reward/reward_A0": -0.8948361277580261,
      "reward/reward_A1": -1.226264238357544,
      "reward/reward_A2": -1.6552765369415283,
      "reward/reward_A3": -2.2163288593292236,
      "rewards/accuracies": 0.7552008032798767,
      "rewards/chosen": -0.8948361277580261,
      "rewards/margins": 0.8044368028640747,
      "rewards/rejected": -1.6992727518081665,
      "step": 1330
    },
    {
      "epoch": 0.71,
      "learning_rate": 1.1613732048132795e-06,
      "loss": 0.9296,
      "loss/mini_gap_loss": 0.929604709148407,
      "loss/ori_loss": 1.1592520475387573,
      "loss/reward_entrophy": 0.22964735329151154,
      "mask/mask_ratio": 0.45150741934776306,
      "reward/A01_acc": 0.6343749761581421,
      "reward/A02_acc": 0.734375,
      "reward/A03_acc": 0.7906249761581421,
      "reward/reward_A0": -1.1357390880584717,
      "reward/reward_A1": -1.5163072347640991,
      "reward/reward_A2": -1.8698927164077759,
      "reward/reward_A3": -2.270881175994873,
      "rewards/accuracies": 0.7197844386100769,
      "rewards/chosen": -1.1357390880584717,
      "rewards/margins": 0.7499358654022217,
      "rewards/rejected": -1.8856747150421143,
      "step": 1340
    },
    {
      "epoch": 0.72,
      "learning_rate": 1.1224257070876993e-06,
      "loss": 0.8884,
      "loss/mini_gap_loss": 0.888393759727478,
      "loss/ori_loss": 1.1388452053070068,
      "loss/reward_entrophy": 0.25045153498649597,
      "mask/mask_ratio": 0.4380703866481781,
      "reward/A01_acc": 0.637499988079071,
      "reward/A02_acc": 0.7406250238418579,
      "reward/A03_acc": 0.828125,
      "reward/reward_A0": -0.8787404894828796,
      "reward/reward_A1": -1.1871328353881836,
      "reward/reward_A2": -1.6211423873901367,
      "reward/reward_A3": -2.1545729637145996,
      "rewards/accuracies": 0.7354093790054321,
      "rewards/chosen": -0.8787404894828796,
      "rewards/margins": 0.7755255699157715,
      "rewards/rejected": -1.654266119003296,
      "step": 1350
    },
    {
      "epoch": 0.72,
      "learning_rate": 1.0839525478850339e-06,
      "loss": 0.9522,
      "loss/mini_gap_loss": 0.9521796107292175,
      "loss/ori_loss": 1.2004649639129639,
      "loss/reward_entrophy": 0.2482854574918747,
      "mask/mask_ratio": 0.4756375849246979,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.731249988079071,
      "reward/A03_acc": 0.793749988079071,
      "reward/reward_A0": -0.9776943325996399,
      "reward/reward_A1": -1.2448749542236328,
      "reward/reward_A2": -1.5346852540969849,
      "reward/reward_A3": -1.9824388027191162,
      "rewards/accuracies": 0.7166594862937927,
      "rewards/chosen": -0.9776943325996399,
      "rewards/margins": 0.6096227765083313,
      "rewards/rejected": -1.5873172283172607,
      "step": 1360
    },
    {
      "epoch": 0.73,
      "learning_rate": 1.045966974622574e-06,
      "loss": 0.941,
      "loss/mini_gap_loss": 0.940959095954895,
      "loss/ori_loss": 1.1677465438842773,
      "loss/reward_entrophy": 0.2267874777317047,
      "mask/mask_ratio": 0.4549782872200012,
      "reward/A01_acc": 0.59375,
      "reward/A02_acc": 0.6656249761581421,
      "reward/A03_acc": 0.784375011920929,
      "reward/reward_A0": -0.9361907839775085,
      "reward/reward_A1": -1.181770920753479,
      "reward/reward_A2": -1.556579351425171,
      "reward/reward_A3": -1.9789737462997437,
      "rewards/accuracies": 0.6812431812286377,
      "rewards/chosen": -0.9361907839775085,
      "rewards/margins": 0.6362348198890686,
      "rewards/rejected": -1.5724256038665771,
      "step": 1370
    },
    {
      "epoch": 0.73,
      "learning_rate": 1.0084820668277224e-06,
      "loss": 0.9201,
      "loss/mini_gap_loss": 0.9200908541679382,
      "loss/ori_loss": 1.1488392353057861,
      "loss/reward_entrophy": 0.2287483513355255,
      "mask/mask_ratio": 0.4529235363006592,
      "reward/A01_acc": 0.5874999761581421,
      "reward/A02_acc": 0.721875011920929,
      "reward/A03_acc": 0.793749988079071,
      "reward/reward_A0": -0.854448139667511,
      "reward/reward_A1": -1.1523164510726929,
      "reward/reward_A2": -1.4604136943817139,
      "reward/reward_A3": -1.8696391582489014,
      "rewards/accuracies": 0.701034665107727,
      "rewards/chosen": -0.854448139667511,
      "rewards/margins": 0.6396600604057312,
      "rewards/rejected": -1.4941082000732422,
      "step": 1380
    },
    {
      "epoch": 0.74,
      "learning_rate": 9.715107316343345e-07,
      "loss": 0.9411,
      "loss/mini_gap_loss": 0.9411381483078003,
      "loss/ori_loss": 1.1554601192474365,
      "loss/reward_entrophy": 0.21432189643383026,
      "mask/mask_ratio": 0.4415750503540039,
      "reward/A01_acc": 0.6468750238418579,
      "reward/A02_acc": 0.7250000238418579,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.8770395517349243,
      "reward/reward_A1": -1.2095171213150024,
      "reward/reward_A2": -1.4473973512649536,
      "reward/reward_A3": -1.9267619848251343,
      "rewards/accuracies": 0.729159414768219,
      "rewards/chosen": -0.8770395517349243,
      "rewards/margins": 0.6508374214172363,
      "rewards/rejected": -1.527876853942871,
      "step": 1390
    },
    {
      "epoch": 0.74,
      "learning_rate": 9.350656993384224e-07,
      "loss": 0.8829,
      "loss/mini_gap_loss": 0.882941722869873,
      "loss/ori_loss": 1.1279569864273071,
      "loss/reward_entrophy": 0.24501517415046692,
      "mask/mask_ratio": 0.47556072473526,
      "reward/A01_acc": 0.574999988079071,
      "reward/A02_acc": 0.737500011920929,
      "reward/A03_acc": 0.8531249761581421,
      "reward/reward_A0": -0.8187419176101685,
      "reward/reward_A1": -1.070815086364746,
      "reward/reward_A2": -1.4848848581314087,
      "reward/reward_A3": -1.9091180562973022,
      "rewards/accuracies": 0.7218678593635559,
      "rewards/chosen": -0.8187419176101685,
      "rewards/margins": 0.6695159673690796,
      "rewards/rejected": -1.488257884979248,
      "step": 1400
    },
    {
      "epoch": 0.74,
      "eval_loss": 0.8846410512924194,
      "eval_loss/mini_gap_loss": 0.8847437500953674,
      "eval_loss/ori_loss": 1.1173930168151855,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.417441725730896,
      "eval_regularization/policy_data_loss": 3.0759572982788086,
      "eval_regularization/policy_ref_data_loss_gap": 1.8099538087844849,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.8464220762252808,
      "eval_reward/A01_acc": 0.6392339468002319,
      "eval_reward/A02_acc": 0.738095223903656,
      "eval_reward/A03_acc": 0.8297101259231567,
      "eval_reward/reward_A0": -0.8118953108787537,
      "eval_reward/reward_A1": -1.1349323987960815,
      "eval_reward/reward_A2": -1.4591352939605713,
      "eval_reward/reward_A3": -1.9229153394699097,
      "eval_rewards/accuracies": 0.7356724143028259,
      "eval_rewards/chosen": -0.8118953108787537,
      "eval_rewards/margins": 0.6937506198883057,
      "eval_rewards/rejected": -1.505645990371704,
      "eval_runtime": 1142.8512,
      "eval_samples_per_second": 1.69,
      "eval_steps_per_second": 0.423,
      "step": 1400
    },
    {
      "epoch": 0.75,
      "learning_rate": 8.991595190147418e-07,
      "loss": 0.9243,
      "loss/mini_gap_loss": 0.9242815971374512,
      "loss/ori_loss": 1.133004069328308,
      "loss/reward_entrophy": 0.20872263610363007,
      "mask/mask_ratio": 0.4475019872188568,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.7437499761581421,
      "reward/A03_acc": 0.8125,
      "reward/reward_A0": -0.820052444934845,
      "reward/reward_A1": -1.1284749507904053,
      "reward/reward_A2": -1.50020170211792,
      "reward/reward_A3": -1.8475959300994873,
      "rewards/accuracies": 0.7270760536193848,
      "rewards/chosen": -0.820052444934845,
      "rewards/margins": 0.672023594379425,
      "rewards/rejected": -1.49207603931427,
      "step": 1410
    },
    {
      "epoch": 0.75,
      "learning_rate": 8.638045541957926e-07,
      "loss": 0.9371,
      "loss/mini_gap_loss": 0.9371197819709778,
      "loss/ori_loss": 1.1472723484039307,
      "loss/reward_entrophy": 0.21015258133411407,
      "mask/mask_ratio": 0.4662766456604004,
      "reward/A01_acc": 0.621874988079071,
      "reward/A02_acc": 0.706250011920929,
      "reward/A03_acc": 0.840624988079071,
      "reward/reward_A0": -0.858025074005127,
      "reward/reward_A1": -1.1401804685592651,
      "reward/reward_A2": -1.4315658807754517,
      "reward/reward_A3": -1.9185640811920166,
      "rewards/accuracies": 0.7229094505310059,
      "rewards/chosen": -0.858025074005127,
      "rewards/margins": 0.6387301683425903,
      "rewards/rejected": -1.4967553615570068,
      "step": 1420
    },
    {
      "epoch": 0.76,
      "learning_rate": 8.290129786146905e-07,
      "loss": 0.8694,
      "loss/mini_gap_loss": 0.8693562746047974,
      "loss/ori_loss": 1.1125733852386475,
      "loss/reward_entrophy": 0.24321714043617249,
      "mask/mask_ratio": 0.4436076283454895,
      "reward/A01_acc": 0.668749988079071,
      "reward/A02_acc": 0.7718750238418579,
      "reward/A03_acc": 0.828125,
      "reward/reward_A0": -0.8094648122787476,
      "reward/reward_A1": -1.1851192712783813,
      "reward/reward_A2": -1.47697114944458,
      "reward/reward_A3": -1.96657395362854,
      "rewards/accuracies": 0.7562424540519714,
      "rewards/chosen": -0.8094648122787476,
      "rewards/margins": 0.7334078550338745,
      "rewards/rejected": -1.542872667312622,
      "step": 1430
    },
    {
      "epoch": 0.76,
      "learning_rate": 7.947967720134034e-07,
      "loss": 0.8902,
      "loss/mini_gap_loss": 0.8901891708374023,
      "loss/ori_loss": 1.094416856765747,
      "loss/reward_entrophy": 0.20422761142253876,
      "mask/mask_ratio": 0.4529925286769867,
      "reward/A01_acc": 0.6812499761581421,
      "reward/A02_acc": 0.721875011920929,
      "reward/A03_acc": 0.828125,
      "reward/reward_A0": -0.7843989729881287,
      "reward/reward_A1": -1.1927213668823242,
      "reward/reward_A2": -1.4969167709350586,
      "reward/reward_A3": -1.8997853994369507,
      "rewards/accuracies": 0.7437425851821899,
      "rewards/chosen": -0.7843989729881287,
      "rewards/margins": 0.7453936338424683,
      "rewards/rejected": -1.5297926664352417,
      "step": 1440
    },
    {
      "epoch": 0.77,
      "learning_rate": 7.61167716017781e-07,
      "loss": 0.8761,
      "loss/mini_gap_loss": 0.8761194944381714,
      "loss/ori_loss": 1.0499794483184814,
      "loss/reward_entrophy": 0.1738600730895996,
      "mask/mask_ratio": 0.46990475058555603,
      "reward/A01_acc": 0.671875,
      "reward/A02_acc": 0.7562500238418579,
      "reward/A03_acc": 0.871874988079071,
      "reward/reward_A0": -0.7830844521522522,
      "reward/reward_A1": -1.1805198192596436,
      "reward/reward_A2": -1.5168288946151733,
      "reward/reward_A3": -2.102269172668457,
      "rewards/accuracies": 0.7666589617729187,
      "rewards/chosen": -0.7830844521522522,
      "rewards/margins": 0.8167723417282104,
      "rewards/rejected": -1.5998566150665283,
      "step": 1450
    },
    {
      "epoch": 0.78,
      "learning_rate": 7.281373900808012e-07,
      "loss": 0.8882,
      "loss/mini_gap_loss": 0.8881914019584656,
      "loss/ori_loss": 1.1453039646148682,
      "loss/reward_entrophy": 0.25711265206336975,
      "mask/mask_ratio": 0.4698655605316162,
      "reward/A01_acc": 0.6187499761581421,
      "reward/A02_acc": 0.7093750238418579,
      "reward/A03_acc": 0.8062499761581421,
      "reward/reward_A0": -0.951356053352356,
      "reward/reward_A1": -1.2747620344161987,
      "reward/reward_A2": -1.4963126182556152,
      "reward/reward_A3": -2.16728138923645,
      "rewards/accuracies": 0.7114511728286743,
      "rewards/chosen": -0.951356053352356,
      "rewards/margins": 0.6947463154792786,
      "rewards/rejected": -1.6461021900177002,
      "step": 1460
    },
    {
      "epoch": 0.78,
      "learning_rate": 6.9571716749543e-07,
      "loss": 0.9163,
      "loss/mini_gap_loss": 0.9163480997085571,
      "loss/ori_loss": 1.162630319595337,
      "loss/reward_entrophy": 0.24628224968910217,
      "mask/mask_ratio": 0.45205968618392944,
      "reward/A01_acc": 0.5687500238418579,
      "reward/A02_acc": 0.734375,
      "reward/A03_acc": 0.765625,
      "reward/reward_A0": -0.9504269361495972,
      "reward/reward_A1": -1.1640957593917847,
      "reward/reward_A2": -1.6603755950927734,
      "reward/reward_A3": -2.0087687969207764,
      "rewards/accuracies": 0.6895765066146851,
      "rewards/chosen": -0.9504269361495972,
      "rewards/margins": 0.6606370806694031,
      "rewards/rejected": -1.6110641956329346,
      "step": 1470
    },
    {
      "epoch": 0.79,
      "learning_rate": 6.639182114784701e-07,
      "loss": 0.9021,
      "loss/mini_gap_loss": 0.9020618200302124,
      "loss/ori_loss": 1.127582311630249,
      "loss/reward_entrophy": 0.22552040219306946,
      "mask/mask_ratio": 0.47272396087646484,
      "reward/A01_acc": 0.6468750238418579,
      "reward/A02_acc": 0.7437499761581421,
      "reward/A03_acc": 0.8343750238418579,
      "reward/reward_A0": -0.9277932047843933,
      "reward/reward_A1": -1.296461820602417,
      "reward/reward_A2": -1.6383081674575806,
      "reward/reward_A3": -2.140859365463257,
      "rewards/accuracies": 0.7416592836380005,
      "rewards/chosen": -0.9277932047843933,
      "rewards/margins": 0.7640663385391235,
      "rewards/rejected": -1.691859483718872,
      "step": 1480
    },
    {
      "epoch": 0.79,
      "learning_rate": 6.327514713267435e-07,
      "loss": 0.9353,
      "loss/mini_gap_loss": 0.9352778196334839,
      "loss/ori_loss": 1.1595309972763062,
      "loss/reward_entrophy": 0.2242533266544342,
      "mask/mask_ratio": 0.4589906632900238,
      "reward/A01_acc": 0.612500011920929,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.831250011920929,
      "reward/reward_A0": -0.8963180780410767,
      "reward/reward_A1": -1.2176826000213623,
      "reward/reward_A2": -1.6295543909072876,
      "reward/reward_A3": -2.0022239685058594,
      "rewards/accuracies": 0.7239511609077454,
      "rewards/chosen": -0.8963180780410767,
      "rewards/margins": 0.720152735710144,
      "rewards/rejected": -1.6164706945419312,
      "step": 1490
    },
    {
      "epoch": 0.8,
      "learning_rate": 6.02227678646933e-07,
      "loss": 0.8779,
      "loss/mini_gap_loss": 0.8779279589653015,
      "loss/ori_loss": 1.103243112564087,
      "loss/reward_entrophy": 0.2253151834011078,
      "mask/mask_ratio": 0.42910391092300415,
      "reward/A01_acc": 0.65625,
      "reward/A02_acc": 0.746874988079071,
      "reward/A03_acc": 0.831250011920929,
      "reward/reward_A0": -0.8973411321640015,
      "reward/reward_A1": -1.2726609706878662,
      "reward/reward_A2": -1.6699680089950562,
      "reward/reward_A3": -2.0736217498779297,
      "rewards/accuracies": 0.7447842955589294,
      "rewards/chosen": -0.8973411321640015,
      "rewards/margins": 0.7747256755828857,
      "rewards/rejected": -1.6720669269561768,
      "step": 1500
    },
    {
      "epoch": 0.8,
      "eval_loss": 0.8822488188743591,
      "eval_loss/mini_gap_loss": 0.8823315501213074,
      "eval_loss/ori_loss": 1.114980936050415,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.6182937622070312,
      "eval_regularization/policy_data_loss": 3.3052010536193848,
      "eval_regularization/policy_ref_data_loss_gap": 2.0391972064971924,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.9325066208839417,
      "eval_reward/A01_acc": 0.6387163400650024,
      "eval_reward/A02_acc": 0.7344720363616943,
      "eval_reward/A03_acc": 0.8260869383811951,
      "eval_reward/reward_A0": -0.9158169627189636,
      "eval_reward/reward_A1": -1.2610524892807007,
      "eval_reward/reward_A2": -1.610992193222046,
      "eval_reward/reward_A3": -2.1029934883117676,
      "eval_rewards/accuracies": 0.733084499835968,
      "eval_rewards/chosen": -0.9158169627189636,
      "eval_rewards/margins": 0.7425126433372498,
      "eval_rewards/rejected": -1.6583294868469238,
      "eval_runtime": 1142.5598,
      "eval_samples_per_second": 1.69,
      "eval_steps_per_second": 0.423,
      "step": 1500
    },
    {
      "epoch": 0.8,
      "learning_rate": 5.723573436603802e-07,
      "loss": 0.9304,
      "loss/mini_gap_loss": 0.9304397702217102,
      "loss/ori_loss": 1.170223593711853,
      "loss/reward_entrophy": 0.23978397250175476,
      "mask/mask_ratio": 0.45080581307411194,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.706250011920929,
      "reward/A03_acc": 0.8218749761581421,
      "reward/reward_A0": -0.9868305325508118,
      "reward/reward_A1": -1.265616774559021,
      "reward/reward_A2": -1.5993990898132324,
      "reward/reward_A3": -2.0582656860351562,
      "rewards/accuracies": 0.7177011370658875,
      "rewards/chosen": -0.9868305325508118,
      "rewards/margins": 0.6542468667030334,
      "rewards/rejected": -1.6410773992538452,
      "step": 1510
    },
    {
      "epoch": 0.81,
      "learning_rate": 5.431507515841141e-07,
      "loss": 0.8605,
      "loss/mini_gap_loss": 0.8605340719223022,
      "loss/ori_loss": 1.1236140727996826,
      "loss/reward_entrophy": 0.263079971075058,
      "mask/mask_ratio": 0.469203382730484,
      "reward/A01_acc": 0.6625000238418579,
      "reward/A02_acc": 0.7749999761581421,
      "reward/A03_acc": 0.809374988079071,
      "reward/reward_A0": -0.8981224894523621,
      "reward/reward_A1": -1.2626953125,
      "reward/reward_A2": -1.5538547039031982,
      "reward/reward_A3": -2.0344815254211426,
      "rewards/accuracies": 0.7489508986473083,
      "rewards/chosen": -0.8981224894523621,
      "rewards/margins": 0.7188718914985657,
      "rewards/rejected": -1.6169942617416382,
      "step": 1520
    },
    {
      "epoch": 0.81,
      "learning_rate": 5.146179590893563e-07,
      "loss": 0.858,
      "loss/mini_gap_loss": 0.8580056428909302,
      "loss/ori_loss": 1.0985257625579834,
      "loss/reward_entrophy": 0.24052011966705322,
      "mask/mask_ratio": 0.48628073930740356,
      "reward/A01_acc": 0.6812499761581421,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.84375,
      "reward/reward_A0": -0.8191559910774231,
      "reward/reward_A1": -1.1968119144439697,
      "reward/reward_A2": -1.5552462339401245,
      "reward/reward_A3": -2.053729295730591,
      "rewards/accuracies": 0.751034140586853,
      "rewards/chosen": -0.8191559910774231,
      "rewards/margins": 0.7827571630477905,
      "rewards/rejected": -1.6019132137298584,
      "step": 1530
    },
    {
      "epoch": 0.82,
      "learning_rate": 4.867687908387139e-07,
      "loss": 0.9011,
      "loss/mini_gap_loss": 0.9010723829269409,
      "loss/ori_loss": 1.1189903020858765,
      "loss/reward_entrophy": 0.21791791915893555,
      "mask/mask_ratio": 0.4643673002719879,
      "reward/A01_acc": 0.653124988079071,
      "reward/A02_acc": 0.784375011920929,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.8661721348762512,
      "reward/reward_A1": -1.2660331726074219,
      "reward/reward_A2": -1.6548726558685303,
      "reward/reward_A3": -2.024789333343506,
      "rewards/accuracies": 0.751034140586853,
      "rewards/chosen": -0.8661721348762512,
      "rewards/margins": 0.7823765277862549,
      "rewards/rejected": -1.6485488414764404,
      "step": 1540
    },
    {
      "epoch": 0.82,
      "learning_rate": 4.596128361032709e-07,
      "loss": 0.9171,
      "loss/mini_gap_loss": 0.9171171188354492,
      "loss/ori_loss": 1.1401032209396362,
      "loss/reward_entrophy": 0.22298625111579895,
      "mask/mask_ratio": 0.46076661348342896,
      "reward/A01_acc": 0.6031249761581421,
      "reward/A02_acc": 0.706250011920929,
      "reward/A03_acc": 0.8343750238418579,
      "reward/reward_A0": -0.9855461120605469,
      "reward/reward_A1": -1.250284194946289,
      "reward/reward_A2": -1.7197529077529907,
      "reward/reward_A3": -2.1052405834198,
      "rewards/accuracies": 0.714576244354248,
      "rewards/chosen": -0.9855461120605469,
      "rewards/margins": 0.7061963677406311,
      "rewards/rejected": -1.6917425394058228,
      "step": 1550
    },
    {
      "epoch": 0.83,
      "learning_rate": 4.3315944546072297e-07,
      "loss": 0.8674,
      "loss/mini_gap_loss": 0.867353618144989,
      "loss/ori_loss": 1.1201342344284058,
      "loss/reward_entrophy": 0.2527805268764496,
      "mask/mask_ratio": 0.44891557097435,
      "reward/A01_acc": 0.65625,
      "reward/A02_acc": 0.715624988079071,
      "reward/A03_acc": 0.8031250238418579,
      "reward/reward_A0": -0.9152956008911133,
      "reward/reward_A1": -1.3079079389572144,
      "reward/reward_A2": -1.6341779232025146,
      "reward/reward_A3": -2.1775622367858887,
      "rewards/accuracies": 0.7249928116798401,
      "rewards/chosen": -0.9152956008911133,
      "rewards/margins": 0.7912366986274719,
      "rewards/rejected": -1.7065322399139404,
      "step": 1560
    },
    {
      "epoch": 0.83,
      "learning_rate": 4.0741772757570494e-07,
      "loss": 0.8675,
      "loss/mini_gap_loss": 0.8675341606140137,
      "loss/ori_loss": 1.099552869796753,
      "loss/reward_entrophy": 0.23201866447925568,
      "mask/mask_ratio": 0.46192407608032227,
      "reward/A01_acc": 0.6312500238418579,
      "reward/A02_acc": 0.746874988079071,
      "reward/A03_acc": 0.8374999761581421,
      "reward/reward_A0": -0.9678158760070801,
      "reward/reward_A1": -1.3131967782974243,
      "reward/reward_A2": -1.7389236688613892,
      "reward/reward_A3": -2.141085386276245,
      "rewards/accuracies": 0.7385343313217163,
      "rewards/chosen": -0.9678158760070801,
      "rewards/margins": 0.7632354497909546,
      "rewards/rejected": -1.7310512065887451,
      "step": 1570
    },
    {
      "epoch": 0.84,
      "learning_rate": 3.823965460634141e-07,
      "loss": 0.8389,
      "loss/mini_gap_loss": 0.8389045000076294,
      "loss/ori_loss": 1.059356451034546,
      "loss/reward_entrophy": 0.22045207023620605,
      "mask/mask_ratio": 0.4694506525993347,
      "reward/A01_acc": 0.671875,
      "reward/A02_acc": 0.762499988079071,
      "reward/A03_acc": 0.8374999761581421,
      "reward/reward_A0": -0.862488865852356,
      "reward/reward_A1": -1.2190959453582764,
      "reward/reward_A2": -1.7338390350341797,
      "reward/reward_A3": -2.200566530227661,
      "rewards/accuracies": 0.7572841644287109,
      "rewards/chosen": -0.862488865852356,
      "rewards/margins": 0.8553277850151062,
      "rewards/rejected": -1.7178165912628174,
      "step": 1580
    },
    {
      "epoch": 0.84,
      "learning_rate": 3.581045164376143e-07,
      "loss": 0.903,
      "loss/mini_gap_loss": 0.9029655456542969,
      "loss/ori_loss": 1.1366193294525146,
      "loss/reward_entrophy": 0.23365378379821777,
      "mask/mask_ratio": 0.46125784516334534,
      "reward/A01_acc": 0.628125011920929,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.8062499761581421,
      "reward/reward_A0": -0.9809072613716125,
      "reward/reward_A1": -1.3575327396392822,
      "reward/reward_A2": -1.6387054920196533,
      "reward/reward_A3": -2.1882338523864746,
      "rewards/accuracies": 0.7208261489868164,
      "rewards/chosen": -0.9809072613716125,
      "rewards/margins": 0.7472329139709473,
      "rewards/rejected": -1.7281402349472046,
      "step": 1590
    },
    {
      "epoch": 0.85,
      "learning_rate": 3.345500031440638e-07,
      "loss": 0.9388,
      "loss/mini_gap_loss": 0.9387799501419067,
      "loss/ori_loss": 1.1855933666229248,
      "loss/reward_entrophy": 0.24681314826011658,
      "mask/mask_ratio": 0.44629794359207153,
      "reward/A01_acc": 0.6000000238418579,
      "reward/A02_acc": 0.6968749761581421,
      "reward/A03_acc": 0.859375,
      "reward/reward_A0": -0.9063900709152222,
      "reward/reward_A1": -1.2032688856124878,
      "reward/reward_A2": -1.4673185348510742,
      "reward/reward_A3": -2.1756975650787354,
      "rewards/accuracies": 0.7187429070472717,
      "rewards/chosen": -0.9063900709152222,
      "rewards/margins": 0.709022045135498,
      "rewards/rejected": -1.6154121160507202,
      "step": 1600
    },
    {
      "epoch": 0.85,
      "eval_loss": 0.8817759156227112,
      "eval_loss/mini_gap_loss": 0.8818590641021729,
      "eval_loss/ori_loss": 1.1145082712173462,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.6408650875091553,
      "eval_regularization/policy_data_loss": 3.3317787647247314,
      "eval_regularization/policy_ref_data_loss_gap": 2.065775156021118,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.938787579536438,
      "eval_reward/A01_acc": 0.6361283659934998,
      "eval_reward/A02_acc": 0.7318840622901917,
      "eval_reward/A03_acc": 0.827122151851654,
      "eval_reward/reward_A0": -0.9331848621368408,
      "eval_reward/reward_A1": -1.2822673320770264,
      "eval_reward/reward_A2": -1.6358609199523926,
      "eval_reward/reward_A3": -2.1321842670440674,
      "eval_rewards/accuracies": 0.7317042350769043,
      "eval_rewards/chosen": -0.9331848621368408,
      "eval_rewards/margins": 0.7502357959747314,
      "eval_rewards/rejected": -1.6834207773208618,
      "eval_runtime": 1142.1938,
      "eval_samples_per_second": 1.691,
      "eval_steps_per_second": 0.423,
      "step": 1600
    },
    {
      "epoch": 0.86,
      "learning_rate": 3.1174111668039714e-07,
      "loss": 0.9085,
      "loss/mini_gap_loss": 0.9084548950195312,
      "loss/ori_loss": 1.1186074018478394,
      "loss/reward_entrophy": 0.21015258133411407,
      "mask/mask_ratio": 0.48104602098464966,
      "reward/A01_acc": 0.643750011920929,
      "reward/A02_acc": 0.7406250238418579,
      "reward/A03_acc": 0.8656250238418579,
      "reward/reward_A0": -0.880437970161438,
      "reward/reward_A1": -1.2374048233032227,
      "reward/reward_A2": -1.595609426498413,
      "reward/reward_A3": -2.170083999633789,
      "rewards/accuracies": 0.7499925494194031,
      "rewards/chosen": -0.880437970161438,
      "rewards/margins": 0.7872448563575745,
      "rewards/rejected": -1.6676826477050781,
      "step": 1610
    },
    {
      "epoch": 0.86,
      "learning_rate": 2.8968571080344537e-07,
      "loss": 0.9027,
      "loss/mini_gap_loss": 0.9026743173599243,
      "loss/ori_loss": 1.100035548210144,
      "loss/reward_entrophy": 0.1973612755537033,
      "mask/mask_ratio": 0.4396878778934479,
      "reward/A01_acc": 0.659375011920929,
      "reward/A02_acc": 0.7124999761581421,
      "reward/A03_acc": 0.8187500238418579,
      "reward/reward_A0": -0.9400911331176758,
      "reward/reward_A1": -1.3367325067520142,
      "reward/reward_A2": -1.6663545370101929,
      "reward/reward_A3": -2.0657973289489746,
      "rewards/accuracies": 0.7302010655403137,
      "rewards/chosen": -0.9400911331176758,
      "rewards/margins": 0.7495201230049133,
      "rewards/rejected": -1.6896114349365234,
      "step": 1620
    },
    {
      "epoch": 0.87,
      "learning_rate": 2.683913798249638e-07,
      "loss": 0.9118,
      "loss/mini_gap_loss": 0.9118353724479675,
      "loss/ori_loss": 1.1180239915847778,
      "loss/reward_entrophy": 0.20618848502635956,
      "mask/mask_ratio": 0.48387041687965393,
      "reward/A01_acc": 0.628125011920929,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.840624988079071,
      "reward/reward_A0": -0.9405193328857422,
      "reward/reward_A1": -1.3408492803573608,
      "reward/reward_A2": -1.6669750213623047,
      "reward/reward_A3": -2.0900237560272217,
      "rewards/accuracies": 0.7322843670845032,
      "rewards/chosen": -0.9405193328857422,
      "rewards/margins": 0.7587462663650513,
      "rewards/rejected": -1.699265480041504,
      "step": 1630
    },
    {
      "epoch": 0.87,
      "learning_rate": 2.478654559966892e-07,
      "loss": 0.8761,
      "loss/mini_gap_loss": 0.876091480255127,
      "loss/ori_loss": 1.1586661338806152,
      "loss/reward_entrophy": 0.28257474303245544,
      "mask/mask_ratio": 0.4744124412536621,
      "reward/A01_acc": 0.6156250238418579,
      "reward/A02_acc": 0.7406250238418579,
      "reward/A03_acc": 0.831250011920929,
      "reward/reward_A0": -0.9844692945480347,
      "reward/reward_A1": -1.2477277517318726,
      "reward/reward_A2": -1.6280428171157837,
      "reward/reward_A3": -2.1211674213409424,
      "rewards/accuracies": 0.729159414768219,
      "rewards/chosen": -0.9844692945480347,
      "rewards/margins": 0.681160032749176,
      "rewards/rejected": -1.6656296253204346,
      "step": 1640
    },
    {
      "epoch": 0.88,
      "learning_rate": 2.2811500698563776e-07,
      "loss": 0.9036,
      "loss/mini_gap_loss": 0.9035654067993164,
      "loss/ori_loss": 1.1204215288162231,
      "loss/reward_entrophy": 0.21685604751110077,
      "mask/mask_ratio": 0.47866517305374146,
      "reward/A01_acc": 0.5687500238418579,
      "reward/A02_acc": 0.706250011920929,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.9517456293106079,
      "reward/reward_A1": -1.312117576599121,
      "reward/reward_A2": -1.695202112197876,
      "reward/reward_A3": -2.286611557006836,
      "rewards/accuracies": 0.6968680620193481,
      "rewards/chosen": -0.9517456293106079,
      "rewards/margins": 0.8128805160522461,
      "rewards/rejected": -1.764626145362854,
      "step": 1650
    },
    {
      "epoch": 0.88,
      "learning_rate": 2.0914683344049863e-07,
      "loss": 0.8993,
      "loss/mini_gap_loss": 0.8993347883224487,
      "loss/ori_loss": 1.117089867591858,
      "loss/reward_entrophy": 0.2177550494670868,
      "mask/mask_ratio": 0.47489672899246216,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.715624988079071,
      "reward/A03_acc": 0.856249988079071,
      "reward/reward_A0": -0.9764219522476196,
      "reward/reward_A1": -1.2577083110809326,
      "reward/reward_A2": -1.6121628284454346,
      "reward/reward_A3": -2.149153470993042,
      "rewards/accuracies": 0.7322843670845032,
      "rewards/chosen": -0.9764219522476196,
      "rewards/margins": 0.6965696811676025,
      "rewards/rejected": -1.6729915142059326,
      "step": 1660
    },
    {
      "epoch": 0.89,
      "learning_rate": 1.909674666499789e-07,
      "loss": 0.89,
      "loss/mini_gap_loss": 0.8899775743484497,
      "loss/ori_loss": 1.140429139137268,
      "loss/reward_entrophy": 0.25045153498649597,
      "mask/mask_ratio": 0.4533194601535797,
      "reward/A01_acc": 0.6187499761581421,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.815625011920929,
      "reward/reward_A0": -0.9425910711288452,
      "reward/reward_A1": -1.289264440536499,
      "reward/reward_A2": -1.676578164100647,
      "reward/reward_A3": -2.0965018272399902,
      "rewards/accuracies": 0.7208260297775269,
      "rewards/chosen": -0.9425910711288452,
      "rewards/margins": 0.7448403239250183,
      "rewards/rejected": -1.6874313354492188,
      "step": 1670
    },
    {
      "epoch": 0.89,
      "learning_rate": 1.7358316629389054e-07,
      "loss": 0.9847,
      "loss/mini_gap_loss": 0.9846882820129395,
      "loss/ori_loss": 1.1786164045333862,
      "loss/reward_entrophy": 0.19392812252044678,
      "mask/mask_ratio": 0.44630661606788635,
      "reward/A01_acc": 0.6312500238418579,
      "reward/A02_acc": 0.7124999761581421,
      "reward/A03_acc": 0.828125,
      "reward/reward_A0": -1.0414741039276123,
      "reward/reward_A1": -1.3088910579681396,
      "reward/reward_A2": -1.622057318687439,
      "reward/reward_A3": -2.1086015701293945,
      "rewards/accuracies": 0.7239511013031006,
      "rewards/chosen": -1.0414741039276123,
      "rewards/margins": 0.6383589506149292,
      "rewards/rejected": -1.6798330545425415,
      "step": 1680
    },
    {
      "epoch": 0.9,
      "learning_rate": 1.569999182877624e-07,
      "loss": 0.9227,
      "loss/mini_gap_loss": 0.9227094650268555,
      "loss/ori_loss": 1.2148475646972656,
      "loss/reward_entrophy": 0.29213809967041016,
      "mask/mask_ratio": 0.44861000776290894,
      "reward/A01_acc": 0.59375,
      "reward/A02_acc": 0.6875,
      "reward/A03_acc": 0.7593749761581421,
      "reward/reward_A0": -0.9883913993835449,
      "reward/reward_A1": -1.2174708843231201,
      "reward/reward_A2": -1.5206449031829834,
      "reward/reward_A3": -2.062551975250244,
      "rewards/accuracies": 0.6802015900611877,
      "rewards/chosen": -0.9883913993835449,
      "rewards/margins": 0.6118153929710388,
      "rewards/rejected": -1.600206732749939,
      "step": 1690
    },
    {
      "epoch": 0.9,
      "learning_rate": 1.4122343272171823e-07,
      "loss": 0.8319,
      "loss/mini_gap_loss": 0.8318880796432495,
      "loss/ori_loss": 1.1224333047866821,
      "loss/reward_entrophy": 0.2905452847480774,
      "mask/mask_ratio": 0.4644540250301361,
      "reward/A01_acc": 0.6343749761581421,
      "reward/A02_acc": 0.71875,
      "reward/A03_acc": 0.8374999761581421,
      "reward/reward_A0": -0.9178289175033569,
      "reward/reward_A1": -1.1682894229888916,
      "reward/reward_A2": -1.6365363597869873,
      "reward/reward_A3": -2.071646213531494,
      "rewards/accuracies": 0.7302011251449585,
      "rewards/chosen": -0.9178289175033569,
      "rewards/margins": 0.7076454162597656,
      "rewards/rejected": -1.6254743337631226,
      "step": 1700
    },
    {
      "epoch": 0.9,
      "eval_loss": 0.8811317682266235,
      "eval_loss/mini_gap_loss": 0.8812221884727478,
      "eval_loss/ori_loss": 1.1138713359832764,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.5744651556015015,
      "eval_regularization/policy_data_loss": 3.2654638290405273,
      "eval_regularization/policy_ref_data_loss_gap": 1.9994598627090454,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.9075831770896912,
      "eval_reward/A01_acc": 0.6356107592582703,
      "eval_reward/A02_acc": 0.7349896430969238,
      "eval_reward/A03_acc": 0.8307453393936157,
      "eval_reward/reward_A0": -0.8984208106994629,
      "eval_reward/reward_A1": -1.2426929473876953,
      "eval_reward/reward_A2": -1.590885043144226,
      "eval_reward/reward_A3": -2.080568790435791,
      "eval_rewards/accuracies": 0.7337745428085327,
      "eval_rewards/chosen": -0.8984208106994629,
      "eval_rewards/margins": 0.7396116256713867,
      "eval_rewards/rejected": -1.6380324363708496,
      "eval_runtime": 1141.8445,
      "eval_samples_per_second": 1.691,
      "eval_steps_per_second": 0.423,
      "step": 1700
    },
    {
      "epoch": 0.91,
      "learning_rate": 1.2625914189432709e-07,
      "loss": 0.8875,
      "loss/mini_gap_loss": 0.8875478506088257,
      "loss/ori_loss": 1.1338300704956055,
      "loss/reward_entrophy": 0.24628224968910217,
      "mask/mask_ratio": 0.4597649574279785,
      "reward/A01_acc": 0.6499999761581421,
      "reward/A02_acc": 0.7406250238418579,
      "reward/A03_acc": 0.824999988079071,
      "reward/reward_A0": -0.9026660919189453,
      "reward/reward_A1": -1.233636736869812,
      "reward/reward_A2": -1.6044059991836548,
      "reward/reward_A3": -2.0627338886260986,
      "rewards/accuracies": 0.7385342717170715,
      "rewards/chosen": -0.9026660919189453,
      "rewards/margins": 0.7309099435806274,
      "rewards/rejected": -1.6335760354995728,
      "step": 1710
    },
    {
      "epoch": 0.91,
      "learning_rate": 1.1211219844210696e-07,
      "loss": 0.8673,
      "loss/mini_gap_loss": 0.867332935333252,
      "loss/ori_loss": 1.1029475927352905,
      "loss/reward_entrophy": 0.23561468720436096,
      "mask/mask_ratio": 0.44793859124183655,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.7281249761581421,
      "reward/A03_acc": 0.8531249761581421,
      "reward/reward_A0": -0.9328775405883789,
      "reward/reward_A1": -1.2483123540878296,
      "reward/reward_A2": -1.6282415390014648,
      "reward/reward_A3": -2.211362838745117,
      "rewards/accuracies": 0.7354092597961426,
      "rewards/chosen": -0.9328775405883789,
      "rewards/margins": 0.7630778551101685,
      "rewards/rejected": -1.6959552764892578,
      "step": 1720
    },
    {
      "epoch": 0.92,
      "learning_rate": 9.878747356532298e-08,
      "loss": 0.9126,
      "loss/mini_gap_loss": 0.9126413464546204,
      "loss/ori_loss": 1.1166636943817139,
      "loss/reward_entrophy": 0.20402240753173828,
      "mask/mask_ratio": 0.4471747875213623,
      "reward/A01_acc": 0.640625,
      "reward/A02_acc": 0.753125011920929,
      "reward/A03_acc": 0.8500000238418579,
      "reward/reward_A0": -0.9075021743774414,
      "reward/reward_A1": -1.2915842533111572,
      "reward/reward_A2": -1.6443061828613281,
      "reward/reward_A3": -2.2075138092041016,
      "rewards/accuracies": 0.7479091882705688,
      "rewards/chosen": -0.9075021743774414,
      "rewards/margins": 0.8069487810134888,
      "rewards/rejected": -1.7144508361816406,
      "step": 1730
    },
    {
      "epoch": 0.92,
      "learning_rate": 8.62895553506926e-08,
      "loss": 0.8903,
      "loss/mini_gap_loss": 0.8902907371520996,
      "loss/ori_loss": 1.1257424354553223,
      "loss/reward_entrophy": 0.2354518175125122,
      "mask/mask_ratio": 0.46470707654953003,
      "reward/A01_acc": 0.606249988079071,
      "reward/A02_acc": 0.7250000238418579,
      "reward/A03_acc": 0.8218749761581421,
      "reward/reward_A0": -0.9436232447624207,
      "reward/reward_A1": -1.1962757110595703,
      "reward/reward_A2": -1.6372013092041016,
      "reward/reward_A3": -2.147691249847412,
      "rewards/accuracies": 0.7177011370658875,
      "rewards/chosen": -0.9436232447624207,
      "rewards/margins": 0.7167496681213379,
      "rewards/rejected": -1.6603729724884033,
      "step": 1740
    },
    {
      "epoch": 0.93,
      "learning_rate": 7.46227471915767e-08,
      "loss": 0.8939,
      "loss/mini_gap_loss": 0.8939388394355774,
      "loss/ori_loss": 1.1315568685531616,
      "loss/reward_entrophy": 0.23761789500713348,
      "mask/mask_ratio": 0.45197755098342896,
      "reward/A01_acc": 0.6468750238418579,
      "reward/A02_acc": 0.7437499761581421,
      "reward/A03_acc": 0.828125,
      "reward/reward_A0": -0.9276655912399292,
      "reward/reward_A1": -1.2928975820541382,
      "reward/reward_A2": -1.628234624862671,
      "reward/reward_A3": -2.0726821422576904,
      "rewards/accuracies": 0.739575982093811,
      "rewards/chosen": -0.9276655912399292,
      "rewards/margins": 0.7369223833084106,
      "rewards/rejected": -1.6645879745483398,
      "step": 1750
    },
    {
      "epoch": 0.93,
      "learning_rate": 6.379106630619524e-08,
      "loss": 0.9406,
      "loss/mini_gap_loss": 0.9405549764633179,
      "loss/ori_loss": 1.1530787944793701,
      "loss/reward_entrophy": 0.21252386271953583,
      "mask/mask_ratio": 0.4601779878139496,
      "reward/A01_acc": 0.6156250238418579,
      "reward/A02_acc": 0.7406250238418579,
      "reward/A03_acc": 0.8125,
      "reward/reward_A0": -0.9357322454452515,
      "reward/reward_A1": -1.230397343635559,
      "reward/reward_A2": -1.5878775119781494,
      "reward/reward_A3": -2.054211139678955,
      "rewards/accuracies": 0.7229094505310059,
      "rewards/chosen": -0.9357322454452515,
      "rewards/margins": 0.6884135007858276,
      "rewards/rejected": -1.624145746231079,
      "step": 1760
    },
    {
      "epoch": 0.94,
      "learning_rate": 5.3798242354384524e-08,
      "loss": 0.9078,
      "loss/mini_gap_loss": 0.9078313112258911,
      "loss/ori_loss": 1.1203553676605225,
      "loss/reward_entrophy": 0.21252389252185822,
      "mask/mask_ratio": 0.45732393860816956,
      "reward/A01_acc": 0.6812499761581421,
      "reward/A02_acc": 0.762499988079071,
      "reward/A03_acc": 0.8343750238418579,
      "reward/reward_A0": -0.8432042002677917,
      "reward/reward_A1": -1.2736574411392212,
      "reward/reward_A2": -1.6133968830108643,
      "reward/reward_A3": -1.9856348037719727,
      "rewards/accuracies": 0.7593674659729004,
      "rewards/chosen": -0.8432042002677917,
      "rewards/margins": 0.7810092568397522,
      "rewards/rejected": -1.624213457107544,
      "step": 1770
    },
    {
      "epoch": 0.95,
      "learning_rate": 4.464771615336788e-08,
      "loss": 0.9011,
      "loss/mini_gap_loss": 0.9010864496231079,
      "loss/ori_loss": 1.101144790649414,
      "loss/reward_entrophy": 0.20005831122398376,
      "mask/mask_ratio": 0.4748764634132385,
      "reward/A01_acc": 0.621874988079071,
      "reward/A02_acc": 0.7406250238418579,
      "reward/A03_acc": 0.809374988079071,
      "reward/reward_A0": -0.9245613217353821,
      "reward/reward_A1": -1.33144211769104,
      "reward/reward_A2": -1.6136871576309204,
      "reward/reward_A3": -2.071500301361084,
      "rewards/accuracies": 0.7239511609077454,
      "rewards/chosen": -0.9245613217353821,
      "rewards/margins": 0.7476319074630737,
      "rewards/rejected": -1.672193169593811,
      "step": 1780
    },
    {
      "epoch": 0.95,
      "learning_rate": 3.634263849298214e-08,
      "loss": 0.8899,
      "loss/mini_gap_loss": 0.8899188041687012,
      "loss/ori_loss": 1.091244101524353,
      "loss/reward_entrophy": 0.20132538676261902,
      "mask/mask_ratio": 0.45664745569229126,
      "reward/A01_acc": 0.671875,
      "reward/A02_acc": 0.7437499761581421,
      "reward/A03_acc": 0.846875011920929,
      "reward/reward_A0": -0.8805424571037292,
      "reward/reward_A1": -1.2967352867126465,
      "reward/reward_A2": -1.6369895935058594,
      "reward/reward_A3": -2.0665535926818848,
      "rewards/accuracies": 0.7541590929031372,
      "rewards/chosen": -0.8805424571037292,
      "rewards/margins": 0.786200225353241,
      "rewards/rejected": -1.6667426824569702,
      "step": 1790
    },
    {
      "epoch": 0.96,
      "learning_rate": 2.8885869050770952e-08,
      "loss": 0.8719,
      "loss/mini_gap_loss": 0.8718563318252563,
      "loss/ori_loss": 1.1172394752502441,
      "loss/reward_entrophy": 0.24538323283195496,
      "mask/mask_ratio": 0.45249858498573303,
      "reward/A01_acc": 0.6187499761581421,
      "reward/A02_acc": 0.731249988079071,
      "reward/A03_acc": 0.824999988079071,
      "reward/reward_A0": -0.8772226572036743,
      "reward/reward_A1": -1.181308627128601,
      "reward/reward_A2": -1.5706679821014404,
      "reward/reward_A3": -2.102947235107422,
      "rewards/accuracies": 0.7249927520751953,
      "rewards/chosen": -0.8772226572036743,
      "rewards/margins": 0.7410691380500793,
      "rewards/rejected": -1.6182918548583984,
      "step": 1800
    },
    {
      "epoch": 0.96,
      "eval_loss": 0.8809170722961426,
      "eval_loss/mini_gap_loss": 0.8810112476348877,
      "eval_loss/ori_loss": 1.1136603355407715,
      "eval_loss/reward_entrophy": 0.23264923691749573,
      "eval_mask/mask_ratio": 0.4576639235019684,
      "eval_regularization/forward_KL": 1.582701325416565,
      "eval_regularization/policy_data_loss": 3.2694504261016846,
      "eval_regularization/policy_ref_data_loss_gap": 2.003446578979492,
      "eval_regularization/reference_data_loss": 1.2660036087036133,
      "eval_regularization/reverse_KL": 0.9136151075363159,
      "eval_reward/A01_acc": 0.6371635794639587,
      "eval_reward/A02_acc": 0.7339544296264648,
      "eval_reward/A03_acc": 0.8307453393936157,
      "eval_reward/reward_A0": -0.8998152613639832,
      "eval_reward/reward_A1": -1.2451317310333252,
      "eval_reward/reward_A2": -1.594663381576538,
      "eval_reward/reward_A3": -2.087033748626709,
      "eval_rewards/accuracies": 0.7339470982551575,
      "eval_rewards/chosen": -0.8998152613639832,
      "eval_rewards/margins": 0.742444634437561,
      "eval_rewards/rejected": -1.642259955406189,
      "eval_runtime": 1140.6075,
      "eval_samples_per_second": 1.693,
      "eval_steps_per_second": 0.423,
      "step": 1800
    },
    {
      "epoch": 0.96,
      "learning_rate": 2.2279975407315245e-08,
      "loss": 0.8897,
      "loss/mini_gap_loss": 0.8896512985229492,
      "loss/ori_loss": 1.1126375198364258,
      "loss/reward_entrophy": 0.22298622131347656,
      "mask/mask_ratio": 0.4710591435432434,
      "reward/A01_acc": 0.6468750238418579,
      "reward/A02_acc": 0.7250000238418579,
      "reward/A03_acc": 0.8187500238418579,
      "reward/reward_A0": -0.9231008291244507,
      "reward/reward_A1": -1.3476731777191162,
      "reward/reward_A2": -1.6143853664398193,
      "reward/reward_A3": -2.078157663345337,
      "rewards/accuracies": 0.7302010655403137,
      "rewards/chosen": -0.9231008291244507,
      "rewards/margins": 0.7569543719291687,
      "rewards/rejected": -1.6800552606582642,
      "step": 1810
    },
    {
      "epoch": 0.97,
      "learning_rate": 1.652723216214097e-08,
      "loss": 0.8792,
      "loss/mini_gap_loss": 0.8792353868484497,
      "loss/ori_loss": 1.1264166831970215,
      "loss/reward_entrophy": 0.2471812516450882,
      "mask/mask_ratio": 0.4425739347934723,
      "reward/A01_acc": 0.640625,
      "reward/A02_acc": 0.721875011920929,
      "reward/A03_acc": 0.831250011920929,
      "reward/reward_A0": -0.9693109393119812,
      "reward/reward_A1": -1.367462396621704,
      "reward/reward_A2": -1.5905808210372925,
      "reward/reward_A3": -2.1630115509033203,
      "rewards/accuracies": 0.7312427163124084,
      "rewards/chosen": -0.9693109393119812,
      "rewards/margins": 0.7376902103424072,
      "rewards/rejected": -1.7070010900497437,
      "step": 1820
    },
    {
      "epoch": 0.97,
      "learning_rate": 1.1629620150508113e-08,
      "loss": 0.908,
      "loss/mini_gap_loss": 0.9080455899238586,
      "loss/ori_loss": 1.1140711307525635,
      "loss/reward_entrophy": 0.206025630235672,
      "mask/mask_ratio": 0.4901936650276184,
      "reward/A01_acc": 0.6625000238418579,
      "reward/A02_acc": 0.7406250238418579,
      "reward/A03_acc": 0.856249988079071,
      "reward/reward_A0": -0.8302377462387085,
      "reward/reward_A1": -1.2290120124816895,
      "reward/reward_A2": -1.488166093826294,
      "reward/reward_A3": -2.043938636779785,
      "rewards/accuracies": 0.7531174421310425,
      "rewards/chosen": -0.8302377462387085,
      "rewards/margins": 0.7567852735519409,
      "rewards/rejected": -1.5870230197906494,
      "step": 1830
    },
    {
      "epoch": 0.98,
      "learning_rate": 7.588825761354335e-09,
      "loss": 0.8493,
      "loss/mini_gap_loss": 0.8493164777755737,
      "loss/ori_loss": 1.1187317371368408,
      "loss/reward_entrophy": 0.26941537857055664,
      "mask/mask_ratio": 0.45918259024620056,
      "reward/A01_acc": 0.668749988079071,
      "reward/A02_acc": 0.715624988079071,
      "reward/A03_acc": 0.856249988079071,
      "reward/reward_A0": -0.9326618313789368,
      "reward/reward_A1": -1.3947854042053223,
      "reward/reward_A2": -1.602805733680725,
      "reward/reward_A3": -2.1978847980499268,
      "rewards/accuracies": 0.7468675374984741,
      "rewards/chosen": -0.9326618313789368,
      "rewards/margins": 0.7991461753845215,
      "rewards/rejected": -1.731808066368103,
      "step": 1840
    },
    {
      "epoch": 0.98,
      "learning_rate": 4.406240356620017e-09,
      "loss": 0.8675,
      "loss/mini_gap_loss": 0.8675443530082703,
      "loss/ori_loss": 1.0809673070907593,
      "loss/reward_entrophy": 0.21342289447784424,
      "mask/mask_ratio": 0.4686746597290039,
      "reward/A01_acc": 0.6625000238418579,
      "reward/A02_acc": 0.793749988079071,
      "reward/A03_acc": 0.856249988079071,
      "reward/reward_A0": -0.8745518922805786,
      "reward/reward_A1": -1.2898197174072266,
      "reward/reward_A2": -1.6990492343902588,
      "reward/reward_A3": -2.194479465484619,
      "rewards/accuracies": 0.7708256244659424,
      "rewards/chosen": -0.8745518922805786,
      "rewards/margins": 0.8532136678695679,
      "rewards/rejected": -1.727765440940857,
      "step": 1850
    },
    {
      "epoch": 0.99,
      "learning_rate": 2.082959792164274e-09,
      "loss": 0.8789,
      "loss/mini_gap_loss": 0.8789154291152954,
      "loss/ori_loss": 1.1148983240127563,
      "loss/reward_entrophy": 0.235982745885849,
      "mask/mask_ratio": 0.4585256576538086,
      "reward/A01_acc": 0.625,
      "reward/A02_acc": 0.75,
      "reward/A03_acc": 0.8531249761581421,
      "reward/reward_A0": -0.8795498013496399,
      "reward/reward_A1": -1.2011339664459229,
      "reward/reward_A2": -1.6046861410140991,
      "reward/reward_A3": -2.16829776763916,
      "rewards/accuracies": 0.7427009344100952,
      "rewards/chosen": -0.8795498013496399,
      "rewards/margins": 0.7784730195999146,
      "rewards/rejected": -1.6580226421356201,
      "step": 1860
    },
    {
      "epoch": 0.99,
      "learning_rate": 6.197840404292832e-10,
      "loss": 0.8888,
      "loss/mini_gap_loss": 0.8887971639633179,
      "loss/ori_loss": 1.0850541591644287,
      "loss/reward_entrophy": 0.1962570697069168,
      "mask/mask_ratio": 0.4427838921546936,
      "reward/A01_acc": 0.6625000238418579,
      "reward/A02_acc": 0.762499988079071,
      "reward/A03_acc": 0.8656250238418579,
      "reward/reward_A0": -0.8659710884094238,
      "reward/reward_A1": -1.2570902109146118,
      "reward/reward_A2": -1.667168378829956,
      "reward/reward_A3": -2.179504871368408,
      "rewards/accuracies": 0.7635340094566345,
      "rewards/chosen": -0.8659710884094238,
      "rewards/margins": 0.8352664709091187,
      "rewards/rejected": -1.701237440109253,
      "step": 1870
    },
    {
      "epoch": 1.0,
      "learning_rate": 1.721691498673961e-11,
      "loss": 0.8768,
      "loss/mini_gap_loss": 0.876836895942688,
      "loss/ori_loss": 1.1018263101577759,
      "loss/reward_entrophy": 0.22498945891857147,
      "mask/mask_ratio": 0.4616280496120453,
      "reward/A01_acc": 0.653124988079071,
      "reward/A02_acc": 0.734375,
      "reward/A03_acc": 0.8343750238418579,
      "reward/reward_A0": -0.889785885810852,
      "reward/reward_A1": -1.272937297821045,
      "reward/reward_A2": -1.6506683826446533,
      "reward/reward_A3": -2.0467886924743652,
      "rewards/accuracies": 0.7406176328659058,
      "rewards/chosen": -0.889785885810852,
      "rewards/margins": 0.7669956088066101,
      "rewards/rejected": -1.6567814350128174,
      "step": 1880
    },
    {
      "epoch": 1.0,
      "step": 1882,
      "total_flos": 0.0,
      "train_loss": 0.9523535225773972,
      "train_runtime": 91816.406,
      "train_samples_per_second": 0.656,
      "train_steps_per_second": 0.02
    }
  ],
  "logging_steps": 10,
  "max_steps": 1882,
  "num_train_epochs": 1,
  "save_steps": 500,
  "total_flos": 0.0,
  "trial_name": null,
  "trial_params": null
}
