{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 500,
  "global_step": 1547,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.006464124111182934,
      "grad_norm": 6.441478252410889,
      "learning_rate": 9.941822882999353e-07,
      "logits/chosen": -2.1328125,
      "logits/rejected": -2.0367188453674316,
      "logps/chosen": -39.625,
      "logps/rejected": -44.54999923706055,
      "loss": 0.6918,
      "rewards/accuracies": 0.38749998807907104,
      "rewards/chosen": 0.00375022878870368,
      "rewards/margins": 0.0022705078590661287,
      "rewards/rejected": 0.0014846802223473787,
      "step": 10
    },
    {
      "epoch": 0.012928248222365869,
      "grad_norm": 6.379936695098877,
      "learning_rate": 9.877181641887525e-07,
      "logits/chosen": -2.2601561546325684,
      "logits/rejected": -1.967187523841858,
      "logps/chosen": -33.01250076293945,
      "logps/rejected": -47.125,
      "loss": 0.6934,
      "rewards/accuracies": 0.42500001192092896,
      "rewards/chosen": -0.0035984038840979338,
      "rewards/margins": 0.0002197265566792339,
      "rewards/rejected": -0.003833770751953125,
      "step": 20
    },
    {
      "epoch": 0.019392372333548805,
      "grad_norm": 8.188169479370117,
      "learning_rate": 9.812540400775693e-07,
      "logits/chosen": -2.1328125,
      "logits/rejected": -2.000781297683716,
      "logps/chosen": -37.212501525878906,
      "logps/rejected": -46.474998474121094,
      "loss": 0.6898,
      "rewards/accuracies": 0.48750001192092896,
      "rewards/chosen": -0.0014892577892169356,
      "rewards/margins": 0.0067199706099927425,
      "rewards/rejected": -0.008213805966079235,
      "step": 30
    },
    {
      "epoch": 0.025856496444731737,
      "grad_norm": 6.802555084228516,
      "learning_rate": 9.747899159663866e-07,
      "logits/chosen": -2.0062499046325684,
      "logits/rejected": -1.8781249523162842,
      "logps/chosen": -41.1875,
      "logps/rejected": -49.650001525878906,
      "loss": 0.6934,
      "rewards/accuracies": 0.38749998807907104,
      "rewards/chosen": -0.0035942078102380037,
      "rewards/margins": -0.0016548156272619963,
      "rewards/rejected": -0.0019653320778161287,
      "step": 40
    },
    {
      "epoch": 0.03232062055591468,
      "grad_norm": 4.646665573120117,
      "learning_rate": 9.683257918552036e-07,
      "logits/chosen": -2.1929688453674316,
      "logits/rejected": -2.077343702316284,
      "logps/chosen": -32.32500076293945,
      "logps/rejected": -42.79999923706055,
      "loss": 0.6859,
      "rewards/accuracies": 0.550000011920929,
      "rewards/chosen": 0.006103515625,
      "rewards/margins": 0.01368865929543972,
      "rewards/rejected": -0.00757942209020257,
      "step": 50
    },
    {
      "epoch": 0.03878474466709761,
      "grad_norm": 7.005685329437256,
      "learning_rate": 9.618616677440206e-07,
      "logits/chosen": -2.323437452316284,
      "logits/rejected": -1.932031273841858,
      "logps/chosen": -29.112499237060547,
      "logps/rejected": -50.07500076293945,
      "loss": 0.6914,
      "rewards/accuracies": 0.44999998807907104,
      "rewards/chosen": 0.00258636474609375,
      "rewards/margins": 0.0028999329078942537,
      "rewards/rejected": -0.0003005981561727822,
      "step": 60
    },
    {
      "epoch": 0.04524886877828054,
      "grad_norm": 6.807264804840088,
      "learning_rate": 9.553975436328377e-07,
      "logits/chosen": -2.4375,
      "logits/rejected": -2.1812500953674316,
      "logps/chosen": -27.5625,
      "logps/rejected": -39.625,
      "loss": 0.6906,
      "rewards/accuracies": 0.44999998807907104,
      "rewards/chosen": 0.004772949032485485,
      "rewards/margins": 0.00579833984375,
      "rewards/rejected": -0.0010135651100426912,
      "step": 70
    },
    {
      "epoch": 0.051712992889463474,
      "grad_norm": 5.546220302581787,
      "learning_rate": 9.489334195216547e-07,
      "logits/chosen": -2.1968750953674316,
      "logits/rejected": -2.190624952316284,
      "logps/chosen": -32.63750076293945,
      "logps/rejected": -37.974998474121094,
      "loss": 0.6844,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": 0.008374023251235485,
      "rewards/margins": 0.01659545861184597,
      "rewards/rejected": -0.008229064755141735,
      "step": 80
    },
    {
      "epoch": 0.058177117000646414,
      "grad_norm": 5.227822303771973,
      "learning_rate": 9.424692954104718e-07,
      "logits/chosen": -2.2406249046325684,
      "logits/rejected": -2.038281202316284,
      "logps/chosen": -34.974998474121094,
      "logps/rejected": -46.375,
      "loss": 0.6902,
      "rewards/accuracies": 0.48750001192092896,
      "rewards/chosen": 0.0033546448685228825,
      "rewards/margins": 0.007107543759047985,
      "rewards/rejected": -0.0037490844260901213,
      "step": 90
    },
    {
      "epoch": 0.06464124111182935,
      "grad_norm": 7.3963823318481445,
      "learning_rate": 9.36005171299289e-07,
      "logits/chosen": -2.3765625953674316,
      "logits/rejected": -2.0570311546325684,
      "logps/chosen": -29.712499618530273,
      "logps/rejected": -47.287498474121094,
      "loss": 0.6875,
      "rewards/accuracies": 0.5625,
      "rewards/chosen": 0.004231643863022327,
      "rewards/margins": 0.012052154168486595,
      "rewards/rejected": -0.007823944091796875,
      "step": 100
    },
    {
      "epoch": 0.07110536522301228,
      "grad_norm": 6.5707197189331055,
      "learning_rate": 9.29541047188106e-07,
      "logits/chosen": -2.1953125,
      "logits/rejected": -1.967187523841858,
      "logps/chosen": -32.88750076293945,
      "logps/rejected": -48.712501525878906,
      "loss": 0.6852,
      "rewards/accuracies": 0.5874999761581421,
      "rewards/chosen": 0.0073493956588208675,
      "rewards/margins": 0.014874267391860485,
      "rewards/rejected": -0.0075126648880541325,
      "step": 110
    },
    {
      "epoch": 0.07756948933419522,
      "grad_norm": 6.013023853302002,
      "learning_rate": 9.230769230769231e-07,
      "logits/chosen": -2.0640625953674316,
      "logits/rejected": -2.1312499046325684,
      "logps/chosen": -39.54999923706055,
      "logps/rejected": -40.287498474121094,
      "loss": 0.6875,
      "rewards/accuracies": 0.512499988079071,
      "rewards/chosen": 0.0046249390579760075,
      "rewards/margins": 0.012362671084702015,
      "rewards/rejected": -0.00774459820240736,
      "step": 120
    },
    {
      "epoch": 0.08403361344537816,
      "grad_norm": 6.274867057800293,
      "learning_rate": 9.1661279896574e-07,
      "logits/chosen": -2.231250047683716,
      "logits/rejected": -2.0796875953674316,
      "logps/chosen": -32.625,
      "logps/rejected": -44.70000076293945,
      "loss": 0.6867,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.001331329345703125,
      "rewards/margins": 0.01182479877024889,
      "rewards/rejected": -0.01046600379049778,
      "step": 130
    },
    {
      "epoch": 0.09049773755656108,
      "grad_norm": 6.701501369476318,
      "learning_rate": 9.101486748545572e-07,
      "logits/chosen": -2.2757811546325684,
      "logits/rejected": -1.9890625476837158,
      "logps/chosen": -30.587499618530273,
      "logps/rejected": -51.724998474121094,
      "loss": 0.682,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": 0.010884094052016735,
      "rewards/margins": 0.02325439453125,
      "rewards/rejected": -0.01236572302877903,
      "step": 140
    },
    {
      "epoch": 0.09696186166774402,
      "grad_norm": 6.838586807250977,
      "learning_rate": 9.036845507433742e-07,
      "logits/chosen": -2.182812452316284,
      "logits/rejected": -2.004687547683716,
      "logps/chosen": -32.537498474121094,
      "logps/rejected": -46.82500076293945,
      "loss": 0.682,
      "rewards/accuracies": 0.6625000238418579,
      "rewards/chosen": 0.00593910226598382,
      "rewards/margins": 0.02400360070168972,
      "rewards/rejected": -0.01805419847369194,
      "step": 150
    },
    {
      "epoch": 0.10342598577892695,
      "grad_norm": 5.635846138000488,
      "learning_rate": 8.972204266321913e-07,
      "logits/chosen": -2.3203125,
      "logits/rejected": -2.374218702316284,
      "logps/chosen": -29.424999237060547,
      "logps/rejected": -33.9375,
      "loss": 0.6891,
      "rewards/accuracies": 0.550000011920929,
      "rewards/chosen": 0.0063377381302416325,
      "rewards/margins": 0.009060668759047985,
      "rewards/rejected": -0.002735137939453125,
      "step": 160
    },
    {
      "epoch": 0.10989010989010989,
      "grad_norm": 7.251461505889893,
      "learning_rate": 8.907563025210084e-07,
      "logits/chosen": -2.4507813453674316,
      "logits/rejected": -2.221874952316284,
      "logps/chosen": -26.912500381469727,
      "logps/rejected": -39.349998474121094,
      "loss": 0.684,
      "rewards/accuracies": 0.512499988079071,
      "rewards/chosen": 0.006022262386977673,
      "rewards/margins": 0.01661071740090847,
      "rewards/rejected": -0.0105743408203125,
      "step": 170
    },
    {
      "epoch": 0.11635423400129283,
      "grad_norm": 5.773430347442627,
      "learning_rate": 8.842921784098254e-07,
      "logits/chosen": -2.1859374046325684,
      "logits/rejected": -2.1234374046325684,
      "logps/chosen": -28.8125,
      "logps/rejected": -41.375,
      "loss": 0.6781,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": 0.007189941592514515,
      "rewards/margins": 0.03049926832318306,
      "rewards/rejected": -0.023311614990234375,
      "step": 180
    },
    {
      "epoch": 0.12281835811247575,
      "grad_norm": 6.394359588623047,
      "learning_rate": 8.778280542986425e-07,
      "logits/chosen": -2.246875047683716,
      "logits/rejected": -2.049999952316284,
      "logps/chosen": -35.38750076293945,
      "logps/rejected": -43.73749923706055,
      "loss": 0.6785,
      "rewards/accuracies": 0.6625000238418579,
      "rewards/chosen": 0.009860992431640625,
      "rewards/margins": 0.03095092810690403,
      "rewards/rejected": -0.0211334228515625,
      "step": 190
    },
    {
      "epoch": 0.1292824822236587,
      "grad_norm": 6.146031379699707,
      "learning_rate": 8.713639301874596e-07,
      "logits/chosen": -2.1500000953674316,
      "logits/rejected": -2.1226563453674316,
      "logps/chosen": -32.4375,
      "logps/rejected": -37.912498474121094,
      "loss": 0.6828,
      "rewards/accuracies": 0.5874999761581421,
      "rewards/chosen": 0.00611190777271986,
      "rewards/margins": 0.020227814093232155,
      "rewards/rejected": -0.0141754150390625,
      "step": 200
    },
    {
      "epoch": 0.13574660633484162,
      "grad_norm": 6.2582268714904785,
      "learning_rate": 8.648998060762767e-07,
      "logits/chosen": -2.2367186546325684,
      "logits/rejected": -1.9421875476837158,
      "logps/chosen": -31.274999618530273,
      "logps/rejected": -49.6875,
      "loss": 0.6789,
      "rewards/accuracies": 0.5625,
      "rewards/chosen": 0.011743927374482155,
      "rewards/margins": 0.03179015964269638,
      "rewards/rejected": -0.02000579796731472,
      "step": 210
    },
    {
      "epoch": 0.14221073044602456,
      "grad_norm": 6.393710136413574,
      "learning_rate": 8.584356819650936e-07,
      "logits/chosen": -2.186718702316284,
      "logits/rejected": -1.971093773841858,
      "logps/chosen": -32.73749923706055,
      "logps/rejected": -43.29999923706055,
      "loss": 0.6816,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": 0.0061817169189453125,
      "rewards/margins": 0.02687835693359375,
      "rewards/rejected": -0.02064209058880806,
      "step": 220
    },
    {
      "epoch": 0.1486748545572075,
      "grad_norm": 7.510876655578613,
      "learning_rate": 8.519715578539108e-07,
      "logits/chosen": -2.2992186546325684,
      "logits/rejected": -2.164843797683716,
      "logps/chosen": -30.362499237060547,
      "logps/rejected": -37.099998474121094,
      "loss": 0.682,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": 0.0007904052617959678,
      "rewards/margins": 0.02447509765625,
      "rewards/rejected": -0.023685073480010033,
      "step": 230
    },
    {
      "epoch": 0.15513897866839044,
      "grad_norm": 9.3156156539917,
      "learning_rate": 8.455074337427278e-07,
      "logits/chosen": -2.2484374046325684,
      "logits/rejected": -2.07421875,
      "logps/chosen": -33.849998474121094,
      "logps/rejected": -45.6875,
      "loss": 0.6773,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": 0.0067993164993822575,
      "rewards/margins": 0.03448181226849556,
      "rewards/rejected": -0.02765350416302681,
      "step": 240
    },
    {
      "epoch": 0.16160310277957338,
      "grad_norm": 6.761559009552002,
      "learning_rate": 8.390433096315449e-07,
      "logits/chosen": -2.19921875,
      "logits/rejected": -2.000781297683716,
      "logps/chosen": -33.29999923706055,
      "logps/rejected": -48.8125,
      "loss": 0.6801,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": 0.007586670108139515,
      "rewards/margins": 0.02846984937787056,
      "rewards/rejected": -0.0208587646484375,
      "step": 250
    },
    {
      "epoch": 0.16806722689075632,
      "grad_norm": 5.626691818237305,
      "learning_rate": 8.32579185520362e-07,
      "logits/chosen": -2.13671875,
      "logits/rejected": -2.0843749046325684,
      "logps/chosen": -36.650001525878906,
      "logps/rejected": -45.537498474121094,
      "loss": 0.6738,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": 0.010319518856704235,
      "rewards/margins": 0.03958282619714737,
      "rewards/rejected": -0.02925720252096653,
      "step": 260
    },
    {
      "epoch": 0.17453135100193923,
      "grad_norm": 5.977700233459473,
      "learning_rate": 8.26115061409179e-07,
      "logits/chosen": -2.20703125,
      "logits/rejected": -2.210156202316284,
      "logps/chosen": -35.23749923706055,
      "logps/rejected": -39.70000076293945,
      "loss": 0.6766,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": 0.0086822509765625,
      "rewards/margins": 0.03570060804486275,
      "rewards/rejected": -0.0270538330078125,
      "step": 270
    },
    {
      "epoch": 0.18099547511312217,
      "grad_norm": 4.738009929656982,
      "learning_rate": 8.19650937297996e-07,
      "logits/chosen": -2.2953124046325684,
      "logits/rejected": -2.016406297683716,
      "logps/chosen": -30.299999237060547,
      "logps/rejected": -43.29999923706055,
      "loss": 0.6777,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": 0.007350921630859375,
      "rewards/margins": 0.0331878662109375,
      "rewards/rejected": -0.02580566331744194,
      "step": 280
    },
    {
      "epoch": 0.1874595992243051,
      "grad_norm": 7.8707194328308105,
      "learning_rate": 8.131868131868131e-07,
      "logits/chosen": -2.328125,
      "logits/rejected": -2.1703124046325684,
      "logps/chosen": -32.20000076293945,
      "logps/rejected": -42.51250076293945,
      "loss": 0.6711,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.009305572137236595,
      "rewards/margins": 0.04731445387005806,
      "rewards/rejected": -0.03800048679113388,
      "step": 290
    },
    {
      "epoch": 0.19392372333548805,
      "grad_norm": 5.525120735168457,
      "learning_rate": 8.067226890756303e-07,
      "logits/chosen": -2.00390625,
      "logits/rejected": -2.008593797683716,
      "logps/chosen": -36.79999923706055,
      "logps/rejected": -46.974998474121094,
      "loss": 0.6797,
      "rewards/accuracies": 0.5625,
      "rewards/chosen": -0.0029144287109375,
      "rewards/margins": 0.0306396484375,
      "rewards/rejected": -0.033496856689453125,
      "step": 300
    },
    {
      "epoch": 0.20038784744667099,
      "grad_norm": 6.92855167388916,
      "learning_rate": 8.002585649644473e-07,
      "logits/chosen": -2.1039061546325684,
      "logits/rejected": -1.8093750476837158,
      "logps/chosen": -34.04999923706055,
      "logps/rejected": -55.13750076293945,
      "loss": 0.6766,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": 0.0077453614212572575,
      "rewards/margins": 0.0352935791015625,
      "rewards/rejected": -0.02752990648150444,
      "step": 310
    },
    {
      "epoch": 0.2068519715578539,
      "grad_norm": 7.399740695953369,
      "learning_rate": 7.937944408532643e-07,
      "logits/chosen": -2.1039061546325684,
      "logits/rejected": -2.04296875,
      "logps/chosen": -34.275001525878906,
      "logps/rejected": -45.92499923706055,
      "loss": 0.6719,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": 0.0064140320755541325,
      "rewards/margins": 0.04375610500574112,
      "rewards/rejected": -0.037353515625,
      "step": 320
    },
    {
      "epoch": 0.21331609566903684,
      "grad_norm": 5.498190879821777,
      "learning_rate": 7.873303167420814e-07,
      "logits/chosen": -2.2593750953674316,
      "logits/rejected": -2.1468749046325684,
      "logps/chosen": -28.75,
      "logps/rejected": -37.974998474121094,
      "loss": 0.675,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.014706420712172985,
      "rewards/margins": 0.03831176832318306,
      "rewards/rejected": -0.023575592786073685,
      "step": 330
    },
    {
      "epoch": 0.21978021978021978,
      "grad_norm": 7.076873302459717,
      "learning_rate": 7.808661926308985e-07,
      "logits/chosen": -2.18359375,
      "logits/rejected": -2.008593797683716,
      "logps/chosen": -34.38750076293945,
      "logps/rejected": -48.61249923706055,
      "loss": 0.6777,
      "rewards/accuracies": 0.5874999761581421,
      "rewards/chosen": 0.0021118163131177425,
      "rewards/margins": 0.03334350511431694,
      "rewards/rejected": -0.031247710809111595,
      "step": 340
    },
    {
      "epoch": 0.22624434389140272,
      "grad_norm": 5.878056526184082,
      "learning_rate": 7.744020685197155e-07,
      "logits/chosen": -2.1328125,
      "logits/rejected": -2.109375,
      "logps/chosen": -34.67499923706055,
      "logps/rejected": -40.48749923706055,
      "loss": 0.6684,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": 0.005421447567641735,
      "rewards/margins": 0.05188598483800888,
      "rewards/rejected": -0.04656372219324112,
      "step": 350
    },
    {
      "epoch": 0.23270846800258566,
      "grad_norm": 6.397393226623535,
      "learning_rate": 7.679379444085327e-07,
      "logits/chosen": -2.282031297683716,
      "logits/rejected": -1.9929687976837158,
      "logps/chosen": -32.63750076293945,
      "logps/rejected": -48.599998474121094,
      "loss": 0.6762,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": 0.00634765625,
      "rewards/margins": 0.036905668675899506,
      "rewards/rejected": -0.03057861328125,
      "step": 360
    },
    {
      "epoch": 0.2391725921137686,
      "grad_norm": 6.895285606384277,
      "learning_rate": 7.614738202973496e-07,
      "logits/chosen": -2.210156202316284,
      "logits/rejected": -2.0250000953674316,
      "logps/chosen": -31.899999618530273,
      "logps/rejected": -42.625,
      "loss": 0.6664,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": 0.00459976214915514,
      "rewards/margins": 0.05656738206744194,
      "rewards/rejected": -0.05199585109949112,
      "step": 370
    },
    {
      "epoch": 0.2456367162249515,
      "grad_norm": 6.152177810668945,
      "learning_rate": 7.550096961861667e-07,
      "logits/chosen": -2.1937499046325684,
      "logits/rejected": -2.0687499046325684,
      "logps/chosen": -32.51250076293945,
      "logps/rejected": -42.5625,
      "loss": 0.6691,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.012201691046357155,
      "rewards/margins": 0.05018310621380806,
      "rewards/rejected": -0.03803100436925888,
      "step": 380
    },
    {
      "epoch": 0.25210084033613445,
      "grad_norm": 6.210917949676514,
      "learning_rate": 7.485455720749839e-07,
      "logits/chosen": -2.125781297683716,
      "logits/rejected": -1.8289062976837158,
      "logps/chosen": -35.125,
      "logps/rejected": -50.07500076293945,
      "loss": 0.6586,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.014538574032485485,
      "rewards/margins": 0.07254638522863388,
      "rewards/rejected": -0.05802764743566513,
      "step": 390
    },
    {
      "epoch": 0.2585649644473174,
      "grad_norm": 6.9216389656066895,
      "learning_rate": 7.420814479638009e-07,
      "logits/chosen": -2.0914063453674316,
      "logits/rejected": -2.059375047683716,
      "logps/chosen": -37.57500076293945,
      "logps/rejected": -43.86249923706055,
      "loss": 0.6699,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": 0.0035232543013989925,
      "rewards/margins": 0.04853515699505806,
      "rewards/rejected": -0.04498291015625,
      "step": 400
    },
    {
      "epoch": 0.2650290885585003,
      "grad_norm": 5.222481727600098,
      "learning_rate": 7.35617323852618e-07,
      "logits/chosen": -2.2632813453674316,
      "logits/rejected": -1.9460937976837158,
      "logps/chosen": -31.5,
      "logps/rejected": -43.70000076293945,
      "loss": 0.6645,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": 0.0067138671875,
      "rewards/margins": 0.06016235426068306,
      "rewards/rejected": -0.05344238132238388,
      "step": 410
    },
    {
      "epoch": 0.27149321266968324,
      "grad_norm": 6.641375541687012,
      "learning_rate": 7.291531997414349e-07,
      "logits/chosen": -2.20703125,
      "logits/rejected": -2.184375047683716,
      "logps/chosen": -33.875,
      "logps/rejected": -42.32500076293945,
      "loss": 0.6672,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.01611480675637722,
      "rewards/margins": 0.05540771409869194,
      "rewards/rejected": -0.03934326022863388,
      "step": 420
    },
    {
      "epoch": 0.2779573367808662,
      "grad_norm": 6.896761894226074,
      "learning_rate": 7.226890756302521e-07,
      "logits/chosen": -2.296093702316284,
      "logits/rejected": -2.024218797683716,
      "logps/chosen": -30.0625,
      "logps/rejected": -46.98749923706055,
      "loss": 0.6645,
      "rewards/accuracies": 0.6625000238418579,
      "rewards/chosen": 0.0027313232421875,
      "rewards/margins": 0.06123046949505806,
      "rewards/rejected": -0.05858154222369194,
      "step": 430
    },
    {
      "epoch": 0.2844214608920491,
      "grad_norm": 7.057956218719482,
      "learning_rate": 7.162249515190691e-07,
      "logits/chosen": -2.2484374046325684,
      "logits/rejected": -1.953125,
      "logps/chosen": -31.487499237060547,
      "logps/rejected": -47.57500076293945,
      "loss": 0.6656,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": 0.00907974224537611,
      "rewards/margins": 0.06015624850988388,
      "rewards/rejected": -0.05110473558306694,
      "step": 440
    },
    {
      "epoch": 0.2908855850032321,
      "grad_norm": 5.408035755157471,
      "learning_rate": 7.097608274078862e-07,
      "logits/chosen": -2.2992186546325684,
      "logits/rejected": -2.0062499046325684,
      "logps/chosen": -31.975000381469727,
      "logps/rejected": -46.662498474121094,
      "loss": 0.6598,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.013701247982680798,
      "rewards/margins": 0.06833495944738388,
      "rewards/rejected": -0.05463867262005806,
      "step": 450
    },
    {
      "epoch": 0.297349709114415,
      "grad_norm": 5.76138162612915,
      "learning_rate": 7.032967032967034e-07,
      "logits/chosen": -2.29296875,
      "logits/rejected": -1.947656273841858,
      "logps/chosen": -31.087499618530273,
      "logps/rejected": -46.724998474121094,
      "loss": 0.6547,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": 0.013378906063735485,
      "rewards/margins": 0.08332519233226776,
      "rewards/rejected": -0.06996764987707138,
      "step": 460
    },
    {
      "epoch": 0.3038138332255979,
      "grad_norm": 7.115036964416504,
      "learning_rate": 6.968325791855203e-07,
      "logits/chosen": -2.270312547683716,
      "logits/rejected": -1.92578125,
      "logps/chosen": -31.075000762939453,
      "logps/rejected": -43.400001525878906,
      "loss": 0.6582,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": 0.008691024966537952,
      "rewards/margins": 0.07414398342370987,
      "rewards/rejected": -0.06532974541187286,
      "step": 470
    },
    {
      "epoch": 0.3102779573367809,
      "grad_norm": 8.682717323303223,
      "learning_rate": 6.903684550743374e-07,
      "logits/chosen": -2.168750047683716,
      "logits/rejected": -1.928125023841858,
      "logps/chosen": -36.88750076293945,
      "logps/rejected": -46.912498474121094,
      "loss": 0.6711,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": 0.0021110535599291325,
      "rewards/margins": 0.04788513109087944,
      "rewards/rejected": -0.045745849609375,
      "step": 480
    },
    {
      "epoch": 0.3167420814479638,
      "grad_norm": 5.5581207275390625,
      "learning_rate": 6.839043309631545e-07,
      "logits/chosen": -2.085156202316284,
      "logits/rejected": -2.0914063453674316,
      "logps/chosen": -35.337501525878906,
      "logps/rejected": -38.849998474121094,
      "loss": 0.6676,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": 0.012737656012177467,
      "rewards/margins": 0.054290771484375,
      "rewards/rejected": -0.04152526706457138,
      "step": 490
    },
    {
      "epoch": 0.32320620555914675,
      "grad_norm": 5.94589900970459,
      "learning_rate": 6.774402068519716e-07,
      "logits/chosen": -2.325000047683716,
      "logits/rejected": -1.939062476158142,
      "logps/chosen": -28.412500381469727,
      "logps/rejected": -43.900001525878906,
      "loss": 0.6609,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.0011001586681231856,
      "rewards/margins": 0.06931762397289276,
      "rewards/rejected": -0.07047729194164276,
      "step": 500
    },
    {
      "epoch": 0.32320620555914675,
      "eval_logits/chosen": -2.2455790042877197,
      "eval_logits/rejected": -2.039123058319092,
      "eval_logps/chosen": -32.371124267578125,
      "eval_logps/rejected": -44.32170486450195,
      "eval_loss": 0.6643579602241516,
      "eval_rewards/accuracies": 0.7016581296920776,
      "eval_rewards/chosen": 0.0063747139647603035,
      "eval_rewards/margins": 0.06211565062403679,
      "eval_rewards/rejected": -0.05573521926999092,
      "eval_runtime": 247.326,
      "eval_samples_per_second": 12.51,
      "eval_steps_per_second": 1.565,
      "step": 500
    },
    {
      "epoch": 0.32967032967032966,
      "grad_norm": 5.689755916595459,
      "learning_rate": 6.709760827407885e-07,
      "logits/chosen": -2.2484374046325684,
      "logits/rejected": -2.0640625953674316,
      "logps/chosen": -30.274999618530273,
      "logps/rejected": -42.837501525878906,
      "loss": 0.6547,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.0027290345169603825,
      "rewards/margins": 0.0814208984375,
      "rewards/rejected": -0.07871093600988388,
      "step": 510
    },
    {
      "epoch": 0.33613445378151263,
      "grad_norm": 6.042952537536621,
      "learning_rate": 6.645119586296056e-07,
      "logits/chosen": -2.258593797683716,
      "logits/rejected": -2.0132813453674316,
      "logps/chosen": -32.07500076293945,
      "logps/rejected": -44.82500076293945,
      "loss": 0.6559,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": 0.0064849853515625,
      "rewards/margins": 0.08304443210363388,
      "rewards/rejected": -0.07655028998851776,
      "step": 520
    },
    {
      "epoch": 0.34259857789269554,
      "grad_norm": 4.668971061706543,
      "learning_rate": 6.580478345184227e-07,
      "logits/chosen": -2.234375,
      "logits/rejected": -1.982812523841858,
      "logps/chosen": -33.11249923706055,
      "logps/rejected": -45.375,
      "loss": 0.6625,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.01325378380715847,
      "rewards/margins": 0.064453125,
      "rewards/rejected": -0.05126953125,
      "step": 530
    },
    {
      "epoch": 0.34906270200387846,
      "grad_norm": 6.762983322143555,
      "learning_rate": 6.515837104072398e-07,
      "logits/chosen": -2.1624999046325684,
      "logits/rejected": -2.2210936546325684,
      "logps/chosen": -34.5,
      "logps/rejected": -38.599998474121094,
      "loss": 0.6676,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.0068145752884447575,
      "rewards/margins": 0.05534057691693306,
      "rewards/rejected": -0.04851684719324112,
      "step": 540
    },
    {
      "epoch": 0.3555268261150614,
      "grad_norm": 6.561522960662842,
      "learning_rate": 6.451195862960569e-07,
      "logits/chosen": -2.190624952316284,
      "logits/rejected": -2.198437452316284,
      "logps/chosen": -34.51250076293945,
      "logps/rejected": -37.95000076293945,
      "loss": 0.6687,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": -0.008846282958984375,
      "rewards/margins": 0.05479583889245987,
      "rewards/rejected": -0.06376762688159943,
      "step": 550
    },
    {
      "epoch": 0.36199095022624433,
      "grad_norm": 5.780110836029053,
      "learning_rate": 6.386554621848739e-07,
      "logits/chosen": -2.1734375953674316,
      "logits/rejected": -2.01171875,
      "logps/chosen": -30.725000381469727,
      "logps/rejected": -45.1875,
      "loss": 0.659,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -0.0052543641068041325,
      "rewards/margins": 0.07680664211511612,
      "rewards/rejected": -0.08203735202550888,
      "step": 560
    },
    {
      "epoch": 0.3684550743374273,
      "grad_norm": 6.217379093170166,
      "learning_rate": 6.32191338073691e-07,
      "logits/chosen": -2.4429688453674316,
      "logits/rejected": -2.174999952316284,
      "logps/chosen": -30.981250762939453,
      "logps/rejected": -41.25,
      "loss": 0.6793,
      "rewards/accuracies": 0.5874999761581421,
      "rewards/chosen": -0.0036605834029614925,
      "rewards/margins": 0.029883574694395065,
      "rewards/rejected": -0.03357391431927681,
      "step": 570
    },
    {
      "epoch": 0.3749191984486102,
      "grad_norm": 7.391123294830322,
      "learning_rate": 6.25727213962508e-07,
      "logits/chosen": -2.15625,
      "logits/rejected": -1.9500000476837158,
      "logps/chosen": -35.61249923706055,
      "logps/rejected": -46.087501525878906,
      "loss": 0.6602,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -0.007740020751953125,
      "rewards/margins": 0.07353057712316513,
      "rewards/rejected": -0.0814361572265625,
      "step": 580
    },
    {
      "epoch": 0.3813833225597931,
      "grad_norm": 6.871866226196289,
      "learning_rate": 6.192630898513252e-07,
      "logits/chosen": -2.264843702316284,
      "logits/rejected": -2.0179686546325684,
      "logps/chosen": -32.775001525878906,
      "logps/rejected": -46.537498474121094,
      "loss": 0.6695,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": 0.0055633545853197575,
      "rewards/margins": 0.04864501953125,
      "rewards/rejected": -0.04310913011431694,
      "step": 590
    },
    {
      "epoch": 0.3878474466709761,
      "grad_norm": 5.9584221839904785,
      "learning_rate": 6.127989657401422e-07,
      "logits/chosen": -2.321093797683716,
      "logits/rejected": -2.2289061546325684,
      "logps/chosen": -30.837499618530273,
      "logps/rejected": -35.087501525878906,
      "loss": 0.6641,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -0.0007110595470294356,
      "rewards/margins": 0.06364135444164276,
      "rewards/rejected": -0.06436767429113388,
      "step": 600
    },
    {
      "epoch": 0.394311570782159,
      "grad_norm": 5.761967658996582,
      "learning_rate": 6.063348416289592e-07,
      "logits/chosen": -2.4453125,
      "logits/rejected": -2.124218702316284,
      "logps/chosen": -26.912500381469727,
      "logps/rejected": -43.099998474121094,
      "loss": 0.6508,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.01220092736184597,
      "rewards/margins": 0.09335937350988388,
      "rewards/rejected": -0.08112182468175888,
      "step": 610
    },
    {
      "epoch": 0.40077569489334197,
      "grad_norm": 6.2318010330200195,
      "learning_rate": 5.998707175177762e-07,
      "logits/chosen": -2.4296875,
      "logits/rejected": -2.030468702316284,
      "logps/chosen": -27.887500762939453,
      "logps/rejected": -41.07500076293945,
      "loss": 0.6535,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": 0.01733245886862278,
      "rewards/margins": 0.08664550632238388,
      "rewards/rejected": -0.06934051215648651,
      "step": 620
    },
    {
      "epoch": 0.4072398190045249,
      "grad_norm": 6.487030506134033,
      "learning_rate": 5.934065934065934e-07,
      "logits/chosen": -2.246875047683716,
      "logits/rejected": -1.774999976158142,
      "logps/chosen": -33.54999923706055,
      "logps/rejected": -55.587501525878906,
      "loss": 0.6566,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.0071121216751635075,
      "rewards/margins": 0.07373046875,
      "rewards/rejected": -0.06658630073070526,
      "step": 630
    },
    {
      "epoch": 0.4137039431157078,
      "grad_norm": 5.164629936218262,
      "learning_rate": 5.869424692954105e-07,
      "logits/chosen": -2.159374952316284,
      "logits/rejected": -2.106250047683716,
      "logps/chosen": -34.962501525878906,
      "logps/rejected": -41.962501525878906,
      "loss": 0.6621,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.00933227501809597,
      "rewards/margins": 0.06761779636144638,
      "rewards/rejected": -0.05841064453125,
      "step": 640
    },
    {
      "epoch": 0.42016806722689076,
      "grad_norm": 6.9007649421691895,
      "learning_rate": 5.804783451842275e-07,
      "logits/chosen": -2.1351561546325684,
      "logits/rejected": -1.7609374523162842,
      "logps/chosen": -33.73749923706055,
      "logps/rejected": -55.92499923706055,
      "loss": 0.6465,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.010708617977797985,
      "rewards/margins": 0.09979248046875,
      "rewards/rejected": -0.08903808891773224,
      "step": 650
    },
    {
      "epoch": 0.4266321913380737,
      "grad_norm": 5.711326599121094,
      "learning_rate": 5.740142210730446e-07,
      "logits/chosen": -2.233593702316284,
      "logits/rejected": -2.23046875,
      "logps/chosen": -31.774999618530273,
      "logps/rejected": -36.42499923706055,
      "loss": 0.6574,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.01540527306497097,
      "rewards/margins": 0.076171875,
      "rewards/rejected": -0.06080322340130806,
      "step": 660
    },
    {
      "epoch": 0.43309631544925664,
      "grad_norm": 6.149769306182861,
      "learning_rate": 5.675500969618616e-07,
      "logits/chosen": -1.94140625,
      "logits/rejected": -1.908593773841858,
      "logps/chosen": -38.73749923706055,
      "logps/rejected": -49.962501525878906,
      "loss": 0.6559,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": 0.0010879517067223787,
      "rewards/margins": 0.083526611328125,
      "rewards/rejected": -0.08238525688648224,
      "step": 670
    },
    {
      "epoch": 0.43956043956043955,
      "grad_norm": 6.878147125244141,
      "learning_rate": 5.610859728506787e-07,
      "logits/chosen": -2.24609375,
      "logits/rejected": -1.9734375476837158,
      "logps/chosen": -30.887500762939453,
      "logps/rejected": -45.599998474121094,
      "loss": 0.6508,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": 0.0025054931174963713,
      "rewards/margins": 0.09371338039636612,
      "rewards/rejected": -0.09129638969898224,
      "step": 680
    },
    {
      "epoch": 0.4460245636716225,
      "grad_norm": 6.543976306915283,
      "learning_rate": 5.546218487394958e-07,
      "logits/chosen": -2.387500047683716,
      "logits/rejected": -2.2281250953674316,
      "logps/chosen": -29.149999618530273,
      "logps/rejected": -36.650001525878906,
      "loss": 0.6512,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.01100997906178236,
      "rewards/margins": 0.093994140625,
      "rewards/rejected": -0.08299560844898224,
      "step": 690
    },
    {
      "epoch": 0.45248868778280543,
      "grad_norm": 8.325157165527344,
      "learning_rate": 5.481577246283129e-07,
      "logits/chosen": -2.077343702316284,
      "logits/rejected": -1.9835937023162842,
      "logps/chosen": -35.162498474121094,
      "logps/rejected": -45.375,
      "loss": 0.6672,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": -0.009403991512954235,
      "rewards/margins": 0.059539794921875,
      "rewards/rejected": -0.06892700493335724,
      "step": 700
    },
    {
      "epoch": 0.45895281189398834,
      "grad_norm": 7.477227687835693,
      "learning_rate": 5.416936005171298e-07,
      "logits/chosen": -2.2281250953674316,
      "logits/rejected": -1.904687523841858,
      "logps/chosen": -33.20000076293945,
      "logps/rejected": -47.099998474121094,
      "loss": 0.6488,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.0029663085006177425,
      "rewards/margins": 0.09665527194738388,
      "rewards/rejected": -0.09373779594898224,
      "step": 710
    },
    {
      "epoch": 0.4654169360051713,
      "grad_norm": 9.157503128051758,
      "learning_rate": 5.35229476405947e-07,
      "logits/chosen": -2.225781202316284,
      "logits/rejected": -1.994531273841858,
      "logps/chosen": -32.162498474121094,
      "logps/rejected": -44.11249923706055,
      "loss": 0.6488,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": 0.012927246280014515,
      "rewards/margins": 0.0994873046875,
      "rewards/rejected": -0.08662720024585724,
      "step": 720
    },
    {
      "epoch": 0.4718810601163542,
      "grad_norm": 6.541794776916504,
      "learning_rate": 5.287653522947641e-07,
      "logits/chosen": -2.137500047683716,
      "logits/rejected": -2.112499952316284,
      "logps/chosen": -34.98749923706055,
      "logps/rejected": -44.88750076293945,
      "loss": 0.652,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.0025970458518713713,
      "rewards/margins": 0.09162597358226776,
      "rewards/rejected": -0.09426269680261612,
      "step": 730
    },
    {
      "epoch": 0.4783451842275372,
      "grad_norm": 7.045079708099365,
      "learning_rate": 5.223012281835811e-07,
      "logits/chosen": -2.211718797683716,
      "logits/rejected": -2.049999952316284,
      "logps/chosen": -33.287498474121094,
      "logps/rejected": -42.6875,
      "loss": 0.6492,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": 0.02159881591796875,
      "rewards/margins": 0.09956054389476776,
      "rewards/rejected": -0.0779876708984375,
      "step": 740
    },
    {
      "epoch": 0.4848093083387201,
      "grad_norm": 6.84136962890625,
      "learning_rate": 5.158371040723983e-07,
      "logits/chosen": -2.25,
      "logits/rejected": -2.008593797683716,
      "logps/chosen": -32.900001525878906,
      "logps/rejected": -44.13750076293945,
      "loss": 0.648,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": 0.009143066592514515,
      "rewards/margins": 0.10205078125,
      "rewards/rejected": -0.09302673488855362,
      "step": 750
    },
    {
      "epoch": 0.491273432449903,
      "grad_norm": 5.395453929901123,
      "learning_rate": 5.093729799612152e-07,
      "logits/chosen": -2.0093750953674316,
      "logits/rejected": -1.975000023841858,
      "logps/chosen": -39.775001525878906,
      "logps/rejected": -49.837501525878906,
      "loss": 0.666,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": -0.008969116024672985,
      "rewards/margins": 0.05938110500574112,
      "rewards/rejected": -0.06843261420726776,
      "step": 760
    },
    {
      "epoch": 0.497737556561086,
      "grad_norm": 6.489656448364258,
      "learning_rate": 5.029088558500323e-07,
      "logits/chosen": -2.284374952316284,
      "logits/rejected": -2.137500047683716,
      "logps/chosen": -29.424999237060547,
      "logps/rejected": -40.26250076293945,
      "loss": 0.6418,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.0011688232189044356,
      "rewards/margins": 0.11923827975988388,
      "rewards/rejected": -0.120661161839962,
      "step": 770
    },
    {
      "epoch": 0.5042016806722689,
      "grad_norm": 6.831184387207031,
      "learning_rate": 4.964447317388493e-07,
      "logits/chosen": -2.0250000953674316,
      "logits/rejected": -1.974218726158142,
      "logps/chosen": -37.36249923706055,
      "logps/rejected": -47.82500076293945,
      "loss": 0.6539,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.006091308780014515,
      "rewards/margins": 0.09051208198070526,
      "rewards/rejected": -0.0965423583984375,
      "step": 780
    },
    {
      "epoch": 0.5106658047834518,
      "grad_norm": 5.434061050415039,
      "learning_rate": 4.899806076276664e-07,
      "logits/chosen": -2.3218750953674316,
      "logits/rejected": -1.903906226158142,
      "logps/chosen": -30.8125,
      "logps/rejected": -47.537498474121094,
      "loss": 0.6344,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.0058044432662427425,
      "rewards/margins": 0.13640746474266052,
      "rewards/rejected": -0.13059082627296448,
      "step": 790
    },
    {
      "epoch": 0.5171299288946348,
      "grad_norm": 6.1551408767700195,
      "learning_rate": 4.835164835164835e-07,
      "logits/chosen": -2.1656250953674316,
      "logits/rejected": -1.9874999523162842,
      "logps/chosen": -34.45000076293945,
      "logps/rejected": -46.287498474121094,
      "loss": 0.65,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -0.00888214074075222,
      "rewards/margins": 0.09798584133386612,
      "rewards/rejected": -0.10687255859375,
      "step": 800
    },
    {
      "epoch": 0.5235940530058177,
      "grad_norm": 6.931482315063477,
      "learning_rate": 4.770523594053005e-07,
      "logits/chosen": -2.0921874046325684,
      "logits/rejected": -1.974218726158142,
      "logps/chosen": -36.11249923706055,
      "logps/rejected": -45.86249923706055,
      "loss": 0.6461,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.005218505859375,
      "rewards/margins": 0.10900268703699112,
      "rewards/rejected": -0.10369873046875,
      "step": 810
    },
    {
      "epoch": 0.5300581771170007,
      "grad_norm": 5.439358234405518,
      "learning_rate": 4.705882352941176e-07,
      "logits/chosen": -2.0601563453674316,
      "logits/rejected": -1.8984375,
      "logps/chosen": -38.349998474121094,
      "logps/rejected": -50.849998474121094,
      "loss": 0.6516,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.0036155700217932463,
      "rewards/margins": 0.09121093899011612,
      "rewards/rejected": -0.087646484375,
      "step": 820
    },
    {
      "epoch": 0.5365223012281836,
      "grad_norm": 6.197802543640137,
      "learning_rate": 4.641241111829347e-07,
      "logits/chosen": -2.360156297683716,
      "logits/rejected": -1.974218726158142,
      "logps/chosen": -29.399999618530273,
      "logps/rejected": -46.0,
      "loss": 0.6492,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.00822372455149889,
      "rewards/margins": 0.09587402641773224,
      "rewards/rejected": -0.08759765326976776,
      "step": 830
    },
    {
      "epoch": 0.5429864253393665,
      "grad_norm": 6.332528114318848,
      "learning_rate": 4.5765998707175173e-07,
      "logits/chosen": -2.2835936546325684,
      "logits/rejected": -2.094531297683716,
      "logps/chosen": -30.799999237060547,
      "logps/rejected": -42.17499923706055,
      "loss": 0.657,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.00872802734375,
      "rewards/margins": 0.08310546725988388,
      "rewards/rejected": -0.07445068657398224,
      "step": 840
    },
    {
      "epoch": 0.5494505494505495,
      "grad_norm": 6.556399345397949,
      "learning_rate": 4.511958629605688e-07,
      "logits/chosen": -2.3609375953674316,
      "logits/rejected": -2.1265625953674316,
      "logps/chosen": -29.375,
      "logps/rejected": -43.3125,
      "loss": 0.6465,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.003979492001235485,
      "rewards/margins": 0.10551758110523224,
      "rewards/rejected": -0.10952148586511612,
      "step": 850
    },
    {
      "epoch": 0.5559146735617324,
      "grad_norm": 6.173341751098633,
      "learning_rate": 4.4473173884938584e-07,
      "logits/chosen": -2.229687452316284,
      "logits/rejected": -2.092968702316284,
      "logps/chosen": -32.76250076293945,
      "logps/rejected": -43.82500076293945,
      "loss": 0.6371,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": 0.0005409240839071572,
      "rewards/margins": 0.1268310546875,
      "rewards/rejected": -0.1260986328125,
      "step": 860
    },
    {
      "epoch": 0.5623787976729153,
      "grad_norm": 4.50367546081543,
      "learning_rate": 4.3826761473820297e-07,
      "logits/chosen": -2.258593797683716,
      "logits/rejected": -2.0625,
      "logps/chosen": -31.4375,
      "logps/rejected": -46.95000076293945,
      "loss": 0.6543,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.016404341906309128,
      "rewards/margins": 0.09392090141773224,
      "rewards/rejected": -0.11020507663488388,
      "step": 870
    },
    {
      "epoch": 0.5688429217840982,
      "grad_norm": 6.905920028686523,
      "learning_rate": 4.3180349062702005e-07,
      "logits/chosen": -1.975000023841858,
      "logits/rejected": -1.84765625,
      "logps/chosen": -39.23749923706055,
      "logps/rejected": -49.400001525878906,
      "loss": 0.6676,
      "rewards/accuracies": 0.6625000238418579,
      "rewards/chosen": -0.02604370191693306,
      "rewards/margins": 0.06517334282398224,
      "rewards/rejected": -0.09122314304113388,
      "step": 880
    },
    {
      "epoch": 0.5753070458952811,
      "grad_norm": 7.6789021492004395,
      "learning_rate": 4.253393665158371e-07,
      "logits/chosen": -2.1617188453674316,
      "logits/rejected": -1.9617187976837158,
      "logps/chosen": -32.6875,
      "logps/rejected": -47.849998474121094,
      "loss": 0.6324,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.006512450985610485,
      "rewards/margins": 0.14787597954273224,
      "rewards/rejected": -0.15424804389476776,
      "step": 890
    },
    {
      "epoch": 0.5817711700064642,
      "grad_norm": 5.422806739807129,
      "learning_rate": 4.1887524240465416e-07,
      "logits/chosen": -2.10546875,
      "logits/rejected": -2.0,
      "logps/chosen": -34.525001525878906,
      "logps/rejected": -41.6875,
      "loss": 0.6551,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -0.00641632080078125,
      "rewards/margins": 0.08454589545726776,
      "rewards/rejected": -0.09105224907398224,
      "step": 900
    },
    {
      "epoch": 0.5882352941176471,
      "grad_norm": 5.118630886077881,
      "learning_rate": 4.124111182934712e-07,
      "logits/chosen": -2.1796875,
      "logits/rejected": -2.196093797683716,
      "logps/chosen": -32.92499923706055,
      "logps/rejected": -38.837501525878906,
      "loss": 0.6449,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.00493621826171875,
      "rewards/margins": 0.10678710788488388,
      "rewards/rejected": -0.10203857719898224,
      "step": 910
    },
    {
      "epoch": 0.59469941822883,
      "grad_norm": 4.897077560424805,
      "learning_rate": 4.0594699418228827e-07,
      "logits/chosen": -2.356250047683716,
      "logits/rejected": -2.08984375,
      "logps/chosen": -29.25,
      "logps/rejected": -38.0,
      "loss": 0.643,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.0014099121326580644,
      "rewards/margins": 0.11846618354320526,
      "rewards/rejected": -0.11980590969324112,
      "step": 920
    },
    {
      "epoch": 0.6011635423400129,
      "grad_norm": 6.72301721572876,
      "learning_rate": 3.9948287007110535e-07,
      "logits/chosen": -2.178906202316284,
      "logits/rejected": -2.0640625953674316,
      "logps/chosen": -33.224998474121094,
      "logps/rejected": -45.88750076293945,
      "loss": 0.6547,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": -0.006317138671875,
      "rewards/margins": 0.08646545559167862,
      "rewards/rejected": -0.09287109225988388,
      "step": 930
    },
    {
      "epoch": 0.6076276664511958,
      "grad_norm": 7.7329888343811035,
      "learning_rate": 3.930187459599224e-07,
      "logits/chosen": -2.0015625953674316,
      "logits/rejected": -1.87109375,
      "logps/chosen": -37.099998474121094,
      "logps/rejected": -46.599998474121094,
      "loss": 0.6363,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -0.013926696963608265,
      "rewards/margins": 0.1342422515153885,
      "rewards/rejected": -0.14792481064796448,
      "step": 940
    },
    {
      "epoch": 0.6140917905623788,
      "grad_norm": 7.24464750289917,
      "learning_rate": 3.865546218487395e-07,
      "logits/chosen": -2.24609375,
      "logits/rejected": -2.0093750953674316,
      "logps/chosen": -34.849998474121094,
      "logps/rejected": -47.61249923706055,
      "loss": 0.6352,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.00203704833984375,
      "rewards/margins": 0.13595275580883026,
      "rewards/rejected": -0.133982852101326,
      "step": 950
    },
    {
      "epoch": 0.6205559146735617,
      "grad_norm": 6.148697376251221,
      "learning_rate": 3.8009049773755655e-07,
      "logits/chosen": -2.297656297683716,
      "logits/rejected": -1.9835937023162842,
      "logps/chosen": -29.399999618530273,
      "logps/rejected": -46.025001525878906,
      "loss": 0.6402,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": 0.011855697259306908,
      "rewards/margins": 0.12172851711511612,
      "rewards/rejected": -0.10989990085363388,
      "step": 960
    },
    {
      "epoch": 0.6270200387847447,
      "grad_norm": 6.6240715980529785,
      "learning_rate": 3.7362637362637363e-07,
      "logits/chosen": -2.12890625,
      "logits/rejected": -2.0648436546325684,
      "logps/chosen": -32.57500076293945,
      "logps/rejected": -48.07500076293945,
      "loss": 0.6293,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": 0.01114578265696764,
      "rewards/margins": 0.14631347358226776,
      "rewards/rejected": -0.13518066704273224,
      "step": 970
    },
    {
      "epoch": 0.6334841628959276,
      "grad_norm": 7.20724630355835,
      "learning_rate": 3.6716224951519066e-07,
      "logits/chosen": -2.1890625953674316,
      "logits/rejected": -1.8984375,
      "logps/chosen": -34.125,
      "logps/rejected": -54.54999923706055,
      "loss": 0.6203,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.00470657367259264,
      "rewards/margins": 0.17182616889476776,
      "rewards/rejected": -0.17666015028953552,
      "step": 980
    },
    {
      "epoch": 0.6399482870071105,
      "grad_norm": 5.279712677001953,
      "learning_rate": 3.6069812540400774e-07,
      "logits/chosen": -2.150390625,
      "logits/rejected": -1.9796874523162842,
      "logps/chosen": -36.537498474121094,
      "logps/rejected": -45.150001525878906,
      "loss": 0.6594,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.01039733923971653,
      "rewards/margins": 0.07333068549633026,
      "rewards/rejected": -0.08380737155675888,
      "step": 990
    },
    {
      "epoch": 0.6464124111182935,
      "grad_norm": 6.097612380981445,
      "learning_rate": 3.542340012928248e-07,
      "logits/chosen": -2.374218702316284,
      "logits/rejected": -2.194531202316284,
      "logps/chosen": -27.712499618530273,
      "logps/rejected": -38.787498474121094,
      "loss": 0.6387,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": 0.01126709021627903,
      "rewards/margins": 0.12513427436351776,
      "rewards/rejected": -0.11369476467370987,
      "step": 1000
    },
    {
      "epoch": 0.6464124111182935,
      "eval_logits/chosen": -2.2388362884521484,
      "eval_logits/rejected": -2.0223474502563477,
      "eval_logps/chosen": -32.45478057861328,
      "eval_logps/rejected": -44.88953399658203,
      "eval_loss": 0.6451347470283508,
      "eval_rewards/accuracies": 0.7355728149414062,
      "eval_rewards/chosen": -0.00194389873649925,
      "eval_rewards/margins": 0.10936161130666733,
      "eval_rewards/rejected": -0.11126767843961716,
      "eval_runtime": 223.0965,
      "eval_samples_per_second": 13.868,
      "eval_steps_per_second": 1.735,
      "step": 1000
    },
    {
      "epoch": 0.6528765352294764,
      "grad_norm": 6.603030681610107,
      "learning_rate": 3.4776987718164185e-07,
      "logits/chosen": -2.117968797683716,
      "logits/rejected": -1.967187523841858,
      "logps/chosen": -36.07500076293945,
      "logps/rejected": -43.3125,
      "loss": 0.6434,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": 0.011035156436264515,
      "rewards/margins": 0.11629638820886612,
      "rewards/rejected": -0.10539551079273224,
      "step": 1010
    },
    {
      "epoch": 0.6593406593406593,
      "grad_norm": 5.5277204513549805,
      "learning_rate": 3.4130575307045893e-07,
      "logits/chosen": -2.125,
      "logits/rejected": -2.0679688453674316,
      "logps/chosen": -34.04999923706055,
      "logps/rejected": -42.962501525878906,
      "loss": 0.6492,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -0.017289351671934128,
      "rewards/margins": 0.10797119140625,
      "rewards/rejected": -0.12529906630516052,
      "step": 1020
    },
    {
      "epoch": 0.6658047834518422,
      "grad_norm": 7.359653949737549,
      "learning_rate": 3.3484162895927596e-07,
      "logits/chosen": -2.2125000953674316,
      "logits/rejected": -1.94140625,
      "logps/chosen": -33.162498474121094,
      "logps/rejected": -48.775001525878906,
      "loss": 0.643,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": 5.493163916980848e-05,
      "rewards/margins": 0.11522217094898224,
      "rewards/rejected": -0.115234375,
      "step": 1030
    },
    {
      "epoch": 0.6722689075630253,
      "grad_norm": 6.227341651916504,
      "learning_rate": 3.283775048480931e-07,
      "logits/chosen": -2.2265625,
      "logits/rejected": -2.1695313453674316,
      "logps/chosen": -31.887500762939453,
      "logps/rejected": -40.775001525878906,
      "loss": 0.6559,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": 0.0040954588912427425,
      "rewards/margins": 0.07955322414636612,
      "rewards/rejected": -0.07546386867761612,
      "step": 1040
    },
    {
      "epoch": 0.6787330316742082,
      "grad_norm": 7.087686538696289,
      "learning_rate": 3.219133807369102e-07,
      "logits/chosen": -2.092968702316284,
      "logits/rejected": -1.93359375,
      "logps/chosen": -35.3125,
      "logps/rejected": -51.974998474121094,
      "loss": 0.6543,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": -0.008679199032485485,
      "rewards/margins": 0.09174804389476776,
      "rewards/rejected": -0.10050048679113388,
      "step": 1050
    },
    {
      "epoch": 0.6851971557853911,
      "grad_norm": 6.2360968589782715,
      "learning_rate": 3.154492566257272e-07,
      "logits/chosen": -2.167187452316284,
      "logits/rejected": -2.0445313453674316,
      "logps/chosen": -33.98749923706055,
      "logps/rejected": -41.349998474121094,
      "loss": 0.618,
      "rewards/accuracies": 0.8125,
      "rewards/chosen": 0.01287231408059597,
      "rewards/margins": 0.17495116591453552,
      "rewards/rejected": -0.16208496689796448,
      "step": 1060
    },
    {
      "epoch": 0.691661279896574,
      "grad_norm": 8.29593276977539,
      "learning_rate": 3.089851325145443e-07,
      "logits/chosen": -2.1812500953674316,
      "logits/rejected": -2.0289063453674316,
      "logps/chosen": -32.38750076293945,
      "logps/rejected": -44.025001525878906,
      "loss": 0.6367,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.0015502929454669356,
      "rewards/margins": 0.13422851264476776,
      "rewards/rejected": -0.1358642578125,
      "step": 1070
    },
    {
      "epoch": 0.6981254040077569,
      "grad_norm": 6.664228916168213,
      "learning_rate": 3.025210084033613e-07,
      "logits/chosen": -2.3460936546325684,
      "logits/rejected": -2.0179686546325684,
      "logps/chosen": -27.1875,
      "logps/rejected": -40.974998474121094,
      "loss": 0.6266,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": 0.01611480675637722,
      "rewards/margins": 0.15510253608226776,
      "rewards/rejected": -0.13925781846046448,
      "step": 1080
    },
    {
      "epoch": 0.7045895281189399,
      "grad_norm": 6.836609363555908,
      "learning_rate": 2.960568842921784e-07,
      "logits/chosen": -2.19140625,
      "logits/rejected": -1.9015624523162842,
      "logps/chosen": -32.76250076293945,
      "logps/rejected": -49.17499923706055,
      "loss": 0.6246,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": 0.006195068359375,
      "rewards/margins": 0.16258545219898224,
      "rewards/rejected": -0.15664061903953552,
      "step": 1090
    },
    {
      "epoch": 0.7110536522301228,
      "grad_norm": 7.155961990356445,
      "learning_rate": 2.895927601809955e-07,
      "logits/chosen": -2.19140625,
      "logits/rejected": -1.8390624523162842,
      "logps/chosen": -32.5625,
      "logps/rejected": -52.525001525878906,
      "loss": 0.6367,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.00170135498046875,
      "rewards/margins": 0.12583008408546448,
      "rewards/rejected": -0.12739257514476776,
      "step": 1100
    },
    {
      "epoch": 0.7175177763413058,
      "grad_norm": 5.750825881958008,
      "learning_rate": 2.831286360698125e-07,
      "logits/chosen": -2.0562500953674316,
      "logits/rejected": -1.911718726158142,
      "logps/chosen": -36.462501525878906,
      "logps/rejected": -47.8125,
      "loss": 0.623,
      "rewards/accuracies": 0.8125,
      "rewards/chosen": -0.000988006591796875,
      "rewards/margins": 0.16801758110523224,
      "rewards/rejected": -0.16921386122703552,
      "step": 1110
    },
    {
      "epoch": 0.7239819004524887,
      "grad_norm": 8.863003730773926,
      "learning_rate": 2.766645119586296e-07,
      "logits/chosen": -2.132031202316284,
      "logits/rejected": -1.8273437023162842,
      "logps/chosen": -35.5625,
      "logps/rejected": -51.29999923706055,
      "loss": 0.6469,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.0003807067987509072,
      "rewards/margins": 0.10054931789636612,
      "rewards/rejected": -0.10020752251148224,
      "step": 1120
    },
    {
      "epoch": 0.7304460245636716,
      "grad_norm": 9.2312593460083,
      "learning_rate": 2.7020038784744667e-07,
      "logits/chosen": -2.2085938453674316,
      "logits/rejected": -1.993749976158142,
      "logps/chosen": -32.70000076293945,
      "logps/rejected": -45.07500076293945,
      "loss": 0.6074,
      "rewards/accuracies": 0.862500011920929,
      "rewards/chosen": 0.0004028320254292339,
      "rewards/margins": 0.21123047173023224,
      "rewards/rejected": -0.210693359375,
      "step": 1130
    },
    {
      "epoch": 0.7369101486748546,
      "grad_norm": 6.017390251159668,
      "learning_rate": 2.6373626373626375e-07,
      "logits/chosen": -2.0335936546325684,
      "logits/rejected": -1.7531249523162842,
      "logps/chosen": -38.26250076293945,
      "logps/rejected": -52.73749923706055,
      "loss": 0.6434,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.01358795166015625,
      "rewards/margins": 0.12060546875,
      "rewards/rejected": -0.13437500596046448,
      "step": 1140
    },
    {
      "epoch": 0.7433742727860375,
      "grad_norm": 5.903501987457275,
      "learning_rate": 2.572721396250808e-07,
      "logits/chosen": -2.367968797683716,
      "logits/rejected": -1.9375,
      "logps/chosen": -29.5,
      "logps/rejected": -49.625,
      "loss": 0.623,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": 0.01497802697122097,
      "rewards/margins": 0.16400146484375,
      "rewards/rejected": -0.14877930283546448,
      "step": 1150
    },
    {
      "epoch": 0.7498383968972204,
      "grad_norm": 7.522356986999512,
      "learning_rate": 2.5080801551389786e-07,
      "logits/chosen": -2.0999999046325684,
      "logits/rejected": -1.853906273841858,
      "logps/chosen": -36.07500076293945,
      "logps/rejected": -49.73749923706055,
      "loss": 0.6422,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.02725830115377903,
      "rewards/margins": 0.11488647758960724,
      "rewards/rejected": -0.1419677734375,
      "step": 1160
    },
    {
      "epoch": 0.7563025210084033,
      "grad_norm": 7.662413597106934,
      "learning_rate": 2.443438914027149e-07,
      "logits/chosen": -2.268749952316284,
      "logits/rejected": -1.9812500476837158,
      "logps/chosen": -30.200000762939453,
      "logps/rejected": -47.962501525878906,
      "loss": 0.6145,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": 0.01032409630715847,
      "rewards/margins": 0.18391112983226776,
      "rewards/rejected": -0.17351074516773224,
      "step": 1170
    },
    {
      "epoch": 0.7627666451195863,
      "grad_norm": 6.924587726593018,
      "learning_rate": 2.3787976729153197e-07,
      "logits/chosen": -2.07421875,
      "logits/rejected": -1.8781249523162842,
      "logps/chosen": -37.599998474121094,
      "logps/rejected": -50.8125,
      "loss": 0.6211,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": -0.00626449566334486,
      "rewards/margins": 0.17497558891773224,
      "rewards/rejected": -0.18135985732078552,
      "step": 1180
    },
    {
      "epoch": 0.7692307692307693,
      "grad_norm": 7.191421031951904,
      "learning_rate": 2.3141564318034905e-07,
      "logits/chosen": -2.2281250953674316,
      "logits/rejected": -2.0718750953674316,
      "logps/chosen": -30.325000762939453,
      "logps/rejected": -42.04999923706055,
      "loss": 0.6461,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.012211608700454235,
      "rewards/margins": 0.11051025241613388,
      "rewards/rejected": -0.12265624850988388,
      "step": 1190
    },
    {
      "epoch": 0.7756948933419522,
      "grad_norm": 5.949921607971191,
      "learning_rate": 2.2495151906916613e-07,
      "logits/chosen": -2.03515625,
      "logits/rejected": -1.9617187976837158,
      "logps/chosen": -38.25,
      "logps/rejected": -48.92499923706055,
      "loss": 0.6441,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.01292266882956028,
      "rewards/margins": 0.112060546875,
      "rewards/rejected": -0.12486572563648224,
      "step": 1200
    },
    {
      "epoch": 0.7821590174531351,
      "grad_norm": 5.949769020080566,
      "learning_rate": 2.184873949579832e-07,
      "logits/chosen": -2.2640624046325684,
      "logits/rejected": -2.140625,
      "logps/chosen": -30.962499618530273,
      "logps/rejected": -40.625,
      "loss": 0.648,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": 0.0062622069381177425,
      "rewards/margins": 0.09993896633386612,
      "rewards/rejected": -0.09376220405101776,
      "step": 1210
    },
    {
      "epoch": 0.788623141564318,
      "grad_norm": 5.67306661605835,
      "learning_rate": 2.1202327084680024e-07,
      "logits/chosen": -2.2367186546325684,
      "logits/rejected": -2.1695313453674316,
      "logps/chosen": -32.79999923706055,
      "logps/rejected": -40.625,
      "loss": 0.6523,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": 0.0029235840775072575,
      "rewards/margins": 0.09202881157398224,
      "rewards/rejected": -0.08903808891773224,
      "step": 1220
    },
    {
      "epoch": 0.7950872656755009,
      "grad_norm": 6.161100387573242,
      "learning_rate": 2.055591467356173e-07,
      "logits/chosen": -2.139843702316284,
      "logits/rejected": -2.0179686546325684,
      "logps/chosen": -36.07500076293945,
      "logps/rejected": -44.57500076293945,
      "loss": 0.6195,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": 0.00421218853443861,
      "rewards/margins": 0.18037109076976776,
      "rewards/rejected": -0.17604979872703552,
      "step": 1230
    },
    {
      "epoch": 0.8015513897866839,
      "grad_norm": 8.060186386108398,
      "learning_rate": 1.9909502262443435e-07,
      "logits/chosen": -2.082812547683716,
      "logits/rejected": -1.762109398841858,
      "logps/chosen": -35.76250076293945,
      "logps/rejected": -55.0625,
      "loss": 0.6238,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.01748046837747097,
      "rewards/margins": 0.16681213676929474,
      "rewards/rejected": -0.18450622260570526,
      "step": 1240
    },
    {
      "epoch": 0.8080155138978669,
      "grad_norm": 5.548961162567139,
      "learning_rate": 1.9263089851325146e-07,
      "logits/chosen": -2.0718750953674316,
      "logits/rejected": -1.853906273841858,
      "logps/chosen": -36.775001525878906,
      "logps/rejected": -51.025001525878906,
      "loss": 0.6488,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": -0.02834472618997097,
      "rewards/margins": 0.1006927490234375,
      "rewards/rejected": -0.12929686903953552,
      "step": 1250
    },
    {
      "epoch": 0.8144796380090498,
      "grad_norm": 8.184466361999512,
      "learning_rate": 1.8616677440206852e-07,
      "logits/chosen": -2.2421875,
      "logits/rejected": -1.94140625,
      "logps/chosen": -34.125,
      "logps/rejected": -49.349998474121094,
      "loss": 0.6328,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.0075225830078125,
      "rewards/margins": 0.14026489853858948,
      "rewards/rejected": -0.14777831733226776,
      "step": 1260
    },
    {
      "epoch": 0.8209437621202327,
      "grad_norm": 5.949737071990967,
      "learning_rate": 1.7970265029088557e-07,
      "logits/chosen": -2.26953125,
      "logits/rejected": -2.030468702316284,
      "logps/chosen": -31.274999618530273,
      "logps/rejected": -41.212501525878906,
      "loss": 0.6555,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.009674072265625,
      "rewards/margins": 0.08385773003101349,
      "rewards/rejected": -0.09356689453125,
      "step": 1270
    },
    {
      "epoch": 0.8274078862314156,
      "grad_norm": 5.359708309173584,
      "learning_rate": 1.7323852617970263e-07,
      "logits/chosen": -2.4046874046325684,
      "logits/rejected": -1.876562476158142,
      "logps/chosen": -27.774999618530273,
      "logps/rejected": -50.849998474121094,
      "loss": 0.6371,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.01033935509622097,
      "rewards/margins": 0.12919005751609802,
      "rewards/rejected": -0.1396484375,
      "step": 1280
    },
    {
      "epoch": 0.8338720103425986,
      "grad_norm": 5.995463848114014,
      "learning_rate": 1.667744020685197e-07,
      "logits/chosen": -2.12109375,
      "logits/rejected": -1.94921875,
      "logps/chosen": -35.0625,
      "logps/rejected": -49.32500076293945,
      "loss": 0.6379,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.014133453369140625,
      "rewards/margins": 0.12868651747703552,
      "rewards/rejected": -0.142822265625,
      "step": 1290
    },
    {
      "epoch": 0.8403361344537815,
      "grad_norm": 5.987330913543701,
      "learning_rate": 1.603102779573368e-07,
      "logits/chosen": -2.106250047683716,
      "logits/rejected": -1.677343726158142,
      "logps/chosen": -34.82500076293945,
      "logps/rejected": -57.224998474121094,
      "loss": 0.6105,
      "rewards/accuracies": 0.8125,
      "rewards/chosen": -0.000461578369140625,
      "rewards/margins": 0.19721679389476776,
      "rewards/rejected": -0.19736328721046448,
      "step": 1300
    },
    {
      "epoch": 0.8468002585649644,
      "grad_norm": 6.187958717346191,
      "learning_rate": 1.5384615384615385e-07,
      "logits/chosen": -2.028125047683716,
      "logits/rejected": -1.9031250476837158,
      "logps/chosen": -39.29999923706055,
      "logps/rejected": -51.125,
      "loss": 0.65,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.0266265869140625,
      "rewards/margins": 0.11011657863855362,
      "rewards/rejected": -0.13673095405101776,
      "step": 1310
    },
    {
      "epoch": 0.8532643826761473,
      "grad_norm": 7.16326904296875,
      "learning_rate": 1.473820297349709e-07,
      "logits/chosen": -2.0765624046325684,
      "logits/rejected": -1.8820312023162842,
      "logps/chosen": -38.375,
      "logps/rejected": -46.474998474121094,
      "loss": 0.6543,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -0.013995361514389515,
      "rewards/margins": 0.09053955227136612,
      "rewards/rejected": -0.10441894829273224,
      "step": 1320
    },
    {
      "epoch": 0.8597285067873304,
      "grad_norm": 6.134576797485352,
      "learning_rate": 1.4091790562378798e-07,
      "logits/chosen": -2.094531297683716,
      "logits/rejected": -1.9070312976837158,
      "logps/chosen": -39.01250076293945,
      "logps/rejected": -48.67499923706055,
      "loss": 0.6406,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -0.0069671631790697575,
      "rewards/margins": 0.12123413383960724,
      "rewards/rejected": -0.12824706733226776,
      "step": 1330
    },
    {
      "epoch": 0.8661926308985133,
      "grad_norm": 6.233573913574219,
      "learning_rate": 1.3445378151260504e-07,
      "logits/chosen": -2.3984375,
      "logits/rejected": -2.202343702316284,
      "logps/chosen": -27.524999618530273,
      "logps/rejected": -36.98749923706055,
      "loss": 0.6465,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.003215789794921875,
      "rewards/margins": 0.104400634765625,
      "rewards/rejected": -0.10121460258960724,
      "step": 1340
    },
    {
      "epoch": 0.8726567550096962,
      "grad_norm": 6.2733845710754395,
      "learning_rate": 1.279896574014221e-07,
      "logits/chosen": -2.242968797683716,
      "logits/rejected": -1.982812523841858,
      "logps/chosen": -30.537500381469727,
      "logps/rejected": -45.900001525878906,
      "loss": 0.6402,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.00023193359083961695,
      "rewards/margins": 0.12075195461511612,
      "rewards/rejected": -0.12027587741613388,
      "step": 1350
    },
    {
      "epoch": 0.8791208791208791,
      "grad_norm": 8.090156555175781,
      "learning_rate": 1.2152553329023917e-07,
      "logits/chosen": -2.1390624046325684,
      "logits/rejected": -1.87890625,
      "logps/chosen": -34.32500076293945,
      "logps/rejected": -49.5,
      "loss": 0.634,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.01145324669778347,
      "rewards/margins": 0.14055785536766052,
      "rewards/rejected": -0.15200194716453552,
      "step": 1360
    },
    {
      "epoch": 0.885585003232062,
      "grad_norm": 7.740165710449219,
      "learning_rate": 1.1506140917905623e-07,
      "logits/chosen": -2.03515625,
      "logits/rejected": -2.01171875,
      "logps/chosen": -37.712501525878906,
      "logps/rejected": -45.5,
      "loss": 0.6484,
      "rewards/accuracies": 0.8374999761581421,
      "rewards/chosen": -0.02341003343462944,
      "rewards/margins": 0.09877929836511612,
      "rewards/rejected": -0.1221923828125,
      "step": 1370
    },
    {
      "epoch": 0.892049127343245,
      "grad_norm": 7.002925395965576,
      "learning_rate": 1.085972850678733e-07,
      "logits/chosen": -2.0999999046325684,
      "logits/rejected": -1.799218773841858,
      "logps/chosen": -35.775001525878906,
      "logps/rejected": -54.45000076293945,
      "loss": 0.6383,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.023548126220703125,
      "rewards/margins": 0.142822265625,
      "rewards/rejected": -0.1663818359375,
      "step": 1380
    },
    {
      "epoch": 0.898513251454428,
      "grad_norm": 7.787292957305908,
      "learning_rate": 1.0213316095669037e-07,
      "logits/chosen": -2.153125047683716,
      "logits/rejected": -2.010937452316284,
      "logps/chosen": -35.82500076293945,
      "logps/rejected": -43.0,
      "loss": 0.652,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.017242431640625,
      "rewards/margins": 0.09310302883386612,
      "rewards/rejected": -0.11036376655101776,
      "step": 1390
    },
    {
      "epoch": 0.9049773755656109,
      "grad_norm": 7.6334228515625,
      "learning_rate": 9.566903684550742e-08,
      "logits/chosen": -2.280468702316284,
      "logits/rejected": -2.05078125,
      "logps/chosen": -29.9375,
      "logps/rejected": -43.23749923706055,
      "loss": 0.641,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.0021903992164880037,
      "rewards/margins": 0.12362060695886612,
      "rewards/rejected": -0.12136230617761612,
      "step": 1400
    },
    {
      "epoch": 0.9114414996767938,
      "grad_norm": 6.017730712890625,
      "learning_rate": 8.92049127343245e-08,
      "logits/chosen": -2.3382811546325684,
      "logits/rejected": -1.939843773841858,
      "logps/chosen": -28.674999237060547,
      "logps/rejected": -47.587501525878906,
      "loss": 0.6441,
      "rewards/accuracies": 0.8500000238418579,
      "rewards/chosen": -0.0043960572220385075,
      "rewards/margins": 0.10791015625,
      "rewards/rejected": -0.1123046875,
      "step": 1410
    },
    {
      "epoch": 0.9179056237879767,
      "grad_norm": 5.560482501983643,
      "learning_rate": 8.274078862314156e-08,
      "logits/chosen": -2.167187452316284,
      "logits/rejected": -1.9484374523162842,
      "logps/chosen": -32.587501525878906,
      "logps/rejected": -44.6875,
      "loss": 0.6434,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.018343353644013405,
      "rewards/margins": 0.12397460639476776,
      "rewards/rejected": -0.14223632216453552,
      "step": 1420
    },
    {
      "epoch": 0.9243697478991597,
      "grad_norm": 6.859267234802246,
      "learning_rate": 7.627666451195864e-08,
      "logits/chosen": -2.0445313453674316,
      "logits/rejected": -1.8515625,
      "logps/chosen": -38.099998474121094,
      "logps/rejected": -50.462501525878906,
      "loss": 0.6477,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.02138366736471653,
      "rewards/margins": 0.109527587890625,
      "rewards/rejected": -0.13093261420726776,
      "step": 1430
    },
    {
      "epoch": 0.9308338720103426,
      "grad_norm": 6.36416482925415,
      "learning_rate": 6.98125404007757e-08,
      "logits/chosen": -2.071093797683716,
      "logits/rejected": -2.1156249046325684,
      "logps/chosen": -33.150001525878906,
      "logps/rejected": -42.212501525878906,
      "loss": 0.6445,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -0.0130767822265625,
      "rewards/margins": 0.11459960788488388,
      "rewards/rejected": -0.12770995497703552,
      "step": 1440
    },
    {
      "epoch": 0.9372979961215255,
      "grad_norm": 6.67123556137085,
      "learning_rate": 6.334841628959275e-08,
      "logits/chosen": -2.356250047683716,
      "logits/rejected": -2.1078124046325684,
      "logps/chosen": -28.112499237060547,
      "logps/rejected": -42.962501525878906,
      "loss": 0.632,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": 0.005261230282485485,
      "rewards/margins": 0.14548340439796448,
      "rewards/rejected": -0.1402488648891449,
      "step": 1450
    },
    {
      "epoch": 0.9437621202327084,
      "grad_norm": 7.510457992553711,
      "learning_rate": 5.6884292178409824e-08,
      "logits/chosen": -2.174999952316284,
      "logits/rejected": -2.077343702316284,
      "logps/chosen": -33.76250076293945,
      "logps/rejected": -42.79999923706055,
      "loss": 0.6555,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -0.0086669921875,
      "rewards/margins": 0.08434448391199112,
      "rewards/rejected": -0.09311523288488388,
      "step": 1460
    },
    {
      "epoch": 0.9502262443438914,
      "grad_norm": 5.84100341796875,
      "learning_rate": 5.042016806722689e-08,
      "logits/chosen": -2.265625,
      "logits/rejected": -2.1976561546325684,
      "logps/chosen": -33.412498474121094,
      "logps/rejected": -35.837501525878906,
      "loss": 0.6523,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": 0.0003379821719136089,
      "rewards/margins": 0.09195556491613388,
      "rewards/rejected": -0.09157714992761612,
      "step": 1470
    },
    {
      "epoch": 0.9566903684550744,
      "grad_norm": 6.280606746673584,
      "learning_rate": 4.395604395604396e-08,
      "logits/chosen": -2.057812452316284,
      "logits/rejected": -2.0687499046325684,
      "logps/chosen": -37.587501525878906,
      "logps/rejected": -45.11249923706055,
      "loss": 0.6418,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": 0.0013267516624182463,
      "rewards/margins": 0.1134033203125,
      "rewards/rejected": -0.11204834282398224,
      "step": 1480
    },
    {
      "epoch": 0.9631544925662573,
      "grad_norm": 6.138069152832031,
      "learning_rate": 3.7491919844861016e-08,
      "logits/chosen": -2.3023438453674316,
      "logits/rejected": -1.865625023841858,
      "logps/chosen": -30.712499618530273,
      "logps/rejected": -48.20000076293945,
      "loss": 0.6441,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.012371825985610485,
      "rewards/margins": 0.10634765774011612,
      "rewards/rejected": -0.11879882961511612,
      "step": 1490
    },
    {
      "epoch": 0.9696186166774402,
      "grad_norm": 7.02135705947876,
      "learning_rate": 3.1027795733678084e-08,
      "logits/chosen": -2.0101561546325684,
      "logits/rejected": -2.004687547683716,
      "logps/chosen": -34.75,
      "logps/rejected": -48.275001525878906,
      "loss": 0.643,
      "rewards/accuracies": 0.6625000238418579,
      "rewards/chosen": -0.008056640625,
      "rewards/margins": 0.11821899563074112,
      "rewards/rejected": -0.12647704780101776,
      "step": 1500
    },
    {
      "epoch": 0.9696186166774402,
      "eval_logits/chosen": -2.2356467247009277,
      "eval_logits/rejected": -2.0164527893066406,
      "eval_logps/chosen": -32.4964485168457,
      "eval_logps/rejected": -45.10820388793945,
      "eval_loss": 0.6378322839736938,
      "eval_rewards/accuracies": 0.7512919902801514,
      "eval_rewards/chosen": -0.005878813099116087,
      "eval_rewards/margins": 0.128102645277977,
      "eval_rewards/rejected": -0.13398845493793488,
      "eval_runtime": 208.5178,
      "eval_samples_per_second": 14.838,
      "eval_steps_per_second": 1.856,
      "step": 1500
    },
    {
      "epoch": 0.9760827407886231,
      "grad_norm": 6.11538553237915,
      "learning_rate": 2.456367162249515e-08,
      "logits/chosen": -2.163281202316284,
      "logits/rejected": -1.8875000476837158,
      "logps/chosen": -33.837501525878906,
      "logps/rejected": -44.48749923706055,
      "loss": 0.6379,
      "rewards/accuracies": 0.8125,
      "rewards/chosen": -0.0071197510696947575,
      "rewards/margins": 0.12338867038488388,
      "rewards/rejected": -0.13054199516773224,
      "step": 1510
    },
    {
      "epoch": 0.982546864899806,
      "grad_norm": 5.675445079803467,
      "learning_rate": 1.8099547511312217e-08,
      "logits/chosen": -2.3460936546325684,
      "logits/rejected": -2.0101561546325684,
      "logps/chosen": -29.100000381469727,
      "logps/rejected": -44.1875,
      "loss": 0.6227,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.0009937286376953125,
      "rewards/margins": 0.168701171875,
      "rewards/rejected": -0.16994628310203552,
      "step": 1520
    },
    {
      "epoch": 0.989010989010989,
      "grad_norm": 5.527551651000977,
      "learning_rate": 1.1635423400129282e-08,
      "logits/chosen": -2.0953125953674316,
      "logits/rejected": -1.850000023841858,
      "logps/chosen": -35.224998474121094,
      "logps/rejected": -55.04999923706055,
      "loss": 0.6371,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -0.01041259802877903,
      "rewards/margins": 0.12872314453125,
      "rewards/rejected": -0.13935546576976776,
      "step": 1530
    },
    {
      "epoch": 0.995475113122172,
      "grad_norm": 7.402287006378174,
      "learning_rate": 5.171299288946347e-09,
      "logits/chosen": -2.2484374046325684,
      "logits/rejected": -1.9914062023162842,
      "logps/chosen": -30.299999237060547,
      "logps/rejected": -42.224998474121094,
      "loss": 0.634,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": 0.01038436871021986,
      "rewards/margins": 0.1387939453125,
      "rewards/rejected": -0.12841796875,
      "step": 1540
    }
  ],
  "logging_steps": 10,
  "max_steps": 1547,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}
