{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 100,
  "global_step": 6250,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.00016,
      "grad_norm": 77.7119369506836,
      "learning_rate": 0.0001,
      "loss": 10.5881,
      "step": 1
    },
    {
      "epoch": 0.00032,
      "grad_norm": 15.86559772491455,
      "learning_rate": 0.0001,
      "loss": 6.8609,
      "step": 2
    },
    {
      "epoch": 0.00048,
      "grad_norm": 9.723411560058594,
      "learning_rate": 0.0001,
      "loss": 5.4352,
      "step": 3
    },
    {
      "epoch": 0.00064,
      "grad_norm": 100.80152130126953,
      "learning_rate": 0.0001,
      "loss": 8.2291,
      "step": 4
    },
    {
      "epoch": 0.0008,
      "grad_norm": 10.081633567810059,
      "learning_rate": 0.0001,
      "loss": 5.0525,
      "step": 5
    },
    {
      "epoch": 0.00096,
      "grad_norm": 11.842557907104492,
      "learning_rate": 0.0001,
      "loss": 4.7868,
      "step": 6
    },
    {
      "epoch": 0.00112,
      "grad_norm": 3.860222101211548,
      "learning_rate": 0.0001,
      "loss": 4.4586,
      "step": 7
    },
    {
      "epoch": 0.00128,
      "grad_norm": 3.542466640472412,
      "learning_rate": 0.0001,
      "loss": 4.3474,
      "step": 8
    },
    {
      "epoch": 0.00144,
      "grad_norm": 3.531780958175659,
      "learning_rate": 0.0001,
      "loss": 4.2549,
      "step": 9
    },
    {
      "epoch": 0.0016,
      "grad_norm": 3.7748703956604004,
      "learning_rate": 0.0001,
      "loss": 4.295,
      "step": 10
    },
    {
      "epoch": 0.00176,
      "grad_norm": 4.071692943572998,
      "learning_rate": 0.0001,
      "loss": 4.0235,
      "step": 11
    },
    {
      "epoch": 0.00192,
      "grad_norm": 4.653055191040039,
      "learning_rate": 0.0001,
      "loss": 3.7414,
      "step": 12
    },
    {
      "epoch": 0.00208,
      "grad_norm": 10.538986206054688,
      "learning_rate": 0.0001,
      "loss": 3.4663,
      "step": 13
    },
    {
      "epoch": 0.00224,
      "grad_norm": 8.532798767089844,
      "learning_rate": 0.0001,
      "loss": 3.2978,
      "step": 14
    },
    {
      "epoch": 0.0024,
      "grad_norm": 10.74067497253418,
      "learning_rate": 0.0001,
      "loss": 3.198,
      "step": 15
    },
    {
      "epoch": 0.00256,
      "grad_norm": 5.774070739746094,
      "learning_rate": 0.0001,
      "loss": 3.0728,
      "step": 16
    },
    {
      "epoch": 0.00272,
      "grad_norm": 12.785757064819336,
      "learning_rate": 0.0001,
      "loss": 2.9902,
      "step": 17
    },
    {
      "epoch": 0.00288,
      "grad_norm": 9.58934497833252,
      "learning_rate": 0.0001,
      "loss": 2.7384,
      "step": 18
    },
    {
      "epoch": 0.00304,
      "grad_norm": 2.961435556411743,
      "learning_rate": 0.0001,
      "loss": 2.6972,
      "step": 19
    },
    {
      "epoch": 0.0032,
      "grad_norm": 8.383098602294922,
      "learning_rate": 0.0001,
      "loss": 2.6113,
      "step": 20
    },
    {
      "epoch": 0.00336,
      "grad_norm": 5.6838297843933105,
      "learning_rate": 0.0001,
      "loss": 2.5338,
      "step": 21
    },
    {
      "epoch": 0.00352,
      "grad_norm": 2.959404706954956,
      "learning_rate": 0.0001,
      "loss": 2.3756,
      "step": 22
    },
    {
      "epoch": 0.00368,
      "grad_norm": 2.901047706604004,
      "learning_rate": 0.0001,
      "loss": 2.3105,
      "step": 23
    },
    {
      "epoch": 0.00384,
      "grad_norm": 3.3924520015716553,
      "learning_rate": 0.0001,
      "loss": 2.2806,
      "step": 24
    },
    {
      "epoch": 0.004,
      "grad_norm": 2.2880380153656006,
      "learning_rate": 0.0001,
      "loss": 2.1177,
      "step": 25
    },
    {
      "epoch": 0.00416,
      "grad_norm": 2.8509774208068848,
      "learning_rate": 0.0001,
      "loss": 2.0728,
      "step": 26
    },
    {
      "epoch": 0.00432,
      "grad_norm": 2.7400662899017334,
      "learning_rate": 0.0001,
      "loss": 1.8753,
      "step": 27
    },
    {
      "epoch": 0.00448,
      "grad_norm": 2.0912115573883057,
      "learning_rate": 0.0001,
      "loss": 1.827,
      "step": 28
    },
    {
      "epoch": 0.00464,
      "grad_norm": 2.0836615562438965,
      "learning_rate": 0.0001,
      "loss": 1.8239,
      "step": 29
    },
    {
      "epoch": 0.0048,
      "grad_norm": 1.6743824481964111,
      "learning_rate": 0.0001,
      "loss": 1.7586,
      "step": 30
    },
    {
      "epoch": 0.00496,
      "grad_norm": 1.358473539352417,
      "learning_rate": 0.0001,
      "loss": 1.5803,
      "step": 31
    },
    {
      "epoch": 0.00512,
      "grad_norm": 1.4722394943237305,
      "learning_rate": 0.0001,
      "loss": 1.6553,
      "step": 32
    },
    {
      "epoch": 0.00528,
      "grad_norm": 1.0680358409881592,
      "learning_rate": 0.0001,
      "loss": 1.6019,
      "step": 33
    },
    {
      "epoch": 0.00544,
      "grad_norm": 1.357996940612793,
      "learning_rate": 0.0001,
      "loss": 1.4502,
      "step": 34
    },
    {
      "epoch": 0.0056,
      "grad_norm": 0.9529079794883728,
      "learning_rate": 0.0001,
      "loss": 1.3946,
      "step": 35
    },
    {
      "epoch": 0.00576,
      "grad_norm": 1.1684993505477905,
      "learning_rate": 0.0001,
      "loss": 1.4712,
      "step": 36
    },
    {
      "epoch": 0.00592,
      "grad_norm": 1.0638447999954224,
      "learning_rate": 0.0001,
      "loss": 1.4008,
      "step": 37
    },
    {
      "epoch": 0.00608,
      "grad_norm": 0.7582200169563293,
      "learning_rate": 0.0001,
      "loss": 1.2997,
      "step": 38
    },
    {
      "epoch": 0.00624,
      "grad_norm": 0.8854861855506897,
      "learning_rate": 0.0001,
      "loss": 1.3443,
      "step": 39
    },
    {
      "epoch": 0.0064,
      "grad_norm": 0.7972013354301453,
      "learning_rate": 0.0001,
      "loss": 1.3093,
      "step": 40
    },
    {
      "epoch": 0.00656,
      "grad_norm": 0.7303720116615295,
      "learning_rate": 0.0001,
      "loss": 1.2234,
      "step": 41
    },
    {
      "epoch": 0.00672,
      "grad_norm": 1.2769593000411987,
      "learning_rate": 0.0001,
      "loss": 1.2003,
      "step": 42
    },
    {
      "epoch": 0.00688,
      "grad_norm": 2.0403404235839844,
      "learning_rate": 0.0001,
      "loss": 1.1858,
      "step": 43
    },
    {
      "epoch": 0.00704,
      "grad_norm": 0.6624970436096191,
      "learning_rate": 0.0001,
      "loss": 1.126,
      "step": 44
    },
    {
      "epoch": 0.0072,
      "grad_norm": 1.6160118579864502,
      "learning_rate": 0.0001,
      "loss": 1.0596,
      "step": 45
    },
    {
      "epoch": 0.00736,
      "grad_norm": 1.3531368970870972,
      "learning_rate": 0.0001,
      "loss": 1.0658,
      "step": 46
    },
    {
      "epoch": 0.00752,
      "grad_norm": 0.9369664192199707,
      "learning_rate": 0.0001,
      "loss": 1.0678,
      "step": 47
    },
    {
      "epoch": 0.00768,
      "grad_norm": 1.981000542640686,
      "learning_rate": 0.0001,
      "loss": 1.0655,
      "step": 48
    },
    {
      "epoch": 0.00784,
      "grad_norm": 1.3617008924484253,
      "learning_rate": 0.0001,
      "loss": 1.0614,
      "step": 49
    },
    {
      "epoch": 0.008,
      "grad_norm": 1.27870512008667,
      "learning_rate": 0.0001,
      "loss": 0.9828,
      "step": 50
    },
    {
      "epoch": 0.00816,
      "grad_norm": 1.2370675802230835,
      "learning_rate": 0.0001,
      "loss": 0.9785,
      "step": 51
    },
    {
      "epoch": 0.00832,
      "grad_norm": 1.4587010145187378,
      "learning_rate": 0.0001,
      "loss": 0.9059,
      "step": 52
    },
    {
      "epoch": 0.00848,
      "grad_norm": 1.5812320709228516,
      "learning_rate": 0.0001,
      "loss": 0.9644,
      "step": 53
    },
    {
      "epoch": 0.00864,
      "grad_norm": 0.6766594052314758,
      "learning_rate": 0.0001,
      "loss": 0.9155,
      "step": 54
    },
    {
      "epoch": 0.0088,
      "grad_norm": 1.7678676843643188,
      "learning_rate": 0.0001,
      "loss": 0.893,
      "step": 55
    },
    {
      "epoch": 0.00896,
      "grad_norm": 0.41813820600509644,
      "learning_rate": 0.0001,
      "loss": 0.8552,
      "step": 56
    },
    {
      "epoch": 0.00912,
      "grad_norm": 1.6992897987365723,
      "learning_rate": 0.0001,
      "loss": 0.8895,
      "step": 57
    },
    {
      "epoch": 0.00928,
      "grad_norm": 1.0383503437042236,
      "learning_rate": 0.0001,
      "loss": 0.8954,
      "step": 58
    },
    {
      "epoch": 0.00944,
      "grad_norm": 0.6433092951774597,
      "learning_rate": 0.0001,
      "loss": 0.8418,
      "step": 59
    },
    {
      "epoch": 0.0096,
      "grad_norm": 1.8909327983856201,
      "learning_rate": 0.0001,
      "loss": 0.879,
      "step": 60
    },
    {
      "epoch": 0.00976,
      "grad_norm": 1.5047078132629395,
      "learning_rate": 0.0001,
      "loss": 0.8275,
      "step": 61
    },
    {
      "epoch": 0.00992,
      "grad_norm": 1.5957250595092773,
      "learning_rate": 0.0001,
      "loss": 0.8338,
      "step": 62
    },
    {
      "epoch": 0.01008,
      "grad_norm": 1.385420560836792,
      "learning_rate": 0.0001,
      "loss": 0.8452,
      "step": 63
    },
    {
      "epoch": 0.01024,
      "grad_norm": 1.4886198043823242,
      "learning_rate": 0.0001,
      "loss": 0.8035,
      "step": 64
    },
    {
      "epoch": 0.0104,
      "grad_norm": 1.568930745124817,
      "learning_rate": 0.0001,
      "loss": 0.7795,
      "step": 65
    },
    {
      "epoch": 0.01056,
      "grad_norm": 1.506390929222107,
      "learning_rate": 0.0001,
      "loss": 0.7932,
      "step": 66
    },
    {
      "epoch": 0.01072,
      "grad_norm": 2.009774923324585,
      "learning_rate": 0.0001,
      "loss": 0.7885,
      "step": 67
    },
    {
      "epoch": 0.01088,
      "grad_norm": 0.9761955738067627,
      "learning_rate": 0.0001,
      "loss": 0.7969,
      "step": 68
    },
    {
      "epoch": 0.01104,
      "grad_norm": 2.586148738861084,
      "learning_rate": 0.0001,
      "loss": 0.7803,
      "step": 69
    },
    {
      "epoch": 0.0112,
      "grad_norm": 1.2170474529266357,
      "learning_rate": 0.0001,
      "loss": 0.8018,
      "step": 70
    },
    {
      "epoch": 0.01136,
      "grad_norm": 3.1646761894226074,
      "learning_rate": 0.0001,
      "loss": 0.8205,
      "step": 71
    },
    {
      "epoch": 0.01152,
      "grad_norm": 2.6828222274780273,
      "learning_rate": 0.0001,
      "loss": 0.7882,
      "step": 72
    },
    {
      "epoch": 0.01168,
      "grad_norm": 1.6073148250579834,
      "learning_rate": 0.0001,
      "loss": 0.7753,
      "step": 73
    },
    {
      "epoch": 0.01184,
      "grad_norm": 2.013918161392212,
      "learning_rate": 0.0001,
      "loss": 0.7484,
      "step": 74
    },
    {
      "epoch": 0.012,
      "grad_norm": 2.136077642440796,
      "learning_rate": 0.0001,
      "loss": 0.7694,
      "step": 75
    },
    {
      "epoch": 0.01216,
      "grad_norm": 1.3678350448608398,
      "learning_rate": 0.0001,
      "loss": 0.7621,
      "step": 76
    },
    {
      "epoch": 0.01232,
      "grad_norm": 1.5675938129425049,
      "learning_rate": 0.0001,
      "loss": 0.7665,
      "step": 77
    },
    {
      "epoch": 0.01248,
      "grad_norm": 2.365593194961548,
      "learning_rate": 0.0001,
      "loss": 0.7469,
      "step": 78
    },
    {
      "epoch": 0.01264,
      "grad_norm": 0.9990292191505432,
      "learning_rate": 0.0001,
      "loss": 0.736,
      "step": 79
    },
    {
      "epoch": 0.0128,
      "grad_norm": 2.2617034912109375,
      "learning_rate": 0.0001,
      "loss": 0.7215,
      "step": 80
    },
    {
      "epoch": 0.01296,
      "grad_norm": 2.111541509628296,
      "learning_rate": 0.0001,
      "loss": 0.7305,
      "step": 81
    },
    {
      "epoch": 0.01312,
      "grad_norm": 1.3975051641464233,
      "learning_rate": 0.0001,
      "loss": 0.7407,
      "step": 82
    },
    {
      "epoch": 0.01328,
      "grad_norm": 2.347721815109253,
      "learning_rate": 0.0001,
      "loss": 0.7544,
      "step": 83
    },
    {
      "epoch": 0.01344,
      "grad_norm": 2.2298521995544434,
      "learning_rate": 0.0001,
      "loss": 0.779,
      "step": 84
    },
    {
      "epoch": 0.0136,
      "grad_norm": 2.0123291015625,
      "learning_rate": 0.0001,
      "loss": 0.7187,
      "step": 85
    },
    {
      "epoch": 0.01376,
      "grad_norm": 1.293844223022461,
      "learning_rate": 0.0001,
      "loss": 0.7053,
      "step": 86
    },
    {
      "epoch": 0.01392,
      "grad_norm": 2.00860333442688,
      "learning_rate": 0.0001,
      "loss": 0.7357,
      "step": 87
    },
    {
      "epoch": 0.01408,
      "grad_norm": 1.3900790214538574,
      "learning_rate": 0.0001,
      "loss": 0.681,
      "step": 88
    },
    {
      "epoch": 0.01424,
      "grad_norm": 2.1436264514923096,
      "learning_rate": 0.0001,
      "loss": 0.7306,
      "step": 89
    },
    {
      "epoch": 0.0144,
      "grad_norm": 1.786734700202942,
      "learning_rate": 0.0001,
      "loss": 0.7131,
      "step": 90
    },
    {
      "epoch": 0.01456,
      "grad_norm": 1.2161316871643066,
      "learning_rate": 0.0001,
      "loss": 0.7346,
      "step": 91
    },
    {
      "epoch": 0.01472,
      "grad_norm": 1.2491860389709473,
      "learning_rate": 0.0001,
      "loss": 0.7124,
      "step": 92
    },
    {
      "epoch": 0.01488,
      "grad_norm": 1.5050146579742432,
      "learning_rate": 0.0001,
      "loss": 0.7067,
      "step": 93
    },
    {
      "epoch": 0.01504,
      "grad_norm": 1.2530373334884644,
      "learning_rate": 0.0001,
      "loss": 0.7021,
      "step": 94
    },
    {
      "epoch": 0.0152,
      "grad_norm": 1.116498589515686,
      "learning_rate": 0.0001,
      "loss": 0.6947,
      "step": 95
    },
    {
      "epoch": 0.01536,
      "grad_norm": 0.9364747405052185,
      "learning_rate": 0.0001,
      "loss": 0.6917,
      "step": 96
    },
    {
      "epoch": 0.01552,
      "grad_norm": 1.7284201383590698,
      "learning_rate": 0.0001,
      "loss": 0.7175,
      "step": 97
    },
    {
      "epoch": 0.01568,
      "grad_norm": 1.5419800281524658,
      "learning_rate": 0.0001,
      "loss": 0.6925,
      "step": 98
    },
    {
      "epoch": 0.01584,
      "grad_norm": 0.927583634853363,
      "learning_rate": 0.0001,
      "loss": 0.6773,
      "step": 99
    },
    {
      "epoch": 0.016,
      "grad_norm": 1.190490961074829,
      "learning_rate": 0.0001,
      "loss": 0.6802,
      "step": 100
    },
    {
      "epoch": 0.016,
      "eval_train_accuracy": 0.4992,
      "eval_train_loss": 0.6721848845481873,
      "eval_train_runtime": 4.1442,
      "eval_train_samples_per_second": 1206.51,
      "eval_train_steps_per_second": 15.202,
      "step": 100
    },
    {
      "epoch": 0.016,
      "eval_test_accuracy": 0.5048,
      "eval_test_loss": 0.6693627238273621,
      "eval_test_runtime": 4.6816,
      "eval_test_samples_per_second": 1068.01,
      "eval_test_steps_per_second": 13.457,
      "step": 100
    },
    {
      "epoch": 0.01616,
      "grad_norm": 1.2775893211364746,
      "learning_rate": 0.0001,
      "loss": 0.6629,
      "step": 101
    },
    {
      "epoch": 0.01632,
      "grad_norm": 0.7754040360450745,
      "learning_rate": 0.0001,
      "loss": 0.6892,
      "step": 102
    },
    {
      "epoch": 0.01648,
      "grad_norm": 0.9723941087722778,
      "learning_rate": 0.0001,
      "loss": 0.6788,
      "step": 103
    },
    {
      "epoch": 0.01664,
      "grad_norm": 0.8939617872238159,
      "learning_rate": 0.0001,
      "loss": 0.6787,
      "step": 104
    },
    {
      "epoch": 0.0168,
      "grad_norm": 0.8440852761268616,
      "learning_rate": 0.0001,
      "loss": 0.6665,
      "step": 105
    },
    {
      "epoch": 0.01696,
      "grad_norm": 1.2258847951889038,
      "learning_rate": 0.0001,
      "loss": 0.6665,
      "step": 106
    },
    {
      "epoch": 0.01712,
      "grad_norm": 0.6886342763900757,
      "learning_rate": 0.0001,
      "loss": 0.6664,
      "step": 107
    },
    {
      "epoch": 0.01728,
      "grad_norm": 0.8991551995277405,
      "learning_rate": 0.0001,
      "loss": 0.6711,
      "step": 108
    },
    {
      "epoch": 0.01744,
      "grad_norm": 0.8286511898040771,
      "learning_rate": 0.0001,
      "loss": 0.6798,
      "step": 109
    },
    {
      "epoch": 0.0176,
      "grad_norm": 0.5575344562530518,
      "learning_rate": 0.0001,
      "loss": 0.6587,
      "step": 110
    },
    {
      "epoch": 0.01776,
      "grad_norm": 0.9119205474853516,
      "learning_rate": 0.0001,
      "loss": 0.6753,
      "step": 111
    },
    {
      "epoch": 0.01792,
      "grad_norm": 0.7863529920578003,
      "learning_rate": 0.0001,
      "loss": 0.6603,
      "step": 112
    },
    {
      "epoch": 0.01808,
      "grad_norm": 0.683083713054657,
      "learning_rate": 0.0001,
      "loss": 0.6465,
      "step": 113
    },
    {
      "epoch": 0.01824,
      "grad_norm": 0.7275103330612183,
      "learning_rate": 0.0001,
      "loss": 0.6089,
      "step": 114
    },
    {
      "epoch": 0.0184,
      "grad_norm": 0.6362513899803162,
      "learning_rate": 0.0001,
      "loss": 0.6605,
      "step": 115
    },
    {
      "epoch": 0.01856,
      "grad_norm": 0.7942511439323425,
      "learning_rate": 0.0001,
      "loss": 0.6344,
      "step": 116
    },
    {
      "epoch": 0.01872,
      "grad_norm": 0.9763580560684204,
      "learning_rate": 0.0001,
      "loss": 0.6668,
      "step": 117
    },
    {
      "epoch": 0.01888,
      "grad_norm": 0.636893093585968,
      "learning_rate": 0.0001,
      "loss": 0.6395,
      "step": 118
    },
    {
      "epoch": 0.01904,
      "grad_norm": 0.6480208039283752,
      "learning_rate": 0.0001,
      "loss": 0.6334,
      "step": 119
    },
    {
      "epoch": 0.0192,
      "grad_norm": 0.6813835501670837,
      "learning_rate": 0.0001,
      "loss": 0.6385,
      "step": 120
    },
    {
      "epoch": 0.01936,
      "grad_norm": 0.5524700284004211,
      "learning_rate": 0.0001,
      "loss": 0.6028,
      "step": 121
    },
    {
      "epoch": 0.01952,
      "grad_norm": 0.669031023979187,
      "learning_rate": 0.0001,
      "loss": 0.6546,
      "step": 122
    },
    {
      "epoch": 0.01968,
      "grad_norm": 0.471418172121048,
      "learning_rate": 0.0001,
      "loss": 0.6157,
      "step": 123
    },
    {
      "epoch": 0.01984,
      "grad_norm": 0.5270216464996338,
      "learning_rate": 0.0001,
      "loss": 0.6362,
      "step": 124
    },
    {
      "epoch": 0.02,
      "grad_norm": 0.5022809505462646,
      "learning_rate": 0.0001,
      "loss": 0.6236,
      "step": 125
    },
    {
      "epoch": 0.02016,
      "grad_norm": 0.44652295112609863,
      "learning_rate": 0.0001,
      "loss": 0.6355,
      "step": 126
    },
    {
      "epoch": 0.02032,
      "grad_norm": 0.5041065812110901,
      "learning_rate": 0.0001,
      "loss": 0.6232,
      "step": 127
    },
    {
      "epoch": 0.02048,
      "grad_norm": 0.4019528925418854,
      "learning_rate": 0.0001,
      "loss": 0.6321,
      "step": 128
    },
    {
      "epoch": 0.02064,
      "grad_norm": 0.36395514011383057,
      "learning_rate": 0.0001,
      "loss": 0.6278,
      "step": 129
    },
    {
      "epoch": 0.0208,
      "grad_norm": 0.5264583826065063,
      "learning_rate": 0.0001,
      "loss": 0.6282,
      "step": 130
    },
    {
      "epoch": 0.02096,
      "grad_norm": 0.4085930585861206,
      "learning_rate": 0.0001,
      "loss": 0.6119,
      "step": 131
    },
    {
      "epoch": 0.02112,
      "grad_norm": 0.5390750765800476,
      "learning_rate": 0.0001,
      "loss": 0.6151,
      "step": 132
    },
    {
      "epoch": 0.02128,
      "grad_norm": 0.41967955231666565,
      "learning_rate": 0.0001,
      "loss": 0.6456,
      "step": 133
    },
    {
      "epoch": 0.02144,
      "grad_norm": 0.5566163063049316,
      "learning_rate": 0.0001,
      "loss": 0.605,
      "step": 134
    },
    {
      "epoch": 0.0216,
      "grad_norm": 0.4576062858104706,
      "learning_rate": 0.0001,
      "loss": 0.6408,
      "step": 135
    },
    {
      "epoch": 0.02176,
      "grad_norm": 0.378772109746933,
      "learning_rate": 0.0001,
      "loss": 0.5916,
      "step": 136
    },
    {
      "epoch": 0.02192,
      "grad_norm": 0.6053535342216492,
      "learning_rate": 0.0001,
      "loss": 0.6221,
      "step": 137
    },
    {
      "epoch": 0.02208,
      "grad_norm": 0.42460954189300537,
      "learning_rate": 0.0001,
      "loss": 0.6164,
      "step": 138
    },
    {
      "epoch": 0.02224,
      "grad_norm": 0.7706974148750305,
      "learning_rate": 0.0001,
      "loss": 0.6007,
      "step": 139
    },
    {
      "epoch": 0.0224,
      "grad_norm": 0.7699508666992188,
      "learning_rate": 0.0001,
      "loss": 0.6105,
      "step": 140
    },
    {
      "epoch": 0.02256,
      "grad_norm": 0.4954952597618103,
      "learning_rate": 0.0001,
      "loss": 0.6229,
      "step": 141
    },
    {
      "epoch": 0.02272,
      "grad_norm": 1.035778522491455,
      "learning_rate": 0.0001,
      "loss": 0.6086,
      "step": 142
    },
    {
      "epoch": 0.02288,
      "grad_norm": 0.6091610789299011,
      "learning_rate": 0.0001,
      "loss": 0.6095,
      "step": 143
    },
    {
      "epoch": 0.02304,
      "grad_norm": 0.5950079560279846,
      "learning_rate": 0.0001,
      "loss": 0.6191,
      "step": 144
    },
    {
      "epoch": 0.0232,
      "grad_norm": 0.4881584942340851,
      "learning_rate": 0.0001,
      "loss": 0.6389,
      "step": 145
    },
    {
      "epoch": 0.02336,
      "grad_norm": 0.5702329874038696,
      "learning_rate": 0.0001,
      "loss": 0.6145,
      "step": 146
    },
    {
      "epoch": 0.02352,
      "grad_norm": 0.399149090051651,
      "learning_rate": 0.0001,
      "loss": 0.5661,
      "step": 147
    },
    {
      "epoch": 0.02368,
      "grad_norm": 0.5372491478919983,
      "learning_rate": 0.0001,
      "loss": 0.6006,
      "step": 148
    },
    {
      "epoch": 0.02384,
      "grad_norm": 0.47538551688194275,
      "learning_rate": 0.0001,
      "loss": 0.6095,
      "step": 149
    },
    {
      "epoch": 0.024,
      "grad_norm": 0.47533848881721497,
      "learning_rate": 0.0001,
      "loss": 0.5854,
      "step": 150
    },
    {
      "epoch": 0.02416,
      "grad_norm": 0.48898449540138245,
      "learning_rate": 0.0001,
      "loss": 0.6094,
      "step": 151
    },
    {
      "epoch": 0.02432,
      "grad_norm": 0.557315468788147,
      "learning_rate": 0.0001,
      "loss": 0.5923,
      "step": 152
    },
    {
      "epoch": 0.02448,
      "grad_norm": 0.6038573980331421,
      "learning_rate": 0.0001,
      "loss": 0.5963,
      "step": 153
    },
    {
      "epoch": 0.02464,
      "grad_norm": 0.3396874964237213,
      "learning_rate": 0.0001,
      "loss": 0.5912,
      "step": 154
    },
    {
      "epoch": 0.0248,
      "grad_norm": 0.6142090559005737,
      "learning_rate": 0.0001,
      "loss": 0.5984,
      "step": 155
    },
    {
      "epoch": 0.02496,
      "grad_norm": 0.33893388509750366,
      "learning_rate": 0.0001,
      "loss": 0.6013,
      "step": 156
    },
    {
      "epoch": 0.02512,
      "grad_norm": 0.48939049243927,
      "learning_rate": 0.0001,
      "loss": 0.5918,
      "step": 157
    },
    {
      "epoch": 0.02528,
      "grad_norm": 0.8800159692764282,
      "learning_rate": 0.0001,
      "loss": 0.6019,
      "step": 158
    },
    {
      "epoch": 0.02544,
      "grad_norm": 0.9279545545578003,
      "learning_rate": 0.0001,
      "loss": 0.558,
      "step": 159
    },
    {
      "epoch": 0.0256,
      "grad_norm": 0.4139571189880371,
      "learning_rate": 0.0001,
      "loss": 0.5766,
      "step": 160
    },
    {
      "epoch": 0.02576,
      "grad_norm": 0.965855062007904,
      "learning_rate": 0.0001,
      "loss": 0.5865,
      "step": 161
    },
    {
      "epoch": 0.02592,
      "grad_norm": 0.9661456346511841,
      "learning_rate": 0.0001,
      "loss": 0.608,
      "step": 162
    },
    {
      "epoch": 0.02608,
      "grad_norm": 0.41104766726493835,
      "learning_rate": 0.0001,
      "loss": 0.588,
      "step": 163
    },
    {
      "epoch": 0.02624,
      "grad_norm": 0.7149668335914612,
      "learning_rate": 0.0001,
      "loss": 0.5884,
      "step": 164
    },
    {
      "epoch": 0.0264,
      "grad_norm": 0.41353386640548706,
      "learning_rate": 0.0001,
      "loss": 0.5655,
      "step": 165
    },
    {
      "epoch": 0.02656,
      "grad_norm": 0.5854697227478027,
      "learning_rate": 0.0001,
      "loss": 0.5937,
      "step": 166
    },
    {
      "epoch": 0.02672,
      "grad_norm": 0.5155587792396545,
      "learning_rate": 0.0001,
      "loss": 0.555,
      "step": 167
    },
    {
      "epoch": 0.02688,
      "grad_norm": 0.4549197554588318,
      "learning_rate": 0.0001,
      "loss": 0.5876,
      "step": 168
    },
    {
      "epoch": 0.02704,
      "grad_norm": 0.44297659397125244,
      "learning_rate": 0.0001,
      "loss": 0.5818,
      "step": 169
    },
    {
      "epoch": 0.0272,
      "grad_norm": 0.4329194724559784,
      "learning_rate": 0.0001,
      "loss": 0.6108,
      "step": 170
    },
    {
      "epoch": 0.02736,
      "grad_norm": 0.5197380781173706,
      "learning_rate": 0.0001,
      "loss": 0.5643,
      "step": 171
    },
    {
      "epoch": 0.02752,
      "grad_norm": 0.33782222867012024,
      "learning_rate": 0.0001,
      "loss": 0.5891,
      "step": 172
    },
    {
      "epoch": 0.02768,
      "grad_norm": 0.3903028070926666,
      "learning_rate": 0.0001,
      "loss": 0.5947,
      "step": 173
    },
    {
      "epoch": 0.02784,
      "grad_norm": 0.37192443013191223,
      "learning_rate": 0.0001,
      "loss": 0.5825,
      "step": 174
    },
    {
      "epoch": 0.028,
      "grad_norm": 0.6242554783821106,
      "learning_rate": 0.0001,
      "loss": 0.598,
      "step": 175
    },
    {
      "epoch": 0.02816,
      "grad_norm": 0.4836231470108032,
      "learning_rate": 0.0001,
      "loss": 0.5747,
      "step": 176
    },
    {
      "epoch": 0.02832,
      "grad_norm": 0.46862614154815674,
      "learning_rate": 0.0001,
      "loss": 0.5677,
      "step": 177
    },
    {
      "epoch": 0.02848,
      "grad_norm": 0.4777337312698364,
      "learning_rate": 0.0001,
      "loss": 0.5791,
      "step": 178
    },
    {
      "epoch": 0.02864,
      "grad_norm": 0.4687139093875885,
      "learning_rate": 0.0001,
      "loss": 0.5629,
      "step": 179
    },
    {
      "epoch": 0.0288,
      "grad_norm": 0.564816415309906,
      "learning_rate": 0.0001,
      "loss": 0.5838,
      "step": 180
    },
    {
      "epoch": 0.02896,
      "grad_norm": 0.28897732496261597,
      "learning_rate": 0.0001,
      "loss": 0.5731,
      "step": 181
    },
    {
      "epoch": 0.02912,
      "grad_norm": 0.5985177159309387,
      "learning_rate": 0.0001,
      "loss": 0.5676,
      "step": 182
    },
    {
      "epoch": 0.02928,
      "grad_norm": 0.335464209318161,
      "learning_rate": 0.0001,
      "loss": 0.562,
      "step": 183
    },
    {
      "epoch": 0.02944,
      "grad_norm": 0.5098849534988403,
      "learning_rate": 0.0001,
      "loss": 0.577,
      "step": 184
    },
    {
      "epoch": 0.0296,
      "grad_norm": 0.5112779140472412,
      "learning_rate": 0.0001,
      "loss": 0.5706,
      "step": 185
    },
    {
      "epoch": 0.02976,
      "grad_norm": 0.6316503882408142,
      "learning_rate": 0.0001,
      "loss": 0.5662,
      "step": 186
    },
    {
      "epoch": 0.02992,
      "grad_norm": 0.546904981136322,
      "learning_rate": 0.0001,
      "loss": 0.5689,
      "step": 187
    },
    {
      "epoch": 0.03008,
      "grad_norm": 0.49590352177619934,
      "learning_rate": 0.0001,
      "loss": 0.5944,
      "step": 188
    },
    {
      "epoch": 0.03024,
      "grad_norm": 0.6453245878219604,
      "learning_rate": 0.0001,
      "loss": 0.5832,
      "step": 189
    },
    {
      "epoch": 0.0304,
      "grad_norm": 0.4027453660964966,
      "learning_rate": 0.0001,
      "loss": 0.5554,
      "step": 190
    },
    {
      "epoch": 0.03056,
      "grad_norm": 0.5426005721092224,
      "learning_rate": 0.0001,
      "loss": 0.5435,
      "step": 191
    },
    {
      "epoch": 0.03072,
      "grad_norm": 0.3381921350955963,
      "learning_rate": 0.0001,
      "loss": 0.5419,
      "step": 192
    },
    {
      "epoch": 0.03088,
      "grad_norm": 0.5136387944221497,
      "learning_rate": 0.0001,
      "loss": 0.5569,
      "step": 193
    },
    {
      "epoch": 0.03104,
      "grad_norm": 0.3990335762500763,
      "learning_rate": 0.0001,
      "loss": 0.5422,
      "step": 194
    },
    {
      "epoch": 0.0312,
      "grad_norm": 0.6370740532875061,
      "learning_rate": 0.0001,
      "loss": 0.5786,
      "step": 195
    },
    {
      "epoch": 0.03136,
      "grad_norm": 0.551914632320404,
      "learning_rate": 0.0001,
      "loss": 0.5608,
      "step": 196
    },
    {
      "epoch": 0.03152,
      "grad_norm": 0.8292807936668396,
      "learning_rate": 0.0001,
      "loss": 0.5654,
      "step": 197
    },
    {
      "epoch": 0.03168,
      "grad_norm": 0.6143330335617065,
      "learning_rate": 0.0001,
      "loss": 0.5527,
      "step": 198
    },
    {
      "epoch": 0.03184,
      "grad_norm": 0.4807484745979309,
      "learning_rate": 0.0001,
      "loss": 0.5548,
      "step": 199
    },
    {
      "epoch": 0.032,
      "grad_norm": 0.4877353012561798,
      "learning_rate": 0.0001,
      "loss": 0.554,
      "step": 200
    },
    {
      "epoch": 0.032,
      "eval_train_accuracy": 0.4988,
      "eval_train_loss": 0.5432412028312683,
      "eval_train_runtime": 4.0556,
      "eval_train_samples_per_second": 1232.849,
      "eval_train_steps_per_second": 15.534,
      "step": 200
    },
    {
      "epoch": 0.032,
      "eval_test_accuracy": 0.5008,
      "eval_test_loss": 0.5414249897003174,
      "eval_test_runtime": 4.7119,
      "eval_test_samples_per_second": 1061.15,
      "eval_test_steps_per_second": 13.37,
      "step": 200
    },
    {
      "epoch": 0.03216,
      "grad_norm": 0.3755268454551697,
      "learning_rate": 0.0001,
      "loss": 0.5163,
      "step": 201
    },
    {
      "epoch": 0.03232,
      "grad_norm": 0.49885591864585876,
      "learning_rate": 0.0001,
      "loss": 0.5595,
      "step": 202
    },
    {
      "epoch": 0.03248,
      "grad_norm": 0.43722066283226013,
      "learning_rate": 0.0001,
      "loss": 0.5394,
      "step": 203
    },
    {
      "epoch": 0.03264,
      "grad_norm": 0.36117738485336304,
      "learning_rate": 0.0001,
      "loss": 0.5888,
      "step": 204
    },
    {
      "epoch": 0.0328,
      "grad_norm": 0.6061204671859741,
      "learning_rate": 0.0001,
      "loss": 0.5465,
      "step": 205
    },
    {
      "epoch": 0.03296,
      "grad_norm": 0.35624366998672485,
      "learning_rate": 0.0001,
      "loss": 0.5376,
      "step": 206
    },
    {
      "epoch": 0.03312,
      "grad_norm": 0.6602376699447632,
      "learning_rate": 0.0001,
      "loss": 0.5752,
      "step": 207
    },
    {
      "epoch": 0.03328,
      "grad_norm": 0.4121297001838684,
      "learning_rate": 0.0001,
      "loss": 0.5426,
      "step": 208
    },
    {
      "epoch": 0.03344,
      "grad_norm": 0.48896127939224243,
      "learning_rate": 0.0001,
      "loss": 0.5442,
      "step": 209
    },
    {
      "epoch": 0.0336,
      "grad_norm": 0.4125860929489136,
      "learning_rate": 0.0001,
      "loss": 0.5395,
      "step": 210
    },
    {
      "epoch": 0.03376,
      "grad_norm": 0.6232761740684509,
      "learning_rate": 0.0001,
      "loss": 0.5475,
      "step": 211
    },
    {
      "epoch": 0.03392,
      "grad_norm": 0.36049842834472656,
      "learning_rate": 0.0001,
      "loss": 0.5485,
      "step": 212
    },
    {
      "epoch": 0.03408,
      "grad_norm": 0.5414941906929016,
      "learning_rate": 0.0001,
      "loss": 0.4943,
      "step": 213
    },
    {
      "epoch": 0.03424,
      "grad_norm": 0.4565263092517853,
      "learning_rate": 0.0001,
      "loss": 0.5518,
      "step": 214
    },
    {
      "epoch": 0.0344,
      "grad_norm": 0.44907113909721375,
      "learning_rate": 0.0001,
      "loss": 0.5475,
      "step": 215
    },
    {
      "epoch": 0.03456,
      "grad_norm": 0.3874342143535614,
      "learning_rate": 0.0001,
      "loss": 0.5522,
      "step": 216
    },
    {
      "epoch": 0.03472,
      "grad_norm": 0.3551444709300995,
      "learning_rate": 0.0001,
      "loss": 0.5409,
      "step": 217
    },
    {
      "epoch": 0.03488,
      "grad_norm": 0.35579901933670044,
      "learning_rate": 0.0001,
      "loss": 0.5188,
      "step": 218
    },
    {
      "epoch": 0.03504,
      "grad_norm": 0.3388150930404663,
      "learning_rate": 0.0001,
      "loss": 0.5311,
      "step": 219
    },
    {
      "epoch": 0.0352,
      "grad_norm": 0.37688785791397095,
      "learning_rate": 0.0001,
      "loss": 0.5355,
      "step": 220
    },
    {
      "epoch": 0.03536,
      "grad_norm": 0.3148072063922882,
      "learning_rate": 0.0001,
      "loss": 0.5276,
      "step": 221
    },
    {
      "epoch": 0.03552,
      "grad_norm": 0.3065742552280426,
      "learning_rate": 0.0001,
      "loss": 0.5218,
      "step": 222
    },
    {
      "epoch": 0.03568,
      "grad_norm": 0.34451037645339966,
      "learning_rate": 0.0001,
      "loss": 0.5292,
      "step": 223
    },
    {
      "epoch": 0.03584,
      "grad_norm": 0.3338187038898468,
      "learning_rate": 0.0001,
      "loss": 0.538,
      "step": 224
    },
    {
      "epoch": 0.036,
      "grad_norm": 0.36436742544174194,
      "learning_rate": 0.0001,
      "loss": 0.512,
      "step": 225
    },
    {
      "epoch": 0.03616,
      "grad_norm": 0.3347264528274536,
      "learning_rate": 0.0001,
      "loss": 0.5453,
      "step": 226
    },
    {
      "epoch": 0.03632,
      "grad_norm": 0.4188240170478821,
      "learning_rate": 0.0001,
      "loss": 0.5366,
      "step": 227
    },
    {
      "epoch": 0.03648,
      "grad_norm": 0.3283935785293579,
      "learning_rate": 0.0001,
      "loss": 0.5234,
      "step": 228
    },
    {
      "epoch": 0.03664,
      "grad_norm": 0.41834187507629395,
      "learning_rate": 0.0001,
      "loss": 0.516,
      "step": 229
    },
    {
      "epoch": 0.0368,
      "grad_norm": 0.381434828042984,
      "learning_rate": 0.0001,
      "loss": 0.5384,
      "step": 230
    },
    {
      "epoch": 0.03696,
      "grad_norm": 0.5387043952941895,
      "learning_rate": 0.0001,
      "loss": 0.5207,
      "step": 231
    },
    {
      "epoch": 0.03712,
      "grad_norm": 0.4515240490436554,
      "learning_rate": 0.0001,
      "loss": 0.525,
      "step": 232
    },
    {
      "epoch": 0.03728,
      "grad_norm": 0.43782278895378113,
      "learning_rate": 0.0001,
      "loss": 0.547,
      "step": 233
    },
    {
      "epoch": 0.03744,
      "grad_norm": 0.5283610820770264,
      "learning_rate": 0.0001,
      "loss": 0.5322,
      "step": 234
    },
    {
      "epoch": 0.0376,
      "grad_norm": 0.34199196100234985,
      "learning_rate": 0.0001,
      "loss": 0.5035,
      "step": 235
    },
    {
      "epoch": 0.03776,
      "grad_norm": 0.48576855659484863,
      "learning_rate": 0.0001,
      "loss": 0.5021,
      "step": 236
    },
    {
      "epoch": 0.03792,
      "grad_norm": 0.3454415500164032,
      "learning_rate": 0.0001,
      "loss": 0.5002,
      "step": 237
    },
    {
      "epoch": 0.03808,
      "grad_norm": 0.5075095295906067,
      "learning_rate": 0.0001,
      "loss": 0.5008,
      "step": 238
    },
    {
      "epoch": 0.03824,
      "grad_norm": 0.4239567220211029,
      "learning_rate": 0.0001,
      "loss": 0.5353,
      "step": 239
    },
    {
      "epoch": 0.0384,
      "grad_norm": 0.3588399291038513,
      "learning_rate": 0.0001,
      "loss": 0.5403,
      "step": 240
    },
    {
      "epoch": 0.03856,
      "grad_norm": 0.37843596935272217,
      "learning_rate": 0.0001,
      "loss": 0.5186,
      "step": 241
    },
    {
      "epoch": 0.03872,
      "grad_norm": 0.3191050589084625,
      "learning_rate": 0.0001,
      "loss": 0.5129,
      "step": 242
    },
    {
      "epoch": 0.03888,
      "grad_norm": 0.4018246531486511,
      "learning_rate": 0.0001,
      "loss": 0.5204,
      "step": 243
    },
    {
      "epoch": 0.03904,
      "grad_norm": 0.3591548800468445,
      "learning_rate": 0.0001,
      "loss": 0.4992,
      "step": 244
    },
    {
      "epoch": 0.0392,
      "grad_norm": 0.34374547004699707,
      "learning_rate": 0.0001,
      "loss": 0.5097,
      "step": 245
    },
    {
      "epoch": 0.03936,
      "grad_norm": 0.3351617455482483,
      "learning_rate": 0.0001,
      "loss": 0.5058,
      "step": 246
    },
    {
      "epoch": 0.03952,
      "grad_norm": 0.32702386379241943,
      "learning_rate": 0.0001,
      "loss": 0.5456,
      "step": 247
    },
    {
      "epoch": 0.03968,
      "grad_norm": 0.3822413384914398,
      "learning_rate": 0.0001,
      "loss": 0.4986,
      "step": 248
    },
    {
      "epoch": 0.03984,
      "grad_norm": 0.3359142243862152,
      "learning_rate": 0.0001,
      "loss": 0.5154,
      "step": 249
    },
    {
      "epoch": 0.04,
      "grad_norm": 0.3820008337497711,
      "learning_rate": 0.0001,
      "loss": 0.5316,
      "step": 250
    },
    {
      "epoch": 0.04016,
      "grad_norm": 0.35951468348503113,
      "learning_rate": 0.0001,
      "loss": 0.4896,
      "step": 251
    },
    {
      "epoch": 0.04032,
      "grad_norm": 0.3882177472114563,
      "learning_rate": 0.0001,
      "loss": 0.5212,
      "step": 252
    },
    {
      "epoch": 0.04048,
      "grad_norm": 0.3037460148334503,
      "learning_rate": 0.0001,
      "loss": 0.5066,
      "step": 253
    },
    {
      "epoch": 0.04064,
      "grad_norm": 0.35402750968933105,
      "learning_rate": 0.0001,
      "loss": 0.5057,
      "step": 254
    },
    {
      "epoch": 0.0408,
      "grad_norm": 0.3829970061779022,
      "learning_rate": 0.0001,
      "loss": 0.5196,
      "step": 255
    },
    {
      "epoch": 0.04096,
      "grad_norm": 0.3765171766281128,
      "learning_rate": 0.0001,
      "loss": 0.5007,
      "step": 256
    },
    {
      "epoch": 0.04112,
      "grad_norm": 0.3552708625793457,
      "learning_rate": 0.0001,
      "loss": 0.5046,
      "step": 257
    },
    {
      "epoch": 0.04128,
      "grad_norm": 0.5145670175552368,
      "learning_rate": 0.0001,
      "loss": 0.5115,
      "step": 258
    },
    {
      "epoch": 0.04144,
      "grad_norm": 0.6586034893989563,
      "learning_rate": 0.0001,
      "loss": 0.5091,
      "step": 259
    },
    {
      "epoch": 0.0416,
      "grad_norm": 0.489167720079422,
      "learning_rate": 0.0001,
      "loss": 0.5139,
      "step": 260
    },
    {
      "epoch": 0.04176,
      "grad_norm": 0.4770561456680298,
      "learning_rate": 0.0001,
      "loss": 0.4959,
      "step": 261
    },
    {
      "epoch": 0.04192,
      "grad_norm": 0.5904169678688049,
      "learning_rate": 0.0001,
      "loss": 0.4995,
      "step": 262
    },
    {
      "epoch": 0.04208,
      "grad_norm": 0.4262777864933014,
      "learning_rate": 0.0001,
      "loss": 0.5046,
      "step": 263
    },
    {
      "epoch": 0.04224,
      "grad_norm": 0.8543452620506287,
      "learning_rate": 0.0001,
      "loss": 0.479,
      "step": 264
    },
    {
      "epoch": 0.0424,
      "grad_norm": 0.3875999450683594,
      "learning_rate": 0.0001,
      "loss": 0.4979,
      "step": 265
    },
    {
      "epoch": 0.04256,
      "grad_norm": 0.5917110443115234,
      "learning_rate": 0.0001,
      "loss": 0.5085,
      "step": 266
    },
    {
      "epoch": 0.04272,
      "grad_norm": 0.42489662766456604,
      "learning_rate": 0.0001,
      "loss": 0.4686,
      "step": 267
    },
    {
      "epoch": 0.04288,
      "grad_norm": 0.5693848729133606,
      "learning_rate": 0.0001,
      "loss": 0.4919,
      "step": 268
    },
    {
      "epoch": 0.04304,
      "grad_norm": 0.34706440567970276,
      "learning_rate": 0.0001,
      "loss": 0.4937,
      "step": 269
    },
    {
      "epoch": 0.0432,
      "grad_norm": 0.48547571897506714,
      "learning_rate": 0.0001,
      "loss": 0.5029,
      "step": 270
    },
    {
      "epoch": 0.04336,
      "grad_norm": 0.37143486738204956,
      "learning_rate": 0.0001,
      "loss": 0.4623,
      "step": 271
    },
    {
      "epoch": 0.04352,
      "grad_norm": 0.3676338195800781,
      "learning_rate": 0.0001,
      "loss": 0.4733,
      "step": 272
    },
    {
      "epoch": 0.04368,
      "grad_norm": 0.4830535352230072,
      "learning_rate": 0.0001,
      "loss": 0.5096,
      "step": 273
    },
    {
      "epoch": 0.04384,
      "grad_norm": 0.4128039479255676,
      "learning_rate": 0.0001,
      "loss": 0.4909,
      "step": 274
    },
    {
      "epoch": 0.044,
      "grad_norm": 0.3555804193019867,
      "learning_rate": 0.0001,
      "loss": 0.4895,
      "step": 275
    },
    {
      "epoch": 0.04416,
      "grad_norm": 0.3873211443424225,
      "learning_rate": 0.0001,
      "loss": 0.4848,
      "step": 276
    },
    {
      "epoch": 0.04432,
      "grad_norm": 0.4238971471786499,
      "learning_rate": 0.0001,
      "loss": 0.505,
      "step": 277
    },
    {
      "epoch": 0.04448,
      "grad_norm": 0.4663321077823639,
      "learning_rate": 0.0001,
      "loss": 0.4925,
      "step": 278
    },
    {
      "epoch": 0.04464,
      "grad_norm": 0.3038024604320526,
      "learning_rate": 0.0001,
      "loss": 0.478,
      "step": 279
    },
    {
      "epoch": 0.0448,
      "grad_norm": 0.48346683382987976,
      "learning_rate": 0.0001,
      "loss": 0.4922,
      "step": 280
    },
    {
      "epoch": 0.04496,
      "grad_norm": 0.29926714301109314,
      "learning_rate": 0.0001,
      "loss": 0.4712,
      "step": 281
    },
    {
      "epoch": 0.04512,
      "grad_norm": 0.3806879222393036,
      "learning_rate": 0.0001,
      "loss": 0.4697,
      "step": 282
    },
    {
      "epoch": 0.04528,
      "grad_norm": 0.4127981662750244,
      "learning_rate": 0.0001,
      "loss": 0.4903,
      "step": 283
    },
    {
      "epoch": 0.04544,
      "grad_norm": 0.45918676257133484,
      "learning_rate": 0.0001,
      "loss": 0.5065,
      "step": 284
    },
    {
      "epoch": 0.0456,
      "grad_norm": 0.4743536114692688,
      "learning_rate": 0.0001,
      "loss": 0.4776,
      "step": 285
    },
    {
      "epoch": 0.04576,
      "grad_norm": 0.3789817988872528,
      "learning_rate": 0.0001,
      "loss": 0.4765,
      "step": 286
    },
    {
      "epoch": 0.04592,
      "grad_norm": 0.44132158160209656,
      "learning_rate": 0.0001,
      "loss": 0.5111,
      "step": 287
    },
    {
      "epoch": 0.04608,
      "grad_norm": 0.4020986557006836,
      "learning_rate": 0.0001,
      "loss": 0.4775,
      "step": 288
    },
    {
      "epoch": 0.04624,
      "grad_norm": 0.38996630907058716,
      "learning_rate": 0.0001,
      "loss": 0.4805,
      "step": 289
    },
    {
      "epoch": 0.0464,
      "grad_norm": 0.3833138346672058,
      "learning_rate": 0.0001,
      "loss": 0.49,
      "step": 290
    },
    {
      "epoch": 0.04656,
      "grad_norm": 0.3879009783267975,
      "learning_rate": 0.0001,
      "loss": 0.4562,
      "step": 291
    },
    {
      "epoch": 0.04672,
      "grad_norm": 0.6891469955444336,
      "learning_rate": 0.0001,
      "loss": 0.4772,
      "step": 292
    },
    {
      "epoch": 0.04688,
      "grad_norm": 0.5672962665557861,
      "learning_rate": 0.0001,
      "loss": 0.5062,
      "step": 293
    },
    {
      "epoch": 0.04704,
      "grad_norm": 0.6070660948753357,
      "learning_rate": 0.0001,
      "loss": 0.4902,
      "step": 294
    },
    {
      "epoch": 0.0472,
      "grad_norm": 0.3896338939666748,
      "learning_rate": 0.0001,
      "loss": 0.4744,
      "step": 295
    },
    {
      "epoch": 0.04736,
      "grad_norm": 0.7501013278961182,
      "learning_rate": 0.0001,
      "loss": 0.4818,
      "step": 296
    },
    {
      "epoch": 0.04752,
      "grad_norm": 0.4156897962093353,
      "learning_rate": 0.0001,
      "loss": 0.4816,
      "step": 297
    },
    {
      "epoch": 0.04768,
      "grad_norm": 0.9513557553291321,
      "learning_rate": 0.0001,
      "loss": 0.4868,
      "step": 298
    },
    {
      "epoch": 0.04784,
      "grad_norm": 0.4488682448863983,
      "learning_rate": 0.0001,
      "loss": 0.4673,
      "step": 299
    },
    {
      "epoch": 0.048,
      "grad_norm": 1.348981261253357,
      "learning_rate": 0.0001,
      "loss": 0.5072,
      "step": 300
    },
    {
      "epoch": 0.048,
      "eval_train_accuracy": 0.5,
      "eval_train_loss": 0.46834784746170044,
      "eval_train_runtime": 4.3215,
      "eval_train_samples_per_second": 1157.007,
      "eval_train_steps_per_second": 14.578,
      "step": 300
    },
    {
      "epoch": 0.048,
      "eval_test_accuracy": 0.4988,
      "eval_test_loss": 0.46679985523223877,
      "eval_test_runtime": 4.593,
      "eval_test_samples_per_second": 1088.609,
      "eval_test_steps_per_second": 13.716,
      "step": 300
    },
    {
      "epoch": 0.04816,
      "grad_norm": 0.794299840927124,
      "learning_rate": 0.0001,
      "loss": 0.4848,
      "step": 301
    },
    {
      "epoch": 0.04832,
      "grad_norm": 1.170288324356079,
      "learning_rate": 0.0001,
      "loss": 0.4928,
      "step": 302
    },
    {
      "epoch": 0.04848,
      "grad_norm": 0.8093296885490417,
      "learning_rate": 0.0001,
      "loss": 0.4919,
      "step": 303
    },
    {
      "epoch": 0.04864,
      "grad_norm": 0.4708687663078308,
      "learning_rate": 0.0001,
      "loss": 0.4627,
      "step": 304
    },
    {
      "epoch": 0.0488,
      "grad_norm": 0.5951588749885559,
      "learning_rate": 0.0001,
      "loss": 0.4444,
      "step": 305
    },
    {
      "epoch": 0.04896,
      "grad_norm": 0.9573937058448792,
      "learning_rate": 0.0001,
      "loss": 0.4598,
      "step": 306
    },
    {
      "epoch": 0.04912,
      "grad_norm": 0.6763418316841125,
      "learning_rate": 0.0001,
      "loss": 0.4903,
      "step": 307
    },
    {
      "epoch": 0.04928,
      "grad_norm": 0.5225142240524292,
      "learning_rate": 0.0001,
      "loss": 0.44,
      "step": 308
    },
    {
      "epoch": 0.04944,
      "grad_norm": 0.6147029995918274,
      "learning_rate": 0.0001,
      "loss": 0.4715,
      "step": 309
    },
    {
      "epoch": 0.0496,
      "grad_norm": 0.6542117595672607,
      "learning_rate": 0.0001,
      "loss": 0.4658,
      "step": 310
    },
    {
      "epoch": 0.04976,
      "grad_norm": 0.6548803448677063,
      "learning_rate": 0.0001,
      "loss": 0.4821,
      "step": 311
    },
    {
      "epoch": 0.04992,
      "grad_norm": 0.623757004737854,
      "learning_rate": 0.0001,
      "loss": 0.4846,
      "step": 312
    },
    {
      "epoch": 0.05008,
      "grad_norm": 0.8719553351402283,
      "learning_rate": 0.0001,
      "loss": 0.4554,
      "step": 313
    },
    {
      "epoch": 0.05024,
      "grad_norm": 0.6090419292449951,
      "learning_rate": 0.0001,
      "loss": 0.4779,
      "step": 314
    },
    {
      "epoch": 0.0504,
      "grad_norm": 1.2410907745361328,
      "learning_rate": 0.0001,
      "loss": 0.4811,
      "step": 315
    },
    {
      "epoch": 0.05056,
      "grad_norm": 0.6994481086730957,
      "learning_rate": 0.0001,
      "loss": 0.4655,
      "step": 316
    },
    {
      "epoch": 0.05072,
      "grad_norm": 0.5580395460128784,
      "learning_rate": 0.0001,
      "loss": 0.4694,
      "step": 317
    },
    {
      "epoch": 0.05088,
      "grad_norm": 0.8156932592391968,
      "learning_rate": 0.0001,
      "loss": 0.4782,
      "step": 318
    },
    {
      "epoch": 0.05104,
      "grad_norm": 0.4314369261264801,
      "learning_rate": 0.0001,
      "loss": 0.4664,
      "step": 319
    },
    {
      "epoch": 0.0512,
      "grad_norm": 0.5237566828727722,
      "learning_rate": 0.0001,
      "loss": 0.4558,
      "step": 320
    },
    {
      "epoch": 0.05136,
      "grad_norm": 0.36575940251350403,
      "learning_rate": 0.0001,
      "loss": 0.4629,
      "step": 321
    },
    {
      "epoch": 0.05152,
      "grad_norm": 0.5027133226394653,
      "learning_rate": 0.0001,
      "loss": 0.4668,
      "step": 322
    },
    {
      "epoch": 0.05168,
      "grad_norm": 0.44950106739997864,
      "learning_rate": 0.0001,
      "loss": 0.4708,
      "step": 323
    },
    {
      "epoch": 0.05184,
      "grad_norm": 0.6495047807693481,
      "learning_rate": 0.0001,
      "loss": 0.4537,
      "step": 324
    },
    {
      "epoch": 0.052,
      "grad_norm": 0.40843212604522705,
      "learning_rate": 0.0001,
      "loss": 0.461,
      "step": 325
    },
    {
      "epoch": 0.05216,
      "grad_norm": 0.5901216268539429,
      "learning_rate": 0.0001,
      "loss": 0.4599,
      "step": 326
    },
    {
      "epoch": 0.05232,
      "grad_norm": 0.3544059097766876,
      "learning_rate": 0.0001,
      "loss": 0.447,
      "step": 327
    },
    {
      "epoch": 0.05248,
      "grad_norm": 0.5903146862983704,
      "learning_rate": 0.0001,
      "loss": 0.4768,
      "step": 328
    },
    {
      "epoch": 0.05264,
      "grad_norm": 0.32640138268470764,
      "learning_rate": 0.0001,
      "loss": 0.5017,
      "step": 329
    },
    {
      "epoch": 0.0528,
      "grad_norm": 0.573591411113739,
      "learning_rate": 0.0001,
      "loss": 0.4626,
      "step": 330
    },
    {
      "epoch": 0.05296,
      "grad_norm": 0.31379422545433044,
      "learning_rate": 0.0001,
      "loss": 0.4729,
      "step": 331
    },
    {
      "epoch": 0.05312,
      "grad_norm": 0.47715359926223755,
      "learning_rate": 0.0001,
      "loss": 0.4629,
      "step": 332
    },
    {
      "epoch": 0.05328,
      "grad_norm": 0.35618188977241516,
      "learning_rate": 0.0001,
      "loss": 0.4628,
      "step": 333
    },
    {
      "epoch": 0.05344,
      "grad_norm": 0.41478994488716125,
      "learning_rate": 0.0001,
      "loss": 0.4603,
      "step": 334
    },
    {
      "epoch": 0.0536,
      "grad_norm": 0.343889057636261,
      "learning_rate": 0.0001,
      "loss": 0.464,
      "step": 335
    },
    {
      "epoch": 0.05376,
      "grad_norm": 0.6014657020568848,
      "learning_rate": 0.0001,
      "loss": 0.4464,
      "step": 336
    },
    {
      "epoch": 0.05392,
      "grad_norm": 0.307510107755661,
      "learning_rate": 0.0001,
      "loss": 0.4541,
      "step": 337
    },
    {
      "epoch": 0.05408,
      "grad_norm": 0.6213024854660034,
      "learning_rate": 0.0001,
      "loss": 0.452,
      "step": 338
    },
    {
      "epoch": 0.05424,
      "grad_norm": 0.35168927907943726,
      "learning_rate": 0.0001,
      "loss": 0.4532,
      "step": 339
    },
    {
      "epoch": 0.0544,
      "grad_norm": 0.6678427457809448,
      "learning_rate": 0.0001,
      "loss": 0.4268,
      "step": 340
    },
    {
      "epoch": 0.05456,
      "grad_norm": 0.3538016974925995,
      "learning_rate": 0.0001,
      "loss": 0.459,
      "step": 341
    },
    {
      "epoch": 0.05472,
      "grad_norm": 0.47895485162734985,
      "learning_rate": 0.0001,
      "loss": 0.4201,
      "step": 342
    },
    {
      "epoch": 0.05488,
      "grad_norm": 0.443058043718338,
      "learning_rate": 0.0001,
      "loss": 0.4606,
      "step": 343
    },
    {
      "epoch": 0.05504,
      "grad_norm": 0.3782863914966583,
      "learning_rate": 0.0001,
      "loss": 0.4244,
      "step": 344
    },
    {
      "epoch": 0.0552,
      "grad_norm": 0.5741382241249084,
      "learning_rate": 0.0001,
      "loss": 0.4601,
      "step": 345
    },
    {
      "epoch": 0.05536,
      "grad_norm": 0.5513052940368652,
      "learning_rate": 0.0001,
      "loss": 0.4578,
      "step": 346
    },
    {
      "epoch": 0.05552,
      "grad_norm": 0.35069555044174194,
      "learning_rate": 0.0001,
      "loss": 0.4578,
      "step": 347
    },
    {
      "epoch": 0.05568,
      "grad_norm": 0.39294037222862244,
      "learning_rate": 0.0001,
      "loss": 0.442,
      "step": 348
    },
    {
      "epoch": 0.05584,
      "grad_norm": 0.4873962998390198,
      "learning_rate": 0.0001,
      "loss": 0.4512,
      "step": 349
    },
    {
      "epoch": 0.056,
      "grad_norm": 0.34100794792175293,
      "learning_rate": 0.0001,
      "loss": 0.4388,
      "step": 350
    },
    {
      "epoch": 0.05616,
      "grad_norm": 0.5273870825767517,
      "learning_rate": 0.0001,
      "loss": 0.4332,
      "step": 351
    },
    {
      "epoch": 0.05632,
      "grad_norm": 0.2894707918167114,
      "learning_rate": 0.0001,
      "loss": 0.4513,
      "step": 352
    },
    {
      "epoch": 0.05648,
      "grad_norm": 0.4726313352584839,
      "learning_rate": 0.0001,
      "loss": 0.4487,
      "step": 353
    },
    {
      "epoch": 0.05664,
      "grad_norm": 0.39385077357292175,
      "learning_rate": 0.0001,
      "loss": 0.4622,
      "step": 354
    },
    {
      "epoch": 0.0568,
      "grad_norm": 0.6300116777420044,
      "learning_rate": 0.0001,
      "loss": 0.4816,
      "step": 355
    },
    {
      "epoch": 0.05696,
      "grad_norm": 0.35728704929351807,
      "learning_rate": 0.0001,
      "loss": 0.4559,
      "step": 356
    },
    {
      "epoch": 0.05712,
      "grad_norm": 0.5249121785163879,
      "learning_rate": 0.0001,
      "loss": 0.4403,
      "step": 357
    },
    {
      "epoch": 0.05728,
      "grad_norm": 0.5596709847450256,
      "learning_rate": 0.0001,
      "loss": 0.4457,
      "step": 358
    },
    {
      "epoch": 0.05744,
      "grad_norm": 0.6956215500831604,
      "learning_rate": 0.0001,
      "loss": 0.4526,
      "step": 359
    },
    {
      "epoch": 0.0576,
      "grad_norm": 0.5332053303718567,
      "learning_rate": 0.0001,
      "loss": 0.4538,
      "step": 360
    },
    {
      "epoch": 0.05776,
      "grad_norm": 0.9840513467788696,
      "learning_rate": 0.0001,
      "loss": 0.4724,
      "step": 361
    },
    {
      "epoch": 0.05792,
      "grad_norm": 0.35990574955940247,
      "learning_rate": 0.0001,
      "loss": 0.4465,
      "step": 362
    },
    {
      "epoch": 0.05808,
      "grad_norm": 0.867251992225647,
      "learning_rate": 0.0001,
      "loss": 0.4518,
      "step": 363
    },
    {
      "epoch": 0.05824,
      "grad_norm": 0.38044923543930054,
      "learning_rate": 0.0001,
      "loss": 0.449,
      "step": 364
    },
    {
      "epoch": 0.0584,
      "grad_norm": 0.8680050373077393,
      "learning_rate": 0.0001,
      "loss": 0.4626,
      "step": 365
    },
    {
      "epoch": 0.05856,
      "grad_norm": 0.35011574625968933,
      "learning_rate": 0.0001,
      "loss": 0.4463,
      "step": 366
    },
    {
      "epoch": 0.05872,
      "grad_norm": 0.6814441084861755,
      "learning_rate": 0.0001,
      "loss": 0.4428,
      "step": 367
    },
    {
      "epoch": 0.05888,
      "grad_norm": 0.42806461453437805,
      "learning_rate": 0.0001,
      "loss": 0.4467,
      "step": 368
    },
    {
      "epoch": 0.05904,
      "grad_norm": 0.6081996560096741,
      "learning_rate": 0.0001,
      "loss": 0.4466,
      "step": 369
    },
    {
      "epoch": 0.0592,
      "grad_norm": 0.4169359505176544,
      "learning_rate": 0.0001,
      "loss": 0.4369,
      "step": 370
    },
    {
      "epoch": 0.05936,
      "grad_norm": 0.44009384512901306,
      "learning_rate": 0.0001,
      "loss": 0.4337,
      "step": 371
    },
    {
      "epoch": 0.05952,
      "grad_norm": 0.3625560998916626,
      "learning_rate": 0.0001,
      "loss": 0.4304,
      "step": 372
    },
    {
      "epoch": 0.05968,
      "grad_norm": 0.3135546147823334,
      "learning_rate": 0.0001,
      "loss": 0.4429,
      "step": 373
    },
    {
      "epoch": 0.05984,
      "grad_norm": 0.45795169472694397,
      "learning_rate": 0.0001,
      "loss": 0.4259,
      "step": 374
    },
    {
      "epoch": 0.06,
      "grad_norm": 0.3026619851589203,
      "learning_rate": 0.0001,
      "loss": 0.4475,
      "step": 375
    },
    {
      "epoch": 0.06016,
      "grad_norm": 0.4643794000148773,
      "learning_rate": 0.0001,
      "loss": 0.4338,
      "step": 376
    },
    {
      "epoch": 0.06032,
      "grad_norm": 0.3092113137245178,
      "learning_rate": 0.0001,
      "loss": 0.4151,
      "step": 377
    },
    {
      "epoch": 0.06048,
      "grad_norm": 0.321429044008255,
      "learning_rate": 0.0001,
      "loss": 0.4301,
      "step": 378
    },
    {
      "epoch": 0.06064,
      "grad_norm": 0.35766372084617615,
      "learning_rate": 0.0001,
      "loss": 0.4393,
      "step": 379
    },
    {
      "epoch": 0.0608,
      "grad_norm": 0.32079440355300903,
      "learning_rate": 0.0001,
      "loss": 0.439,
      "step": 380
    },
    {
      "epoch": 0.06096,
      "grad_norm": 0.3738339841365814,
      "learning_rate": 0.0001,
      "loss": 0.4446,
      "step": 381
    },
    {
      "epoch": 0.06112,
      "grad_norm": 0.4703870713710785,
      "learning_rate": 0.0001,
      "loss": 0.4545,
      "step": 382
    },
    {
      "epoch": 0.06128,
      "grad_norm": 0.5703818202018738,
      "learning_rate": 0.0001,
      "loss": 0.4421,
      "step": 383
    },
    {
      "epoch": 0.06144,
      "grad_norm": 0.40408891439437866,
      "learning_rate": 0.0001,
      "loss": 0.4416,
      "step": 384
    },
    {
      "epoch": 0.0616,
      "grad_norm": 0.2686176896095276,
      "learning_rate": 0.0001,
      "loss": 0.4264,
      "step": 385
    },
    {
      "epoch": 0.06176,
      "grad_norm": 0.36757513880729675,
      "learning_rate": 0.0001,
      "loss": 0.4475,
      "step": 386
    },
    {
      "epoch": 0.06192,
      "grad_norm": 0.3867359459400177,
      "learning_rate": 0.0001,
      "loss": 0.4324,
      "step": 387
    },
    {
      "epoch": 0.06208,
      "grad_norm": 0.3213803172111511,
      "learning_rate": 0.0001,
      "loss": 0.4215,
      "step": 388
    },
    {
      "epoch": 0.06224,
      "grad_norm": 0.36640051007270813,
      "learning_rate": 0.0001,
      "loss": 0.4318,
      "step": 389
    },
    {
      "epoch": 0.0624,
      "grad_norm": 0.31067487597465515,
      "learning_rate": 0.0001,
      "loss": 0.428,
      "step": 390
    },
    {
      "epoch": 0.06256,
      "grad_norm": 0.4344998598098755,
      "learning_rate": 0.0001,
      "loss": 0.4292,
      "step": 391
    },
    {
      "epoch": 0.06272,
      "grad_norm": 0.40031898021698,
      "learning_rate": 0.0001,
      "loss": 0.426,
      "step": 392
    },
    {
      "epoch": 0.06288,
      "grad_norm": 0.3194144666194916,
      "learning_rate": 0.0001,
      "loss": 0.4256,
      "step": 393
    },
    {
      "epoch": 0.06304,
      "grad_norm": 0.48375973105430603,
      "learning_rate": 0.0001,
      "loss": 0.4362,
      "step": 394
    },
    {
      "epoch": 0.0632,
      "grad_norm": 0.4062870740890503,
      "learning_rate": 0.0001,
      "loss": 0.4167,
      "step": 395
    },
    {
      "epoch": 0.06336,
      "grad_norm": 0.47817492485046387,
      "learning_rate": 0.0001,
      "loss": 0.4509,
      "step": 396
    },
    {
      "epoch": 0.06352,
      "grad_norm": 0.3198767900466919,
      "learning_rate": 0.0001,
      "loss": 0.4217,
      "step": 397
    },
    {
      "epoch": 0.06368,
      "grad_norm": 0.33142685890197754,
      "learning_rate": 0.0001,
      "loss": 0.4269,
      "step": 398
    },
    {
      "epoch": 0.06384,
      "grad_norm": 0.5914079546928406,
      "learning_rate": 0.0001,
      "loss": 0.4391,
      "step": 399
    },
    {
      "epoch": 0.064,
      "grad_norm": 1.0199495553970337,
      "learning_rate": 0.0001,
      "loss": 0.4316,
      "step": 400
    },
    {
      "epoch": 0.064,
      "eval_train_accuracy": 0.4992,
      "eval_train_loss": 0.42218729853630066,
      "eval_train_runtime": 4.3482,
      "eval_train_samples_per_second": 1149.898,
      "eval_train_steps_per_second": 14.489,
      "step": 400
    },
    {
      "epoch": 0.064,
      "eval_test_accuracy": 0.5084,
      "eval_test_loss": 0.4208557605743408,
      "eval_test_runtime": 4.8711,
      "eval_test_samples_per_second": 1026.454,
      "eval_test_steps_per_second": 12.933,
      "step": 400
    },
    {
      "epoch": 0.06416,
      "grad_norm": 1.16423499584198,
      "learning_rate": 0.0001,
      "loss": 0.4444,
      "step": 401
    },
    {
      "epoch": 0.06432,
      "grad_norm": 0.5907636284828186,
      "learning_rate": 0.0001,
      "loss": 0.4453,
      "step": 402
    },
    {
      "epoch": 0.06448,
      "grad_norm": 0.45127758383750916,
      "learning_rate": 0.0001,
      "loss": 0.4259,
      "step": 403
    },
    {
      "epoch": 0.06464,
      "grad_norm": 0.32561877369880676,
      "learning_rate": 0.0001,
      "loss": 0.4326,
      "step": 404
    },
    {
      "epoch": 0.0648,
      "grad_norm": 0.30277374386787415,
      "learning_rate": 0.0001,
      "loss": 0.4373,
      "step": 405
    },
    {
      "epoch": 0.06496,
      "grad_norm": 0.4011163115501404,
      "learning_rate": 0.0001,
      "loss": 0.4092,
      "step": 406
    },
    {
      "epoch": 0.06512,
      "grad_norm": 0.4905720353126526,
      "learning_rate": 0.0001,
      "loss": 0.4179,
      "step": 407
    },
    {
      "epoch": 0.06528,
      "grad_norm": 0.46652498841285706,
      "learning_rate": 0.0001,
      "loss": 0.4261,
      "step": 408
    },
    {
      "epoch": 0.06544,
      "grad_norm": 0.38418087363243103,
      "learning_rate": 0.0001,
      "loss": 0.4282,
      "step": 409
    },
    {
      "epoch": 0.0656,
      "grad_norm": 0.46664631366729736,
      "learning_rate": 0.0001,
      "loss": 0.4285,
      "step": 410
    },
    {
      "epoch": 0.06576,
      "grad_norm": 0.3861197531223297,
      "learning_rate": 0.0001,
      "loss": 0.4176,
      "step": 411
    },
    {
      "epoch": 0.06592,
      "grad_norm": 0.4584300220012665,
      "learning_rate": 0.0001,
      "loss": 0.4178,
      "step": 412
    },
    {
      "epoch": 0.06608,
      "grad_norm": 0.33803790807724,
      "learning_rate": 0.0001,
      "loss": 0.408,
      "step": 413
    },
    {
      "epoch": 0.06624,
      "grad_norm": 0.353893905878067,
      "learning_rate": 0.0001,
      "loss": 0.4015,
      "step": 414
    },
    {
      "epoch": 0.0664,
      "grad_norm": 0.37201905250549316,
      "learning_rate": 0.0001,
      "loss": 0.4097,
      "step": 415
    },
    {
      "epoch": 0.06656,
      "grad_norm": 0.33199092745780945,
      "learning_rate": 0.0001,
      "loss": 0.3934,
      "step": 416
    },
    {
      "epoch": 0.06672,
      "grad_norm": 0.6304075717926025,
      "learning_rate": 0.0001,
      "loss": 0.4174,
      "step": 417
    },
    {
      "epoch": 0.06688,
      "grad_norm": 1.0493485927581787,
      "learning_rate": 0.0001,
      "loss": 0.4088,
      "step": 418
    },
    {
      "epoch": 0.06704,
      "grad_norm": 1.2452231645584106,
      "learning_rate": 0.0001,
      "loss": 0.4394,
      "step": 419
    },
    {
      "epoch": 0.0672,
      "grad_norm": 0.6094799637794495,
      "learning_rate": 0.0001,
      "loss": 0.4231,
      "step": 420
    },
    {
      "epoch": 0.06736,
      "grad_norm": 0.4679381251335144,
      "learning_rate": 0.0001,
      "loss": 0.3883,
      "step": 421
    },
    {
      "epoch": 0.06752,
      "grad_norm": 0.4303785562515259,
      "learning_rate": 0.0001,
      "loss": 0.4061,
      "step": 422
    },
    {
      "epoch": 0.06768,
      "grad_norm": 0.39590713381767273,
      "learning_rate": 0.0001,
      "loss": 0.4153,
      "step": 423
    },
    {
      "epoch": 0.06784,
      "grad_norm": 0.41830217838287354,
      "learning_rate": 0.0001,
      "loss": 0.4289,
      "step": 424
    },
    {
      "epoch": 0.068,
      "grad_norm": 0.38808631896972656,
      "learning_rate": 0.0001,
      "loss": 0.4173,
      "step": 425
    },
    {
      "epoch": 0.06816,
      "grad_norm": 0.3669370412826538,
      "learning_rate": 0.0001,
      "loss": 0.404,
      "step": 426
    },
    {
      "epoch": 0.06832,
      "grad_norm": 0.40234705805778503,
      "learning_rate": 0.0001,
      "loss": 0.3867,
      "step": 427
    },
    {
      "epoch": 0.06848,
      "grad_norm": 0.6395906209945679,
      "learning_rate": 0.0001,
      "loss": 0.4222,
      "step": 428
    },
    {
      "epoch": 0.06864,
      "grad_norm": 0.6344679594039917,
      "learning_rate": 0.0001,
      "loss": 0.4117,
      "step": 429
    },
    {
      "epoch": 0.0688,
      "grad_norm": 0.7349144816398621,
      "learning_rate": 0.0001,
      "loss": 0.4102,
      "step": 430
    },
    {
      "epoch": 0.06896,
      "grad_norm": 0.808068573474884,
      "learning_rate": 0.0001,
      "loss": 0.3923,
      "step": 431
    },
    {
      "epoch": 0.06912,
      "grad_norm": 0.515073299407959,
      "learning_rate": 0.0001,
      "loss": 0.3986,
      "step": 432
    },
    {
      "epoch": 0.06928,
      "grad_norm": 0.9415417313575745,
      "learning_rate": 0.0001,
      "loss": 0.4075,
      "step": 433
    },
    {
      "epoch": 0.06944,
      "grad_norm": 0.39186564087867737,
      "learning_rate": 0.0001,
      "loss": 0.4065,
      "step": 434
    },
    {
      "epoch": 0.0696,
      "grad_norm": 1.2323294878005981,
      "learning_rate": 0.0001,
      "loss": 0.4225,
      "step": 435
    },
    {
      "epoch": 0.06976,
      "grad_norm": 0.40406543016433716,
      "learning_rate": 0.0001,
      "loss": 0.4314,
      "step": 436
    },
    {
      "epoch": 0.06992,
      "grad_norm": 0.4986034333705902,
      "learning_rate": 0.0001,
      "loss": 0.4051,
      "step": 437
    },
    {
      "epoch": 0.07008,
      "grad_norm": 0.3598973751068115,
      "learning_rate": 0.0001,
      "loss": 0.3935,
      "step": 438
    },
    {
      "epoch": 0.07024,
      "grad_norm": 0.45362260937690735,
      "learning_rate": 0.0001,
      "loss": 0.4137,
      "step": 439
    },
    {
      "epoch": 0.0704,
      "grad_norm": 0.3857596814632416,
      "learning_rate": 0.0001,
      "loss": 0.4002,
      "step": 440
    },
    {
      "epoch": 0.07056,
      "grad_norm": 0.27322402596473694,
      "learning_rate": 0.0001,
      "loss": 0.3932,
      "step": 441
    },
    {
      "epoch": 0.07072,
      "grad_norm": 0.43799445033073425,
      "learning_rate": 0.0001,
      "loss": 0.4134,
      "step": 442
    },
    {
      "epoch": 0.07088,
      "grad_norm": 0.3044508993625641,
      "learning_rate": 0.0001,
      "loss": 0.3983,
      "step": 443
    },
    {
      "epoch": 0.07104,
      "grad_norm": 0.3795570731163025,
      "learning_rate": 0.0001,
      "loss": 0.4022,
      "step": 444
    },
    {
      "epoch": 0.0712,
      "grad_norm": 0.5302649736404419,
      "learning_rate": 0.0001,
      "loss": 0.415,
      "step": 445
    },
    {
      "epoch": 0.07136,
      "grad_norm": 0.3931788504123688,
      "learning_rate": 0.0001,
      "loss": 0.4261,
      "step": 446
    },
    {
      "epoch": 0.07152,
      "grad_norm": 0.4514508843421936,
      "learning_rate": 0.0001,
      "loss": 0.4049,
      "step": 447
    },
    {
      "epoch": 0.07168,
      "grad_norm": 0.49725350737571716,
      "learning_rate": 0.0001,
      "loss": 0.3936,
      "step": 448
    },
    {
      "epoch": 0.07184,
      "grad_norm": 0.27177146077156067,
      "learning_rate": 0.0001,
      "loss": 0.4038,
      "step": 449
    },
    {
      "epoch": 0.072,
      "grad_norm": 0.3323196470737457,
      "learning_rate": 0.0001,
      "loss": 0.3929,
      "step": 450
    },
    {
      "epoch": 0.07216,
      "grad_norm": 0.32708361744880676,
      "learning_rate": 0.0001,
      "loss": 0.4149,
      "step": 451
    },
    {
      "epoch": 0.07232,
      "grad_norm": 0.6556123495101929,
      "learning_rate": 0.0001,
      "loss": 0.3998,
      "step": 452
    },
    {
      "epoch": 0.07248,
      "grad_norm": 0.6271286606788635,
      "learning_rate": 0.0001,
      "loss": 0.4026,
      "step": 453
    },
    {
      "epoch": 0.07264,
      "grad_norm": 0.3493739366531372,
      "learning_rate": 0.0001,
      "loss": 0.3957,
      "step": 454
    },
    {
      "epoch": 0.0728,
      "grad_norm": 0.5657927989959717,
      "learning_rate": 0.0001,
      "loss": 0.398,
      "step": 455
    },
    {
      "epoch": 0.07296,
      "grad_norm": 0.3210761249065399,
      "learning_rate": 0.0001,
      "loss": 0.4104,
      "step": 456
    },
    {
      "epoch": 0.07312,
      "grad_norm": 0.4287351965904236,
      "learning_rate": 0.0001,
      "loss": 0.3972,
      "step": 457
    },
    {
      "epoch": 0.07328,
      "grad_norm": 0.2677478492259979,
      "learning_rate": 0.0001,
      "loss": 0.4016,
      "step": 458
    },
    {
      "epoch": 0.07344,
      "grad_norm": 0.45690470933914185,
      "learning_rate": 0.0001,
      "loss": 0.425,
      "step": 459
    },
    {
      "epoch": 0.0736,
      "grad_norm": 0.2560800611972809,
      "learning_rate": 0.0001,
      "loss": 0.3892,
      "step": 460
    },
    {
      "epoch": 0.07376,
      "grad_norm": 0.3919265866279602,
      "learning_rate": 0.0001,
      "loss": 0.4098,
      "step": 461
    },
    {
      "epoch": 0.07392,
      "grad_norm": 0.2604879140853882,
      "learning_rate": 0.0001,
      "loss": 0.4071,
      "step": 462
    },
    {
      "epoch": 0.07408,
      "grad_norm": 0.362783819437027,
      "learning_rate": 0.0001,
      "loss": 0.4215,
      "step": 463
    },
    {
      "epoch": 0.07424,
      "grad_norm": 0.33888348937034607,
      "learning_rate": 0.0001,
      "loss": 0.4025,
      "step": 464
    },
    {
      "epoch": 0.0744,
      "grad_norm": 0.250728577375412,
      "learning_rate": 0.0001,
      "loss": 0.4045,
      "step": 465
    },
    {
      "epoch": 0.07456,
      "grad_norm": 0.25229406356811523,
      "learning_rate": 0.0001,
      "loss": 0.3973,
      "step": 466
    },
    {
      "epoch": 0.07472,
      "grad_norm": 0.29204878211021423,
      "learning_rate": 0.0001,
      "loss": 0.3913,
      "step": 467
    },
    {
      "epoch": 0.07488,
      "grad_norm": 0.24995487928390503,
      "learning_rate": 0.0001,
      "loss": 0.3903,
      "step": 468
    },
    {
      "epoch": 0.07504,
      "grad_norm": 0.2756747901439667,
      "learning_rate": 0.0001,
      "loss": 0.3945,
      "step": 469
    },
    {
      "epoch": 0.0752,
      "grad_norm": 0.27267196774482727,
      "learning_rate": 0.0001,
      "loss": 0.3992,
      "step": 470
    },
    {
      "epoch": 0.07536,
      "grad_norm": 0.26545974612236023,
      "learning_rate": 0.0001,
      "loss": 0.3833,
      "step": 471
    },
    {
      "epoch": 0.07552,
      "grad_norm": 0.37887993454933167,
      "learning_rate": 0.0001,
      "loss": 0.3922,
      "step": 472
    },
    {
      "epoch": 0.07568,
      "grad_norm": 0.28686729073524475,
      "learning_rate": 0.0001,
      "loss": 0.4014,
      "step": 473
    },
    {
      "epoch": 0.07584,
      "grad_norm": 0.2957600951194763,
      "learning_rate": 0.0001,
      "loss": 0.3986,
      "step": 474
    },
    {
      "epoch": 0.076,
      "grad_norm": 0.2948113679885864,
      "learning_rate": 0.0001,
      "loss": 0.3881,
      "step": 475
    },
    {
      "epoch": 0.07616,
      "grad_norm": 0.2628563940525055,
      "learning_rate": 0.0001,
      "loss": 0.3978,
      "step": 476
    },
    {
      "epoch": 0.07632,
      "grad_norm": 0.444179892539978,
      "learning_rate": 0.0001,
      "loss": 0.4203,
      "step": 477
    },
    {
      "epoch": 0.07648,
      "grad_norm": 0.3236483633518219,
      "learning_rate": 0.0001,
      "loss": 0.393,
      "step": 478
    },
    {
      "epoch": 0.07664,
      "grad_norm": 0.32504644989967346,
      "learning_rate": 0.0001,
      "loss": 0.3896,
      "step": 479
    },
    {
      "epoch": 0.0768,
      "grad_norm": 0.29062241315841675,
      "learning_rate": 0.0001,
      "loss": 0.3828,
      "step": 480
    },
    {
      "epoch": 0.07696,
      "grad_norm": 0.27614134550094604,
      "learning_rate": 0.0001,
      "loss": 0.3929,
      "step": 481
    },
    {
      "epoch": 0.07712,
      "grad_norm": 0.3359411656856537,
      "learning_rate": 0.0001,
      "loss": 0.4091,
      "step": 482
    },
    {
      "epoch": 0.07728,
      "grad_norm": 0.24516139924526215,
      "learning_rate": 0.0001,
      "loss": 0.4013,
      "step": 483
    },
    {
      "epoch": 0.07744,
      "grad_norm": 0.2679748237133026,
      "learning_rate": 0.0001,
      "loss": 0.3984,
      "step": 484
    },
    {
      "epoch": 0.0776,
      "grad_norm": 0.2994062304496765,
      "learning_rate": 0.0001,
      "loss": 0.3901,
      "step": 485
    },
    {
      "epoch": 0.07776,
      "grad_norm": 0.332659512758255,
      "learning_rate": 0.0001,
      "loss": 0.3984,
      "step": 486
    },
    {
      "epoch": 0.07792,
      "grad_norm": 0.42973971366882324,
      "learning_rate": 0.0001,
      "loss": 0.393,
      "step": 487
    },
    {
      "epoch": 0.07808,
      "grad_norm": 0.405840128660202,
      "learning_rate": 0.0001,
      "loss": 0.3886,
      "step": 488
    },
    {
      "epoch": 0.07824,
      "grad_norm": 0.32166972756385803,
      "learning_rate": 0.0001,
      "loss": 0.3977,
      "step": 489
    },
    {
      "epoch": 0.0784,
      "grad_norm": 0.3957519829273224,
      "learning_rate": 0.0001,
      "loss": 0.3935,
      "step": 490
    },
    {
      "epoch": 0.07856,
      "grad_norm": 0.8000646829605103,
      "learning_rate": 0.0001,
      "loss": 0.4107,
      "step": 491
    },
    {
      "epoch": 0.07872,
      "grad_norm": 0.751928448677063,
      "learning_rate": 0.0001,
      "loss": 0.3958,
      "step": 492
    },
    {
      "epoch": 0.07888,
      "grad_norm": 0.3278470039367676,
      "learning_rate": 0.0001,
      "loss": 0.3939,
      "step": 493
    },
    {
      "epoch": 0.07904,
      "grad_norm": 0.635106086730957,
      "learning_rate": 0.0001,
      "loss": 0.3998,
      "step": 494
    },
    {
      "epoch": 0.0792,
      "grad_norm": 0.3980269730091095,
      "learning_rate": 0.0001,
      "loss": 0.3923,
      "step": 495
    },
    {
      "epoch": 0.07936,
      "grad_norm": 0.45266464352607727,
      "learning_rate": 0.0001,
      "loss": 0.3856,
      "step": 496
    },
    {
      "epoch": 0.07952,
      "grad_norm": 0.5241808295249939,
      "learning_rate": 0.0001,
      "loss": 0.3945,
      "step": 497
    },
    {
      "epoch": 0.07968,
      "grad_norm": 0.4541044533252716,
      "learning_rate": 0.0001,
      "loss": 0.3809,
      "step": 498
    },
    {
      "epoch": 0.07984,
      "grad_norm": 0.4127626121044159,
      "learning_rate": 0.0001,
      "loss": 0.3823,
      "step": 499
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.38073694705963135,
      "learning_rate": 0.0001,
      "loss": 0.3837,
      "step": 500
    },
    {
      "epoch": 0.08,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.3890226483345032,
      "eval_train_runtime": 4.2248,
      "eval_train_samples_per_second": 1183.495,
      "eval_train_steps_per_second": 14.912,
      "step": 500
    },
    {
      "epoch": 0.08,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.387285977602005,
      "eval_test_runtime": 4.5075,
      "eval_test_samples_per_second": 1109.271,
      "eval_test_steps_per_second": 13.977,
      "step": 500
    },
    {
      "epoch": 0.08016,
      "grad_norm": 0.4882318377494812,
      "learning_rate": 0.0001,
      "loss": 0.3801,
      "step": 501
    },
    {
      "epoch": 0.08032,
      "grad_norm": 0.23361380398273468,
      "learning_rate": 0.0001,
      "loss": 0.3648,
      "step": 502
    },
    {
      "epoch": 0.08048,
      "grad_norm": 0.571707010269165,
      "learning_rate": 0.0001,
      "loss": 0.4096,
      "step": 503
    },
    {
      "epoch": 0.08064,
      "grad_norm": 0.2605324387550354,
      "learning_rate": 0.0001,
      "loss": 0.3878,
      "step": 504
    },
    {
      "epoch": 0.0808,
      "grad_norm": 0.3675616383552551,
      "learning_rate": 0.0001,
      "loss": 0.3941,
      "step": 505
    },
    {
      "epoch": 0.08096,
      "grad_norm": 0.26886308193206787,
      "learning_rate": 0.0001,
      "loss": 0.3939,
      "step": 506
    },
    {
      "epoch": 0.08112,
      "grad_norm": 0.3044634759426117,
      "learning_rate": 0.0001,
      "loss": 0.412,
      "step": 507
    },
    {
      "epoch": 0.08128,
      "grad_norm": 0.2670927345752716,
      "learning_rate": 0.0001,
      "loss": 0.3915,
      "step": 508
    },
    {
      "epoch": 0.08144,
      "grad_norm": 0.2887152433395386,
      "learning_rate": 0.0001,
      "loss": 0.3954,
      "step": 509
    },
    {
      "epoch": 0.0816,
      "grad_norm": 0.29956769943237305,
      "learning_rate": 0.0001,
      "loss": 0.3887,
      "step": 510
    },
    {
      "epoch": 0.08176,
      "grad_norm": 0.2298635095357895,
      "learning_rate": 0.0001,
      "loss": 0.3681,
      "step": 511
    },
    {
      "epoch": 0.08192,
      "grad_norm": 0.28352630138397217,
      "learning_rate": 0.0001,
      "loss": 0.3685,
      "step": 512
    },
    {
      "epoch": 0.08208,
      "grad_norm": 0.26997068524360657,
      "learning_rate": 0.0001,
      "loss": 0.405,
      "step": 513
    },
    {
      "epoch": 0.08224,
      "grad_norm": 0.2953471839427948,
      "learning_rate": 0.0001,
      "loss": 0.3928,
      "step": 514
    },
    {
      "epoch": 0.0824,
      "grad_norm": 0.23671114444732666,
      "learning_rate": 0.0001,
      "loss": 0.3714,
      "step": 515
    },
    {
      "epoch": 0.08256,
      "grad_norm": 0.38417738676071167,
      "learning_rate": 0.0001,
      "loss": 0.4004,
      "step": 516
    },
    {
      "epoch": 0.08272,
      "grad_norm": 0.2520388960838318,
      "learning_rate": 0.0001,
      "loss": 0.3829,
      "step": 517
    },
    {
      "epoch": 0.08288,
      "grad_norm": 0.3937288820743561,
      "learning_rate": 0.0001,
      "loss": 0.3715,
      "step": 518
    },
    {
      "epoch": 0.08304,
      "grad_norm": 0.2534836232662201,
      "learning_rate": 0.0001,
      "loss": 0.4045,
      "step": 519
    },
    {
      "epoch": 0.0832,
      "grad_norm": 0.24774260818958282,
      "learning_rate": 0.0001,
      "loss": 0.3707,
      "step": 520
    },
    {
      "epoch": 0.08336,
      "grad_norm": 0.29582321643829346,
      "learning_rate": 0.0001,
      "loss": 0.3846,
      "step": 521
    },
    {
      "epoch": 0.08352,
      "grad_norm": 0.27917900681495667,
      "learning_rate": 0.0001,
      "loss": 0.3917,
      "step": 522
    },
    {
      "epoch": 0.08368,
      "grad_norm": 0.2832454442977905,
      "learning_rate": 0.0001,
      "loss": 0.3743,
      "step": 523
    },
    {
      "epoch": 0.08384,
      "grad_norm": 0.2669735848903656,
      "learning_rate": 0.0001,
      "loss": 0.384,
      "step": 524
    },
    {
      "epoch": 0.084,
      "grad_norm": 0.2653855085372925,
      "learning_rate": 0.0001,
      "loss": 0.395,
      "step": 525
    },
    {
      "epoch": 0.08416,
      "grad_norm": 0.22626416385173798,
      "learning_rate": 0.0001,
      "loss": 0.383,
      "step": 526
    },
    {
      "epoch": 0.08432,
      "grad_norm": 0.23681418597698212,
      "learning_rate": 0.0001,
      "loss": 0.3819,
      "step": 527
    },
    {
      "epoch": 0.08448,
      "grad_norm": 0.3067173957824707,
      "learning_rate": 0.0001,
      "loss": 0.3961,
      "step": 528
    },
    {
      "epoch": 0.08464,
      "grad_norm": 0.2483433187007904,
      "learning_rate": 0.0001,
      "loss": 0.3982,
      "step": 529
    },
    {
      "epoch": 0.0848,
      "grad_norm": 0.26915442943573,
      "learning_rate": 0.0001,
      "loss": 0.3882,
      "step": 530
    },
    {
      "epoch": 0.08496,
      "grad_norm": 0.3178536295890808,
      "learning_rate": 0.0001,
      "loss": 0.3975,
      "step": 531
    },
    {
      "epoch": 0.08512,
      "grad_norm": 0.28550124168395996,
      "learning_rate": 0.0001,
      "loss": 0.3899,
      "step": 532
    },
    {
      "epoch": 0.08528,
      "grad_norm": 0.29499706625938416,
      "learning_rate": 0.0001,
      "loss": 0.3939,
      "step": 533
    },
    {
      "epoch": 0.08544,
      "grad_norm": 0.22985677421092987,
      "learning_rate": 0.0001,
      "loss": 0.3802,
      "step": 534
    },
    {
      "epoch": 0.0856,
      "grad_norm": 0.256203293800354,
      "learning_rate": 0.0001,
      "loss": 0.3762,
      "step": 535
    },
    {
      "epoch": 0.08576,
      "grad_norm": 0.25480690598487854,
      "learning_rate": 0.0001,
      "loss": 0.3948,
      "step": 536
    },
    {
      "epoch": 0.08592,
      "grad_norm": 0.26410341262817383,
      "learning_rate": 0.0001,
      "loss": 0.3912,
      "step": 537
    },
    {
      "epoch": 0.08608,
      "grad_norm": 0.2080579698085785,
      "learning_rate": 0.0001,
      "loss": 0.385,
      "step": 538
    },
    {
      "epoch": 0.08624,
      "grad_norm": 0.22037474811077118,
      "learning_rate": 0.0001,
      "loss": 0.3825,
      "step": 539
    },
    {
      "epoch": 0.0864,
      "grad_norm": 0.2505066990852356,
      "learning_rate": 0.0001,
      "loss": 0.3876,
      "step": 540
    },
    {
      "epoch": 0.08656,
      "grad_norm": 0.22637759149074554,
      "learning_rate": 0.0001,
      "loss": 0.3805,
      "step": 541
    },
    {
      "epoch": 0.08672,
      "grad_norm": 0.24287280440330505,
      "learning_rate": 0.0001,
      "loss": 0.3675,
      "step": 542
    },
    {
      "epoch": 0.08688,
      "grad_norm": 0.30230289697647095,
      "learning_rate": 0.0001,
      "loss": 0.3808,
      "step": 543
    },
    {
      "epoch": 0.08704,
      "grad_norm": 0.23359639942646027,
      "learning_rate": 0.0001,
      "loss": 0.3881,
      "step": 544
    },
    {
      "epoch": 0.0872,
      "grad_norm": 0.25520214438438416,
      "learning_rate": 0.0001,
      "loss": 0.3903,
      "step": 545
    },
    {
      "epoch": 0.08736,
      "grad_norm": 0.22881238162517548,
      "learning_rate": 0.0001,
      "loss": 0.3923,
      "step": 546
    },
    {
      "epoch": 0.08752,
      "grad_norm": 0.2301057130098343,
      "learning_rate": 0.0001,
      "loss": 0.3761,
      "step": 547
    },
    {
      "epoch": 0.08768,
      "grad_norm": 0.22464028000831604,
      "learning_rate": 0.0001,
      "loss": 0.3686,
      "step": 548
    },
    {
      "epoch": 0.08784,
      "grad_norm": 0.20893977582454681,
      "learning_rate": 0.0001,
      "loss": 0.3832,
      "step": 549
    },
    {
      "epoch": 0.088,
      "grad_norm": 0.2522372007369995,
      "learning_rate": 0.0001,
      "loss": 0.395,
      "step": 550
    },
    {
      "epoch": 0.08816,
      "grad_norm": 0.21029269695281982,
      "learning_rate": 0.0001,
      "loss": 0.3832,
      "step": 551
    },
    {
      "epoch": 0.08832,
      "grad_norm": 0.23176532983779907,
      "learning_rate": 0.0001,
      "loss": 0.3814,
      "step": 552
    },
    {
      "epoch": 0.08848,
      "grad_norm": 0.23812542855739594,
      "learning_rate": 0.0001,
      "loss": 0.3909,
      "step": 553
    },
    {
      "epoch": 0.08864,
      "grad_norm": 0.26078253984451294,
      "learning_rate": 0.0001,
      "loss": 0.3966,
      "step": 554
    },
    {
      "epoch": 0.0888,
      "grad_norm": 0.2208440601825714,
      "learning_rate": 0.0001,
      "loss": 0.3832,
      "step": 555
    },
    {
      "epoch": 0.08896,
      "grad_norm": 0.2599398195743561,
      "learning_rate": 0.0001,
      "loss": 0.3801,
      "step": 556
    },
    {
      "epoch": 0.08912,
      "grad_norm": 0.2994251251220703,
      "learning_rate": 0.0001,
      "loss": 0.3899,
      "step": 557
    },
    {
      "epoch": 0.08928,
      "grad_norm": 0.21921689808368683,
      "learning_rate": 0.0001,
      "loss": 0.3864,
      "step": 558
    },
    {
      "epoch": 0.08944,
      "grad_norm": 0.20407164096832275,
      "learning_rate": 0.0001,
      "loss": 0.3777,
      "step": 559
    },
    {
      "epoch": 0.0896,
      "grad_norm": 0.25632327795028687,
      "learning_rate": 0.0001,
      "loss": 0.3911,
      "step": 560
    },
    {
      "epoch": 0.08976,
      "grad_norm": 0.24950185418128967,
      "learning_rate": 0.0001,
      "loss": 0.3992,
      "step": 561
    },
    {
      "epoch": 0.08992,
      "grad_norm": 0.27620700001716614,
      "learning_rate": 0.0001,
      "loss": 0.3964,
      "step": 562
    },
    {
      "epoch": 0.09008,
      "grad_norm": 0.266605019569397,
      "learning_rate": 0.0001,
      "loss": 0.3865,
      "step": 563
    },
    {
      "epoch": 0.09024,
      "grad_norm": 0.30868908762931824,
      "learning_rate": 0.0001,
      "loss": 0.3825,
      "step": 564
    },
    {
      "epoch": 0.0904,
      "grad_norm": 0.23732270300388336,
      "learning_rate": 0.0001,
      "loss": 0.3806,
      "step": 565
    },
    {
      "epoch": 0.09056,
      "grad_norm": 0.18655166029930115,
      "learning_rate": 0.0001,
      "loss": 0.3746,
      "step": 566
    },
    {
      "epoch": 0.09072,
      "grad_norm": 0.28634533286094666,
      "learning_rate": 0.0001,
      "loss": 0.3859,
      "step": 567
    },
    {
      "epoch": 0.09088,
      "grad_norm": 0.2732689678668976,
      "learning_rate": 0.0001,
      "loss": 0.3696,
      "step": 568
    },
    {
      "epoch": 0.09104,
      "grad_norm": 0.22175782918930054,
      "learning_rate": 0.0001,
      "loss": 0.3793,
      "step": 569
    },
    {
      "epoch": 0.0912,
      "grad_norm": 0.37788695096969604,
      "learning_rate": 0.0001,
      "loss": 0.3966,
      "step": 570
    },
    {
      "epoch": 0.09136,
      "grad_norm": 0.2101757675409317,
      "learning_rate": 0.0001,
      "loss": 0.381,
      "step": 571
    },
    {
      "epoch": 0.09152,
      "grad_norm": 0.21807074546813965,
      "learning_rate": 0.0001,
      "loss": 0.3853,
      "step": 572
    },
    {
      "epoch": 0.09168,
      "grad_norm": 0.27082914113998413,
      "learning_rate": 0.0001,
      "loss": 0.3846,
      "step": 573
    },
    {
      "epoch": 0.09184,
      "grad_norm": 0.26009809970855713,
      "learning_rate": 0.0001,
      "loss": 0.3794,
      "step": 574
    },
    {
      "epoch": 0.092,
      "grad_norm": 0.21578674018383026,
      "learning_rate": 0.0001,
      "loss": 0.3863,
      "step": 575
    },
    {
      "epoch": 0.09216,
      "grad_norm": 0.26952266693115234,
      "learning_rate": 0.0001,
      "loss": 0.3919,
      "step": 576
    },
    {
      "epoch": 0.09232,
      "grad_norm": 0.4141083061695099,
      "learning_rate": 0.0001,
      "loss": 0.3977,
      "step": 577
    },
    {
      "epoch": 0.09248,
      "grad_norm": 0.2298695296049118,
      "learning_rate": 0.0001,
      "loss": 0.3906,
      "step": 578
    },
    {
      "epoch": 0.09264,
      "grad_norm": 0.5226332545280457,
      "learning_rate": 0.0001,
      "loss": 0.3915,
      "step": 579
    },
    {
      "epoch": 0.0928,
      "grad_norm": 0.3360500931739807,
      "learning_rate": 0.0001,
      "loss": 0.371,
      "step": 580
    },
    {
      "epoch": 0.09296,
      "grad_norm": 0.2815108299255371,
      "learning_rate": 0.0001,
      "loss": 0.3872,
      "step": 581
    },
    {
      "epoch": 0.09312,
      "grad_norm": 0.688308835029602,
      "learning_rate": 0.0001,
      "loss": 0.3839,
      "step": 582
    },
    {
      "epoch": 0.09328,
      "grad_norm": 0.19827334582805634,
      "learning_rate": 0.0001,
      "loss": 0.3899,
      "step": 583
    },
    {
      "epoch": 0.09344,
      "grad_norm": 0.2517041563987732,
      "learning_rate": 0.0001,
      "loss": 0.3886,
      "step": 584
    },
    {
      "epoch": 0.0936,
      "grad_norm": 0.6681817173957825,
      "learning_rate": 0.0001,
      "loss": 0.3771,
      "step": 585
    },
    {
      "epoch": 0.09376,
      "grad_norm": 0.2011677473783493,
      "learning_rate": 0.0001,
      "loss": 0.3753,
      "step": 586
    },
    {
      "epoch": 0.09392,
      "grad_norm": 0.47789427638053894,
      "learning_rate": 0.0001,
      "loss": 0.3937,
      "step": 587
    },
    {
      "epoch": 0.09408,
      "grad_norm": 0.4253694415092468,
      "learning_rate": 0.0001,
      "loss": 0.3794,
      "step": 588
    },
    {
      "epoch": 0.09424,
      "grad_norm": 0.36020031571388245,
      "learning_rate": 0.0001,
      "loss": 0.384,
      "step": 589
    },
    {
      "epoch": 0.0944,
      "grad_norm": 0.21620744466781616,
      "learning_rate": 0.0001,
      "loss": 0.3913,
      "step": 590
    },
    {
      "epoch": 0.09456,
      "grad_norm": 0.40034496784210205,
      "learning_rate": 0.0001,
      "loss": 0.3925,
      "step": 591
    },
    {
      "epoch": 0.09472,
      "grad_norm": 0.28442883491516113,
      "learning_rate": 0.0001,
      "loss": 0.3836,
      "step": 592
    },
    {
      "epoch": 0.09488,
      "grad_norm": 0.23294876515865326,
      "learning_rate": 0.0001,
      "loss": 0.3839,
      "step": 593
    },
    {
      "epoch": 0.09504,
      "grad_norm": 0.2634163796901703,
      "learning_rate": 0.0001,
      "loss": 0.3663,
      "step": 594
    },
    {
      "epoch": 0.0952,
      "grad_norm": 0.24324433505535126,
      "learning_rate": 0.0001,
      "loss": 0.3715,
      "step": 595
    },
    {
      "epoch": 0.09536,
      "grad_norm": 0.3640804588794708,
      "learning_rate": 0.0001,
      "loss": 0.3861,
      "step": 596
    },
    {
      "epoch": 0.09552,
      "grad_norm": 0.2322969287633896,
      "learning_rate": 0.0001,
      "loss": 0.3836,
      "step": 597
    },
    {
      "epoch": 0.09568,
      "grad_norm": 0.19535598158836365,
      "learning_rate": 0.0001,
      "loss": 0.36,
      "step": 598
    },
    {
      "epoch": 0.09584,
      "grad_norm": 0.3814096748828888,
      "learning_rate": 0.0001,
      "loss": 0.3851,
      "step": 599
    },
    {
      "epoch": 0.096,
      "grad_norm": 0.21106262505054474,
      "learning_rate": 0.0001,
      "loss": 0.3719,
      "step": 600
    },
    {
      "epoch": 0.096,
      "eval_train_accuracy": 0.5064,
      "eval_train_loss": 0.37843936681747437,
      "eval_train_runtime": 4.0492,
      "eval_train_samples_per_second": 1234.798,
      "eval_train_steps_per_second": 15.558,
      "step": 600
    },
    {
      "epoch": 0.096,
      "eval_test_accuracy": 0.5024,
      "eval_test_loss": 0.377149760723114,
      "eval_test_runtime": 4.7767,
      "eval_test_samples_per_second": 1046.745,
      "eval_test_steps_per_second": 13.189,
      "step": 600
    },
    {
      "epoch": 0.09616,
      "grad_norm": 0.24750469624996185,
      "learning_rate": 0.0001,
      "loss": 0.3793,
      "step": 601
    },
    {
      "epoch": 0.09632,
      "grad_norm": 0.2739443778991699,
      "learning_rate": 0.0001,
      "loss": 0.3806,
      "step": 602
    },
    {
      "epoch": 0.09648,
      "grad_norm": 0.1898907572031021,
      "learning_rate": 0.0001,
      "loss": 0.3695,
      "step": 603
    },
    {
      "epoch": 0.09664,
      "grad_norm": 0.20008061826229095,
      "learning_rate": 0.0001,
      "loss": 0.3788,
      "step": 604
    },
    {
      "epoch": 0.0968,
      "grad_norm": 0.2562355101108551,
      "learning_rate": 0.0001,
      "loss": 0.3825,
      "step": 605
    },
    {
      "epoch": 0.09696,
      "grad_norm": 0.22660401463508606,
      "learning_rate": 0.0001,
      "loss": 0.3822,
      "step": 606
    },
    {
      "epoch": 0.09712,
      "grad_norm": 0.24075619876384735,
      "learning_rate": 0.0001,
      "loss": 0.3706,
      "step": 607
    },
    {
      "epoch": 0.09728,
      "grad_norm": 0.25518712401390076,
      "learning_rate": 0.0001,
      "loss": 0.3694,
      "step": 608
    },
    {
      "epoch": 0.09744,
      "grad_norm": 0.26687026023864746,
      "learning_rate": 0.0001,
      "loss": 0.3855,
      "step": 609
    },
    {
      "epoch": 0.0976,
      "grad_norm": 0.19588784873485565,
      "learning_rate": 0.0001,
      "loss": 0.375,
      "step": 610
    },
    {
      "epoch": 0.09776,
      "grad_norm": 0.2443368285894394,
      "learning_rate": 0.0001,
      "loss": 0.3757,
      "step": 611
    },
    {
      "epoch": 0.09792,
      "grad_norm": 0.21439087390899658,
      "learning_rate": 0.0001,
      "loss": 0.3812,
      "step": 612
    },
    {
      "epoch": 0.09808,
      "grad_norm": 0.16715103387832642,
      "learning_rate": 0.0001,
      "loss": 0.3594,
      "step": 613
    },
    {
      "epoch": 0.09824,
      "grad_norm": 0.23927511274814606,
      "learning_rate": 0.0001,
      "loss": 0.3913,
      "step": 614
    },
    {
      "epoch": 0.0984,
      "grad_norm": 0.2124563753604889,
      "learning_rate": 0.0001,
      "loss": 0.3581,
      "step": 615
    },
    {
      "epoch": 0.09856,
      "grad_norm": 0.17962846159934998,
      "learning_rate": 0.0001,
      "loss": 0.3708,
      "step": 616
    },
    {
      "epoch": 0.09872,
      "grad_norm": 0.21711505949497223,
      "learning_rate": 0.0001,
      "loss": 0.3842,
      "step": 617
    },
    {
      "epoch": 0.09888,
      "grad_norm": 0.1919819563627243,
      "learning_rate": 0.0001,
      "loss": 0.3566,
      "step": 618
    },
    {
      "epoch": 0.09904,
      "grad_norm": 0.21339207887649536,
      "learning_rate": 0.0001,
      "loss": 0.3949,
      "step": 619
    },
    {
      "epoch": 0.0992,
      "grad_norm": 0.27117905020713806,
      "learning_rate": 0.0001,
      "loss": 0.3919,
      "step": 620
    },
    {
      "epoch": 0.09936,
      "grad_norm": 0.20555344223976135,
      "learning_rate": 0.0001,
      "loss": 0.3708,
      "step": 621
    },
    {
      "epoch": 0.09952,
      "grad_norm": 0.1968289315700531,
      "learning_rate": 0.0001,
      "loss": 0.372,
      "step": 622
    },
    {
      "epoch": 0.09968,
      "grad_norm": 0.19410960376262665,
      "learning_rate": 0.0001,
      "loss": 0.3662,
      "step": 623
    },
    {
      "epoch": 0.09984,
      "grad_norm": 0.20964619517326355,
      "learning_rate": 0.0001,
      "loss": 0.3655,
      "step": 624
    },
    {
      "epoch": 0.1,
      "grad_norm": 0.22637300193309784,
      "learning_rate": 0.0001,
      "loss": 0.3762,
      "step": 625
    },
    {
      "epoch": 0.10016,
      "grad_norm": 0.25407060980796814,
      "learning_rate": 0.0001,
      "loss": 0.3838,
      "step": 626
    },
    {
      "epoch": 0.10032,
      "grad_norm": 0.20271989703178406,
      "learning_rate": 0.0001,
      "loss": 0.3713,
      "step": 627
    },
    {
      "epoch": 0.10048,
      "grad_norm": 0.2824758291244507,
      "learning_rate": 0.0001,
      "loss": 0.3899,
      "step": 628
    },
    {
      "epoch": 0.10064,
      "grad_norm": 0.2050401270389557,
      "learning_rate": 0.0001,
      "loss": 0.38,
      "step": 629
    },
    {
      "epoch": 0.1008,
      "grad_norm": 0.22080586850643158,
      "learning_rate": 0.0001,
      "loss": 0.3981,
      "step": 630
    },
    {
      "epoch": 0.10096,
      "grad_norm": 0.1944112926721573,
      "learning_rate": 0.0001,
      "loss": 0.3713,
      "step": 631
    },
    {
      "epoch": 0.10112,
      "grad_norm": 0.1842879056930542,
      "learning_rate": 0.0001,
      "loss": 0.3616,
      "step": 632
    },
    {
      "epoch": 0.10128,
      "grad_norm": 0.20213207602500916,
      "learning_rate": 0.0001,
      "loss": 0.3703,
      "step": 633
    },
    {
      "epoch": 0.10144,
      "grad_norm": 0.18681709468364716,
      "learning_rate": 0.0001,
      "loss": 0.3873,
      "step": 634
    },
    {
      "epoch": 0.1016,
      "grad_norm": 0.20962467789649963,
      "learning_rate": 0.0001,
      "loss": 0.3794,
      "step": 635
    },
    {
      "epoch": 0.10176,
      "grad_norm": 0.19061681628227234,
      "learning_rate": 0.0001,
      "loss": 0.3911,
      "step": 636
    },
    {
      "epoch": 0.10192,
      "grad_norm": 0.18078026175498962,
      "learning_rate": 0.0001,
      "loss": 0.3757,
      "step": 637
    },
    {
      "epoch": 0.10208,
      "grad_norm": 0.19270747900009155,
      "learning_rate": 0.0001,
      "loss": 0.375,
      "step": 638
    },
    {
      "epoch": 0.10224,
      "grad_norm": 0.21965761482715607,
      "learning_rate": 0.0001,
      "loss": 0.3945,
      "step": 639
    },
    {
      "epoch": 0.1024,
      "grad_norm": 0.19386205077171326,
      "learning_rate": 0.0001,
      "loss": 0.3742,
      "step": 640
    },
    {
      "epoch": 0.10256,
      "grad_norm": 0.2278282344341278,
      "learning_rate": 0.0001,
      "loss": 0.381,
      "step": 641
    },
    {
      "epoch": 0.10272,
      "grad_norm": 0.18924272060394287,
      "learning_rate": 0.0001,
      "loss": 0.3815,
      "step": 642
    },
    {
      "epoch": 0.10288,
      "grad_norm": 0.2089606523513794,
      "learning_rate": 0.0001,
      "loss": 0.3694,
      "step": 643
    },
    {
      "epoch": 0.10304,
      "grad_norm": 0.22001604735851288,
      "learning_rate": 0.0001,
      "loss": 0.3892,
      "step": 644
    },
    {
      "epoch": 0.1032,
      "grad_norm": 0.20927663147449493,
      "learning_rate": 0.0001,
      "loss": 0.3783,
      "step": 645
    },
    {
      "epoch": 0.10336,
      "grad_norm": 0.19866514205932617,
      "learning_rate": 0.0001,
      "loss": 0.3741,
      "step": 646
    },
    {
      "epoch": 0.10352,
      "grad_norm": 0.1940220445394516,
      "learning_rate": 0.0001,
      "loss": 0.3729,
      "step": 647
    },
    {
      "epoch": 0.10368,
      "grad_norm": 0.1740143597126007,
      "learning_rate": 0.0001,
      "loss": 0.3613,
      "step": 648
    },
    {
      "epoch": 0.10384,
      "grad_norm": 0.21495941281318665,
      "learning_rate": 0.0001,
      "loss": 0.3661,
      "step": 649
    },
    {
      "epoch": 0.104,
      "grad_norm": 0.2056748867034912,
      "learning_rate": 0.0001,
      "loss": 0.3708,
      "step": 650
    },
    {
      "epoch": 0.10416,
      "grad_norm": 0.19344620406627655,
      "learning_rate": 0.0001,
      "loss": 0.3666,
      "step": 651
    },
    {
      "epoch": 0.10432,
      "grad_norm": 0.19788548350334167,
      "learning_rate": 0.0001,
      "loss": 0.3643,
      "step": 652
    },
    {
      "epoch": 0.10448,
      "grad_norm": 0.20664949715137482,
      "learning_rate": 0.0001,
      "loss": 0.3508,
      "step": 653
    },
    {
      "epoch": 0.10464,
      "grad_norm": 0.19971203804016113,
      "learning_rate": 0.0001,
      "loss": 0.3545,
      "step": 654
    },
    {
      "epoch": 0.1048,
      "grad_norm": 0.23358885943889618,
      "learning_rate": 0.0001,
      "loss": 0.3872,
      "step": 655
    },
    {
      "epoch": 0.10496,
      "grad_norm": 0.16733789443969727,
      "learning_rate": 0.0001,
      "loss": 0.357,
      "step": 656
    },
    {
      "epoch": 0.10512,
      "grad_norm": 0.20818650722503662,
      "learning_rate": 0.0001,
      "loss": 0.3862,
      "step": 657
    },
    {
      "epoch": 0.10528,
      "grad_norm": 0.1775447577238083,
      "learning_rate": 0.0001,
      "loss": 0.368,
      "step": 658
    },
    {
      "epoch": 0.10544,
      "grad_norm": 0.22486023604869843,
      "learning_rate": 0.0001,
      "loss": 0.3908,
      "step": 659
    },
    {
      "epoch": 0.1056,
      "grad_norm": 0.20857131481170654,
      "learning_rate": 0.0001,
      "loss": 0.3721,
      "step": 660
    },
    {
      "epoch": 0.10576,
      "grad_norm": 0.19788086414337158,
      "learning_rate": 0.0001,
      "loss": 0.3592,
      "step": 661
    },
    {
      "epoch": 0.10592,
      "grad_norm": 0.19412250816822052,
      "learning_rate": 0.0001,
      "loss": 0.3821,
      "step": 662
    },
    {
      "epoch": 0.10608,
      "grad_norm": 0.2359619289636612,
      "learning_rate": 0.0001,
      "loss": 0.3806,
      "step": 663
    },
    {
      "epoch": 0.10624,
      "grad_norm": 0.18479293584823608,
      "learning_rate": 0.0001,
      "loss": 0.3583,
      "step": 664
    },
    {
      "epoch": 0.1064,
      "grad_norm": 0.22676007449626923,
      "learning_rate": 0.0001,
      "loss": 0.3738,
      "step": 665
    },
    {
      "epoch": 0.10656,
      "grad_norm": 0.19316081702709198,
      "learning_rate": 0.0001,
      "loss": 0.3714,
      "step": 666
    },
    {
      "epoch": 0.10672,
      "grad_norm": 0.20281127095222473,
      "learning_rate": 0.0001,
      "loss": 0.3873,
      "step": 667
    },
    {
      "epoch": 0.10688,
      "grad_norm": 0.20554693043231964,
      "learning_rate": 0.0001,
      "loss": 0.3705,
      "step": 668
    },
    {
      "epoch": 0.10704,
      "grad_norm": 0.1988828331232071,
      "learning_rate": 0.0001,
      "loss": 0.378,
      "step": 669
    },
    {
      "epoch": 0.1072,
      "grad_norm": 0.18422091007232666,
      "learning_rate": 0.0001,
      "loss": 0.3652,
      "step": 670
    },
    {
      "epoch": 0.10736,
      "grad_norm": 0.22921693325042725,
      "learning_rate": 0.0001,
      "loss": 0.3773,
      "step": 671
    },
    {
      "epoch": 0.10752,
      "grad_norm": 0.18219704926013947,
      "learning_rate": 0.0001,
      "loss": 0.3866,
      "step": 672
    },
    {
      "epoch": 0.10768,
      "grad_norm": 0.1825074404478073,
      "learning_rate": 0.0001,
      "loss": 0.3807,
      "step": 673
    },
    {
      "epoch": 0.10784,
      "grad_norm": 0.22828398644924164,
      "learning_rate": 0.0001,
      "loss": 0.3767,
      "step": 674
    },
    {
      "epoch": 0.108,
      "grad_norm": 0.22139529883861542,
      "learning_rate": 0.0001,
      "loss": 0.3619,
      "step": 675
    },
    {
      "epoch": 0.10816,
      "grad_norm": 0.19830255210399628,
      "learning_rate": 0.0001,
      "loss": 0.375,
      "step": 676
    },
    {
      "epoch": 0.10832,
      "grad_norm": 0.18902941048145294,
      "learning_rate": 0.0001,
      "loss": 0.391,
      "step": 677
    },
    {
      "epoch": 0.10848,
      "grad_norm": 0.1908821165561676,
      "learning_rate": 0.0001,
      "loss": 0.3552,
      "step": 678
    },
    {
      "epoch": 0.10864,
      "grad_norm": 0.174294576048851,
      "learning_rate": 0.0001,
      "loss": 0.3599,
      "step": 679
    },
    {
      "epoch": 0.1088,
      "grad_norm": 0.1973327100276947,
      "learning_rate": 0.0001,
      "loss": 0.3777,
      "step": 680
    },
    {
      "epoch": 0.10896,
      "grad_norm": 0.20735368132591248,
      "learning_rate": 0.0001,
      "loss": 0.3711,
      "step": 681
    },
    {
      "epoch": 0.10912,
      "grad_norm": 0.1987563967704773,
      "learning_rate": 0.0001,
      "loss": 0.3877,
      "step": 682
    },
    {
      "epoch": 0.10928,
      "grad_norm": 0.20399999618530273,
      "learning_rate": 0.0001,
      "loss": 0.3646,
      "step": 683
    },
    {
      "epoch": 0.10944,
      "grad_norm": 0.21805192530155182,
      "learning_rate": 0.0001,
      "loss": 0.3677,
      "step": 684
    },
    {
      "epoch": 0.1096,
      "grad_norm": 0.21152937412261963,
      "learning_rate": 0.0001,
      "loss": 0.3845,
      "step": 685
    },
    {
      "epoch": 0.10976,
      "grad_norm": 0.20502229034900665,
      "learning_rate": 0.0001,
      "loss": 0.3953,
      "step": 686
    },
    {
      "epoch": 0.10992,
      "grad_norm": 0.30819156765937805,
      "learning_rate": 0.0001,
      "loss": 0.3982,
      "step": 687
    },
    {
      "epoch": 0.11008,
      "grad_norm": 0.19287970662117004,
      "learning_rate": 0.0001,
      "loss": 0.381,
      "step": 688
    },
    {
      "epoch": 0.11024,
      "grad_norm": 0.21346516907215118,
      "learning_rate": 0.0001,
      "loss": 0.3645,
      "step": 689
    },
    {
      "epoch": 0.1104,
      "grad_norm": 0.2384362518787384,
      "learning_rate": 0.0001,
      "loss": 0.3568,
      "step": 690
    },
    {
      "epoch": 0.11056,
      "grad_norm": 0.17893053591251373,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 691
    },
    {
      "epoch": 0.11072,
      "grad_norm": 0.22552450001239777,
      "learning_rate": 0.0001,
      "loss": 0.3729,
      "step": 692
    },
    {
      "epoch": 0.11088,
      "grad_norm": 0.22137580811977386,
      "learning_rate": 0.0001,
      "loss": 0.3722,
      "step": 693
    },
    {
      "epoch": 0.11104,
      "grad_norm": 0.19724547863006592,
      "learning_rate": 0.0001,
      "loss": 0.3612,
      "step": 694
    },
    {
      "epoch": 0.1112,
      "grad_norm": 0.20876161754131317,
      "learning_rate": 0.0001,
      "loss": 0.3772,
      "step": 695
    },
    {
      "epoch": 0.11136,
      "grad_norm": 0.18017108738422394,
      "learning_rate": 0.0001,
      "loss": 0.376,
      "step": 696
    },
    {
      "epoch": 0.11152,
      "grad_norm": 0.23900547623634338,
      "learning_rate": 0.0001,
      "loss": 0.3661,
      "step": 697
    },
    {
      "epoch": 0.11168,
      "grad_norm": 0.23718227446079254,
      "learning_rate": 0.0001,
      "loss": 0.3794,
      "step": 698
    },
    {
      "epoch": 0.11184,
      "grad_norm": 0.21479883790016174,
      "learning_rate": 0.0001,
      "loss": 0.3679,
      "step": 699
    },
    {
      "epoch": 0.112,
      "grad_norm": 0.18934740126132965,
      "learning_rate": 0.0001,
      "loss": 0.374,
      "step": 700
    },
    {
      "epoch": 0.112,
      "eval_train_accuracy": 0.5026,
      "eval_train_loss": 0.3720729351043701,
      "eval_train_runtime": 4.2168,
      "eval_train_samples_per_second": 1185.745,
      "eval_train_steps_per_second": 14.94,
      "step": 700
    },
    {
      "epoch": 0.112,
      "eval_test_accuracy": 0.509,
      "eval_test_loss": 0.37055742740631104,
      "eval_test_runtime": 4.6434,
      "eval_test_samples_per_second": 1076.788,
      "eval_test_steps_per_second": 13.568,
      "step": 700
    },
    {
      "epoch": 0.11216,
      "grad_norm": 0.20682209730148315,
      "learning_rate": 0.0001,
      "loss": 0.3588,
      "step": 701
    },
    {
      "epoch": 0.11232,
      "grad_norm": 0.24137575924396515,
      "learning_rate": 0.0001,
      "loss": 0.3667,
      "step": 702
    },
    {
      "epoch": 0.11248,
      "grad_norm": 0.23336073756217957,
      "learning_rate": 0.0001,
      "loss": 0.3849,
      "step": 703
    },
    {
      "epoch": 0.11264,
      "grad_norm": 0.1852215677499771,
      "learning_rate": 0.0001,
      "loss": 0.3796,
      "step": 704
    },
    {
      "epoch": 0.1128,
      "grad_norm": 0.32149985432624817,
      "learning_rate": 0.0001,
      "loss": 0.377,
      "step": 705
    },
    {
      "epoch": 0.11296,
      "grad_norm": 0.26660263538360596,
      "learning_rate": 0.0001,
      "loss": 0.3849,
      "step": 706
    },
    {
      "epoch": 0.11312,
      "grad_norm": 0.1748826652765274,
      "learning_rate": 0.0001,
      "loss": 0.3795,
      "step": 707
    },
    {
      "epoch": 0.11328,
      "grad_norm": 0.19958879053592682,
      "learning_rate": 0.0001,
      "loss": 0.3655,
      "step": 708
    },
    {
      "epoch": 0.11344,
      "grad_norm": 0.2895336449146271,
      "learning_rate": 0.0001,
      "loss": 0.3846,
      "step": 709
    },
    {
      "epoch": 0.1136,
      "grad_norm": 0.26369500160217285,
      "learning_rate": 0.0001,
      "loss": 0.3681,
      "step": 710
    },
    {
      "epoch": 0.11376,
      "grad_norm": 0.22653424739837646,
      "learning_rate": 0.0001,
      "loss": 0.3793,
      "step": 711
    },
    {
      "epoch": 0.11392,
      "grad_norm": 0.2982812225818634,
      "learning_rate": 0.0001,
      "loss": 0.37,
      "step": 712
    },
    {
      "epoch": 0.11408,
      "grad_norm": 0.25943636894226074,
      "learning_rate": 0.0001,
      "loss": 0.3787,
      "step": 713
    },
    {
      "epoch": 0.11424,
      "grad_norm": 0.24538014829158783,
      "learning_rate": 0.0001,
      "loss": 0.3585,
      "step": 714
    },
    {
      "epoch": 0.1144,
      "grad_norm": 0.32144299149513245,
      "learning_rate": 0.0001,
      "loss": 0.3681,
      "step": 715
    },
    {
      "epoch": 0.11456,
      "grad_norm": 0.220550075173378,
      "learning_rate": 0.0001,
      "loss": 0.3785,
      "step": 716
    },
    {
      "epoch": 0.11472,
      "grad_norm": 0.21057447791099548,
      "learning_rate": 0.0001,
      "loss": 0.3632,
      "step": 717
    },
    {
      "epoch": 0.11488,
      "grad_norm": 0.3428965210914612,
      "learning_rate": 0.0001,
      "loss": 0.3762,
      "step": 718
    },
    {
      "epoch": 0.11504,
      "grad_norm": 0.20038558542728424,
      "learning_rate": 0.0001,
      "loss": 0.3819,
      "step": 719
    },
    {
      "epoch": 0.1152,
      "grad_norm": 0.27152761816978455,
      "learning_rate": 0.0001,
      "loss": 0.3691,
      "step": 720
    },
    {
      "epoch": 0.11536,
      "grad_norm": 0.24007757008075714,
      "learning_rate": 0.0001,
      "loss": 0.3668,
      "step": 721
    },
    {
      "epoch": 0.11552,
      "grad_norm": 0.2280183732509613,
      "learning_rate": 0.0001,
      "loss": 0.368,
      "step": 722
    },
    {
      "epoch": 0.11568,
      "grad_norm": 0.23887838423252106,
      "learning_rate": 0.0001,
      "loss": 0.366,
      "step": 723
    },
    {
      "epoch": 0.11584,
      "grad_norm": 0.21580825746059418,
      "learning_rate": 0.0001,
      "loss": 0.3668,
      "step": 724
    },
    {
      "epoch": 0.116,
      "grad_norm": 0.22299668192863464,
      "learning_rate": 0.0001,
      "loss": 0.3797,
      "step": 725
    },
    {
      "epoch": 0.11616,
      "grad_norm": 0.30729299783706665,
      "learning_rate": 0.0001,
      "loss": 0.3835,
      "step": 726
    },
    {
      "epoch": 0.11632,
      "grad_norm": 0.1692628562450409,
      "learning_rate": 0.0001,
      "loss": 0.3764,
      "step": 727
    },
    {
      "epoch": 0.11648,
      "grad_norm": 0.23970088362693787,
      "learning_rate": 0.0001,
      "loss": 0.3747,
      "step": 728
    },
    {
      "epoch": 0.11664,
      "grad_norm": 0.1983463168144226,
      "learning_rate": 0.0001,
      "loss": 0.38,
      "step": 729
    },
    {
      "epoch": 0.1168,
      "grad_norm": 0.19523341953754425,
      "learning_rate": 0.0001,
      "loss": 0.3742,
      "step": 730
    },
    {
      "epoch": 0.11696,
      "grad_norm": 0.22015029191970825,
      "learning_rate": 0.0001,
      "loss": 0.3741,
      "step": 731
    },
    {
      "epoch": 0.11712,
      "grad_norm": 0.20797933638095856,
      "learning_rate": 0.0001,
      "loss": 0.3653,
      "step": 732
    },
    {
      "epoch": 0.11728,
      "grad_norm": 0.18098142743110657,
      "learning_rate": 0.0001,
      "loss": 0.3498,
      "step": 733
    },
    {
      "epoch": 0.11744,
      "grad_norm": 0.2323215752840042,
      "learning_rate": 0.0001,
      "loss": 0.3788,
      "step": 734
    },
    {
      "epoch": 0.1176,
      "grad_norm": 0.2525564432144165,
      "learning_rate": 0.0001,
      "loss": 0.3838,
      "step": 735
    },
    {
      "epoch": 0.11776,
      "grad_norm": 0.21102726459503174,
      "learning_rate": 0.0001,
      "loss": 0.3727,
      "step": 736
    },
    {
      "epoch": 0.11792,
      "grad_norm": 0.17518334090709686,
      "learning_rate": 0.0001,
      "loss": 0.3738,
      "step": 737
    },
    {
      "epoch": 0.11808,
      "grad_norm": 0.2073400318622589,
      "learning_rate": 0.0001,
      "loss": 0.3693,
      "step": 738
    },
    {
      "epoch": 0.11824,
      "grad_norm": 0.20655858516693115,
      "learning_rate": 0.0001,
      "loss": 0.3754,
      "step": 739
    },
    {
      "epoch": 0.1184,
      "grad_norm": 0.17089755833148956,
      "learning_rate": 0.0001,
      "loss": 0.3791,
      "step": 740
    },
    {
      "epoch": 0.11856,
      "grad_norm": 0.18908105790615082,
      "learning_rate": 0.0001,
      "loss": 0.3675,
      "step": 741
    },
    {
      "epoch": 0.11872,
      "grad_norm": 0.228749081492424,
      "learning_rate": 0.0001,
      "loss": 0.3816,
      "step": 742
    },
    {
      "epoch": 0.11888,
      "grad_norm": 0.20857051014900208,
      "learning_rate": 0.0001,
      "loss": 0.3677,
      "step": 743
    },
    {
      "epoch": 0.11904,
      "grad_norm": 0.17451563477516174,
      "learning_rate": 0.0001,
      "loss": 0.3589,
      "step": 744
    },
    {
      "epoch": 0.1192,
      "grad_norm": 0.1756225973367691,
      "learning_rate": 0.0001,
      "loss": 0.3679,
      "step": 745
    },
    {
      "epoch": 0.11936,
      "grad_norm": 0.20848938822746277,
      "learning_rate": 0.0001,
      "loss": 0.3656,
      "step": 746
    },
    {
      "epoch": 0.11952,
      "grad_norm": 0.18619774281978607,
      "learning_rate": 0.0001,
      "loss": 0.3565,
      "step": 747
    },
    {
      "epoch": 0.11968,
      "grad_norm": 0.18425512313842773,
      "learning_rate": 0.0001,
      "loss": 0.3701,
      "step": 748
    },
    {
      "epoch": 0.11984,
      "grad_norm": 0.18439218401908875,
      "learning_rate": 0.0001,
      "loss": 0.3606,
      "step": 749
    },
    {
      "epoch": 0.12,
      "grad_norm": 0.1848987340927124,
      "learning_rate": 0.0001,
      "loss": 0.3702,
      "step": 750
    },
    {
      "epoch": 0.12016,
      "grad_norm": 0.19117401540279388,
      "learning_rate": 0.0001,
      "loss": 0.3666,
      "step": 751
    },
    {
      "epoch": 0.12032,
      "grad_norm": 0.1968277394771576,
      "learning_rate": 0.0001,
      "loss": 0.3851,
      "step": 752
    },
    {
      "epoch": 0.12048,
      "grad_norm": 0.18890370428562164,
      "learning_rate": 0.0001,
      "loss": 0.3711,
      "step": 753
    },
    {
      "epoch": 0.12064,
      "grad_norm": 0.1789926439523697,
      "learning_rate": 0.0001,
      "loss": 0.3692,
      "step": 754
    },
    {
      "epoch": 0.1208,
      "grad_norm": 0.18093262612819672,
      "learning_rate": 0.0001,
      "loss": 0.3636,
      "step": 755
    },
    {
      "epoch": 0.12096,
      "grad_norm": 0.19244582951068878,
      "learning_rate": 0.0001,
      "loss": 0.3704,
      "step": 756
    },
    {
      "epoch": 0.12112,
      "grad_norm": 0.1834336519241333,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 757
    },
    {
      "epoch": 0.12128,
      "grad_norm": 0.19951772689819336,
      "learning_rate": 0.0001,
      "loss": 0.3698,
      "step": 758
    },
    {
      "epoch": 0.12144,
      "grad_norm": 0.17090025544166565,
      "learning_rate": 0.0001,
      "loss": 0.3715,
      "step": 759
    },
    {
      "epoch": 0.1216,
      "grad_norm": 0.19565609097480774,
      "learning_rate": 0.0001,
      "loss": 0.3662,
      "step": 760
    },
    {
      "epoch": 0.12176,
      "grad_norm": 0.2235313206911087,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 761
    },
    {
      "epoch": 0.12192,
      "grad_norm": 0.1826915144920349,
      "learning_rate": 0.0001,
      "loss": 0.3694,
      "step": 762
    },
    {
      "epoch": 0.12208,
      "grad_norm": 0.18145166337490082,
      "learning_rate": 0.0001,
      "loss": 0.3576,
      "step": 763
    },
    {
      "epoch": 0.12224,
      "grad_norm": 0.231620192527771,
      "learning_rate": 0.0001,
      "loss": 0.3741,
      "step": 764
    },
    {
      "epoch": 0.1224,
      "grad_norm": 0.19346801936626434,
      "learning_rate": 0.0001,
      "loss": 0.3638,
      "step": 765
    },
    {
      "epoch": 0.12256,
      "grad_norm": 0.2005482167005539,
      "learning_rate": 0.0001,
      "loss": 0.3829,
      "step": 766
    },
    {
      "epoch": 0.12272,
      "grad_norm": 0.20894111692905426,
      "learning_rate": 0.0001,
      "loss": 0.3572,
      "step": 767
    },
    {
      "epoch": 0.12288,
      "grad_norm": 0.18930114805698395,
      "learning_rate": 0.0001,
      "loss": 0.3498,
      "step": 768
    },
    {
      "epoch": 0.12304,
      "grad_norm": 0.1811162680387497,
      "learning_rate": 0.0001,
      "loss": 0.3665,
      "step": 769
    },
    {
      "epoch": 0.1232,
      "grad_norm": 0.22386902570724487,
      "learning_rate": 0.0001,
      "loss": 0.3728,
      "step": 770
    },
    {
      "epoch": 0.12336,
      "grad_norm": 0.19966906309127808,
      "learning_rate": 0.0001,
      "loss": 0.3596,
      "step": 771
    },
    {
      "epoch": 0.12352,
      "grad_norm": 0.2078552395105362,
      "learning_rate": 0.0001,
      "loss": 0.3725,
      "step": 772
    },
    {
      "epoch": 0.12368,
      "grad_norm": 0.15933701395988464,
      "learning_rate": 0.0001,
      "loss": 0.3555,
      "step": 773
    },
    {
      "epoch": 0.12384,
      "grad_norm": 0.2013106346130371,
      "learning_rate": 0.0001,
      "loss": 0.3607,
      "step": 774
    },
    {
      "epoch": 0.124,
      "grad_norm": 0.2544011175632477,
      "learning_rate": 0.0001,
      "loss": 0.3629,
      "step": 775
    },
    {
      "epoch": 0.12416,
      "grad_norm": 0.20537200570106506,
      "learning_rate": 0.0001,
      "loss": 0.3714,
      "step": 776
    },
    {
      "epoch": 0.12432,
      "grad_norm": 0.20732729136943817,
      "learning_rate": 0.0001,
      "loss": 0.3663,
      "step": 777
    },
    {
      "epoch": 0.12448,
      "grad_norm": 0.229488804936409,
      "learning_rate": 0.0001,
      "loss": 0.3483,
      "step": 778
    },
    {
      "epoch": 0.12464,
      "grad_norm": 0.2036900818347931,
      "learning_rate": 0.0001,
      "loss": 0.364,
      "step": 779
    },
    {
      "epoch": 0.1248,
      "grad_norm": 0.19247016310691833,
      "learning_rate": 0.0001,
      "loss": 0.3695,
      "step": 780
    },
    {
      "epoch": 0.12496,
      "grad_norm": 0.1858186423778534,
      "learning_rate": 0.0001,
      "loss": 0.361,
      "step": 781
    },
    {
      "epoch": 0.12512,
      "grad_norm": 0.21473640203475952,
      "learning_rate": 0.0001,
      "loss": 0.3678,
      "step": 782
    },
    {
      "epoch": 0.12528,
      "grad_norm": 0.19926482439041138,
      "learning_rate": 0.0001,
      "loss": 0.386,
      "step": 783
    },
    {
      "epoch": 0.12544,
      "grad_norm": 0.2232334464788437,
      "learning_rate": 0.0001,
      "loss": 0.3671,
      "step": 784
    },
    {
      "epoch": 0.1256,
      "grad_norm": 0.23720116913318634,
      "learning_rate": 0.0001,
      "loss": 0.3538,
      "step": 785
    },
    {
      "epoch": 0.12576,
      "grad_norm": 0.19606363773345947,
      "learning_rate": 0.0001,
      "loss": 0.3607,
      "step": 786
    },
    {
      "epoch": 0.12592,
      "grad_norm": 0.18052807450294495,
      "learning_rate": 0.0001,
      "loss": 0.3753,
      "step": 787
    },
    {
      "epoch": 0.12608,
      "grad_norm": 0.2025478184223175,
      "learning_rate": 0.0001,
      "loss": 0.3677,
      "step": 788
    },
    {
      "epoch": 0.12624,
      "grad_norm": 0.20989996194839478,
      "learning_rate": 0.0001,
      "loss": 0.359,
      "step": 789
    },
    {
      "epoch": 0.1264,
      "grad_norm": 0.22692203521728516,
      "learning_rate": 0.0001,
      "loss": 0.373,
      "step": 790
    },
    {
      "epoch": 0.12656,
      "grad_norm": 0.2050934135913849,
      "learning_rate": 0.0001,
      "loss": 0.3635,
      "step": 791
    },
    {
      "epoch": 0.12672,
      "grad_norm": 0.19683898985385895,
      "learning_rate": 0.0001,
      "loss": 0.3541,
      "step": 792
    },
    {
      "epoch": 0.12688,
      "grad_norm": 0.18643896281719208,
      "learning_rate": 0.0001,
      "loss": 0.3571,
      "step": 793
    },
    {
      "epoch": 0.12704,
      "grad_norm": 0.21975578367710114,
      "learning_rate": 0.0001,
      "loss": 0.3644,
      "step": 794
    },
    {
      "epoch": 0.1272,
      "grad_norm": 0.1909463256597519,
      "learning_rate": 0.0001,
      "loss": 0.3548,
      "step": 795
    },
    {
      "epoch": 0.12736,
      "grad_norm": 0.21570120751857758,
      "learning_rate": 0.0001,
      "loss": 0.3603,
      "step": 796
    },
    {
      "epoch": 0.12752,
      "grad_norm": 0.2201121598482132,
      "learning_rate": 0.0001,
      "loss": 0.3574,
      "step": 797
    },
    {
      "epoch": 0.12768,
      "grad_norm": 0.196279838681221,
      "learning_rate": 0.0001,
      "loss": 0.3652,
      "step": 798
    },
    {
      "epoch": 0.12784,
      "grad_norm": 0.18097727000713348,
      "learning_rate": 0.0001,
      "loss": 0.3633,
      "step": 799
    },
    {
      "epoch": 0.128,
      "grad_norm": 0.23171043395996094,
      "learning_rate": 0.0001,
      "loss": 0.3607,
      "step": 800
    },
    {
      "epoch": 0.128,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.3569539487361908,
      "eval_train_runtime": 4.2078,
      "eval_train_samples_per_second": 1188.261,
      "eval_train_steps_per_second": 14.972,
      "step": 800
    },
    {
      "epoch": 0.128,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.3553684949874878,
      "eval_test_runtime": 5.0144,
      "eval_test_samples_per_second": 997.125,
      "eval_test_steps_per_second": 12.564,
      "step": 800
    },
    {
      "epoch": 0.12816,
      "grad_norm": 0.20147472620010376,
      "learning_rate": 0.0001,
      "loss": 0.3665,
      "step": 801
    },
    {
      "epoch": 0.12832,
      "grad_norm": 0.18624623119831085,
      "learning_rate": 0.0001,
      "loss": 0.3492,
      "step": 802
    },
    {
      "epoch": 0.12848,
      "grad_norm": 0.19691742956638336,
      "learning_rate": 0.0001,
      "loss": 0.3649,
      "step": 803
    },
    {
      "epoch": 0.12864,
      "grad_norm": 0.2131369262933731,
      "learning_rate": 0.0001,
      "loss": 0.3595,
      "step": 804
    },
    {
      "epoch": 0.1288,
      "grad_norm": 0.21981371939182281,
      "learning_rate": 0.0001,
      "loss": 0.3646,
      "step": 805
    },
    {
      "epoch": 0.12896,
      "grad_norm": 0.20205293595790863,
      "learning_rate": 0.0001,
      "loss": 0.3609,
      "step": 806
    },
    {
      "epoch": 0.12912,
      "grad_norm": 0.16711826622486115,
      "learning_rate": 0.0001,
      "loss": 0.358,
      "step": 807
    },
    {
      "epoch": 0.12928,
      "grad_norm": 0.19206634163856506,
      "learning_rate": 0.0001,
      "loss": 0.3694,
      "step": 808
    },
    {
      "epoch": 0.12944,
      "grad_norm": 0.23681414127349854,
      "learning_rate": 0.0001,
      "loss": 0.356,
      "step": 809
    },
    {
      "epoch": 0.1296,
      "grad_norm": 0.22134417295455933,
      "learning_rate": 0.0001,
      "loss": 0.3585,
      "step": 810
    },
    {
      "epoch": 0.12976,
      "grad_norm": 0.1843389868736267,
      "learning_rate": 0.0001,
      "loss": 0.3662,
      "step": 811
    },
    {
      "epoch": 0.12992,
      "grad_norm": 0.20330068469047546,
      "learning_rate": 0.0001,
      "loss": 0.3549,
      "step": 812
    },
    {
      "epoch": 0.13008,
      "grad_norm": 0.20051421225070953,
      "learning_rate": 0.0001,
      "loss": 0.3696,
      "step": 813
    },
    {
      "epoch": 0.13024,
      "grad_norm": 0.20701079070568085,
      "learning_rate": 0.0001,
      "loss": 0.3557,
      "step": 814
    },
    {
      "epoch": 0.1304,
      "grad_norm": 0.1755826324224472,
      "learning_rate": 0.0001,
      "loss": 0.3637,
      "step": 815
    },
    {
      "epoch": 0.13056,
      "grad_norm": 0.19893886148929596,
      "learning_rate": 0.0001,
      "loss": 0.356,
      "step": 816
    },
    {
      "epoch": 0.13072,
      "grad_norm": 0.2727128267288208,
      "learning_rate": 0.0001,
      "loss": 0.3558,
      "step": 817
    },
    {
      "epoch": 0.13088,
      "grad_norm": 0.2151995599269867,
      "learning_rate": 0.0001,
      "loss": 0.3569,
      "step": 818
    },
    {
      "epoch": 0.13104,
      "grad_norm": 0.2160796970129013,
      "learning_rate": 0.0001,
      "loss": 0.3582,
      "step": 819
    },
    {
      "epoch": 0.1312,
      "grad_norm": 0.1789531111717224,
      "learning_rate": 0.0001,
      "loss": 0.3538,
      "step": 820
    },
    {
      "epoch": 0.13136,
      "grad_norm": 0.2915545403957367,
      "learning_rate": 0.0001,
      "loss": 0.3618,
      "step": 821
    },
    {
      "epoch": 0.13152,
      "grad_norm": 0.1933390349149704,
      "learning_rate": 0.0001,
      "loss": 0.3488,
      "step": 822
    },
    {
      "epoch": 0.13168,
      "grad_norm": 0.1957373321056366,
      "learning_rate": 0.0001,
      "loss": 0.35,
      "step": 823
    },
    {
      "epoch": 0.13184,
      "grad_norm": 0.27016761898994446,
      "learning_rate": 0.0001,
      "loss": 0.3581,
      "step": 824
    },
    {
      "epoch": 0.132,
      "grad_norm": 0.20308315753936768,
      "learning_rate": 0.0001,
      "loss": 0.3598,
      "step": 825
    },
    {
      "epoch": 0.13216,
      "grad_norm": 0.18662142753601074,
      "learning_rate": 0.0001,
      "loss": 0.3482,
      "step": 826
    },
    {
      "epoch": 0.13232,
      "grad_norm": 0.21952086687088013,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 827
    },
    {
      "epoch": 0.13248,
      "grad_norm": 0.25119999051094055,
      "learning_rate": 0.0001,
      "loss": 0.3466,
      "step": 828
    },
    {
      "epoch": 0.13264,
      "grad_norm": 0.2124888151884079,
      "learning_rate": 0.0001,
      "loss": 0.3627,
      "step": 829
    },
    {
      "epoch": 0.1328,
      "grad_norm": 0.2435184270143509,
      "learning_rate": 0.0001,
      "loss": 0.3594,
      "step": 830
    },
    {
      "epoch": 0.13296,
      "grad_norm": 0.2253703624010086,
      "learning_rate": 0.0001,
      "loss": 0.3549,
      "step": 831
    },
    {
      "epoch": 0.13312,
      "grad_norm": 0.2436094731092453,
      "learning_rate": 0.0001,
      "loss": 0.3603,
      "step": 832
    },
    {
      "epoch": 0.13328,
      "grad_norm": 0.20727866888046265,
      "learning_rate": 0.0001,
      "loss": 0.3447,
      "step": 833
    },
    {
      "epoch": 0.13344,
      "grad_norm": 0.22387661039829254,
      "learning_rate": 0.0001,
      "loss": 0.3579,
      "step": 834
    },
    {
      "epoch": 0.1336,
      "grad_norm": 0.1957368403673172,
      "learning_rate": 0.0001,
      "loss": 0.3621,
      "step": 835
    },
    {
      "epoch": 0.13376,
      "grad_norm": 0.27741536498069763,
      "learning_rate": 0.0001,
      "loss": 0.3714,
      "step": 836
    },
    {
      "epoch": 0.13392,
      "grad_norm": 0.22846968472003937,
      "learning_rate": 0.0001,
      "loss": 0.344,
      "step": 837
    },
    {
      "epoch": 0.13408,
      "grad_norm": 0.19755245745182037,
      "learning_rate": 0.0001,
      "loss": 0.3498,
      "step": 838
    },
    {
      "epoch": 0.13424,
      "grad_norm": 0.2523234486579895,
      "learning_rate": 0.0001,
      "loss": 0.3572,
      "step": 839
    },
    {
      "epoch": 0.1344,
      "grad_norm": 0.24110375344753265,
      "learning_rate": 0.0001,
      "loss": 0.3562,
      "step": 840
    },
    {
      "epoch": 0.13456,
      "grad_norm": 0.21320073306560516,
      "learning_rate": 0.0001,
      "loss": 0.3509,
      "step": 841
    },
    {
      "epoch": 0.13472,
      "grad_norm": 0.1948201209306717,
      "learning_rate": 0.0001,
      "loss": 0.3516,
      "step": 842
    },
    {
      "epoch": 0.13488,
      "grad_norm": 0.24540092051029205,
      "learning_rate": 0.0001,
      "loss": 0.3588,
      "step": 843
    },
    {
      "epoch": 0.13504,
      "grad_norm": 0.20822659134864807,
      "learning_rate": 0.0001,
      "loss": 0.349,
      "step": 844
    },
    {
      "epoch": 0.1352,
      "grad_norm": 0.22607599198818207,
      "learning_rate": 0.0001,
      "loss": 0.3459,
      "step": 845
    },
    {
      "epoch": 0.13536,
      "grad_norm": 0.2351224273443222,
      "learning_rate": 0.0001,
      "loss": 0.3611,
      "step": 846
    },
    {
      "epoch": 0.13552,
      "grad_norm": 0.1753230094909668,
      "learning_rate": 0.0001,
      "loss": 0.3401,
      "step": 847
    },
    {
      "epoch": 0.13568,
      "grad_norm": 0.21676619350910187,
      "learning_rate": 0.0001,
      "loss": 0.3395,
      "step": 848
    },
    {
      "epoch": 0.13584,
      "grad_norm": 0.22052264213562012,
      "learning_rate": 0.0001,
      "loss": 0.3611,
      "step": 849
    },
    {
      "epoch": 0.136,
      "grad_norm": 0.1756930649280548,
      "learning_rate": 0.0001,
      "loss": 0.365,
      "step": 850
    },
    {
      "epoch": 0.13616,
      "grad_norm": 0.19378361105918884,
      "learning_rate": 0.0001,
      "loss": 0.3578,
      "step": 851
    },
    {
      "epoch": 0.13632,
      "grad_norm": 0.23174920678138733,
      "learning_rate": 0.0001,
      "loss": 0.3533,
      "step": 852
    },
    {
      "epoch": 0.13648,
      "grad_norm": 0.3209342956542969,
      "learning_rate": 0.0001,
      "loss": 0.3591,
      "step": 853
    },
    {
      "epoch": 0.13664,
      "grad_norm": 0.29612088203430176,
      "learning_rate": 0.0001,
      "loss": 0.3642,
      "step": 854
    },
    {
      "epoch": 0.1368,
      "grad_norm": 0.3883107602596283,
      "learning_rate": 0.0001,
      "loss": 0.3495,
      "step": 855
    },
    {
      "epoch": 0.13696,
      "grad_norm": 0.2129882127046585,
      "learning_rate": 0.0001,
      "loss": 0.3562,
      "step": 856
    },
    {
      "epoch": 0.13712,
      "grad_norm": 0.3548133671283722,
      "learning_rate": 0.0001,
      "loss": 0.3511,
      "step": 857
    },
    {
      "epoch": 0.13728,
      "grad_norm": 0.22763943672180176,
      "learning_rate": 0.0001,
      "loss": 0.3507,
      "step": 858
    },
    {
      "epoch": 0.13744,
      "grad_norm": 0.22298501431941986,
      "learning_rate": 0.0001,
      "loss": 0.3464,
      "step": 859
    },
    {
      "epoch": 0.1376,
      "grad_norm": 0.2050994336605072,
      "learning_rate": 0.0001,
      "loss": 0.3603,
      "step": 860
    },
    {
      "epoch": 0.13776,
      "grad_norm": 0.22487424314022064,
      "learning_rate": 0.0001,
      "loss": 0.3397,
      "step": 861
    },
    {
      "epoch": 0.13792,
      "grad_norm": 0.2553003430366516,
      "learning_rate": 0.0001,
      "loss": 0.3601,
      "step": 862
    },
    {
      "epoch": 0.13808,
      "grad_norm": 0.24025778472423553,
      "learning_rate": 0.0001,
      "loss": 0.3654,
      "step": 863
    },
    {
      "epoch": 0.13824,
      "grad_norm": 0.21628618240356445,
      "learning_rate": 0.0001,
      "loss": 0.345,
      "step": 864
    },
    {
      "epoch": 0.1384,
      "grad_norm": 0.18104206025600433,
      "learning_rate": 0.0001,
      "loss": 0.3456,
      "step": 865
    },
    {
      "epoch": 0.13856,
      "grad_norm": 0.18185822665691376,
      "learning_rate": 0.0001,
      "loss": 0.3516,
      "step": 866
    },
    {
      "epoch": 0.13872,
      "grad_norm": 0.21803343296051025,
      "learning_rate": 0.0001,
      "loss": 0.3578,
      "step": 867
    },
    {
      "epoch": 0.13888,
      "grad_norm": 0.2648133933544159,
      "learning_rate": 0.0001,
      "loss": 0.3541,
      "step": 868
    },
    {
      "epoch": 0.13904,
      "grad_norm": 0.21096256375312805,
      "learning_rate": 0.0001,
      "loss": 0.3582,
      "step": 869
    },
    {
      "epoch": 0.1392,
      "grad_norm": 0.2380804866552353,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 870
    },
    {
      "epoch": 0.13936,
      "grad_norm": 0.26467326283454895,
      "learning_rate": 0.0001,
      "loss": 0.3534,
      "step": 871
    },
    {
      "epoch": 0.13952,
      "grad_norm": 0.2259659767150879,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 872
    },
    {
      "epoch": 0.13968,
      "grad_norm": 0.23765145242214203,
      "learning_rate": 0.0001,
      "loss": 0.355,
      "step": 873
    },
    {
      "epoch": 0.13984,
      "grad_norm": 0.23419757187366486,
      "learning_rate": 0.0001,
      "loss": 0.3538,
      "step": 874
    },
    {
      "epoch": 0.14,
      "grad_norm": 0.21788768470287323,
      "learning_rate": 0.0001,
      "loss": 0.3491,
      "step": 875
    },
    {
      "epoch": 0.14016,
      "grad_norm": 0.2424650937318802,
      "learning_rate": 0.0001,
      "loss": 0.3541,
      "step": 876
    },
    {
      "epoch": 0.14032,
      "grad_norm": 0.2150385081768036,
      "learning_rate": 0.0001,
      "loss": 0.3403,
      "step": 877
    },
    {
      "epoch": 0.14048,
      "grad_norm": 0.2058628797531128,
      "learning_rate": 0.0001,
      "loss": 0.3375,
      "step": 878
    },
    {
      "epoch": 0.14064,
      "grad_norm": 0.2090601772069931,
      "learning_rate": 0.0001,
      "loss": 0.3481,
      "step": 879
    },
    {
      "epoch": 0.1408,
      "grad_norm": 0.22061719000339508,
      "learning_rate": 0.0001,
      "loss": 0.3654,
      "step": 880
    },
    {
      "epoch": 0.14096,
      "grad_norm": 0.23674003779888153,
      "learning_rate": 0.0001,
      "loss": 0.3703,
      "step": 881
    },
    {
      "epoch": 0.14112,
      "grad_norm": 0.19301001727581024,
      "learning_rate": 0.0001,
      "loss": 0.352,
      "step": 882
    },
    {
      "epoch": 0.14128,
      "grad_norm": 0.19176892936229706,
      "learning_rate": 0.0001,
      "loss": 0.3453,
      "step": 883
    },
    {
      "epoch": 0.14144,
      "grad_norm": 0.3065575063228607,
      "learning_rate": 0.0001,
      "loss": 0.3718,
      "step": 884
    },
    {
      "epoch": 0.1416,
      "grad_norm": 0.22747868299484253,
      "learning_rate": 0.0001,
      "loss": 0.361,
      "step": 885
    },
    {
      "epoch": 0.14176,
      "grad_norm": 0.22448280453681946,
      "learning_rate": 0.0001,
      "loss": 0.3465,
      "step": 886
    },
    {
      "epoch": 0.14192,
      "grad_norm": 0.16537359356880188,
      "learning_rate": 0.0001,
      "loss": 0.3382,
      "step": 887
    },
    {
      "epoch": 0.14208,
      "grad_norm": 0.22950023412704468,
      "learning_rate": 0.0001,
      "loss": 0.3477,
      "step": 888
    },
    {
      "epoch": 0.14224,
      "grad_norm": 0.1888248771429062,
      "learning_rate": 0.0001,
      "loss": 0.3518,
      "step": 889
    },
    {
      "epoch": 0.1424,
      "grad_norm": 0.18554653227329254,
      "learning_rate": 0.0001,
      "loss": 0.3666,
      "step": 890
    },
    {
      "epoch": 0.14256,
      "grad_norm": 0.23605602979660034,
      "learning_rate": 0.0001,
      "loss": 0.3652,
      "step": 891
    },
    {
      "epoch": 0.14272,
      "grad_norm": 0.21354128420352936,
      "learning_rate": 0.0001,
      "loss": 0.3547,
      "step": 892
    },
    {
      "epoch": 0.14288,
      "grad_norm": 0.25627604126930237,
      "learning_rate": 0.0001,
      "loss": 0.3527,
      "step": 893
    },
    {
      "epoch": 0.14304,
      "grad_norm": 0.20234179496765137,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 894
    },
    {
      "epoch": 0.1432,
      "grad_norm": 0.18637344241142273,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 895
    },
    {
      "epoch": 0.14336,
      "grad_norm": 0.24848058819770813,
      "learning_rate": 0.0001,
      "loss": 0.3486,
      "step": 896
    },
    {
      "epoch": 0.14352,
      "grad_norm": 0.22484137117862701,
      "learning_rate": 0.0001,
      "loss": 0.3572,
      "step": 897
    },
    {
      "epoch": 0.14368,
      "grad_norm": 0.20687855780124664,
      "learning_rate": 0.0001,
      "loss": 0.343,
      "step": 898
    },
    {
      "epoch": 0.14384,
      "grad_norm": 0.20467016100883484,
      "learning_rate": 0.0001,
      "loss": 0.3564,
      "step": 899
    },
    {
      "epoch": 0.144,
      "grad_norm": 0.18202170729637146,
      "learning_rate": 0.0001,
      "loss": 0.3514,
      "step": 900
    },
    {
      "epoch": 0.144,
      "eval_train_accuracy": 0.5136,
      "eval_train_loss": 0.3469606041908264,
      "eval_train_runtime": 4.0885,
      "eval_train_samples_per_second": 1222.947,
      "eval_train_steps_per_second": 15.409,
      "step": 900
    },
    {
      "epoch": 0.144,
      "eval_test_accuracy": 0.5122,
      "eval_test_loss": 0.3456655442714691,
      "eval_test_runtime": 4.8405,
      "eval_test_samples_per_second": 1032.952,
      "eval_test_steps_per_second": 13.015,
      "step": 900
    },
    {
      "epoch": 0.14416,
      "grad_norm": 0.1797914206981659,
      "learning_rate": 0.0001,
      "loss": 0.3441,
      "step": 901
    },
    {
      "epoch": 0.14432,
      "grad_norm": 0.22308599948883057,
      "learning_rate": 0.0001,
      "loss": 0.3537,
      "step": 902
    },
    {
      "epoch": 0.14448,
      "grad_norm": 0.21007120609283447,
      "learning_rate": 0.0001,
      "loss": 0.3496,
      "step": 903
    },
    {
      "epoch": 0.14464,
      "grad_norm": 0.23332713544368744,
      "learning_rate": 0.0001,
      "loss": 0.3551,
      "step": 904
    },
    {
      "epoch": 0.1448,
      "grad_norm": 0.207045778632164,
      "learning_rate": 0.0001,
      "loss": 0.3543,
      "step": 905
    },
    {
      "epoch": 0.14496,
      "grad_norm": 0.20915833115577698,
      "learning_rate": 0.0001,
      "loss": 0.3403,
      "step": 906
    },
    {
      "epoch": 0.14512,
      "grad_norm": 0.21506786346435547,
      "learning_rate": 0.0001,
      "loss": 0.3563,
      "step": 907
    },
    {
      "epoch": 0.14528,
      "grad_norm": 0.172720804810524,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 908
    },
    {
      "epoch": 0.14544,
      "grad_norm": 0.21452514827251434,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 909
    },
    {
      "epoch": 0.1456,
      "grad_norm": 0.2231750339269638,
      "learning_rate": 0.0001,
      "loss": 0.3469,
      "step": 910
    },
    {
      "epoch": 0.14576,
      "grad_norm": 0.2159915417432785,
      "learning_rate": 0.0001,
      "loss": 0.3585,
      "step": 911
    },
    {
      "epoch": 0.14592,
      "grad_norm": 0.23105473816394806,
      "learning_rate": 0.0001,
      "loss": 0.3674,
      "step": 912
    },
    {
      "epoch": 0.14608,
      "grad_norm": 0.18325749039649963,
      "learning_rate": 0.0001,
      "loss": 0.3486,
      "step": 913
    },
    {
      "epoch": 0.14624,
      "grad_norm": 0.2189156711101532,
      "learning_rate": 0.0001,
      "loss": 0.36,
      "step": 914
    },
    {
      "epoch": 0.1464,
      "grad_norm": 0.24798427522182465,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 915
    },
    {
      "epoch": 0.14656,
      "grad_norm": 0.22201038897037506,
      "learning_rate": 0.0001,
      "loss": 0.3607,
      "step": 916
    },
    {
      "epoch": 0.14672,
      "grad_norm": 0.20011462271213531,
      "learning_rate": 0.0001,
      "loss": 0.358,
      "step": 917
    },
    {
      "epoch": 0.14688,
      "grad_norm": 0.21389389038085938,
      "learning_rate": 0.0001,
      "loss": 0.3541,
      "step": 918
    },
    {
      "epoch": 0.14704,
      "grad_norm": 0.20598889887332916,
      "learning_rate": 0.0001,
      "loss": 0.3421,
      "step": 919
    },
    {
      "epoch": 0.1472,
      "grad_norm": 0.17606312036514282,
      "learning_rate": 0.0001,
      "loss": 0.3569,
      "step": 920
    },
    {
      "epoch": 0.14736,
      "grad_norm": 0.20537392795085907,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 921
    },
    {
      "epoch": 0.14752,
      "grad_norm": 0.20014724135398865,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 922
    },
    {
      "epoch": 0.14768,
      "grad_norm": 0.1959308236837387,
      "learning_rate": 0.0001,
      "loss": 0.3486,
      "step": 923
    },
    {
      "epoch": 0.14784,
      "grad_norm": 0.26533153653144836,
      "learning_rate": 0.0001,
      "loss": 0.3521,
      "step": 924
    },
    {
      "epoch": 0.148,
      "grad_norm": 0.17345218360424042,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 925
    },
    {
      "epoch": 0.14816,
      "grad_norm": 0.24261245131492615,
      "learning_rate": 0.0001,
      "loss": 0.3678,
      "step": 926
    },
    {
      "epoch": 0.14832,
      "grad_norm": 0.19421418011188507,
      "learning_rate": 0.0001,
      "loss": 0.3436,
      "step": 927
    },
    {
      "epoch": 0.14848,
      "grad_norm": 0.1841745674610138,
      "learning_rate": 0.0001,
      "loss": 0.3476,
      "step": 928
    },
    {
      "epoch": 0.14864,
      "grad_norm": 0.22217802703380585,
      "learning_rate": 0.0001,
      "loss": 0.3531,
      "step": 929
    },
    {
      "epoch": 0.1488,
      "grad_norm": 0.21924056112766266,
      "learning_rate": 0.0001,
      "loss": 0.3533,
      "step": 930
    },
    {
      "epoch": 0.14896,
      "grad_norm": 0.26135191321372986,
      "learning_rate": 0.0001,
      "loss": 0.3733,
      "step": 931
    },
    {
      "epoch": 0.14912,
      "grad_norm": 0.18730054795742035,
      "learning_rate": 0.0001,
      "loss": 0.3567,
      "step": 932
    },
    {
      "epoch": 0.14928,
      "grad_norm": 0.16307245194911957,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 933
    },
    {
      "epoch": 0.14944,
      "grad_norm": 0.22687499225139618,
      "learning_rate": 0.0001,
      "loss": 0.3504,
      "step": 934
    },
    {
      "epoch": 0.1496,
      "grad_norm": 0.21552449464797974,
      "learning_rate": 0.0001,
      "loss": 0.349,
      "step": 935
    },
    {
      "epoch": 0.14976,
      "grad_norm": 0.23769471049308777,
      "learning_rate": 0.0001,
      "loss": 0.3548,
      "step": 936
    },
    {
      "epoch": 0.14992,
      "grad_norm": 0.17871981859207153,
      "learning_rate": 0.0001,
      "loss": 0.3541,
      "step": 937
    },
    {
      "epoch": 0.15008,
      "grad_norm": 0.1779036670923233,
      "learning_rate": 0.0001,
      "loss": 0.3502,
      "step": 938
    },
    {
      "epoch": 0.15024,
      "grad_norm": 0.21535514295101166,
      "learning_rate": 0.0001,
      "loss": 0.3469,
      "step": 939
    },
    {
      "epoch": 0.1504,
      "grad_norm": 0.1814824342727661,
      "learning_rate": 0.0001,
      "loss": 0.3459,
      "step": 940
    },
    {
      "epoch": 0.15056,
      "grad_norm": 0.17478875815868378,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 941
    },
    {
      "epoch": 0.15072,
      "grad_norm": 0.19970695674419403,
      "learning_rate": 0.0001,
      "loss": 0.3444,
      "step": 942
    },
    {
      "epoch": 0.15088,
      "grad_norm": 0.15970586240291595,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 943
    },
    {
      "epoch": 0.15104,
      "grad_norm": 0.18441493809223175,
      "learning_rate": 0.0001,
      "loss": 0.352,
      "step": 944
    },
    {
      "epoch": 0.1512,
      "grad_norm": 0.16454492509365082,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 945
    },
    {
      "epoch": 0.15136,
      "grad_norm": 0.18136897683143616,
      "learning_rate": 0.0001,
      "loss": 0.3545,
      "step": 946
    },
    {
      "epoch": 0.15152,
      "grad_norm": 0.16185952723026276,
      "learning_rate": 0.0001,
      "loss": 0.3577,
      "step": 947
    },
    {
      "epoch": 0.15168,
      "grad_norm": 0.16662263870239258,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 948
    },
    {
      "epoch": 0.15184,
      "grad_norm": 0.23648451268672943,
      "learning_rate": 0.0001,
      "loss": 0.346,
      "step": 949
    },
    {
      "epoch": 0.152,
      "grad_norm": 0.1888689547777176,
      "learning_rate": 0.0001,
      "loss": 0.3529,
      "step": 950
    },
    {
      "epoch": 0.15216,
      "grad_norm": 0.22592909634113312,
      "learning_rate": 0.0001,
      "loss": 0.3453,
      "step": 951
    },
    {
      "epoch": 0.15232,
      "grad_norm": 0.17753884196281433,
      "learning_rate": 0.0001,
      "loss": 0.3498,
      "step": 952
    },
    {
      "epoch": 0.15248,
      "grad_norm": 0.1774575114250183,
      "learning_rate": 0.0001,
      "loss": 0.352,
      "step": 953
    },
    {
      "epoch": 0.15264,
      "grad_norm": 0.1729169636964798,
      "learning_rate": 0.0001,
      "loss": 0.3414,
      "step": 954
    },
    {
      "epoch": 0.1528,
      "grad_norm": 0.18346144258975983,
      "learning_rate": 0.0001,
      "loss": 0.3464,
      "step": 955
    },
    {
      "epoch": 0.15296,
      "grad_norm": 0.1793409138917923,
      "learning_rate": 0.0001,
      "loss": 0.3447,
      "step": 956
    },
    {
      "epoch": 0.15312,
      "grad_norm": 0.20661698281764984,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 957
    },
    {
      "epoch": 0.15328,
      "grad_norm": 0.20278483629226685,
      "learning_rate": 0.0001,
      "loss": 0.3568,
      "step": 958
    },
    {
      "epoch": 0.15344,
      "grad_norm": 0.19667337834835052,
      "learning_rate": 0.0001,
      "loss": 0.3623,
      "step": 959
    },
    {
      "epoch": 0.1536,
      "grad_norm": 0.3125147521495819,
      "learning_rate": 0.0001,
      "loss": 0.3555,
      "step": 960
    },
    {
      "epoch": 0.15376,
      "grad_norm": 0.19071251153945923,
      "learning_rate": 0.0001,
      "loss": 0.3446,
      "step": 961
    },
    {
      "epoch": 0.15392,
      "grad_norm": 0.17748939990997314,
      "learning_rate": 0.0001,
      "loss": 0.348,
      "step": 962
    },
    {
      "epoch": 0.15408,
      "grad_norm": 0.2232213020324707,
      "learning_rate": 0.0001,
      "loss": 0.3593,
      "step": 963
    },
    {
      "epoch": 0.15424,
      "grad_norm": 0.19005408883094788,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 964
    },
    {
      "epoch": 0.1544,
      "grad_norm": 0.18351802229881287,
      "learning_rate": 0.0001,
      "loss": 0.3539,
      "step": 965
    },
    {
      "epoch": 0.15456,
      "grad_norm": 0.19078445434570312,
      "learning_rate": 0.0001,
      "loss": 0.3551,
      "step": 966
    },
    {
      "epoch": 0.15472,
      "grad_norm": 0.1974067986011505,
      "learning_rate": 0.0001,
      "loss": 0.3588,
      "step": 967
    },
    {
      "epoch": 0.15488,
      "grad_norm": 0.18392105400562286,
      "learning_rate": 0.0001,
      "loss": 0.3473,
      "step": 968
    },
    {
      "epoch": 0.15504,
      "grad_norm": 0.16346190869808197,
      "learning_rate": 0.0001,
      "loss": 0.3516,
      "step": 969
    },
    {
      "epoch": 0.1552,
      "grad_norm": 0.18922393023967743,
      "learning_rate": 0.0001,
      "loss": 0.3624,
      "step": 970
    },
    {
      "epoch": 0.15536,
      "grad_norm": 0.17608068883419037,
      "learning_rate": 0.0001,
      "loss": 0.3603,
      "step": 971
    },
    {
      "epoch": 0.15552,
      "grad_norm": 0.2107500582933426,
      "learning_rate": 0.0001,
      "loss": 0.3441,
      "step": 972
    },
    {
      "epoch": 0.15568,
      "grad_norm": 0.20080837607383728,
      "learning_rate": 0.0001,
      "loss": 0.3444,
      "step": 973
    },
    {
      "epoch": 0.15584,
      "grad_norm": 0.23169958591461182,
      "learning_rate": 0.0001,
      "loss": 0.3474,
      "step": 974
    },
    {
      "epoch": 0.156,
      "grad_norm": 0.16661199927330017,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 975
    },
    {
      "epoch": 0.15616,
      "grad_norm": 0.18745633959770203,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 976
    },
    {
      "epoch": 0.15632,
      "grad_norm": 0.1813899278640747,
      "learning_rate": 0.0001,
      "loss": 0.3446,
      "step": 977
    },
    {
      "epoch": 0.15648,
      "grad_norm": 0.25081154704093933,
      "learning_rate": 0.0001,
      "loss": 0.3441,
      "step": 978
    },
    {
      "epoch": 0.15664,
      "grad_norm": 0.20528054237365723,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 979
    },
    {
      "epoch": 0.1568,
      "grad_norm": 0.19333277642726898,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 980
    },
    {
      "epoch": 0.15696,
      "grad_norm": 0.16347748041152954,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 981
    },
    {
      "epoch": 0.15712,
      "grad_norm": 0.1731230765581131,
      "learning_rate": 0.0001,
      "loss": 0.3526,
      "step": 982
    },
    {
      "epoch": 0.15728,
      "grad_norm": 0.22126510739326477,
      "learning_rate": 0.0001,
      "loss": 0.3427,
      "step": 983
    },
    {
      "epoch": 0.15744,
      "grad_norm": 0.19949251413345337,
      "learning_rate": 0.0001,
      "loss": 0.3436,
      "step": 984
    },
    {
      "epoch": 0.1576,
      "grad_norm": 0.2502773702144623,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 985
    },
    {
      "epoch": 0.15776,
      "grad_norm": 0.19153191149234772,
      "learning_rate": 0.0001,
      "loss": 0.3541,
      "step": 986
    },
    {
      "epoch": 0.15792,
      "grad_norm": 0.2041664570569992,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 987
    },
    {
      "epoch": 0.15808,
      "grad_norm": 0.2947552502155304,
      "learning_rate": 0.0001,
      "loss": 0.3468,
      "step": 988
    },
    {
      "epoch": 0.15824,
      "grad_norm": 0.16818305850028992,
      "learning_rate": 0.0001,
      "loss": 0.3466,
      "step": 989
    },
    {
      "epoch": 0.1584,
      "grad_norm": 0.22044788300991058,
      "learning_rate": 0.0001,
      "loss": 0.3511,
      "step": 990
    },
    {
      "epoch": 0.15856,
      "grad_norm": 0.24190029501914978,
      "learning_rate": 0.0001,
      "loss": 0.3512,
      "step": 991
    },
    {
      "epoch": 0.15872,
      "grad_norm": 0.1715463399887085,
      "learning_rate": 0.0001,
      "loss": 0.35,
      "step": 992
    },
    {
      "epoch": 0.15888,
      "grad_norm": 0.18063242733478546,
      "learning_rate": 0.0001,
      "loss": 0.3505,
      "step": 993
    },
    {
      "epoch": 0.15904,
      "grad_norm": 0.19807235896587372,
      "learning_rate": 0.0001,
      "loss": 0.3617,
      "step": 994
    },
    {
      "epoch": 0.1592,
      "grad_norm": 0.18736979365348816,
      "learning_rate": 0.0001,
      "loss": 0.3406,
      "step": 995
    },
    {
      "epoch": 0.15936,
      "grad_norm": 0.208875834941864,
      "learning_rate": 0.0001,
      "loss": 0.3403,
      "step": 996
    },
    {
      "epoch": 0.15952,
      "grad_norm": 0.19879968464374542,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 997
    },
    {
      "epoch": 0.15968,
      "grad_norm": 0.1727326363325119,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 998
    },
    {
      "epoch": 0.15984,
      "grad_norm": 0.17007267475128174,
      "learning_rate": 0.0001,
      "loss": 0.3424,
      "step": 999
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.17635810375213623,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 1000
    },
    {
      "epoch": 0.16,
      "eval_train_accuracy": 0.5006,
      "eval_train_loss": 0.3436990976333618,
      "eval_train_runtime": 4.3938,
      "eval_train_samples_per_second": 1137.955,
      "eval_train_steps_per_second": 14.338,
      "step": 1000
    },
    {
      "epoch": 0.16,
      "eval_test_accuracy": 0.5058,
      "eval_test_loss": 0.3420068621635437,
      "eval_test_runtime": 5.1344,
      "eval_test_samples_per_second": 973.816,
      "eval_test_steps_per_second": 12.27,
      "step": 1000
    },
    {
      "epoch": 0.16016,
      "grad_norm": 0.19582758843898773,
      "learning_rate": 0.0001,
      "loss": 0.345,
      "step": 1001
    },
    {
      "epoch": 0.16032,
      "grad_norm": 0.20136098563671112,
      "learning_rate": 0.0001,
      "loss": 0.3536,
      "step": 1002
    },
    {
      "epoch": 0.16048,
      "grad_norm": 0.18023210763931274,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 1003
    },
    {
      "epoch": 0.16064,
      "grad_norm": 0.25801077485084534,
      "learning_rate": 0.0001,
      "loss": 0.3479,
      "step": 1004
    },
    {
      "epoch": 0.1608,
      "grad_norm": 0.1754879355430603,
      "learning_rate": 0.0001,
      "loss": 0.3494,
      "step": 1005
    },
    {
      "epoch": 0.16096,
      "grad_norm": 0.2186407893896103,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 1006
    },
    {
      "epoch": 0.16112,
      "grad_norm": 0.18762868642807007,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 1007
    },
    {
      "epoch": 0.16128,
      "grad_norm": 0.1869063675403595,
      "learning_rate": 0.0001,
      "loss": 0.3447,
      "step": 1008
    },
    {
      "epoch": 0.16144,
      "grad_norm": 0.16134551167488098,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 1009
    },
    {
      "epoch": 0.1616,
      "grad_norm": 0.16189199686050415,
      "learning_rate": 0.0001,
      "loss": 0.3471,
      "step": 1010
    },
    {
      "epoch": 0.16176,
      "grad_norm": 0.18954254686832428,
      "learning_rate": 0.0001,
      "loss": 0.3461,
      "step": 1011
    },
    {
      "epoch": 0.16192,
      "grad_norm": 0.2022758275270462,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 1012
    },
    {
      "epoch": 0.16208,
      "grad_norm": 0.22041822969913483,
      "learning_rate": 0.0001,
      "loss": 0.3464,
      "step": 1013
    },
    {
      "epoch": 0.16224,
      "grad_norm": 0.18650783598423004,
      "learning_rate": 0.0001,
      "loss": 0.3393,
      "step": 1014
    },
    {
      "epoch": 0.1624,
      "grad_norm": 0.1547229290008545,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 1015
    },
    {
      "epoch": 0.16256,
      "grad_norm": 0.16171064972877502,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 1016
    },
    {
      "epoch": 0.16272,
      "grad_norm": 0.18691277503967285,
      "learning_rate": 0.0001,
      "loss": 0.3433,
      "step": 1017
    },
    {
      "epoch": 0.16288,
      "grad_norm": 0.14980165660381317,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 1018
    },
    {
      "epoch": 0.16304,
      "grad_norm": 0.17838743329048157,
      "learning_rate": 0.0001,
      "loss": 0.3501,
      "step": 1019
    },
    {
      "epoch": 0.1632,
      "grad_norm": 0.1704004555940628,
      "learning_rate": 0.0001,
      "loss": 0.3529,
      "step": 1020
    },
    {
      "epoch": 0.16336,
      "grad_norm": 0.18347671627998352,
      "learning_rate": 0.0001,
      "loss": 0.3484,
      "step": 1021
    },
    {
      "epoch": 0.16352,
      "grad_norm": 0.18200716376304626,
      "learning_rate": 0.0001,
      "loss": 0.3463,
      "step": 1022
    },
    {
      "epoch": 0.16368,
      "grad_norm": 0.1625155657529831,
      "learning_rate": 0.0001,
      "loss": 0.3503,
      "step": 1023
    },
    {
      "epoch": 0.16384,
      "grad_norm": 0.23420952260494232,
      "learning_rate": 0.0001,
      "loss": 0.3559,
      "step": 1024
    },
    {
      "epoch": 0.164,
      "grad_norm": 0.169838547706604,
      "learning_rate": 0.0001,
      "loss": 0.338,
      "step": 1025
    },
    {
      "epoch": 0.16416,
      "grad_norm": 0.19425900280475616,
      "learning_rate": 0.0001,
      "loss": 0.3543,
      "step": 1026
    },
    {
      "epoch": 0.16432,
      "grad_norm": 0.1562505066394806,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 1027
    },
    {
      "epoch": 0.16448,
      "grad_norm": 0.19351676106452942,
      "learning_rate": 0.0001,
      "loss": 0.3604,
      "step": 1028
    },
    {
      "epoch": 0.16464,
      "grad_norm": 0.1957942545413971,
      "learning_rate": 0.0001,
      "loss": 0.3695,
      "step": 1029
    },
    {
      "epoch": 0.1648,
      "grad_norm": 0.1903703212738037,
      "learning_rate": 0.0001,
      "loss": 0.3704,
      "step": 1030
    },
    {
      "epoch": 0.16496,
      "grad_norm": 0.17135275900363922,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 1031
    },
    {
      "epoch": 0.16512,
      "grad_norm": 0.2317161113023758,
      "learning_rate": 0.0001,
      "loss": 0.3564,
      "step": 1032
    },
    {
      "epoch": 0.16528,
      "grad_norm": 0.1874457448720932,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 1033
    },
    {
      "epoch": 0.16544,
      "grad_norm": 0.16207431256771088,
      "learning_rate": 0.0001,
      "loss": 0.3515,
      "step": 1034
    },
    {
      "epoch": 0.1656,
      "grad_norm": 0.18179942667484283,
      "learning_rate": 0.0001,
      "loss": 0.3402,
      "step": 1035
    },
    {
      "epoch": 0.16576,
      "grad_norm": 0.21734675765037537,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 1036
    },
    {
      "epoch": 0.16592,
      "grad_norm": 0.22533714771270752,
      "learning_rate": 0.0001,
      "loss": 0.3488,
      "step": 1037
    },
    {
      "epoch": 0.16608,
      "grad_norm": 0.1763220727443695,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 1038
    },
    {
      "epoch": 0.16624,
      "grad_norm": 0.21004176139831543,
      "learning_rate": 0.0001,
      "loss": 0.3513,
      "step": 1039
    },
    {
      "epoch": 0.1664,
      "grad_norm": 0.23613499104976654,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 1040
    },
    {
      "epoch": 0.16656,
      "grad_norm": 0.21579056978225708,
      "learning_rate": 0.0001,
      "loss": 0.3538,
      "step": 1041
    },
    {
      "epoch": 0.16672,
      "grad_norm": 0.16333529353141785,
      "learning_rate": 0.0001,
      "loss": 0.3587,
      "step": 1042
    },
    {
      "epoch": 0.16688,
      "grad_norm": 0.3273988664150238,
      "learning_rate": 0.0001,
      "loss": 0.343,
      "step": 1043
    },
    {
      "epoch": 0.16704,
      "grad_norm": 0.30243512988090515,
      "learning_rate": 0.0001,
      "loss": 0.3598,
      "step": 1044
    },
    {
      "epoch": 0.1672,
      "grad_norm": 0.207128643989563,
      "learning_rate": 0.0001,
      "loss": 0.3545,
      "step": 1045
    },
    {
      "epoch": 0.16736,
      "grad_norm": 0.16787733137607574,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 1046
    },
    {
      "epoch": 0.16752,
      "grad_norm": 0.15756650269031525,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 1047
    },
    {
      "epoch": 0.16768,
      "grad_norm": 0.26226046681404114,
      "learning_rate": 0.0001,
      "loss": 0.3437,
      "step": 1048
    },
    {
      "epoch": 0.16784,
      "grad_norm": 0.2679958641529083,
      "learning_rate": 0.0001,
      "loss": 0.3565,
      "step": 1049
    },
    {
      "epoch": 0.168,
      "grad_norm": 0.19210883975028992,
      "learning_rate": 0.0001,
      "loss": 0.3385,
      "step": 1050
    },
    {
      "epoch": 0.16816,
      "grad_norm": 0.19334715604782104,
      "learning_rate": 0.0001,
      "loss": 0.3523,
      "step": 1051
    },
    {
      "epoch": 0.16832,
      "grad_norm": 0.2251807153224945,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 1052
    },
    {
      "epoch": 0.16848,
      "grad_norm": 0.1811414510011673,
      "learning_rate": 0.0001,
      "loss": 0.3479,
      "step": 1053
    },
    {
      "epoch": 0.16864,
      "grad_norm": 0.1910606175661087,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 1054
    },
    {
      "epoch": 0.1688,
      "grad_norm": 0.21540206670761108,
      "learning_rate": 0.0001,
      "loss": 0.3392,
      "step": 1055
    },
    {
      "epoch": 0.16896,
      "grad_norm": 0.17727461457252502,
      "learning_rate": 0.0001,
      "loss": 0.3507,
      "step": 1056
    },
    {
      "epoch": 0.16912,
      "grad_norm": 0.202310249209404,
      "learning_rate": 0.0001,
      "loss": 0.3382,
      "step": 1057
    },
    {
      "epoch": 0.16928,
      "grad_norm": 0.16262705624103546,
      "learning_rate": 0.0001,
      "loss": 0.3444,
      "step": 1058
    },
    {
      "epoch": 0.16944,
      "grad_norm": 0.19868627190589905,
      "learning_rate": 0.0001,
      "loss": 0.3524,
      "step": 1059
    },
    {
      "epoch": 0.1696,
      "grad_norm": 0.19643667340278625,
      "learning_rate": 0.0001,
      "loss": 0.3534,
      "step": 1060
    },
    {
      "epoch": 0.16976,
      "grad_norm": 0.16601423919200897,
      "learning_rate": 0.0001,
      "loss": 0.3379,
      "step": 1061
    },
    {
      "epoch": 0.16992,
      "grad_norm": 0.17985796928405762,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 1062
    },
    {
      "epoch": 0.17008,
      "grad_norm": 0.2194170206785202,
      "learning_rate": 0.0001,
      "loss": 0.3571,
      "step": 1063
    },
    {
      "epoch": 0.17024,
      "grad_norm": 0.1804073601961136,
      "learning_rate": 0.0001,
      "loss": 0.3517,
      "step": 1064
    },
    {
      "epoch": 0.1704,
      "grad_norm": 0.17680679261684418,
      "learning_rate": 0.0001,
      "loss": 0.3488,
      "step": 1065
    },
    {
      "epoch": 0.17056,
      "grad_norm": 0.19961251318454742,
      "learning_rate": 0.0001,
      "loss": 0.3469,
      "step": 1066
    },
    {
      "epoch": 0.17072,
      "grad_norm": 0.16137424111366272,
      "learning_rate": 0.0001,
      "loss": 0.3451,
      "step": 1067
    },
    {
      "epoch": 0.17088,
      "grad_norm": 0.17494343221187592,
      "learning_rate": 0.0001,
      "loss": 0.3381,
      "step": 1068
    },
    {
      "epoch": 0.17104,
      "grad_norm": 0.2245008945465088,
      "learning_rate": 0.0001,
      "loss": 0.3564,
      "step": 1069
    },
    {
      "epoch": 0.1712,
      "grad_norm": 0.18811769783496857,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 1070
    },
    {
      "epoch": 0.17136,
      "grad_norm": 0.16596025228500366,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 1071
    },
    {
      "epoch": 0.17152,
      "grad_norm": 0.17693807184696198,
      "learning_rate": 0.0001,
      "loss": 0.3539,
      "step": 1072
    },
    {
      "epoch": 0.17168,
      "grad_norm": 0.22313521802425385,
      "learning_rate": 0.0001,
      "loss": 0.3474,
      "step": 1073
    },
    {
      "epoch": 0.17184,
      "grad_norm": 0.20134872198104858,
      "learning_rate": 0.0001,
      "loss": 0.3499,
      "step": 1074
    },
    {
      "epoch": 0.172,
      "grad_norm": 0.1569156050682068,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 1075
    },
    {
      "epoch": 0.17216,
      "grad_norm": 0.16268756985664368,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 1076
    },
    {
      "epoch": 0.17232,
      "grad_norm": 0.2629736661911011,
      "learning_rate": 0.0001,
      "loss": 0.3526,
      "step": 1077
    },
    {
      "epoch": 0.17248,
      "grad_norm": 0.19521763920783997,
      "learning_rate": 0.0001,
      "loss": 0.3536,
      "step": 1078
    },
    {
      "epoch": 0.17264,
      "grad_norm": 0.18040715157985687,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 1079
    },
    {
      "epoch": 0.1728,
      "grad_norm": 0.24657301604747772,
      "learning_rate": 0.0001,
      "loss": 0.3511,
      "step": 1080
    },
    {
      "epoch": 0.17296,
      "grad_norm": 0.1691116839647293,
      "learning_rate": 0.0001,
      "loss": 0.3456,
      "step": 1081
    },
    {
      "epoch": 0.17312,
      "grad_norm": 0.14840173721313477,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 1082
    },
    {
      "epoch": 0.17328,
      "grad_norm": 0.20563654601573944,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 1083
    },
    {
      "epoch": 0.17344,
      "grad_norm": 0.15264169871807098,
      "learning_rate": 0.0001,
      "loss": 0.3692,
      "step": 1084
    },
    {
      "epoch": 0.1736,
      "grad_norm": 0.17373594641685486,
      "learning_rate": 0.0001,
      "loss": 0.3393,
      "step": 1085
    },
    {
      "epoch": 0.17376,
      "grad_norm": 0.2020777314901352,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 1086
    },
    {
      "epoch": 0.17392,
      "grad_norm": 0.18907542526721954,
      "learning_rate": 0.0001,
      "loss": 0.3552,
      "step": 1087
    },
    {
      "epoch": 0.17408,
      "grad_norm": 0.14946889877319336,
      "learning_rate": 0.0001,
      "loss": 0.3437,
      "step": 1088
    },
    {
      "epoch": 0.17424,
      "grad_norm": 0.1575455516576767,
      "learning_rate": 0.0001,
      "loss": 0.3536,
      "step": 1089
    },
    {
      "epoch": 0.1744,
      "grad_norm": 0.21408498287200928,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 1090
    },
    {
      "epoch": 0.17456,
      "grad_norm": 0.17296898365020752,
      "learning_rate": 0.0001,
      "loss": 0.3622,
      "step": 1091
    },
    {
      "epoch": 0.17472,
      "grad_norm": 0.1595555543899536,
      "learning_rate": 0.0001,
      "loss": 0.3538,
      "step": 1092
    },
    {
      "epoch": 0.17488,
      "grad_norm": 0.17801637947559357,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 1093
    },
    {
      "epoch": 0.17504,
      "grad_norm": 0.18506115674972534,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 1094
    },
    {
      "epoch": 0.1752,
      "grad_norm": 0.16824395954608917,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 1095
    },
    {
      "epoch": 0.17536,
      "grad_norm": 0.1988145112991333,
      "learning_rate": 0.0001,
      "loss": 0.3552,
      "step": 1096
    },
    {
      "epoch": 0.17552,
      "grad_norm": 0.1809283345937729,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 1097
    },
    {
      "epoch": 0.17568,
      "grad_norm": 0.21060581505298615,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 1098
    },
    {
      "epoch": 0.17584,
      "grad_norm": 0.18621636927127838,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 1099
    },
    {
      "epoch": 0.176,
      "grad_norm": 0.165646031498909,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 1100
    },
    {
      "epoch": 0.176,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.3399915099143982,
      "eval_train_runtime": 4.109,
      "eval_train_samples_per_second": 1216.849,
      "eval_train_steps_per_second": 15.332,
      "step": 1100
    },
    {
      "epoch": 0.176,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.3385678231716156,
      "eval_test_runtime": 4.8842,
      "eval_test_samples_per_second": 1023.713,
      "eval_test_steps_per_second": 12.899,
      "step": 1100
    },
    {
      "epoch": 0.17616,
      "grad_norm": 0.1688407063484192,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 1101
    },
    {
      "epoch": 0.17632,
      "grad_norm": 0.17853035032749176,
      "learning_rate": 0.0001,
      "loss": 0.3397,
      "step": 1102
    },
    {
      "epoch": 0.17648,
      "grad_norm": 0.15547126531600952,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 1103
    },
    {
      "epoch": 0.17664,
      "grad_norm": 0.1621035784482956,
      "learning_rate": 0.0001,
      "loss": 0.3556,
      "step": 1104
    },
    {
      "epoch": 0.1768,
      "grad_norm": 0.1580369621515274,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 1105
    },
    {
      "epoch": 0.17696,
      "grad_norm": 0.15469305217266083,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 1106
    },
    {
      "epoch": 0.17712,
      "grad_norm": 0.15428408980369568,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 1107
    },
    {
      "epoch": 0.17728,
      "grad_norm": 0.15424707531929016,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 1108
    },
    {
      "epoch": 0.17744,
      "grad_norm": 0.22019334137439728,
      "learning_rate": 0.0001,
      "loss": 0.3498,
      "step": 1109
    },
    {
      "epoch": 0.1776,
      "grad_norm": 0.15787528455257416,
      "learning_rate": 0.0001,
      "loss": 0.3579,
      "step": 1110
    },
    {
      "epoch": 0.17776,
      "grad_norm": 0.1827142834663391,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 1111
    },
    {
      "epoch": 0.17792,
      "grad_norm": 0.16550588607788086,
      "learning_rate": 0.0001,
      "loss": 0.3477,
      "step": 1112
    },
    {
      "epoch": 0.17808,
      "grad_norm": 0.15931186079978943,
      "learning_rate": 0.0001,
      "loss": 0.3429,
      "step": 1113
    },
    {
      "epoch": 0.17824,
      "grad_norm": 0.1649743616580963,
      "learning_rate": 0.0001,
      "loss": 0.3575,
      "step": 1114
    },
    {
      "epoch": 0.1784,
      "grad_norm": 0.15173587203025818,
      "learning_rate": 0.0001,
      "loss": 0.3488,
      "step": 1115
    },
    {
      "epoch": 0.17856,
      "grad_norm": 0.145090714097023,
      "learning_rate": 0.0001,
      "loss": 0.3513,
      "step": 1116
    },
    {
      "epoch": 0.17872,
      "grad_norm": 0.16782334446907043,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 1117
    },
    {
      "epoch": 0.17888,
      "grad_norm": 0.1573304831981659,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 1118
    },
    {
      "epoch": 0.17904,
      "grad_norm": 0.15199744701385498,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 1119
    },
    {
      "epoch": 0.1792,
      "grad_norm": 0.17407268285751343,
      "learning_rate": 0.0001,
      "loss": 0.3537,
      "step": 1120
    },
    {
      "epoch": 0.17936,
      "grad_norm": 0.28419917821884155,
      "learning_rate": 0.0001,
      "loss": 0.3512,
      "step": 1121
    },
    {
      "epoch": 0.17952,
      "grad_norm": 0.16604755818843842,
      "learning_rate": 0.0001,
      "loss": 0.341,
      "step": 1122
    },
    {
      "epoch": 0.17968,
      "grad_norm": 0.14467447996139526,
      "learning_rate": 0.0001,
      "loss": 0.3609,
      "step": 1123
    },
    {
      "epoch": 0.17984,
      "grad_norm": 0.16320589184761047,
      "learning_rate": 0.0001,
      "loss": 0.3402,
      "step": 1124
    },
    {
      "epoch": 0.18,
      "grad_norm": 0.18813647329807281,
      "learning_rate": 0.0001,
      "loss": 0.3478,
      "step": 1125
    },
    {
      "epoch": 0.18016,
      "grad_norm": 0.23420193791389465,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 1126
    },
    {
      "epoch": 0.18032,
      "grad_norm": 0.1829865276813507,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 1127
    },
    {
      "epoch": 0.18048,
      "grad_norm": 0.19168297946453094,
      "learning_rate": 0.0001,
      "loss": 0.3433,
      "step": 1128
    },
    {
      "epoch": 0.18064,
      "grad_norm": 0.16320550441741943,
      "learning_rate": 0.0001,
      "loss": 0.3509,
      "step": 1129
    },
    {
      "epoch": 0.1808,
      "grad_norm": 0.24642056226730347,
      "learning_rate": 0.0001,
      "loss": 0.3519,
      "step": 1130
    },
    {
      "epoch": 0.18096,
      "grad_norm": 0.15512308478355408,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 1131
    },
    {
      "epoch": 0.18112,
      "grad_norm": 0.19531993567943573,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 1132
    },
    {
      "epoch": 0.18128,
      "grad_norm": 0.1750439554452896,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 1133
    },
    {
      "epoch": 0.18144,
      "grad_norm": 0.2157069593667984,
      "learning_rate": 0.0001,
      "loss": 0.3476,
      "step": 1134
    },
    {
      "epoch": 0.1816,
      "grad_norm": 0.17110233008861542,
      "learning_rate": 0.0001,
      "loss": 0.3518,
      "step": 1135
    },
    {
      "epoch": 0.18176,
      "grad_norm": 0.18203617632389069,
      "learning_rate": 0.0001,
      "loss": 0.3423,
      "step": 1136
    },
    {
      "epoch": 0.18192,
      "grad_norm": 0.18028263747692108,
      "learning_rate": 0.0001,
      "loss": 0.3405,
      "step": 1137
    },
    {
      "epoch": 0.18208,
      "grad_norm": 0.18453054130077362,
      "learning_rate": 0.0001,
      "loss": 0.3546,
      "step": 1138
    },
    {
      "epoch": 0.18224,
      "grad_norm": 0.18318861722946167,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 1139
    },
    {
      "epoch": 0.1824,
      "grad_norm": 0.15263943374156952,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 1140
    },
    {
      "epoch": 0.18256,
      "grad_norm": 0.1732325702905655,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 1141
    },
    {
      "epoch": 0.18272,
      "grad_norm": 0.1783369928598404,
      "learning_rate": 0.0001,
      "loss": 0.3455,
      "step": 1142
    },
    {
      "epoch": 0.18288,
      "grad_norm": 0.13167981803417206,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 1143
    },
    {
      "epoch": 0.18304,
      "grad_norm": 0.16243314743041992,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 1144
    },
    {
      "epoch": 0.1832,
      "grad_norm": 0.16786183416843414,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 1145
    },
    {
      "epoch": 0.18336,
      "grad_norm": 0.15916025638580322,
      "learning_rate": 0.0001,
      "loss": 0.3499,
      "step": 1146
    },
    {
      "epoch": 0.18352,
      "grad_norm": 0.1601344347000122,
      "learning_rate": 0.0001,
      "loss": 0.3455,
      "step": 1147
    },
    {
      "epoch": 0.18368,
      "grad_norm": 0.13829956948757172,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 1148
    },
    {
      "epoch": 0.18384,
      "grad_norm": 0.14376969635486603,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 1149
    },
    {
      "epoch": 0.184,
      "grad_norm": 0.1684909611940384,
      "learning_rate": 0.0001,
      "loss": 0.3433,
      "step": 1150
    },
    {
      "epoch": 0.18416,
      "grad_norm": 0.1824931800365448,
      "learning_rate": 0.0001,
      "loss": 0.3584,
      "step": 1151
    },
    {
      "epoch": 0.18432,
      "grad_norm": 0.16766010224819183,
      "learning_rate": 0.0001,
      "loss": 0.3403,
      "step": 1152
    },
    {
      "epoch": 0.18448,
      "grad_norm": 0.1507328450679779,
      "learning_rate": 0.0001,
      "loss": 0.3457,
      "step": 1153
    },
    {
      "epoch": 0.18464,
      "grad_norm": 0.1492300033569336,
      "learning_rate": 0.0001,
      "loss": 0.3497,
      "step": 1154
    },
    {
      "epoch": 0.1848,
      "grad_norm": 0.265621542930603,
      "learning_rate": 0.0001,
      "loss": 0.3455,
      "step": 1155
    },
    {
      "epoch": 0.18496,
      "grad_norm": 0.17512193322181702,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 1156
    },
    {
      "epoch": 0.18512,
      "grad_norm": 0.15729700028896332,
      "learning_rate": 0.0001,
      "loss": 0.3423,
      "step": 1157
    },
    {
      "epoch": 0.18528,
      "grad_norm": 0.1769946962594986,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 1158
    },
    {
      "epoch": 0.18544,
      "grad_norm": 0.15735942125320435,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 1159
    },
    {
      "epoch": 0.1856,
      "grad_norm": 0.1609802395105362,
      "learning_rate": 0.0001,
      "loss": 0.3498,
      "step": 1160
    },
    {
      "epoch": 0.18576,
      "grad_norm": 0.18271605670452118,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 1161
    },
    {
      "epoch": 0.18592,
      "grad_norm": 0.18330557644367218,
      "learning_rate": 0.0001,
      "loss": 0.3551,
      "step": 1162
    },
    {
      "epoch": 0.18608,
      "grad_norm": 0.20421257615089417,
      "learning_rate": 0.0001,
      "loss": 0.3457,
      "step": 1163
    },
    {
      "epoch": 0.18624,
      "grad_norm": 0.1856660097837448,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 1164
    },
    {
      "epoch": 0.1864,
      "grad_norm": 0.19023452699184418,
      "learning_rate": 0.0001,
      "loss": 0.3374,
      "step": 1165
    },
    {
      "epoch": 0.18656,
      "grad_norm": 0.17435932159423828,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 1166
    },
    {
      "epoch": 0.18672,
      "grad_norm": 0.17663873732089996,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 1167
    },
    {
      "epoch": 0.18688,
      "grad_norm": 0.14456574618816376,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 1168
    },
    {
      "epoch": 0.18704,
      "grad_norm": 0.16192200779914856,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 1169
    },
    {
      "epoch": 0.1872,
      "grad_norm": 0.1753004491329193,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 1170
    },
    {
      "epoch": 0.18736,
      "grad_norm": 0.1727987378835678,
      "learning_rate": 0.0001,
      "loss": 0.346,
      "step": 1171
    },
    {
      "epoch": 0.18752,
      "grad_norm": 0.1311994343996048,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 1172
    },
    {
      "epoch": 0.18768,
      "grad_norm": 0.17476850748062134,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 1173
    },
    {
      "epoch": 0.18784,
      "grad_norm": 0.14670923352241516,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 1174
    },
    {
      "epoch": 0.188,
      "grad_norm": 0.21804901957511902,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 1175
    },
    {
      "epoch": 0.18816,
      "grad_norm": 0.14368034899234772,
      "learning_rate": 0.0001,
      "loss": 0.3511,
      "step": 1176
    },
    {
      "epoch": 0.18832,
      "grad_norm": 0.1992107331752777,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 1177
    },
    {
      "epoch": 0.18848,
      "grad_norm": 0.2416936308145523,
      "learning_rate": 0.0001,
      "loss": 0.3442,
      "step": 1178
    },
    {
      "epoch": 0.18864,
      "grad_norm": 0.1532512605190277,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 1179
    },
    {
      "epoch": 0.1888,
      "grad_norm": 0.17259687185287476,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 1180
    },
    {
      "epoch": 0.18896,
      "grad_norm": 0.211422860622406,
      "learning_rate": 0.0001,
      "loss": 0.3424,
      "step": 1181
    },
    {
      "epoch": 0.18912,
      "grad_norm": 0.17288430035114288,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 1182
    },
    {
      "epoch": 0.18928,
      "grad_norm": 0.145375058054924,
      "learning_rate": 0.0001,
      "loss": 0.3485,
      "step": 1183
    },
    {
      "epoch": 0.18944,
      "grad_norm": 0.19791996479034424,
      "learning_rate": 0.0001,
      "loss": 0.3519,
      "step": 1184
    },
    {
      "epoch": 0.1896,
      "grad_norm": 0.1836472600698471,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 1185
    },
    {
      "epoch": 0.18976,
      "grad_norm": 0.18024569749832153,
      "learning_rate": 0.0001,
      "loss": 0.3656,
      "step": 1186
    },
    {
      "epoch": 0.18992,
      "grad_norm": 0.15432897210121155,
      "learning_rate": 0.0001,
      "loss": 0.3498,
      "step": 1187
    },
    {
      "epoch": 0.19008,
      "grad_norm": 0.18302351236343384,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 1188
    },
    {
      "epoch": 0.19024,
      "grad_norm": 0.16832447052001953,
      "learning_rate": 0.0001,
      "loss": 0.3449,
      "step": 1189
    },
    {
      "epoch": 0.1904,
      "grad_norm": 0.15471775829792023,
      "learning_rate": 0.0001,
      "loss": 0.3471,
      "step": 1190
    },
    {
      "epoch": 0.19056,
      "grad_norm": 0.15327367186546326,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 1191
    },
    {
      "epoch": 0.19072,
      "grad_norm": 0.15152838826179504,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 1192
    },
    {
      "epoch": 0.19088,
      "grad_norm": 0.1536240428686142,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 1193
    },
    {
      "epoch": 0.19104,
      "grad_norm": 0.15712231397628784,
      "learning_rate": 0.0001,
      "loss": 0.3482,
      "step": 1194
    },
    {
      "epoch": 0.1912,
      "grad_norm": 0.13885250687599182,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 1195
    },
    {
      "epoch": 0.19136,
      "grad_norm": 0.1406497061252594,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 1196
    },
    {
      "epoch": 0.19152,
      "grad_norm": 0.1707930862903595,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 1197
    },
    {
      "epoch": 0.19168,
      "grad_norm": 0.15026576817035675,
      "learning_rate": 0.0001,
      "loss": 0.3589,
      "step": 1198
    },
    {
      "epoch": 0.19184,
      "grad_norm": 0.14731444418430328,
      "learning_rate": 0.0001,
      "loss": 0.3458,
      "step": 1199
    },
    {
      "epoch": 0.192,
      "grad_norm": 0.13710281252861023,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 1200
    },
    {
      "epoch": 0.192,
      "eval_train_accuracy": 0.5054,
      "eval_train_loss": 0.33863818645477295,
      "eval_train_runtime": 4.1124,
      "eval_train_samples_per_second": 1215.827,
      "eval_train_steps_per_second": 15.319,
      "step": 1200
    },
    {
      "epoch": 0.192,
      "eval_test_accuracy": 0.512,
      "eval_test_loss": 0.33717358112335205,
      "eval_test_runtime": 5.0629,
      "eval_test_samples_per_second": 987.579,
      "eval_test_steps_per_second": 12.443,
      "step": 1200
    },
    {
      "epoch": 0.19216,
      "grad_norm": 0.14071989059448242,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 1201
    },
    {
      "epoch": 0.19232,
      "grad_norm": 0.14537836611270905,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 1202
    },
    {
      "epoch": 0.19248,
      "grad_norm": 0.17306005954742432,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 1203
    },
    {
      "epoch": 0.19264,
      "grad_norm": 0.13728007674217224,
      "learning_rate": 0.0001,
      "loss": 0.3355,
      "step": 1204
    },
    {
      "epoch": 0.1928,
      "grad_norm": 0.12836164236068726,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 1205
    },
    {
      "epoch": 0.19296,
      "grad_norm": 0.24385851621627808,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 1206
    },
    {
      "epoch": 0.19312,
      "grad_norm": 0.13994547724723816,
      "learning_rate": 0.0001,
      "loss": 0.3452,
      "step": 1207
    },
    {
      "epoch": 0.19328,
      "grad_norm": 0.14610223472118378,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 1208
    },
    {
      "epoch": 0.19344,
      "grad_norm": 0.20203928649425507,
      "learning_rate": 0.0001,
      "loss": 0.3486,
      "step": 1209
    },
    {
      "epoch": 0.1936,
      "grad_norm": 0.16740138828754425,
      "learning_rate": 0.0001,
      "loss": 0.3496,
      "step": 1210
    },
    {
      "epoch": 0.19376,
      "grad_norm": 0.15395678579807281,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 1211
    },
    {
      "epoch": 0.19392,
      "grad_norm": 0.15115824341773987,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 1212
    },
    {
      "epoch": 0.19408,
      "grad_norm": 0.1685791164636612,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 1213
    },
    {
      "epoch": 0.19424,
      "grad_norm": 0.14764544367790222,
      "learning_rate": 0.0001,
      "loss": 0.3407,
      "step": 1214
    },
    {
      "epoch": 0.1944,
      "grad_norm": 0.13787612318992615,
      "learning_rate": 0.0001,
      "loss": 0.3439,
      "step": 1215
    },
    {
      "epoch": 0.19456,
      "grad_norm": 0.14346545934677124,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 1216
    },
    {
      "epoch": 0.19472,
      "grad_norm": 0.15415038168430328,
      "learning_rate": 0.0001,
      "loss": 0.3532,
      "step": 1217
    },
    {
      "epoch": 0.19488,
      "grad_norm": 0.12493813782930374,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 1218
    },
    {
      "epoch": 0.19504,
      "grad_norm": 0.15494564175605774,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 1219
    },
    {
      "epoch": 0.1952,
      "grad_norm": 0.1241389736533165,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 1220
    },
    {
      "epoch": 0.19536,
      "grad_norm": 0.16359944641590118,
      "learning_rate": 0.0001,
      "loss": 0.3533,
      "step": 1221
    },
    {
      "epoch": 0.19552,
      "grad_norm": 0.16506025195121765,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 1222
    },
    {
      "epoch": 0.19568,
      "grad_norm": 0.1511077582836151,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 1223
    },
    {
      "epoch": 0.19584,
      "grad_norm": 0.15279589593410492,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 1224
    },
    {
      "epoch": 0.196,
      "grad_norm": 0.15623435378074646,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 1225
    },
    {
      "epoch": 0.19616,
      "grad_norm": 0.1475309580564499,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 1226
    },
    {
      "epoch": 0.19632,
      "grad_norm": 0.15896008908748627,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 1227
    },
    {
      "epoch": 0.19648,
      "grad_norm": 0.16246004402637482,
      "learning_rate": 0.0001,
      "loss": 0.3421,
      "step": 1228
    },
    {
      "epoch": 0.19664,
      "grad_norm": 0.13901714980602264,
      "learning_rate": 0.0001,
      "loss": 0.3405,
      "step": 1229
    },
    {
      "epoch": 0.1968,
      "grad_norm": 0.15642955899238586,
      "learning_rate": 0.0001,
      "loss": 0.3527,
      "step": 1230
    },
    {
      "epoch": 0.19696,
      "grad_norm": 0.13721637427806854,
      "learning_rate": 0.0001,
      "loss": 0.3452,
      "step": 1231
    },
    {
      "epoch": 0.19712,
      "grad_norm": 0.13660092651844025,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 1232
    },
    {
      "epoch": 0.19728,
      "grad_norm": 0.1485360711812973,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 1233
    },
    {
      "epoch": 0.19744,
      "grad_norm": 0.18314479291439056,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 1234
    },
    {
      "epoch": 0.1976,
      "grad_norm": 0.15623724460601807,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 1235
    },
    {
      "epoch": 0.19776,
      "grad_norm": 0.14707255363464355,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 1236
    },
    {
      "epoch": 0.19792,
      "grad_norm": 0.1444503664970398,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 1237
    },
    {
      "epoch": 0.19808,
      "grad_norm": 0.1687050312757492,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 1238
    },
    {
      "epoch": 0.19824,
      "grad_norm": 0.14055365324020386,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 1239
    },
    {
      "epoch": 0.1984,
      "grad_norm": 0.15375490486621857,
      "learning_rate": 0.0001,
      "loss": 0.3601,
      "step": 1240
    },
    {
      "epoch": 0.19856,
      "grad_norm": 0.13139750063419342,
      "learning_rate": 0.0001,
      "loss": 0.3429,
      "step": 1241
    },
    {
      "epoch": 0.19872,
      "grad_norm": 0.13838286697864532,
      "learning_rate": 0.0001,
      "loss": 0.348,
      "step": 1242
    },
    {
      "epoch": 0.19888,
      "grad_norm": 0.16730685532093048,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 1243
    },
    {
      "epoch": 0.19904,
      "grad_norm": 0.21576674282550812,
      "learning_rate": 0.0001,
      "loss": 0.3457,
      "step": 1244
    },
    {
      "epoch": 0.1992,
      "grad_norm": 0.15585865080356598,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 1245
    },
    {
      "epoch": 0.19936,
      "grad_norm": 0.14645594358444214,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 1246
    },
    {
      "epoch": 0.19952,
      "grad_norm": 0.16389742493629456,
      "learning_rate": 0.0001,
      "loss": 0.3462,
      "step": 1247
    },
    {
      "epoch": 0.19968,
      "grad_norm": 0.13874377310276031,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 1248
    },
    {
      "epoch": 0.19984,
      "grad_norm": 0.15632376074790955,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 1249
    },
    {
      "epoch": 0.2,
      "grad_norm": 0.2064441442489624,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 1250
    },
    {
      "epoch": 0.20016,
      "grad_norm": 0.18391503393650055,
      "learning_rate": 0.0001,
      "loss": 0.3421,
      "step": 1251
    },
    {
      "epoch": 0.20032,
      "grad_norm": 0.14191322028636932,
      "learning_rate": 0.0001,
      "loss": 0.3503,
      "step": 1252
    },
    {
      "epoch": 0.20048,
      "grad_norm": 0.302067369222641,
      "learning_rate": 0.0001,
      "loss": 0.3494,
      "step": 1253
    },
    {
      "epoch": 0.20064,
      "grad_norm": 0.16249264776706696,
      "learning_rate": 0.0001,
      "loss": 0.3532,
      "step": 1254
    },
    {
      "epoch": 0.2008,
      "grad_norm": 0.1577921360731125,
      "learning_rate": 0.0001,
      "loss": 0.3395,
      "step": 1255
    },
    {
      "epoch": 0.20096,
      "grad_norm": 0.21798531711101532,
      "learning_rate": 0.0001,
      "loss": 0.351,
      "step": 1256
    },
    {
      "epoch": 0.20112,
      "grad_norm": 0.13767987489700317,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 1257
    },
    {
      "epoch": 0.20128,
      "grad_norm": 0.16842377185821533,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 1258
    },
    {
      "epoch": 0.20144,
      "grad_norm": 0.16253429651260376,
      "learning_rate": 0.0001,
      "loss": 0.3542,
      "step": 1259
    },
    {
      "epoch": 0.2016,
      "grad_norm": 0.139997199177742,
      "learning_rate": 0.0001,
      "loss": 0.3375,
      "step": 1260
    },
    {
      "epoch": 0.20176,
      "grad_norm": 0.15638338029384613,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 1261
    },
    {
      "epoch": 0.20192,
      "grad_norm": 0.15950116515159607,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 1262
    },
    {
      "epoch": 0.20208,
      "grad_norm": 0.15027131140232086,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 1263
    },
    {
      "epoch": 0.20224,
      "grad_norm": 0.15510833263397217,
      "learning_rate": 0.0001,
      "loss": 0.338,
      "step": 1264
    },
    {
      "epoch": 0.2024,
      "grad_norm": 0.1873425394296646,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 1265
    },
    {
      "epoch": 0.20256,
      "grad_norm": 0.15529534220695496,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 1266
    },
    {
      "epoch": 0.20272,
      "grad_norm": 0.14706599712371826,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 1267
    },
    {
      "epoch": 0.20288,
      "grad_norm": 0.13665220141410828,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 1268
    },
    {
      "epoch": 0.20304,
      "grad_norm": 0.1495194137096405,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 1269
    },
    {
      "epoch": 0.2032,
      "grad_norm": 0.12672223150730133,
      "learning_rate": 0.0001,
      "loss": 0.3394,
      "step": 1270
    },
    {
      "epoch": 0.20336,
      "grad_norm": 0.14322258532047272,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 1271
    },
    {
      "epoch": 0.20352,
      "grad_norm": 0.1295251101255417,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 1272
    },
    {
      "epoch": 0.20368,
      "grad_norm": 0.15614710748195648,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 1273
    },
    {
      "epoch": 0.20384,
      "grad_norm": 0.13507147133350372,
      "learning_rate": 0.0001,
      "loss": 0.3447,
      "step": 1274
    },
    {
      "epoch": 0.204,
      "grad_norm": 0.12869782745838165,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 1275
    },
    {
      "epoch": 0.20416,
      "grad_norm": 0.1374729573726654,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 1276
    },
    {
      "epoch": 0.20432,
      "grad_norm": 0.1323109269142151,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 1277
    },
    {
      "epoch": 0.20448,
      "grad_norm": 0.14252521097660065,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 1278
    },
    {
      "epoch": 0.20464,
      "grad_norm": 0.13048997521400452,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 1279
    },
    {
      "epoch": 0.2048,
      "grad_norm": 0.1382599025964737,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 1280
    },
    {
      "epoch": 0.20496,
      "grad_norm": 0.13325725495815277,
      "learning_rate": 0.0001,
      "loss": 0.3393,
      "step": 1281
    },
    {
      "epoch": 0.20512,
      "grad_norm": 0.1321745663881302,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 1282
    },
    {
      "epoch": 0.20528,
      "grad_norm": 0.1247236579656601,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 1283
    },
    {
      "epoch": 0.20544,
      "grad_norm": 0.14792050421237946,
      "learning_rate": 0.0001,
      "loss": 0.3507,
      "step": 1284
    },
    {
      "epoch": 0.2056,
      "grad_norm": 0.15847481787204742,
      "learning_rate": 0.0001,
      "loss": 0.3407,
      "step": 1285
    },
    {
      "epoch": 0.20576,
      "grad_norm": 0.1523532122373581,
      "learning_rate": 0.0001,
      "loss": 0.3473,
      "step": 1286
    },
    {
      "epoch": 0.20592,
      "grad_norm": 0.12638923525810242,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 1287
    },
    {
      "epoch": 0.20608,
      "grad_norm": 0.15995359420776367,
      "learning_rate": 0.0001,
      "loss": 0.346,
      "step": 1288
    },
    {
      "epoch": 0.20624,
      "grad_norm": 0.15449431538581848,
      "learning_rate": 0.0001,
      "loss": 0.3532,
      "step": 1289
    },
    {
      "epoch": 0.2064,
      "grad_norm": 0.15375077724456787,
      "learning_rate": 0.0001,
      "loss": 0.3505,
      "step": 1290
    },
    {
      "epoch": 0.20656,
      "grad_norm": 0.1646110862493515,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 1291
    },
    {
      "epoch": 0.20672,
      "grad_norm": 0.14354869723320007,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 1292
    },
    {
      "epoch": 0.20688,
      "grad_norm": 0.14020505547523499,
      "learning_rate": 0.0001,
      "loss": 0.3375,
      "step": 1293
    },
    {
      "epoch": 0.20704,
      "grad_norm": 0.1284700185060501,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 1294
    },
    {
      "epoch": 0.2072,
      "grad_norm": 0.15232321619987488,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 1295
    },
    {
      "epoch": 0.20736,
      "grad_norm": 0.1435210257768631,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 1296
    },
    {
      "epoch": 0.20752,
      "grad_norm": 0.1441219002008438,
      "learning_rate": 0.0001,
      "loss": 0.3526,
      "step": 1297
    },
    {
      "epoch": 0.20768,
      "grad_norm": 0.1506175845861435,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 1298
    },
    {
      "epoch": 0.20784,
      "grad_norm": 0.13425830006599426,
      "learning_rate": 0.0001,
      "loss": 0.3379,
      "step": 1299
    },
    {
      "epoch": 0.208,
      "grad_norm": 0.14614009857177734,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 1300
    },
    {
      "epoch": 0.208,
      "eval_train_accuracy": 0.509,
      "eval_train_loss": 0.3369031548500061,
      "eval_train_runtime": 4.2773,
      "eval_train_samples_per_second": 1168.972,
      "eval_train_steps_per_second": 14.729,
      "step": 1300
    },
    {
      "epoch": 0.208,
      "eval_test_accuracy": 0.5016,
      "eval_test_loss": 0.3355863392353058,
      "eval_test_runtime": 4.7396,
      "eval_test_samples_per_second": 1054.935,
      "eval_test_steps_per_second": 13.292,
      "step": 1300
    },
    {
      "epoch": 0.20816,
      "grad_norm": 0.16589035093784332,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 1301
    },
    {
      "epoch": 0.20832,
      "grad_norm": 0.14802880585193634,
      "learning_rate": 0.0001,
      "loss": 0.3479,
      "step": 1302
    },
    {
      "epoch": 0.20848,
      "grad_norm": 0.14259657263755798,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 1303
    },
    {
      "epoch": 0.20864,
      "grad_norm": 0.1451888531446457,
      "learning_rate": 0.0001,
      "loss": 0.3475,
      "step": 1304
    },
    {
      "epoch": 0.2088,
      "grad_norm": 0.17640046775341034,
      "learning_rate": 0.0001,
      "loss": 0.3424,
      "step": 1305
    },
    {
      "epoch": 0.20896,
      "grad_norm": 0.18634860217571259,
      "learning_rate": 0.0001,
      "loss": 0.3569,
      "step": 1306
    },
    {
      "epoch": 0.20912,
      "grad_norm": 0.12025673687458038,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 1307
    },
    {
      "epoch": 0.20928,
      "grad_norm": 0.14359912276268005,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 1308
    },
    {
      "epoch": 0.20944,
      "grad_norm": 0.16112247109413147,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 1309
    },
    {
      "epoch": 0.2096,
      "grad_norm": 0.1619076281785965,
      "learning_rate": 0.0001,
      "loss": 0.3429,
      "step": 1310
    },
    {
      "epoch": 0.20976,
      "grad_norm": 0.19149042665958405,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 1311
    },
    {
      "epoch": 0.20992,
      "grad_norm": 0.17731313407421112,
      "learning_rate": 0.0001,
      "loss": 0.3448,
      "step": 1312
    },
    {
      "epoch": 0.21008,
      "grad_norm": 0.12833864986896515,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 1313
    },
    {
      "epoch": 0.21024,
      "grad_norm": 0.16470976173877716,
      "learning_rate": 0.0001,
      "loss": 0.3405,
      "step": 1314
    },
    {
      "epoch": 0.2104,
      "grad_norm": 0.13275039196014404,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 1315
    },
    {
      "epoch": 0.21056,
      "grad_norm": 0.143116757273674,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 1316
    },
    {
      "epoch": 0.21072,
      "grad_norm": 0.27880725264549255,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 1317
    },
    {
      "epoch": 0.21088,
      "grad_norm": 0.1581854373216629,
      "learning_rate": 0.0001,
      "loss": 0.3531,
      "step": 1318
    },
    {
      "epoch": 0.21104,
      "grad_norm": 0.15622101724147797,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 1319
    },
    {
      "epoch": 0.2112,
      "grad_norm": 0.27468153834342957,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 1320
    },
    {
      "epoch": 0.21136,
      "grad_norm": 0.1886935979127884,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 1321
    },
    {
      "epoch": 0.21152,
      "grad_norm": 0.15454281866550446,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 1322
    },
    {
      "epoch": 0.21168,
      "grad_norm": 0.21755656599998474,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 1323
    },
    {
      "epoch": 0.21184,
      "grad_norm": 0.1342347264289856,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 1324
    },
    {
      "epoch": 0.212,
      "grad_norm": 0.19434955716133118,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 1325
    },
    {
      "epoch": 0.21216,
      "grad_norm": 0.15550869703292847,
      "learning_rate": 0.0001,
      "loss": 0.3472,
      "step": 1326
    },
    {
      "epoch": 0.21232,
      "grad_norm": 0.14767824113368988,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 1327
    },
    {
      "epoch": 0.21248,
      "grad_norm": 0.14507000148296356,
      "learning_rate": 0.0001,
      "loss": 0.3449,
      "step": 1328
    },
    {
      "epoch": 0.21264,
      "grad_norm": 0.13989484310150146,
      "learning_rate": 0.0001,
      "loss": 0.3508,
      "step": 1329
    },
    {
      "epoch": 0.2128,
      "grad_norm": 0.14985476434230804,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 1330
    },
    {
      "epoch": 0.21296,
      "grad_norm": 0.16202042996883392,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 1331
    },
    {
      "epoch": 0.21312,
      "grad_norm": 0.14336322247982025,
      "learning_rate": 0.0001,
      "loss": 0.3512,
      "step": 1332
    },
    {
      "epoch": 0.21328,
      "grad_norm": 0.1338959038257599,
      "learning_rate": 0.0001,
      "loss": 0.3437,
      "step": 1333
    },
    {
      "epoch": 0.21344,
      "grad_norm": 0.14502106606960297,
      "learning_rate": 0.0001,
      "loss": 0.3463,
      "step": 1334
    },
    {
      "epoch": 0.2136,
      "grad_norm": 0.18398793041706085,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 1335
    },
    {
      "epoch": 0.21376,
      "grad_norm": 0.12756547331809998,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 1336
    },
    {
      "epoch": 0.21392,
      "grad_norm": 0.12687455117702484,
      "learning_rate": 0.0001,
      "loss": 0.3465,
      "step": 1337
    },
    {
      "epoch": 0.21408,
      "grad_norm": 0.17269004881381989,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 1338
    },
    {
      "epoch": 0.21424,
      "grad_norm": 0.1705126166343689,
      "learning_rate": 0.0001,
      "loss": 0.3498,
      "step": 1339
    },
    {
      "epoch": 0.2144,
      "grad_norm": 0.12563487887382507,
      "learning_rate": 0.0001,
      "loss": 0.3382,
      "step": 1340
    },
    {
      "epoch": 0.21456,
      "grad_norm": 0.1955445259809494,
      "learning_rate": 0.0001,
      "loss": 0.3451,
      "step": 1341
    },
    {
      "epoch": 0.21472,
      "grad_norm": 0.16594600677490234,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 1342
    },
    {
      "epoch": 0.21488,
      "grad_norm": 0.13023914396762848,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 1343
    },
    {
      "epoch": 0.21504,
      "grad_norm": 0.15277844667434692,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 1344
    },
    {
      "epoch": 0.2152,
      "grad_norm": 0.14079879224300385,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 1345
    },
    {
      "epoch": 0.21536,
      "grad_norm": 0.18671786785125732,
      "learning_rate": 0.0001,
      "loss": 0.3543,
      "step": 1346
    },
    {
      "epoch": 0.21552,
      "grad_norm": 0.21253304183483124,
      "learning_rate": 0.0001,
      "loss": 0.3457,
      "step": 1347
    },
    {
      "epoch": 0.21568,
      "grad_norm": 0.13707447052001953,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 1348
    },
    {
      "epoch": 0.21584,
      "grad_norm": 0.15478907525539398,
      "learning_rate": 0.0001,
      "loss": 0.3392,
      "step": 1349
    },
    {
      "epoch": 0.216,
      "grad_norm": 0.40022340416908264,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 1350
    },
    {
      "epoch": 0.21616,
      "grad_norm": 0.19652584195137024,
      "learning_rate": 0.0001,
      "loss": 0.3638,
      "step": 1351
    },
    {
      "epoch": 0.21632,
      "grad_norm": 0.33690395951271057,
      "learning_rate": 0.0001,
      "loss": 0.3459,
      "step": 1352
    },
    {
      "epoch": 0.21648,
      "grad_norm": 0.17792119085788727,
      "learning_rate": 0.0001,
      "loss": 0.3444,
      "step": 1353
    },
    {
      "epoch": 0.21664,
      "grad_norm": 0.1769525706768036,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 1354
    },
    {
      "epoch": 0.2168,
      "grad_norm": 0.25765520334243774,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 1355
    },
    {
      "epoch": 0.21696,
      "grad_norm": 0.2061038762331009,
      "learning_rate": 0.0001,
      "loss": 0.3601,
      "step": 1356
    },
    {
      "epoch": 0.21712,
      "grad_norm": 0.14978955686092377,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 1357
    },
    {
      "epoch": 0.21728,
      "grad_norm": 0.18457399308681488,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 1358
    },
    {
      "epoch": 0.21744,
      "grad_norm": 0.19252893328666687,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 1359
    },
    {
      "epoch": 0.2176,
      "grad_norm": 0.2759505808353424,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 1360
    },
    {
      "epoch": 0.21776,
      "grad_norm": 0.17229962348937988,
      "learning_rate": 0.0001,
      "loss": 0.3503,
      "step": 1361
    },
    {
      "epoch": 0.21792,
      "grad_norm": 0.15680500864982605,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 1362
    },
    {
      "epoch": 0.21808,
      "grad_norm": 0.19203120470046997,
      "learning_rate": 0.0001,
      "loss": 0.345,
      "step": 1363
    },
    {
      "epoch": 0.21824,
      "grad_norm": 0.23416753113269806,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 1364
    },
    {
      "epoch": 0.2184,
      "grad_norm": 0.1575872004032135,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 1365
    },
    {
      "epoch": 0.21856,
      "grad_norm": 0.1432751566171646,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 1366
    },
    {
      "epoch": 0.21872,
      "grad_norm": 0.1579352617263794,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 1367
    },
    {
      "epoch": 0.21888,
      "grad_norm": 0.24055303633213043,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 1368
    },
    {
      "epoch": 0.21904,
      "grad_norm": 0.26524537801742554,
      "learning_rate": 0.0001,
      "loss": 0.3532,
      "step": 1369
    },
    {
      "epoch": 0.2192,
      "grad_norm": 0.14915156364440918,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 1370
    },
    {
      "epoch": 0.21936,
      "grad_norm": 0.1807958483695984,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 1371
    },
    {
      "epoch": 0.21952,
      "grad_norm": 0.12209012359380722,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 1372
    },
    {
      "epoch": 0.21968,
      "grad_norm": 0.20879730582237244,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 1373
    },
    {
      "epoch": 0.21984,
      "grad_norm": 0.17652808129787445,
      "learning_rate": 0.0001,
      "loss": 0.3484,
      "step": 1374
    },
    {
      "epoch": 0.22,
      "grad_norm": 0.1492893099784851,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 1375
    },
    {
      "epoch": 0.22016,
      "grad_norm": 0.17264392971992493,
      "learning_rate": 0.0001,
      "loss": 0.3496,
      "step": 1376
    },
    {
      "epoch": 0.22032,
      "grad_norm": 0.21911738812923431,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 1377
    },
    {
      "epoch": 0.22048,
      "grad_norm": 0.1414344608783722,
      "learning_rate": 0.0001,
      "loss": 0.3381,
      "step": 1378
    },
    {
      "epoch": 0.22064,
      "grad_norm": 0.2043011486530304,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 1379
    },
    {
      "epoch": 0.2208,
      "grad_norm": 0.1826595962047577,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 1380
    },
    {
      "epoch": 0.22096,
      "grad_norm": 0.16527904570102692,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 1381
    },
    {
      "epoch": 0.22112,
      "grad_norm": 0.14992523193359375,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 1382
    },
    {
      "epoch": 0.22128,
      "grad_norm": 0.14343200623989105,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 1383
    },
    {
      "epoch": 0.22144,
      "grad_norm": 0.19032588601112366,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 1384
    },
    {
      "epoch": 0.2216,
      "grad_norm": 0.15334686636924744,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 1385
    },
    {
      "epoch": 0.22176,
      "grad_norm": 0.16496890783309937,
      "learning_rate": 0.0001,
      "loss": 0.35,
      "step": 1386
    },
    {
      "epoch": 0.22192,
      "grad_norm": 0.14542897045612335,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 1387
    },
    {
      "epoch": 0.22208,
      "grad_norm": 0.13110119104385376,
      "learning_rate": 0.0001,
      "loss": 0.3382,
      "step": 1388
    },
    {
      "epoch": 0.22224,
      "grad_norm": 0.1548253297805786,
      "learning_rate": 0.0001,
      "loss": 0.3426,
      "step": 1389
    },
    {
      "epoch": 0.2224,
      "grad_norm": 0.14527317881584167,
      "learning_rate": 0.0001,
      "loss": 0.3454,
      "step": 1390
    },
    {
      "epoch": 0.22256,
      "grad_norm": 0.1504576951265335,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 1391
    },
    {
      "epoch": 0.22272,
      "grad_norm": 0.14832612872123718,
      "learning_rate": 0.0001,
      "loss": 0.3511,
      "step": 1392
    },
    {
      "epoch": 0.22288,
      "grad_norm": 0.13668379187583923,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 1393
    },
    {
      "epoch": 0.22304,
      "grad_norm": 0.16750255227088928,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 1394
    },
    {
      "epoch": 0.2232,
      "grad_norm": 0.14752139151096344,
      "learning_rate": 0.0001,
      "loss": 0.3419,
      "step": 1395
    },
    {
      "epoch": 0.22336,
      "grad_norm": 0.14462770521640778,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 1396
    },
    {
      "epoch": 0.22352,
      "grad_norm": 0.12813422083854675,
      "learning_rate": 0.0001,
      "loss": 0.338,
      "step": 1397
    },
    {
      "epoch": 0.22368,
      "grad_norm": 0.14461471140384674,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 1398
    },
    {
      "epoch": 0.22384,
      "grad_norm": 0.12961521744728088,
      "learning_rate": 0.0001,
      "loss": 0.3446,
      "step": 1399
    },
    {
      "epoch": 0.224,
      "grad_norm": 0.15916183590888977,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 1400
    },
    {
      "epoch": 0.224,
      "eval_train_accuracy": 0.499,
      "eval_train_loss": 0.3360961377620697,
      "eval_train_runtime": 4.2704,
      "eval_train_samples_per_second": 1170.856,
      "eval_train_steps_per_second": 14.753,
      "step": 1400
    },
    {
      "epoch": 0.224,
      "eval_test_accuracy": 0.4948,
      "eval_test_loss": 0.33458495140075684,
      "eval_test_runtime": 4.828,
      "eval_test_samples_per_second": 1035.627,
      "eval_test_steps_per_second": 13.049,
      "step": 1400
    },
    {
      "epoch": 0.22416,
      "grad_norm": 0.23556406795978546,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 1401
    },
    {
      "epoch": 0.22432,
      "grad_norm": 0.1176517978310585,
      "learning_rate": 0.0001,
      "loss": 0.3401,
      "step": 1402
    },
    {
      "epoch": 0.22448,
      "grad_norm": 0.12412220984697342,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 1403
    },
    {
      "epoch": 0.22464,
      "grad_norm": 0.15101900696754456,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 1404
    },
    {
      "epoch": 0.2248,
      "grad_norm": 0.36921122670173645,
      "learning_rate": 0.0001,
      "loss": 0.3492,
      "step": 1405
    },
    {
      "epoch": 0.22496,
      "grad_norm": 0.1324876993894577,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 1406
    },
    {
      "epoch": 0.22512,
      "grad_norm": 0.12472201883792877,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 1407
    },
    {
      "epoch": 0.22528,
      "grad_norm": 0.5403371453285217,
      "learning_rate": 0.0001,
      "loss": 0.3382,
      "step": 1408
    },
    {
      "epoch": 0.22544,
      "grad_norm": 0.22652655839920044,
      "learning_rate": 0.0001,
      "loss": 0.3421,
      "step": 1409
    },
    {
      "epoch": 0.2256,
      "grad_norm": 0.21903331577777863,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 1410
    },
    {
      "epoch": 0.22576,
      "grad_norm": 0.21795791387557983,
      "learning_rate": 0.0001,
      "loss": 0.3374,
      "step": 1411
    },
    {
      "epoch": 0.22592,
      "grad_norm": 0.18571047484874725,
      "learning_rate": 0.0001,
      "loss": 0.3521,
      "step": 1412
    },
    {
      "epoch": 0.22608,
      "grad_norm": 0.19862988591194153,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 1413
    },
    {
      "epoch": 0.22624,
      "grad_norm": 0.21028827130794525,
      "learning_rate": 0.0001,
      "loss": 0.3458,
      "step": 1414
    },
    {
      "epoch": 0.2264,
      "grad_norm": 0.23988422751426697,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 1415
    },
    {
      "epoch": 0.22656,
      "grad_norm": 0.15910853445529938,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 1416
    },
    {
      "epoch": 0.22672,
      "grad_norm": 0.1418539136648178,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 1417
    },
    {
      "epoch": 0.22688,
      "grad_norm": 0.21074774861335754,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 1418
    },
    {
      "epoch": 0.22704,
      "grad_norm": 0.17671428620815277,
      "learning_rate": 0.0001,
      "loss": 0.3373,
      "step": 1419
    },
    {
      "epoch": 0.2272,
      "grad_norm": 0.13856196403503418,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 1420
    },
    {
      "epoch": 0.22736,
      "grad_norm": 0.14120273292064667,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 1421
    },
    {
      "epoch": 0.22752,
      "grad_norm": 0.1419226974248886,
      "learning_rate": 0.0001,
      "loss": 0.3457,
      "step": 1422
    },
    {
      "epoch": 0.22768,
      "grad_norm": 0.17494039237499237,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 1423
    },
    {
      "epoch": 0.22784,
      "grad_norm": 0.1634264886379242,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 1424
    },
    {
      "epoch": 0.228,
      "grad_norm": 0.14796499907970428,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 1425
    },
    {
      "epoch": 0.22816,
      "grad_norm": 0.1279241293668747,
      "learning_rate": 0.0001,
      "loss": 0.3355,
      "step": 1426
    },
    {
      "epoch": 0.22832,
      "grad_norm": 0.16425256431102753,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 1427
    },
    {
      "epoch": 0.22848,
      "grad_norm": 0.14771923422813416,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 1428
    },
    {
      "epoch": 0.22864,
      "grad_norm": 0.14826230704784393,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 1429
    },
    {
      "epoch": 0.2288,
      "grad_norm": 0.13730646669864655,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 1430
    },
    {
      "epoch": 0.22896,
      "grad_norm": 0.23308639228343964,
      "learning_rate": 0.0001,
      "loss": 0.3516,
      "step": 1431
    },
    {
      "epoch": 0.22912,
      "grad_norm": 0.15650655329227448,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 1432
    },
    {
      "epoch": 0.22928,
      "grad_norm": 0.12739461660385132,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 1433
    },
    {
      "epoch": 0.22944,
      "grad_norm": 0.17468243837356567,
      "learning_rate": 0.0001,
      "loss": 0.3492,
      "step": 1434
    },
    {
      "epoch": 0.2296,
      "grad_norm": 0.1489018052816391,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 1435
    },
    {
      "epoch": 0.22976,
      "grad_norm": 0.15209290385246277,
      "learning_rate": 0.0001,
      "loss": 0.352,
      "step": 1436
    },
    {
      "epoch": 0.22992,
      "grad_norm": 0.1553778201341629,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 1437
    },
    {
      "epoch": 0.23008,
      "grad_norm": 0.12103312462568283,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 1438
    },
    {
      "epoch": 0.23024,
      "grad_norm": 0.146159365773201,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 1439
    },
    {
      "epoch": 0.2304,
      "grad_norm": 0.13945160806179047,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 1440
    },
    {
      "epoch": 0.23056,
      "grad_norm": 0.12469269335269928,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 1441
    },
    {
      "epoch": 0.23072,
      "grad_norm": 0.16207394003868103,
      "learning_rate": 0.0001,
      "loss": 0.344,
      "step": 1442
    },
    {
      "epoch": 0.23088,
      "grad_norm": 0.16550901532173157,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 1443
    },
    {
      "epoch": 0.23104,
      "grad_norm": 0.13263271749019623,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 1444
    },
    {
      "epoch": 0.2312,
      "grad_norm": 0.15966950356960297,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 1445
    },
    {
      "epoch": 0.23136,
      "grad_norm": 0.13314932584762573,
      "learning_rate": 0.0001,
      "loss": 0.3414,
      "step": 1446
    },
    {
      "epoch": 0.23152,
      "grad_norm": 0.13887520134449005,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 1447
    },
    {
      "epoch": 0.23168,
      "grad_norm": 0.1403239667415619,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 1448
    },
    {
      "epoch": 0.23184,
      "grad_norm": 0.14480292797088623,
      "learning_rate": 0.0001,
      "loss": 0.338,
      "step": 1449
    },
    {
      "epoch": 0.232,
      "grad_norm": 0.14149031043052673,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 1450
    },
    {
      "epoch": 0.23216,
      "grad_norm": 0.16664698719978333,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 1451
    },
    {
      "epoch": 0.23232,
      "grad_norm": 0.14580227434635162,
      "learning_rate": 0.0001,
      "loss": 0.346,
      "step": 1452
    },
    {
      "epoch": 0.23248,
      "grad_norm": 0.19171930849552155,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 1453
    },
    {
      "epoch": 0.23264,
      "grad_norm": 0.15136145055294037,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 1454
    },
    {
      "epoch": 0.2328,
      "grad_norm": 0.13474975526332855,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 1455
    },
    {
      "epoch": 0.23296,
      "grad_norm": 0.16057798266410828,
      "learning_rate": 0.0001,
      "loss": 0.3452,
      "step": 1456
    },
    {
      "epoch": 0.23312,
      "grad_norm": 0.1375403106212616,
      "learning_rate": 0.0001,
      "loss": 0.3496,
      "step": 1457
    },
    {
      "epoch": 0.23328,
      "grad_norm": 0.1440654844045639,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 1458
    },
    {
      "epoch": 0.23344,
      "grad_norm": 0.13099108636379242,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 1459
    },
    {
      "epoch": 0.2336,
      "grad_norm": 0.14508478343486786,
      "learning_rate": 0.0001,
      "loss": 0.3528,
      "step": 1460
    },
    {
      "epoch": 0.23376,
      "grad_norm": 0.15294037759304047,
      "learning_rate": 0.0001,
      "loss": 0.3436,
      "step": 1461
    },
    {
      "epoch": 0.23392,
      "grad_norm": 0.12753276526927948,
      "learning_rate": 0.0001,
      "loss": 0.3473,
      "step": 1462
    },
    {
      "epoch": 0.23408,
      "grad_norm": 0.1394745409488678,
      "learning_rate": 0.0001,
      "loss": 0.348,
      "step": 1463
    },
    {
      "epoch": 0.23424,
      "grad_norm": 0.1452183723449707,
      "learning_rate": 0.0001,
      "loss": 0.3424,
      "step": 1464
    },
    {
      "epoch": 0.2344,
      "grad_norm": 0.1470375955104828,
      "learning_rate": 0.0001,
      "loss": 0.3394,
      "step": 1465
    },
    {
      "epoch": 0.23456,
      "grad_norm": 0.12998738884925842,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 1466
    },
    {
      "epoch": 0.23472,
      "grad_norm": 0.12540212273597717,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 1467
    },
    {
      "epoch": 0.23488,
      "grad_norm": 0.14241288602352142,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 1468
    },
    {
      "epoch": 0.23504,
      "grad_norm": 0.13712717592716217,
      "learning_rate": 0.0001,
      "loss": 0.341,
      "step": 1469
    },
    {
      "epoch": 0.2352,
      "grad_norm": 0.12628312408924103,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 1470
    },
    {
      "epoch": 0.23536,
      "grad_norm": 0.1282891035079956,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 1471
    },
    {
      "epoch": 0.23552,
      "grad_norm": 0.1401742398738861,
      "learning_rate": 0.0001,
      "loss": 0.3407,
      "step": 1472
    },
    {
      "epoch": 0.23568,
      "grad_norm": 0.1285592019557953,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 1473
    },
    {
      "epoch": 0.23584,
      "grad_norm": 0.12388540059328079,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 1474
    },
    {
      "epoch": 0.236,
      "grad_norm": 0.1542786955833435,
      "learning_rate": 0.0001,
      "loss": 0.3439,
      "step": 1475
    },
    {
      "epoch": 0.23616,
      "grad_norm": 0.2461165338754654,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 1476
    },
    {
      "epoch": 0.23632,
      "grad_norm": 0.15182122588157654,
      "learning_rate": 0.0001,
      "loss": 0.3448,
      "step": 1477
    },
    {
      "epoch": 0.23648,
      "grad_norm": 0.11799311637878418,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 1478
    },
    {
      "epoch": 0.23664,
      "grad_norm": 0.2708788514137268,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 1479
    },
    {
      "epoch": 0.2368,
      "grad_norm": 0.1472320258617401,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 1480
    },
    {
      "epoch": 0.23696,
      "grad_norm": 0.16443784534931183,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 1481
    },
    {
      "epoch": 0.23712,
      "grad_norm": 0.21590833365917206,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 1482
    },
    {
      "epoch": 0.23728,
      "grad_norm": 0.1591482013463974,
      "learning_rate": 0.0001,
      "loss": 0.3455,
      "step": 1483
    },
    {
      "epoch": 0.23744,
      "grad_norm": 0.12654829025268555,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 1484
    },
    {
      "epoch": 0.2376,
      "grad_norm": 0.16209504008293152,
      "learning_rate": 0.0001,
      "loss": 0.3506,
      "step": 1485
    },
    {
      "epoch": 0.23776,
      "grad_norm": 0.15230296552181244,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 1486
    },
    {
      "epoch": 0.23792,
      "grad_norm": 0.17124783992767334,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 1487
    },
    {
      "epoch": 0.23808,
      "grad_norm": 0.1695893406867981,
      "learning_rate": 0.0001,
      "loss": 0.3505,
      "step": 1488
    },
    {
      "epoch": 0.23824,
      "grad_norm": 0.14210785925388336,
      "learning_rate": 0.0001,
      "loss": 0.3447,
      "step": 1489
    },
    {
      "epoch": 0.2384,
      "grad_norm": 0.15628553926944733,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 1490
    },
    {
      "epoch": 0.23856,
      "grad_norm": 0.12292546778917313,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 1491
    },
    {
      "epoch": 0.23872,
      "grad_norm": 0.1632077544927597,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 1492
    },
    {
      "epoch": 0.23888,
      "grad_norm": 0.13325631618499756,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 1493
    },
    {
      "epoch": 0.23904,
      "grad_norm": 0.12743651866912842,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 1494
    },
    {
      "epoch": 0.2392,
      "grad_norm": 0.11553977429866791,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 1495
    },
    {
      "epoch": 0.23936,
      "grad_norm": 0.1239985004067421,
      "learning_rate": 0.0001,
      "loss": 0.3504,
      "step": 1496
    },
    {
      "epoch": 0.23952,
      "grad_norm": 0.1711837500333786,
      "learning_rate": 0.0001,
      "loss": 0.3449,
      "step": 1497
    },
    {
      "epoch": 0.23968,
      "grad_norm": 0.1359114795923233,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 1498
    },
    {
      "epoch": 0.23984,
      "grad_norm": 0.14755640923976898,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 1499
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.14491814374923706,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 1500
    },
    {
      "epoch": 0.24,
      "eval_train_accuracy": 0.517,
      "eval_train_loss": 0.33512556552886963,
      "eval_train_runtime": 4.0975,
      "eval_train_samples_per_second": 1220.25,
      "eval_train_steps_per_second": 15.375,
      "step": 1500
    },
    {
      "epoch": 0.24,
      "eval_test_accuracy": 0.5056,
      "eval_test_loss": 0.33359646797180176,
      "eval_test_runtime": 4.9535,
      "eval_test_samples_per_second": 1009.382,
      "eval_test_steps_per_second": 12.718,
      "step": 1500
    },
    {
      "epoch": 0.24016,
      "grad_norm": 0.12850724160671234,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 1501
    },
    {
      "epoch": 0.24032,
      "grad_norm": 0.13737551867961884,
      "learning_rate": 0.0001,
      "loss": 0.3479,
      "step": 1502
    },
    {
      "epoch": 0.24048,
      "grad_norm": 0.1419009417295456,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 1503
    },
    {
      "epoch": 0.24064,
      "grad_norm": 0.12849393486976624,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 1504
    },
    {
      "epoch": 0.2408,
      "grad_norm": 0.12741804122924805,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 1505
    },
    {
      "epoch": 0.24096,
      "grad_norm": 0.15723353624343872,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 1506
    },
    {
      "epoch": 0.24112,
      "grad_norm": 0.18672816455364227,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 1507
    },
    {
      "epoch": 0.24128,
      "grad_norm": 0.13195325434207916,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 1508
    },
    {
      "epoch": 0.24144,
      "grad_norm": 0.15749229490756989,
      "learning_rate": 0.0001,
      "loss": 0.3608,
      "step": 1509
    },
    {
      "epoch": 0.2416,
      "grad_norm": 0.16494733095169067,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 1510
    },
    {
      "epoch": 0.24176,
      "grad_norm": 0.1604336053133011,
      "learning_rate": 0.0001,
      "loss": 0.341,
      "step": 1511
    },
    {
      "epoch": 0.24192,
      "grad_norm": 0.14212125539779663,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 1512
    },
    {
      "epoch": 0.24208,
      "grad_norm": 0.13920843601226807,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 1513
    },
    {
      "epoch": 0.24224,
      "grad_norm": 0.15047994256019592,
      "learning_rate": 0.0001,
      "loss": 0.3467,
      "step": 1514
    },
    {
      "epoch": 0.2424,
      "grad_norm": 0.17487946152687073,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 1515
    },
    {
      "epoch": 0.24256,
      "grad_norm": 0.1464587152004242,
      "learning_rate": 0.0001,
      "loss": 0.3401,
      "step": 1516
    },
    {
      "epoch": 0.24272,
      "grad_norm": 0.15216143429279327,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 1517
    },
    {
      "epoch": 0.24288,
      "grad_norm": 0.16179156303405762,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 1518
    },
    {
      "epoch": 0.24304,
      "grad_norm": 0.14526161551475525,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 1519
    },
    {
      "epoch": 0.2432,
      "grad_norm": 0.14425548911094666,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 1520
    },
    {
      "epoch": 0.24336,
      "grad_norm": 0.16468946635723114,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 1521
    },
    {
      "epoch": 0.24352,
      "grad_norm": 0.14569666981697083,
      "learning_rate": 0.0001,
      "loss": 0.3414,
      "step": 1522
    },
    {
      "epoch": 0.24368,
      "grad_norm": 0.17661456763744354,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 1523
    },
    {
      "epoch": 0.24384,
      "grad_norm": 0.173771932721138,
      "learning_rate": 0.0001,
      "loss": 0.3468,
      "step": 1524
    },
    {
      "epoch": 0.244,
      "grad_norm": 0.16163212060928345,
      "learning_rate": 0.0001,
      "loss": 0.3461,
      "step": 1525
    },
    {
      "epoch": 0.24416,
      "grad_norm": 0.11219384521245956,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 1526
    },
    {
      "epoch": 0.24432,
      "grad_norm": 0.2818809151649475,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 1527
    },
    {
      "epoch": 0.24448,
      "grad_norm": 0.13863559067249298,
      "learning_rate": 0.0001,
      "loss": 0.3464,
      "step": 1528
    },
    {
      "epoch": 0.24464,
      "grad_norm": 0.151545450091362,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 1529
    },
    {
      "epoch": 0.2448,
      "grad_norm": 0.18285097181797028,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 1530
    },
    {
      "epoch": 0.24496,
      "grad_norm": 0.1282290667295456,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 1531
    },
    {
      "epoch": 0.24512,
      "grad_norm": 0.12263070791959763,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 1532
    },
    {
      "epoch": 0.24528,
      "grad_norm": 0.13204270601272583,
      "learning_rate": 0.0001,
      "loss": 0.3437,
      "step": 1533
    },
    {
      "epoch": 0.24544,
      "grad_norm": 0.13755083084106445,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 1534
    },
    {
      "epoch": 0.2456,
      "grad_norm": 0.1595465987920761,
      "learning_rate": 0.0001,
      "loss": 0.3374,
      "step": 1535
    },
    {
      "epoch": 0.24576,
      "grad_norm": 0.13838352262973785,
      "learning_rate": 0.0001,
      "loss": 0.3473,
      "step": 1536
    },
    {
      "epoch": 0.24592,
      "grad_norm": 0.12968212366104126,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 1537
    },
    {
      "epoch": 0.24608,
      "grad_norm": 0.12604071199893951,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 1538
    },
    {
      "epoch": 0.24624,
      "grad_norm": 0.1393791139125824,
      "learning_rate": 0.0001,
      "loss": 0.3532,
      "step": 1539
    },
    {
      "epoch": 0.2464,
      "grad_norm": 0.15967915952205658,
      "learning_rate": 0.0001,
      "loss": 0.3448,
      "step": 1540
    },
    {
      "epoch": 0.24656,
      "grad_norm": 0.12633462250232697,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 1541
    },
    {
      "epoch": 0.24672,
      "grad_norm": 0.14629223942756653,
      "learning_rate": 0.0001,
      "loss": 0.3442,
      "step": 1542
    },
    {
      "epoch": 0.24688,
      "grad_norm": 0.15900540351867676,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 1543
    },
    {
      "epoch": 0.24704,
      "grad_norm": 0.139556422829628,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 1544
    },
    {
      "epoch": 0.2472,
      "grad_norm": 0.1396406590938568,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 1545
    },
    {
      "epoch": 0.24736,
      "grad_norm": 0.14378607273101807,
      "learning_rate": 0.0001,
      "loss": 0.3382,
      "step": 1546
    },
    {
      "epoch": 0.24752,
      "grad_norm": 0.13580121099948883,
      "learning_rate": 0.0001,
      "loss": 0.3495,
      "step": 1547
    },
    {
      "epoch": 0.24768,
      "grad_norm": 0.16802969574928284,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 1548
    },
    {
      "epoch": 0.24784,
      "grad_norm": 0.20711401104927063,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 1549
    },
    {
      "epoch": 0.248,
      "grad_norm": 0.1391419619321823,
      "learning_rate": 0.0001,
      "loss": 0.3381,
      "step": 1550
    },
    {
      "epoch": 0.24816,
      "grad_norm": 0.1783551424741745,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 1551
    },
    {
      "epoch": 0.24832,
      "grad_norm": 0.1629391759634018,
      "learning_rate": 0.0001,
      "loss": 0.3392,
      "step": 1552
    },
    {
      "epoch": 0.24848,
      "grad_norm": 0.12343243509531021,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 1553
    },
    {
      "epoch": 0.24864,
      "grad_norm": 0.1531251072883606,
      "learning_rate": 0.0001,
      "loss": 0.3473,
      "step": 1554
    },
    {
      "epoch": 0.2488,
      "grad_norm": 0.1488095372915268,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 1555
    },
    {
      "epoch": 0.24896,
      "grad_norm": 0.12427526712417603,
      "learning_rate": 0.0001,
      "loss": 0.3347,
      "step": 1556
    },
    {
      "epoch": 0.24912,
      "grad_norm": 0.13405747711658478,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 1557
    },
    {
      "epoch": 0.24928,
      "grad_norm": 0.1696983128786087,
      "learning_rate": 0.0001,
      "loss": 0.3397,
      "step": 1558
    },
    {
      "epoch": 0.24944,
      "grad_norm": 0.13289819657802582,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 1559
    },
    {
      "epoch": 0.2496,
      "grad_norm": 0.11836162954568863,
      "learning_rate": 0.0001,
      "loss": 0.343,
      "step": 1560
    },
    {
      "epoch": 0.24976,
      "grad_norm": 0.11537153273820877,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 1561
    },
    {
      "epoch": 0.24992,
      "grad_norm": 0.1286933571100235,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 1562
    },
    {
      "epoch": 0.25008,
      "grad_norm": 0.10510843247175217,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 1563
    },
    {
      "epoch": 0.25024,
      "grad_norm": 0.12653987109661102,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 1564
    },
    {
      "epoch": 0.2504,
      "grad_norm": 0.11507418006658554,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 1565
    },
    {
      "epoch": 0.25056,
      "grad_norm": 0.1401192992925644,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 1566
    },
    {
      "epoch": 0.25072,
      "grad_norm": 0.14328284561634064,
      "learning_rate": 0.0001,
      "loss": 0.3456,
      "step": 1567
    },
    {
      "epoch": 0.25088,
      "grad_norm": 0.14421002566814423,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 1568
    },
    {
      "epoch": 0.25104,
      "grad_norm": 0.1311556100845337,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 1569
    },
    {
      "epoch": 0.2512,
      "grad_norm": 0.13130012154579163,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 1570
    },
    {
      "epoch": 0.25136,
      "grad_norm": 0.17648087441921234,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 1571
    },
    {
      "epoch": 0.25152,
      "grad_norm": 0.12588773667812347,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 1572
    },
    {
      "epoch": 0.25168,
      "grad_norm": 0.1586245894432068,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 1573
    },
    {
      "epoch": 0.25184,
      "grad_norm": 0.20076310634613037,
      "learning_rate": 0.0001,
      "loss": 0.342,
      "step": 1574
    },
    {
      "epoch": 0.252,
      "grad_norm": 0.12282285839319229,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 1575
    },
    {
      "epoch": 0.25216,
      "grad_norm": 0.1338127851486206,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 1576
    },
    {
      "epoch": 0.25232,
      "grad_norm": 0.1304735243320465,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 1577
    },
    {
      "epoch": 0.25248,
      "grad_norm": 0.12180054932832718,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 1578
    },
    {
      "epoch": 0.25264,
      "grad_norm": 0.16321870684623718,
      "learning_rate": 0.0001,
      "loss": 0.3462,
      "step": 1579
    },
    {
      "epoch": 0.2528,
      "grad_norm": 0.15485349297523499,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 1580
    },
    {
      "epoch": 0.25296,
      "grad_norm": 0.19209538400173187,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 1581
    },
    {
      "epoch": 0.25312,
      "grad_norm": 0.13642053306102753,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 1582
    },
    {
      "epoch": 0.25328,
      "grad_norm": 0.13175687193870544,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 1583
    },
    {
      "epoch": 0.25344,
      "grad_norm": 0.1592743992805481,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 1584
    },
    {
      "epoch": 0.2536,
      "grad_norm": 0.15440216660499573,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 1585
    },
    {
      "epoch": 0.25376,
      "grad_norm": 0.1449015885591507,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 1586
    },
    {
      "epoch": 0.25392,
      "grad_norm": 0.17312011122703552,
      "learning_rate": 0.0001,
      "loss": 0.3355,
      "step": 1587
    },
    {
      "epoch": 0.25408,
      "grad_norm": 0.1344112753868103,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 1588
    },
    {
      "epoch": 0.25424,
      "grad_norm": 0.1313263326883316,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 1589
    },
    {
      "epoch": 0.2544,
      "grad_norm": 0.13081757724285126,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 1590
    },
    {
      "epoch": 0.25456,
      "grad_norm": 0.145720437169075,
      "learning_rate": 0.0001,
      "loss": 0.3466,
      "step": 1591
    },
    {
      "epoch": 0.25472,
      "grad_norm": 0.14336428046226501,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 1592
    },
    {
      "epoch": 0.25488,
      "grad_norm": 0.14353302121162415,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 1593
    },
    {
      "epoch": 0.25504,
      "grad_norm": 0.12606985867023468,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 1594
    },
    {
      "epoch": 0.2552,
      "grad_norm": 0.15213650465011597,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 1595
    },
    {
      "epoch": 0.25536,
      "grad_norm": 0.17193835973739624,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 1596
    },
    {
      "epoch": 0.25552,
      "grad_norm": 0.1590505689382553,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 1597
    },
    {
      "epoch": 0.25568,
      "grad_norm": 0.18743005394935608,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 1598
    },
    {
      "epoch": 0.25584,
      "grad_norm": 0.16778670251369476,
      "learning_rate": 0.0001,
      "loss": 0.3355,
      "step": 1599
    },
    {
      "epoch": 0.256,
      "grad_norm": 0.15312209725379944,
      "learning_rate": 0.0001,
      "loss": 0.3451,
      "step": 1600
    },
    {
      "epoch": 0.256,
      "eval_train_accuracy": 0.5044,
      "eval_train_loss": 0.3329640030860901,
      "eval_train_runtime": 4.0828,
      "eval_train_samples_per_second": 1224.655,
      "eval_train_steps_per_second": 15.431,
      "step": 1600
    },
    {
      "epoch": 0.256,
      "eval_test_accuracy": 0.5112,
      "eval_test_loss": 0.3316289484500885,
      "eval_test_runtime": 4.9116,
      "eval_test_samples_per_second": 1018.007,
      "eval_test_steps_per_second": 12.827,
      "step": 1600
    },
    {
      "epoch": 0.25616,
      "grad_norm": 0.15294739603996277,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 1601
    },
    {
      "epoch": 0.25632,
      "grad_norm": 0.19250985980033875,
      "learning_rate": 0.0001,
      "loss": 0.3437,
      "step": 1602
    },
    {
      "epoch": 0.25648,
      "grad_norm": 0.19973570108413696,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 1603
    },
    {
      "epoch": 0.25664,
      "grad_norm": 0.1611214131116867,
      "learning_rate": 0.0001,
      "loss": 0.3547,
      "step": 1604
    },
    {
      "epoch": 0.2568,
      "grad_norm": 0.1339108645915985,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 1605
    },
    {
      "epoch": 0.25696,
      "grad_norm": 0.21721184253692627,
      "learning_rate": 0.0001,
      "loss": 0.3452,
      "step": 1606
    },
    {
      "epoch": 0.25712,
      "grad_norm": 0.12216044962406158,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 1607
    },
    {
      "epoch": 0.25728,
      "grad_norm": 0.13624757528305054,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 1608
    },
    {
      "epoch": 0.25744,
      "grad_norm": 0.14358249306678772,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 1609
    },
    {
      "epoch": 0.2576,
      "grad_norm": 0.15570688247680664,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 1610
    },
    {
      "epoch": 0.25776,
      "grad_norm": 0.15557412803173065,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 1611
    },
    {
      "epoch": 0.25792,
      "grad_norm": 0.13777777552604675,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 1612
    },
    {
      "epoch": 0.25808,
      "grad_norm": 0.15256941318511963,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 1613
    },
    {
      "epoch": 0.25824,
      "grad_norm": 0.20240876078605652,
      "learning_rate": 0.0001,
      "loss": 0.3406,
      "step": 1614
    },
    {
      "epoch": 0.2584,
      "grad_norm": 0.16944941878318787,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 1615
    },
    {
      "epoch": 0.25856,
      "grad_norm": 0.1615276038646698,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 1616
    },
    {
      "epoch": 0.25872,
      "grad_norm": 0.11783741414546967,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 1617
    },
    {
      "epoch": 0.25888,
      "grad_norm": 0.2259695678949356,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 1618
    },
    {
      "epoch": 0.25904,
      "grad_norm": 0.15198881924152374,
      "learning_rate": 0.0001,
      "loss": 0.3458,
      "step": 1619
    },
    {
      "epoch": 0.2592,
      "grad_norm": 0.127162903547287,
      "learning_rate": 0.0001,
      "loss": 0.3493,
      "step": 1620
    },
    {
      "epoch": 0.25936,
      "grad_norm": 0.15217973291873932,
      "learning_rate": 0.0001,
      "loss": 0.3566,
      "step": 1621
    },
    {
      "epoch": 0.25952,
      "grad_norm": 0.2230812907218933,
      "learning_rate": 0.0001,
      "loss": 0.3347,
      "step": 1622
    },
    {
      "epoch": 0.25968,
      "grad_norm": 0.12701845169067383,
      "learning_rate": 0.0001,
      "loss": 0.3454,
      "step": 1623
    },
    {
      "epoch": 0.25984,
      "grad_norm": 0.13071227073669434,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 1624
    },
    {
      "epoch": 0.26,
      "grad_norm": 0.15291662514209747,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 1625
    },
    {
      "epoch": 0.26016,
      "grad_norm": 0.2311577945947647,
      "learning_rate": 0.0001,
      "loss": 0.3457,
      "step": 1626
    },
    {
      "epoch": 0.26032,
      "grad_norm": 0.15612199902534485,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 1627
    },
    {
      "epoch": 0.26048,
      "grad_norm": 0.16001392900943756,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 1628
    },
    {
      "epoch": 0.26064,
      "grad_norm": 0.15793967247009277,
      "learning_rate": 0.0001,
      "loss": 0.3457,
      "step": 1629
    },
    {
      "epoch": 0.2608,
      "grad_norm": 0.14730526506900787,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 1630
    },
    {
      "epoch": 0.26096,
      "grad_norm": 0.12970595061779022,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 1631
    },
    {
      "epoch": 0.26112,
      "grad_norm": 0.20856371521949768,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 1632
    },
    {
      "epoch": 0.26128,
      "grad_norm": 0.16191846132278442,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 1633
    },
    {
      "epoch": 0.26144,
      "grad_norm": 0.1445322334766388,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 1634
    },
    {
      "epoch": 0.2616,
      "grad_norm": 0.12670622766017914,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 1635
    },
    {
      "epoch": 0.26176,
      "grad_norm": 0.2171773463487625,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 1636
    },
    {
      "epoch": 0.26192,
      "grad_norm": 0.1422589123249054,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 1637
    },
    {
      "epoch": 0.26208,
      "grad_norm": 0.13646461069583893,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 1638
    },
    {
      "epoch": 0.26224,
      "grad_norm": 0.16333648562431335,
      "learning_rate": 0.0001,
      "loss": 0.3382,
      "step": 1639
    },
    {
      "epoch": 0.2624,
      "grad_norm": 0.18352678418159485,
      "learning_rate": 0.0001,
      "loss": 0.3447,
      "step": 1640
    },
    {
      "epoch": 0.26256,
      "grad_norm": 0.1394960582256317,
      "learning_rate": 0.0001,
      "loss": 0.3473,
      "step": 1641
    },
    {
      "epoch": 0.26272,
      "grad_norm": 0.31696072220802307,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 1642
    },
    {
      "epoch": 0.26288,
      "grad_norm": 0.1404174417257309,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 1643
    },
    {
      "epoch": 0.26304,
      "grad_norm": 0.14404107630252838,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 1644
    },
    {
      "epoch": 0.2632,
      "grad_norm": 0.13678884506225586,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 1645
    },
    {
      "epoch": 0.26336,
      "grad_norm": 0.2454146295785904,
      "learning_rate": 0.0001,
      "loss": 0.3392,
      "step": 1646
    },
    {
      "epoch": 0.26352,
      "grad_norm": 0.14640522003173828,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 1647
    },
    {
      "epoch": 0.26368,
      "grad_norm": 0.13897556066513062,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 1648
    },
    {
      "epoch": 0.26384,
      "grad_norm": 0.1212417408823967,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 1649
    },
    {
      "epoch": 0.264,
      "grad_norm": 0.10768042504787445,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 1650
    },
    {
      "epoch": 0.26416,
      "grad_norm": 0.18260557949543,
      "learning_rate": 0.0001,
      "loss": 0.3466,
      "step": 1651
    },
    {
      "epoch": 0.26432,
      "grad_norm": 0.13034352660179138,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 1652
    },
    {
      "epoch": 0.26448,
      "grad_norm": 0.12553362548351288,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 1653
    },
    {
      "epoch": 0.26464,
      "grad_norm": 0.13473305106163025,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 1654
    },
    {
      "epoch": 0.2648,
      "grad_norm": 0.15782028436660767,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 1655
    },
    {
      "epoch": 0.26496,
      "grad_norm": 0.12476836144924164,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 1656
    },
    {
      "epoch": 0.26512,
      "grad_norm": 0.15129166841506958,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 1657
    },
    {
      "epoch": 0.26528,
      "grad_norm": 0.12508685886859894,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 1658
    },
    {
      "epoch": 0.26544,
      "grad_norm": 0.1332271844148636,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 1659
    },
    {
      "epoch": 0.2656,
      "grad_norm": 0.13482367992401123,
      "learning_rate": 0.0001,
      "loss": 0.3388,
      "step": 1660
    },
    {
      "epoch": 0.26576,
      "grad_norm": 0.11052833497524261,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 1661
    },
    {
      "epoch": 0.26592,
      "grad_norm": 0.14919382333755493,
      "learning_rate": 0.0001,
      "loss": 0.345,
      "step": 1662
    },
    {
      "epoch": 0.26608,
      "grad_norm": 0.12520809471607208,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 1663
    },
    {
      "epoch": 0.26624,
      "grad_norm": 0.13643209636211395,
      "learning_rate": 0.0001,
      "loss": 0.3423,
      "step": 1664
    },
    {
      "epoch": 0.2664,
      "grad_norm": 0.13039320707321167,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 1665
    },
    {
      "epoch": 0.26656,
      "grad_norm": 0.13202087581157684,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 1666
    },
    {
      "epoch": 0.26672,
      "grad_norm": 0.13739809393882751,
      "learning_rate": 0.0001,
      "loss": 0.3419,
      "step": 1667
    },
    {
      "epoch": 0.26688,
      "grad_norm": 0.13921327888965607,
      "learning_rate": 0.0001,
      "loss": 0.3512,
      "step": 1668
    },
    {
      "epoch": 0.26704,
      "grad_norm": 0.15879932045936584,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 1669
    },
    {
      "epoch": 0.2672,
      "grad_norm": 0.16976992785930634,
      "learning_rate": 0.0001,
      "loss": 0.3391,
      "step": 1670
    },
    {
      "epoch": 0.26736,
      "grad_norm": 0.1195719912648201,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 1671
    },
    {
      "epoch": 0.26752,
      "grad_norm": 0.143020898103714,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 1672
    },
    {
      "epoch": 0.26768,
      "grad_norm": 0.2296193242073059,
      "learning_rate": 0.0001,
      "loss": 0.3454,
      "step": 1673
    },
    {
      "epoch": 0.26784,
      "grad_norm": 0.15031221508979797,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 1674
    },
    {
      "epoch": 0.268,
      "grad_norm": 0.11844509094953537,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 1675
    },
    {
      "epoch": 0.26816,
      "grad_norm": 0.13259214162826538,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 1676
    },
    {
      "epoch": 0.26832,
      "grad_norm": 0.1325296312570572,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 1677
    },
    {
      "epoch": 0.26848,
      "grad_norm": 0.3458048105239868,
      "learning_rate": 0.0001,
      "loss": 0.3381,
      "step": 1678
    },
    {
      "epoch": 0.26864,
      "grad_norm": 0.11936371773481369,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 1679
    },
    {
      "epoch": 0.2688,
      "grad_norm": 0.14199945330619812,
      "learning_rate": 0.0001,
      "loss": 0.338,
      "step": 1680
    },
    {
      "epoch": 0.26896,
      "grad_norm": 0.3138717710971832,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 1681
    },
    {
      "epoch": 0.26912,
      "grad_norm": 0.14417056739330292,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 1682
    },
    {
      "epoch": 0.26928,
      "grad_norm": 0.13341021537780762,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 1683
    },
    {
      "epoch": 0.26944,
      "grad_norm": 0.1530865728855133,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 1684
    },
    {
      "epoch": 0.2696,
      "grad_norm": 0.33909717202186584,
      "learning_rate": 0.0001,
      "loss": 0.3382,
      "step": 1685
    },
    {
      "epoch": 0.26976,
      "grad_norm": 0.13070742785930634,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 1686
    },
    {
      "epoch": 0.26992,
      "grad_norm": 0.12596917152404785,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 1687
    },
    {
      "epoch": 0.27008,
      "grad_norm": 0.12063975632190704,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 1688
    },
    {
      "epoch": 0.27024,
      "grad_norm": 0.16277359426021576,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 1689
    },
    {
      "epoch": 0.2704,
      "grad_norm": 0.1698295772075653,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 1690
    },
    {
      "epoch": 0.27056,
      "grad_norm": 0.18888990581035614,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 1691
    },
    {
      "epoch": 0.27072,
      "grad_norm": 0.15197400748729706,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 1692
    },
    {
      "epoch": 0.27088,
      "grad_norm": 0.19159546494483948,
      "learning_rate": 0.0001,
      "loss": 0.3455,
      "step": 1693
    },
    {
      "epoch": 0.27104,
      "grad_norm": 0.13705188035964966,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 1694
    },
    {
      "epoch": 0.2712,
      "grad_norm": 0.16883526742458344,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 1695
    },
    {
      "epoch": 0.27136,
      "grad_norm": 0.1504001021385193,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 1696
    },
    {
      "epoch": 0.27152,
      "grad_norm": 0.13422740995883942,
      "learning_rate": 0.0001,
      "loss": 0.3471,
      "step": 1697
    },
    {
      "epoch": 0.27168,
      "grad_norm": 0.19484736025333405,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 1698
    },
    {
      "epoch": 0.27184,
      "grad_norm": 0.1762242615222931,
      "learning_rate": 0.0001,
      "loss": 0.3427,
      "step": 1699
    },
    {
      "epoch": 0.272,
      "grad_norm": 0.17503762245178223,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 1700
    },
    {
      "epoch": 0.272,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.33246421813964844,
      "eval_train_runtime": 4.0681,
      "eval_train_samples_per_second": 1229.063,
      "eval_train_steps_per_second": 15.486,
      "step": 1700
    },
    {
      "epoch": 0.272,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.33124667406082153,
      "eval_test_runtime": 5.1323,
      "eval_test_samples_per_second": 974.213,
      "eval_test_steps_per_second": 12.275,
      "step": 1700
    },
    {
      "epoch": 0.27216,
      "grad_norm": 0.14107061922550201,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 1701
    },
    {
      "epoch": 0.27232,
      "grad_norm": 0.14517058432102203,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 1702
    },
    {
      "epoch": 0.27248,
      "grad_norm": 0.15949364006519318,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 1703
    },
    {
      "epoch": 0.27264,
      "grad_norm": 0.1334659308195114,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 1704
    },
    {
      "epoch": 0.2728,
      "grad_norm": 0.1394292265176773,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 1705
    },
    {
      "epoch": 0.27296,
      "grad_norm": 0.14640942215919495,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 1706
    },
    {
      "epoch": 0.27312,
      "grad_norm": 0.1442381739616394,
      "learning_rate": 0.0001,
      "loss": 0.3477,
      "step": 1707
    },
    {
      "epoch": 0.27328,
      "grad_norm": 0.12945912778377533,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 1708
    },
    {
      "epoch": 0.27344,
      "grad_norm": 0.11994487792253494,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 1709
    },
    {
      "epoch": 0.2736,
      "grad_norm": 0.13175803422927856,
      "learning_rate": 0.0001,
      "loss": 0.3405,
      "step": 1710
    },
    {
      "epoch": 0.27376,
      "grad_norm": 0.14643917977809906,
      "learning_rate": 0.0001,
      "loss": 0.3388,
      "step": 1711
    },
    {
      "epoch": 0.27392,
      "grad_norm": 0.14251956343650818,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 1712
    },
    {
      "epoch": 0.27408,
      "grad_norm": 0.12215055525302887,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 1713
    },
    {
      "epoch": 0.27424,
      "grad_norm": 0.1556282490491867,
      "learning_rate": 0.0001,
      "loss": 0.3477,
      "step": 1714
    },
    {
      "epoch": 0.2744,
      "grad_norm": 0.14131592214107513,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 1715
    },
    {
      "epoch": 0.27456,
      "grad_norm": 0.13581794500350952,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 1716
    },
    {
      "epoch": 0.27472,
      "grad_norm": 0.11511216312646866,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 1717
    },
    {
      "epoch": 0.27488,
      "grad_norm": 0.13562850654125214,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 1718
    },
    {
      "epoch": 0.27504,
      "grad_norm": 0.13115841150283813,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 1719
    },
    {
      "epoch": 0.2752,
      "grad_norm": 0.16072778403759003,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 1720
    },
    {
      "epoch": 0.27536,
      "grad_norm": 0.15351812541484833,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 1721
    },
    {
      "epoch": 0.27552,
      "grad_norm": 0.11200976371765137,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 1722
    },
    {
      "epoch": 0.27568,
      "grad_norm": 0.1214316114783287,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 1723
    },
    {
      "epoch": 0.27584,
      "grad_norm": 0.12992799282073975,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 1724
    },
    {
      "epoch": 0.276,
      "grad_norm": 0.11852333694696426,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 1725
    },
    {
      "epoch": 0.27616,
      "grad_norm": 0.1364901214838028,
      "learning_rate": 0.0001,
      "loss": 0.343,
      "step": 1726
    },
    {
      "epoch": 0.27632,
      "grad_norm": 0.14532843232154846,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 1727
    },
    {
      "epoch": 0.27648,
      "grad_norm": 0.1428176313638687,
      "learning_rate": 0.0001,
      "loss": 0.352,
      "step": 1728
    },
    {
      "epoch": 0.27664,
      "grad_norm": 0.11626321077346802,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 1729
    },
    {
      "epoch": 0.2768,
      "grad_norm": 0.17678214609622955,
      "learning_rate": 0.0001,
      "loss": 0.3399,
      "step": 1730
    },
    {
      "epoch": 0.27696,
      "grad_norm": 0.13414457440376282,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 1731
    },
    {
      "epoch": 0.27712,
      "grad_norm": 0.1303841471672058,
      "learning_rate": 0.0001,
      "loss": 0.3375,
      "step": 1732
    },
    {
      "epoch": 0.27728,
      "grad_norm": 0.11044907569885254,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 1733
    },
    {
      "epoch": 0.27744,
      "grad_norm": 0.1353110671043396,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 1734
    },
    {
      "epoch": 0.2776,
      "grad_norm": 0.1312415897846222,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 1735
    },
    {
      "epoch": 0.27776,
      "grad_norm": 0.1533094197511673,
      "learning_rate": 0.0001,
      "loss": 0.3436,
      "step": 1736
    },
    {
      "epoch": 0.27792,
      "grad_norm": 0.2230333536863327,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 1737
    },
    {
      "epoch": 0.27808,
      "grad_norm": 0.1374628245830536,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 1738
    },
    {
      "epoch": 0.27824,
      "grad_norm": 0.13382825255393982,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 1739
    },
    {
      "epoch": 0.2784,
      "grad_norm": 0.13095763325691223,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 1740
    },
    {
      "epoch": 0.27856,
      "grad_norm": 0.15525610744953156,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 1741
    },
    {
      "epoch": 0.27872,
      "grad_norm": 0.13650690019130707,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 1742
    },
    {
      "epoch": 0.27888,
      "grad_norm": 0.1335524320602417,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 1743
    },
    {
      "epoch": 0.27904,
      "grad_norm": 0.1628049910068512,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 1744
    },
    {
      "epoch": 0.2792,
      "grad_norm": 0.1208496242761612,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 1745
    },
    {
      "epoch": 0.27936,
      "grad_norm": 0.12776729464530945,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 1746
    },
    {
      "epoch": 0.27952,
      "grad_norm": 0.13029180467128754,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 1747
    },
    {
      "epoch": 0.27968,
      "grad_norm": 0.12632565200328827,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 1748
    },
    {
      "epoch": 0.27984,
      "grad_norm": 0.10917958617210388,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 1749
    },
    {
      "epoch": 0.28,
      "grad_norm": 0.10708733648061752,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 1750
    },
    {
      "epoch": 0.28016,
      "grad_norm": 0.11754243820905685,
      "learning_rate": 0.0001,
      "loss": 0.3439,
      "step": 1751
    },
    {
      "epoch": 0.28032,
      "grad_norm": 0.1639440357685089,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 1752
    },
    {
      "epoch": 0.28048,
      "grad_norm": 0.13033545017242432,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 1753
    },
    {
      "epoch": 0.28064,
      "grad_norm": 0.1184544563293457,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 1754
    },
    {
      "epoch": 0.2808,
      "grad_norm": 0.10581006109714508,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 1755
    },
    {
      "epoch": 0.28096,
      "grad_norm": 0.36051666736602783,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 1756
    },
    {
      "epoch": 0.28112,
      "grad_norm": 0.11891856044530869,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 1757
    },
    {
      "epoch": 0.28128,
      "grad_norm": 0.14425957202911377,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 1758
    },
    {
      "epoch": 0.28144,
      "grad_norm": 0.1552278995513916,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 1759
    },
    {
      "epoch": 0.2816,
      "grad_norm": 0.1498657912015915,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 1760
    },
    {
      "epoch": 0.28176,
      "grad_norm": 0.11056364327669144,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 1761
    },
    {
      "epoch": 0.28192,
      "grad_norm": 0.22391420602798462,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 1762
    },
    {
      "epoch": 0.28208,
      "grad_norm": 0.11726651340723038,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 1763
    },
    {
      "epoch": 0.28224,
      "grad_norm": 0.20673121511936188,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 1764
    },
    {
      "epoch": 0.2824,
      "grad_norm": 0.14130449295043945,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 1765
    },
    {
      "epoch": 0.28256,
      "grad_norm": 0.13341312110424042,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 1766
    },
    {
      "epoch": 0.28272,
      "grad_norm": 0.12614071369171143,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 1767
    },
    {
      "epoch": 0.28288,
      "grad_norm": 0.17486469447612762,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 1768
    },
    {
      "epoch": 0.28304,
      "grad_norm": 0.12579244375228882,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 1769
    },
    {
      "epoch": 0.2832,
      "grad_norm": 0.14620615541934967,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 1770
    },
    {
      "epoch": 0.28336,
      "grad_norm": 0.12397529184818268,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 1771
    },
    {
      "epoch": 0.28352,
      "grad_norm": 0.16745731234550476,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 1772
    },
    {
      "epoch": 0.28368,
      "grad_norm": 0.17318663001060486,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 1773
    },
    {
      "epoch": 0.28384,
      "grad_norm": 0.12767480313777924,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 1774
    },
    {
      "epoch": 0.284,
      "grad_norm": 0.10144881159067154,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 1775
    },
    {
      "epoch": 0.28416,
      "grad_norm": 0.14343635737895966,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 1776
    },
    {
      "epoch": 0.28432,
      "grad_norm": 0.20199500024318695,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 1777
    },
    {
      "epoch": 0.28448,
      "grad_norm": 0.11032939702272415,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 1778
    },
    {
      "epoch": 0.28464,
      "grad_norm": 0.1338237226009369,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 1779
    },
    {
      "epoch": 0.2848,
      "grad_norm": 0.14266330003738403,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 1780
    },
    {
      "epoch": 0.28496,
      "grad_norm": 0.12725374102592468,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 1781
    },
    {
      "epoch": 0.28512,
      "grad_norm": 0.12905699014663696,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 1782
    },
    {
      "epoch": 0.28528,
      "grad_norm": 0.11825286597013474,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 1783
    },
    {
      "epoch": 0.28544,
      "grad_norm": 0.17313426733016968,
      "learning_rate": 0.0001,
      "loss": 0.3449,
      "step": 1784
    },
    {
      "epoch": 0.2856,
      "grad_norm": 0.11952744424343109,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 1785
    },
    {
      "epoch": 0.28576,
      "grad_norm": 0.1158837303519249,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 1786
    },
    {
      "epoch": 0.28592,
      "grad_norm": 0.14659877121448517,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 1787
    },
    {
      "epoch": 0.28608,
      "grad_norm": 0.11116335541009903,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 1788
    },
    {
      "epoch": 0.28624,
      "grad_norm": 0.15399153530597687,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 1789
    },
    {
      "epoch": 0.2864,
      "grad_norm": 0.13115841150283813,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 1790
    },
    {
      "epoch": 0.28656,
      "grad_norm": 0.21682633459568024,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 1791
    },
    {
      "epoch": 0.28672,
      "grad_norm": 0.13827407360076904,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 1792
    },
    {
      "epoch": 0.28688,
      "grad_norm": 0.13324102759361267,
      "learning_rate": 0.0001,
      "loss": 0.3484,
      "step": 1793
    },
    {
      "epoch": 0.28704,
      "grad_norm": 0.16347074508666992,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 1794
    },
    {
      "epoch": 0.2872,
      "grad_norm": 0.19280479848384857,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 1795
    },
    {
      "epoch": 0.28736,
      "grad_norm": 0.40872663259506226,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 1796
    },
    {
      "epoch": 0.28752,
      "grad_norm": 0.1373347043991089,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 1797
    },
    {
      "epoch": 0.28768,
      "grad_norm": 0.15313135087490082,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 1798
    },
    {
      "epoch": 0.28784,
      "grad_norm": 0.46419647336006165,
      "learning_rate": 0.0001,
      "loss": 0.3412,
      "step": 1799
    },
    {
      "epoch": 0.288,
      "grad_norm": 0.14431937038898468,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 1800
    },
    {
      "epoch": 0.288,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.3319077491760254,
      "eval_train_runtime": 4.0861,
      "eval_train_samples_per_second": 1223.66,
      "eval_train_steps_per_second": 15.418,
      "step": 1800
    },
    {
      "epoch": 0.288,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.330427348613739,
      "eval_test_runtime": 4.8358,
      "eval_test_samples_per_second": 1033.949,
      "eval_test_steps_per_second": 13.028,
      "step": 1800
    },
    {
      "epoch": 0.28816,
      "grad_norm": 0.14199914038181305,
      "learning_rate": 0.0001,
      "loss": 0.3419,
      "step": 1801
    },
    {
      "epoch": 0.28832,
      "grad_norm": 0.11713866144418716,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 1802
    },
    {
      "epoch": 0.28848,
      "grad_norm": 0.1906946897506714,
      "learning_rate": 0.0001,
      "loss": 0.3423,
      "step": 1803
    },
    {
      "epoch": 0.28864,
      "grad_norm": 0.2656855285167694,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 1804
    },
    {
      "epoch": 0.2888,
      "grad_norm": 0.33114495873451233,
      "learning_rate": 0.0001,
      "loss": 0.341,
      "step": 1805
    },
    {
      "epoch": 0.28896,
      "grad_norm": 0.2590616047382355,
      "learning_rate": 0.0001,
      "loss": 0.3447,
      "step": 1806
    },
    {
      "epoch": 0.28912,
      "grad_norm": 0.39201620221138,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 1807
    },
    {
      "epoch": 0.28928,
      "grad_norm": 0.14545227587223053,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 1808
    },
    {
      "epoch": 0.28944,
      "grad_norm": 0.16497720777988434,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 1809
    },
    {
      "epoch": 0.2896,
      "grad_norm": 0.20349302887916565,
      "learning_rate": 0.0001,
      "loss": 0.346,
      "step": 1810
    },
    {
      "epoch": 0.28976,
      "grad_norm": 0.23020654916763306,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 1811
    },
    {
      "epoch": 0.28992,
      "grad_norm": 0.16424544155597687,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 1812
    },
    {
      "epoch": 0.29008,
      "grad_norm": 0.14098508656024933,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 1813
    },
    {
      "epoch": 0.29024,
      "grad_norm": 0.16388574242591858,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 1814
    },
    {
      "epoch": 0.2904,
      "grad_norm": 0.17375847697257996,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 1815
    },
    {
      "epoch": 0.29056,
      "grad_norm": 0.16787773370742798,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 1816
    },
    {
      "epoch": 0.29072,
      "grad_norm": 0.290306031703949,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 1817
    },
    {
      "epoch": 0.29088,
      "grad_norm": 0.16867412626743317,
      "learning_rate": 0.0001,
      "loss": 0.341,
      "step": 1818
    },
    {
      "epoch": 0.29104,
      "grad_norm": 0.1903836578130722,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 1819
    },
    {
      "epoch": 0.2912,
      "grad_norm": 0.26914486289024353,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 1820
    },
    {
      "epoch": 0.29136,
      "grad_norm": 0.15523479878902435,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 1821
    },
    {
      "epoch": 0.29152,
      "grad_norm": 0.13378052413463593,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 1822
    },
    {
      "epoch": 0.29168,
      "grad_norm": 0.20816493034362793,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 1823
    },
    {
      "epoch": 0.29184,
      "grad_norm": 0.2801794707775116,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 1824
    },
    {
      "epoch": 0.292,
      "grad_norm": 0.16462235152721405,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 1825
    },
    {
      "epoch": 0.29216,
      "grad_norm": 0.16092467308044434,
      "learning_rate": 0.0001,
      "loss": 0.3469,
      "step": 1826
    },
    {
      "epoch": 0.29232,
      "grad_norm": 0.4404745101928711,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 1827
    },
    {
      "epoch": 0.29248,
      "grad_norm": 0.13576196134090424,
      "learning_rate": 0.0001,
      "loss": 0.3441,
      "step": 1828
    },
    {
      "epoch": 0.29264,
      "grad_norm": 0.3824101388454437,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 1829
    },
    {
      "epoch": 0.2928,
      "grad_norm": 0.14557816088199615,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 1830
    },
    {
      "epoch": 0.29296,
      "grad_norm": 0.10909213125705719,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 1831
    },
    {
      "epoch": 0.29312,
      "grad_norm": 0.1360863596200943,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 1832
    },
    {
      "epoch": 0.29328,
      "grad_norm": 0.1684861183166504,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 1833
    },
    {
      "epoch": 0.29344,
      "grad_norm": 0.32090967893600464,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 1834
    },
    {
      "epoch": 0.2936,
      "grad_norm": 0.1751152127981186,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 1835
    },
    {
      "epoch": 0.29376,
      "grad_norm": 0.19222694635391235,
      "learning_rate": 0.0001,
      "loss": 0.3469,
      "step": 1836
    },
    {
      "epoch": 0.29392,
      "grad_norm": 0.1567467451095581,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 1837
    },
    {
      "epoch": 0.29408,
      "grad_norm": 0.11942801624536514,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 1838
    },
    {
      "epoch": 0.29424,
      "grad_norm": 0.14140507578849792,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 1839
    },
    {
      "epoch": 0.2944,
      "grad_norm": 0.14950881898403168,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 1840
    },
    {
      "epoch": 0.29456,
      "grad_norm": 0.19369731843471527,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 1841
    },
    {
      "epoch": 0.29472,
      "grad_norm": 0.12498847395181656,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 1842
    },
    {
      "epoch": 0.29488,
      "grad_norm": 0.1592235416173935,
      "learning_rate": 0.0001,
      "loss": 0.3463,
      "step": 1843
    },
    {
      "epoch": 0.29504,
      "grad_norm": 0.1319035142660141,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 1844
    },
    {
      "epoch": 0.2952,
      "grad_norm": 0.1291034072637558,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 1845
    },
    {
      "epoch": 0.29536,
      "grad_norm": 0.13176776468753815,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 1846
    },
    {
      "epoch": 0.29552,
      "grad_norm": 0.14946623146533966,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 1847
    },
    {
      "epoch": 0.29568,
      "grad_norm": 0.1340320110321045,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 1848
    },
    {
      "epoch": 0.29584,
      "grad_norm": 0.13566352427005768,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 1849
    },
    {
      "epoch": 0.296,
      "grad_norm": 0.11071248352527618,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 1850
    },
    {
      "epoch": 0.29616,
      "grad_norm": 0.1246943473815918,
      "learning_rate": 0.0001,
      "loss": 0.3447,
      "step": 1851
    },
    {
      "epoch": 0.29632,
      "grad_norm": 0.12752169370651245,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 1852
    },
    {
      "epoch": 0.29648,
      "grad_norm": 0.20243419706821442,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 1853
    },
    {
      "epoch": 0.29664,
      "grad_norm": 0.13649633526802063,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 1854
    },
    {
      "epoch": 0.2968,
      "grad_norm": 0.1521904319524765,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 1855
    },
    {
      "epoch": 0.29696,
      "grad_norm": 0.12374257296323776,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 1856
    },
    {
      "epoch": 0.29712,
      "grad_norm": 0.1521238535642624,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 1857
    },
    {
      "epoch": 0.29728,
      "grad_norm": 0.13189342617988586,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 1858
    },
    {
      "epoch": 0.29744,
      "grad_norm": 0.1889936774969101,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 1859
    },
    {
      "epoch": 0.2976,
      "grad_norm": 0.142700657248497,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 1860
    },
    {
      "epoch": 0.29776,
      "grad_norm": 0.18761423230171204,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 1861
    },
    {
      "epoch": 0.29792,
      "grad_norm": 0.12949518859386444,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 1862
    },
    {
      "epoch": 0.29808,
      "grad_norm": 0.11541210114955902,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 1863
    },
    {
      "epoch": 0.29824,
      "grad_norm": 0.1290399730205536,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 1864
    },
    {
      "epoch": 0.2984,
      "grad_norm": 0.16133686900138855,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 1865
    },
    {
      "epoch": 0.29856,
      "grad_norm": 0.1588577926158905,
      "learning_rate": 0.0001,
      "loss": 0.3436,
      "step": 1866
    },
    {
      "epoch": 0.29872,
      "grad_norm": 0.1330256313085556,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 1867
    },
    {
      "epoch": 0.29888,
      "grad_norm": 0.1137779951095581,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 1868
    },
    {
      "epoch": 0.29904,
      "grad_norm": 0.14660833775997162,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 1869
    },
    {
      "epoch": 0.2992,
      "grad_norm": 0.13061794638633728,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 1870
    },
    {
      "epoch": 0.29936,
      "grad_norm": 0.11607308685779572,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 1871
    },
    {
      "epoch": 0.29952,
      "grad_norm": 0.1211123839020729,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 1872
    },
    {
      "epoch": 0.29968,
      "grad_norm": 0.14580188691616058,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 1873
    },
    {
      "epoch": 0.29984,
      "grad_norm": 0.1481468677520752,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 1874
    },
    {
      "epoch": 0.3,
      "grad_norm": 0.14446064829826355,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 1875
    },
    {
      "epoch": 0.30016,
      "grad_norm": 0.14115452766418457,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 1876
    },
    {
      "epoch": 0.30032,
      "grad_norm": 0.1250963807106018,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 1877
    },
    {
      "epoch": 0.30048,
      "grad_norm": 0.15402382612228394,
      "learning_rate": 0.0001,
      "loss": 0.3385,
      "step": 1878
    },
    {
      "epoch": 0.30064,
      "grad_norm": 0.20666079223155975,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 1879
    },
    {
      "epoch": 0.3008,
      "grad_norm": 0.11706762760877609,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 1880
    },
    {
      "epoch": 0.30096,
      "grad_norm": 0.14532627165317535,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 1881
    },
    {
      "epoch": 0.30112,
      "grad_norm": 0.13876532018184662,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 1882
    },
    {
      "epoch": 0.30128,
      "grad_norm": 0.1837383359670639,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 1883
    },
    {
      "epoch": 0.30144,
      "grad_norm": 0.1332518607378006,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 1884
    },
    {
      "epoch": 0.3016,
      "grad_norm": 0.1346457302570343,
      "learning_rate": 0.0001,
      "loss": 0.3444,
      "step": 1885
    },
    {
      "epoch": 0.30176,
      "grad_norm": 0.1523086279630661,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 1886
    },
    {
      "epoch": 0.30192,
      "grad_norm": 0.1316625028848648,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 1887
    },
    {
      "epoch": 0.30208,
      "grad_norm": 0.15706220269203186,
      "learning_rate": 0.0001,
      "loss": 0.3402,
      "step": 1888
    },
    {
      "epoch": 0.30224,
      "grad_norm": 0.13991090655326843,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 1889
    },
    {
      "epoch": 0.3024,
      "grad_norm": 0.12934213876724243,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 1890
    },
    {
      "epoch": 0.30256,
      "grad_norm": 0.14450781047344208,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 1891
    },
    {
      "epoch": 0.30272,
      "grad_norm": 0.12436500936746597,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 1892
    },
    {
      "epoch": 0.30288,
      "grad_norm": 0.1430468112230301,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 1893
    },
    {
      "epoch": 0.30304,
      "grad_norm": 0.1393788456916809,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 1894
    },
    {
      "epoch": 0.3032,
      "grad_norm": 0.1261591613292694,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 1895
    },
    {
      "epoch": 0.30336,
      "grad_norm": 0.15624158084392548,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 1896
    },
    {
      "epoch": 0.30352,
      "grad_norm": 0.1424061805009842,
      "learning_rate": 0.0001,
      "loss": 0.3381,
      "step": 1897
    },
    {
      "epoch": 0.30368,
      "grad_norm": 0.1077457144856453,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 1898
    },
    {
      "epoch": 0.30384,
      "grad_norm": 0.13662664592266083,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 1899
    },
    {
      "epoch": 0.304,
      "grad_norm": 0.1255357563495636,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 1900
    },
    {
      "epoch": 0.304,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.32978585362434387,
      "eval_train_runtime": 4.1187,
      "eval_train_samples_per_second": 1213.99,
      "eval_train_steps_per_second": 15.296,
      "step": 1900
    },
    {
      "epoch": 0.304,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.3284830152988434,
      "eval_test_runtime": 4.7198,
      "eval_test_samples_per_second": 1059.357,
      "eval_test_steps_per_second": 13.348,
      "step": 1900
    },
    {
      "epoch": 0.30416,
      "grad_norm": 0.11284004151821136,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 1901
    },
    {
      "epoch": 0.30432,
      "grad_norm": 0.10536293685436249,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 1902
    },
    {
      "epoch": 0.30448,
      "grad_norm": 0.11821732670068741,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 1903
    },
    {
      "epoch": 0.30464,
      "grad_norm": 0.17236436903476715,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 1904
    },
    {
      "epoch": 0.3048,
      "grad_norm": 0.12527945637702942,
      "learning_rate": 0.0001,
      "loss": 0.3373,
      "step": 1905
    },
    {
      "epoch": 0.30496,
      "grad_norm": 0.1422542929649353,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 1906
    },
    {
      "epoch": 0.30512,
      "grad_norm": 0.17884200811386108,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 1907
    },
    {
      "epoch": 0.30528,
      "grad_norm": 0.1427670568227768,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 1908
    },
    {
      "epoch": 0.30544,
      "grad_norm": 0.13766847550868988,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 1909
    },
    {
      "epoch": 0.3056,
      "grad_norm": 0.1166931763291359,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 1910
    },
    {
      "epoch": 0.30576,
      "grad_norm": 0.1351388543844223,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 1911
    },
    {
      "epoch": 0.30592,
      "grad_norm": 0.12974701821804047,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 1912
    },
    {
      "epoch": 0.30608,
      "grad_norm": 0.1580948680639267,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 1913
    },
    {
      "epoch": 0.30624,
      "grad_norm": 0.11486274003982544,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 1914
    },
    {
      "epoch": 0.3064,
      "grad_norm": 0.17713066935539246,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 1915
    },
    {
      "epoch": 0.30656,
      "grad_norm": 0.16121554374694824,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 1916
    },
    {
      "epoch": 0.30672,
      "grad_norm": 0.16143865883350372,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 1917
    },
    {
      "epoch": 0.30688,
      "grad_norm": 0.1695016771554947,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 1918
    },
    {
      "epoch": 0.30704,
      "grad_norm": 0.12347032129764557,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 1919
    },
    {
      "epoch": 0.3072,
      "grad_norm": 0.11742353439331055,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 1920
    },
    {
      "epoch": 0.30736,
      "grad_norm": 0.15129579603672028,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 1921
    },
    {
      "epoch": 0.30752,
      "grad_norm": 0.2110418826341629,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 1922
    },
    {
      "epoch": 0.30768,
      "grad_norm": 0.1352623701095581,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 1923
    },
    {
      "epoch": 0.30784,
      "grad_norm": 0.11494214832782745,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 1924
    },
    {
      "epoch": 0.308,
      "grad_norm": 0.11690303683280945,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 1925
    },
    {
      "epoch": 0.30816,
      "grad_norm": 0.17222683131694794,
      "learning_rate": 0.0001,
      "loss": 0.3427,
      "step": 1926
    },
    {
      "epoch": 0.30832,
      "grad_norm": 0.13145537674427032,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 1927
    },
    {
      "epoch": 0.30848,
      "grad_norm": 0.1394992619752884,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 1928
    },
    {
      "epoch": 0.30864,
      "grad_norm": 0.10785429179668427,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 1929
    },
    {
      "epoch": 0.3088,
      "grad_norm": 0.11816564202308655,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 1930
    },
    {
      "epoch": 0.30896,
      "grad_norm": 0.12013498693704605,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 1931
    },
    {
      "epoch": 0.30912,
      "grad_norm": 0.1859370768070221,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 1932
    },
    {
      "epoch": 0.30928,
      "grad_norm": 0.1315557360649109,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 1933
    },
    {
      "epoch": 0.30944,
      "grad_norm": 0.12728029489517212,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 1934
    },
    {
      "epoch": 0.3096,
      "grad_norm": 0.1340080201625824,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 1935
    },
    {
      "epoch": 0.30976,
      "grad_norm": 0.1618146300315857,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 1936
    },
    {
      "epoch": 0.30992,
      "grad_norm": 0.12127862125635147,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 1937
    },
    {
      "epoch": 0.31008,
      "grad_norm": 0.15335330367088318,
      "learning_rate": 0.0001,
      "loss": 0.3399,
      "step": 1938
    },
    {
      "epoch": 0.31024,
      "grad_norm": 0.10795463621616364,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 1939
    },
    {
      "epoch": 0.3104,
      "grad_norm": 0.12687882781028748,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 1940
    },
    {
      "epoch": 0.31056,
      "grad_norm": 0.12171117961406708,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 1941
    },
    {
      "epoch": 0.31072,
      "grad_norm": 0.14060629904270172,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 1942
    },
    {
      "epoch": 0.31088,
      "grad_norm": 0.14660342037677765,
      "learning_rate": 0.0001,
      "loss": 0.3441,
      "step": 1943
    },
    {
      "epoch": 0.31104,
      "grad_norm": 0.2157486081123352,
      "learning_rate": 0.0001,
      "loss": 0.3341,
      "step": 1944
    },
    {
      "epoch": 0.3112,
      "grad_norm": 0.15899710357189178,
      "learning_rate": 0.0001,
      "loss": 0.3399,
      "step": 1945
    },
    {
      "epoch": 0.31136,
      "grad_norm": 0.12255087494850159,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 1946
    },
    {
      "epoch": 0.31152,
      "grad_norm": 0.17026299238204956,
      "learning_rate": 0.0001,
      "loss": 0.3385,
      "step": 1947
    },
    {
      "epoch": 0.31168,
      "grad_norm": 0.12159436196088791,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 1948
    },
    {
      "epoch": 0.31184,
      "grad_norm": 0.13802921772003174,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 1949
    },
    {
      "epoch": 0.312,
      "grad_norm": 0.11738985031843185,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 1950
    },
    {
      "epoch": 0.31216,
      "grad_norm": 0.1312108188867569,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 1951
    },
    {
      "epoch": 0.31232,
      "grad_norm": 0.1437837779521942,
      "learning_rate": 0.0001,
      "loss": 0.3393,
      "step": 1952
    },
    {
      "epoch": 0.31248,
      "grad_norm": 0.14825427532196045,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 1953
    },
    {
      "epoch": 0.31264,
      "grad_norm": 0.12787699699401855,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 1954
    },
    {
      "epoch": 0.3128,
      "grad_norm": 0.17985011637210846,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 1955
    },
    {
      "epoch": 0.31296,
      "grad_norm": 0.11659165471792221,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 1956
    },
    {
      "epoch": 0.31312,
      "grad_norm": 0.11053960025310516,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 1957
    },
    {
      "epoch": 0.31328,
      "grad_norm": 0.11696269363164902,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 1958
    },
    {
      "epoch": 0.31344,
      "grad_norm": 0.10968999564647675,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 1959
    },
    {
      "epoch": 0.3136,
      "grad_norm": 0.13728654384613037,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 1960
    },
    {
      "epoch": 0.31376,
      "grad_norm": 0.12408888339996338,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 1961
    },
    {
      "epoch": 0.31392,
      "grad_norm": 0.10920450091362,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 1962
    },
    {
      "epoch": 0.31408,
      "grad_norm": 0.16023363173007965,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 1963
    },
    {
      "epoch": 0.31424,
      "grad_norm": 0.12883557379245758,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 1964
    },
    {
      "epoch": 0.3144,
      "grad_norm": 0.1162170022726059,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 1965
    },
    {
      "epoch": 0.31456,
      "grad_norm": 0.11028392612934113,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 1966
    },
    {
      "epoch": 0.31472,
      "grad_norm": 0.12188753485679626,
      "learning_rate": 0.0001,
      "loss": 0.3341,
      "step": 1967
    },
    {
      "epoch": 0.31488,
      "grad_norm": 0.14058488607406616,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 1968
    },
    {
      "epoch": 0.31504,
      "grad_norm": 0.11247207224369049,
      "learning_rate": 0.0001,
      "loss": 0.3424,
      "step": 1969
    },
    {
      "epoch": 0.3152,
      "grad_norm": 0.13170818984508514,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 1970
    },
    {
      "epoch": 0.31536,
      "grad_norm": 0.1173451766371727,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 1971
    },
    {
      "epoch": 0.31552,
      "grad_norm": 0.17739801108837128,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 1972
    },
    {
      "epoch": 0.31568,
      "grad_norm": 0.13001175224781036,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 1973
    },
    {
      "epoch": 0.31584,
      "grad_norm": 0.123074471950531,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 1974
    },
    {
      "epoch": 0.316,
      "grad_norm": 0.10874941945075989,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 1975
    },
    {
      "epoch": 0.31616,
      "grad_norm": 0.14580905437469482,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 1976
    },
    {
      "epoch": 0.31632,
      "grad_norm": 0.12981779873371124,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 1977
    },
    {
      "epoch": 0.31648,
      "grad_norm": 0.12363201379776001,
      "learning_rate": 0.0001,
      "loss": 0.3532,
      "step": 1978
    },
    {
      "epoch": 0.31664,
      "grad_norm": 0.12176051735877991,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 1979
    },
    {
      "epoch": 0.3168,
      "grad_norm": 0.11301866173744202,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 1980
    },
    {
      "epoch": 0.31696,
      "grad_norm": 0.108700692653656,
      "learning_rate": 0.0001,
      "loss": 0.3059,
      "step": 1981
    },
    {
      "epoch": 0.31712,
      "grad_norm": 0.11163709312677383,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 1982
    },
    {
      "epoch": 0.31728,
      "grad_norm": 0.12131388485431671,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 1983
    },
    {
      "epoch": 0.31744,
      "grad_norm": 0.14317448437213898,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 1984
    },
    {
      "epoch": 0.3176,
      "grad_norm": 0.12000827491283417,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 1985
    },
    {
      "epoch": 0.31776,
      "grad_norm": 0.13165733218193054,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 1986
    },
    {
      "epoch": 0.31792,
      "grad_norm": 0.10832544416189194,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 1987
    },
    {
      "epoch": 0.31808,
      "grad_norm": 0.1332114338874817,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 1988
    },
    {
      "epoch": 0.31824,
      "grad_norm": 0.13093961775302887,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 1989
    },
    {
      "epoch": 0.3184,
      "grad_norm": 0.12935303151607513,
      "learning_rate": 0.0001,
      "loss": 0.3393,
      "step": 1990
    },
    {
      "epoch": 0.31856,
      "grad_norm": 0.11585336923599243,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 1991
    },
    {
      "epoch": 0.31872,
      "grad_norm": 0.10976587980985641,
      "learning_rate": 0.0001,
      "loss": 0.3437,
      "step": 1992
    },
    {
      "epoch": 0.31888,
      "grad_norm": 0.1111568734049797,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 1993
    },
    {
      "epoch": 0.31904,
      "grad_norm": 0.12424660474061966,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 1994
    },
    {
      "epoch": 0.3192,
      "grad_norm": 0.11828982084989548,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 1995
    },
    {
      "epoch": 0.31936,
      "grad_norm": 0.11845875531435013,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 1996
    },
    {
      "epoch": 0.31952,
      "grad_norm": 0.10376892983913422,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 1997
    },
    {
      "epoch": 0.31968,
      "grad_norm": 0.2533765137195587,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 1998
    },
    {
      "epoch": 0.31984,
      "grad_norm": 0.11183638870716095,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 1999
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.1457844376564026,
      "learning_rate": 0.0001,
      "loss": 0.3443,
      "step": 2000
    },
    {
      "epoch": 0.32,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.32844117283821106,
      "eval_train_runtime": 4.0733,
      "eval_train_samples_per_second": 1227.496,
      "eval_train_steps_per_second": 15.466,
      "step": 2000
    },
    {
      "epoch": 0.32,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.3270794153213501,
      "eval_test_runtime": 4.8436,
      "eval_test_samples_per_second": 1032.294,
      "eval_test_steps_per_second": 13.007,
      "step": 2000
    },
    {
      "epoch": 0.32016,
      "grad_norm": 0.11367091536521912,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 2001
    },
    {
      "epoch": 0.32032,
      "grad_norm": 0.1211661770939827,
      "learning_rate": 0.0001,
      "loss": 0.3412,
      "step": 2002
    },
    {
      "epoch": 0.32048,
      "grad_norm": 0.11877543479204178,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 2003
    },
    {
      "epoch": 0.32064,
      "grad_norm": 0.2480858862400055,
      "learning_rate": 0.0001,
      "loss": 0.3397,
      "step": 2004
    },
    {
      "epoch": 0.3208,
      "grad_norm": 0.12593476474285126,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 2005
    },
    {
      "epoch": 0.32096,
      "grad_norm": 0.12139613926410675,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 2006
    },
    {
      "epoch": 0.32112,
      "grad_norm": 0.15895023941993713,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 2007
    },
    {
      "epoch": 0.32128,
      "grad_norm": 0.12595801055431366,
      "learning_rate": 0.0001,
      "loss": 0.3397,
      "step": 2008
    },
    {
      "epoch": 0.32144,
      "grad_norm": 0.12734395265579224,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 2009
    },
    {
      "epoch": 0.3216,
      "grad_norm": 0.215388223528862,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2010
    },
    {
      "epoch": 0.32176,
      "grad_norm": 0.15899387001991272,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 2011
    },
    {
      "epoch": 0.32192,
      "grad_norm": 0.12202546745538712,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2012
    },
    {
      "epoch": 0.32208,
      "grad_norm": 0.13592779636383057,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 2013
    },
    {
      "epoch": 0.32224,
      "grad_norm": 0.14707998931407928,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 2014
    },
    {
      "epoch": 0.3224,
      "grad_norm": 0.12013411521911621,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 2015
    },
    {
      "epoch": 0.32256,
      "grad_norm": 0.12262997031211853,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 2016
    },
    {
      "epoch": 0.32272,
      "grad_norm": 0.20384828746318817,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 2017
    },
    {
      "epoch": 0.32288,
      "grad_norm": 0.12698282301425934,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 2018
    },
    {
      "epoch": 0.32304,
      "grad_norm": 0.14056190848350525,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 2019
    },
    {
      "epoch": 0.3232,
      "grad_norm": 0.13124480843544006,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 2020
    },
    {
      "epoch": 0.32336,
      "grad_norm": 0.14480659365653992,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 2021
    },
    {
      "epoch": 0.32352,
      "grad_norm": 0.11929734796285629,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 2022
    },
    {
      "epoch": 0.32368,
      "grad_norm": 0.17466849088668823,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 2023
    },
    {
      "epoch": 0.32384,
      "grad_norm": 0.13901637494564056,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 2024
    },
    {
      "epoch": 0.324,
      "grad_norm": 0.18117955327033997,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 2025
    },
    {
      "epoch": 0.32416,
      "grad_norm": 0.1153956949710846,
      "learning_rate": 0.0001,
      "loss": 0.3341,
      "step": 2026
    },
    {
      "epoch": 0.32432,
      "grad_norm": 0.11445924639701843,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 2027
    },
    {
      "epoch": 0.32448,
      "grad_norm": 0.14900001883506775,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 2028
    },
    {
      "epoch": 0.32464,
      "grad_norm": 0.1396225243806839,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 2029
    },
    {
      "epoch": 0.3248,
      "grad_norm": 0.13406594097614288,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 2030
    },
    {
      "epoch": 0.32496,
      "grad_norm": 0.1524297595024109,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 2031
    },
    {
      "epoch": 0.32512,
      "grad_norm": 0.17917989194393158,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 2032
    },
    {
      "epoch": 0.32528,
      "grad_norm": 0.12010425329208374,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 2033
    },
    {
      "epoch": 0.32544,
      "grad_norm": 0.12704889476299286,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2034
    },
    {
      "epoch": 0.3256,
      "grad_norm": 0.12013773620128632,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 2035
    },
    {
      "epoch": 0.32576,
      "grad_norm": 0.13009589910507202,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 2036
    },
    {
      "epoch": 0.32592,
      "grad_norm": 0.1326121836900711,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 2037
    },
    {
      "epoch": 0.32608,
      "grad_norm": 0.11350704729557037,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 2038
    },
    {
      "epoch": 0.32624,
      "grad_norm": 0.10991251468658447,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 2039
    },
    {
      "epoch": 0.3264,
      "grad_norm": 0.12172476202249527,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 2040
    },
    {
      "epoch": 0.32656,
      "grad_norm": 0.11751697957515717,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 2041
    },
    {
      "epoch": 0.32672,
      "grad_norm": 0.1216939389705658,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 2042
    },
    {
      "epoch": 0.32688,
      "grad_norm": 0.12158528715372086,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 2043
    },
    {
      "epoch": 0.32704,
      "grad_norm": 0.11555374413728714,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 2044
    },
    {
      "epoch": 0.3272,
      "grad_norm": 0.12776325643062592,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 2045
    },
    {
      "epoch": 0.32736,
      "grad_norm": 0.10255545377731323,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 2046
    },
    {
      "epoch": 0.32752,
      "grad_norm": 0.18198728561401367,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 2047
    },
    {
      "epoch": 0.32768,
      "grad_norm": 0.14870193600654602,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 2048
    },
    {
      "epoch": 0.32784,
      "grad_norm": 0.10484883189201355,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2049
    },
    {
      "epoch": 0.328,
      "grad_norm": 0.1188664436340332,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 2050
    },
    {
      "epoch": 0.32816,
      "grad_norm": 0.11951324343681335,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 2051
    },
    {
      "epoch": 0.32832,
      "grad_norm": 0.12244658172130585,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 2052
    },
    {
      "epoch": 0.32848,
      "grad_norm": 0.15457263588905334,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 2053
    },
    {
      "epoch": 0.32864,
      "grad_norm": 0.2548119127750397,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 2054
    },
    {
      "epoch": 0.3288,
      "grad_norm": 0.1405135840177536,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 2055
    },
    {
      "epoch": 0.32896,
      "grad_norm": 0.11905589699745178,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 2056
    },
    {
      "epoch": 0.32912,
      "grad_norm": 0.1324538141489029,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 2057
    },
    {
      "epoch": 0.32928,
      "grad_norm": 0.13418492674827576,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 2058
    },
    {
      "epoch": 0.32944,
      "grad_norm": 0.14899224042892456,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 2059
    },
    {
      "epoch": 0.3296,
      "grad_norm": 0.12334271520376205,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 2060
    },
    {
      "epoch": 0.32976,
      "grad_norm": 0.1611577868461609,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 2061
    },
    {
      "epoch": 0.32992,
      "grad_norm": 0.15159769356250763,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 2062
    },
    {
      "epoch": 0.33008,
      "grad_norm": 0.11973686516284943,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 2063
    },
    {
      "epoch": 0.33024,
      "grad_norm": 0.1291440725326538,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 2064
    },
    {
      "epoch": 0.3304,
      "grad_norm": 0.1130930483341217,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 2065
    },
    {
      "epoch": 0.33056,
      "grad_norm": 0.1504027247428894,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 2066
    },
    {
      "epoch": 0.33072,
      "grad_norm": 0.1548106074333191,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 2067
    },
    {
      "epoch": 0.33088,
      "grad_norm": 0.1447049081325531,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 2068
    },
    {
      "epoch": 0.33104,
      "grad_norm": 0.12591642141342163,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 2069
    },
    {
      "epoch": 0.3312,
      "grad_norm": 0.13815055787563324,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 2070
    },
    {
      "epoch": 0.33136,
      "grad_norm": 0.12594737112522125,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 2071
    },
    {
      "epoch": 0.33152,
      "grad_norm": 0.11118237674236298,
      "learning_rate": 0.0001,
      "loss": 0.3412,
      "step": 2072
    },
    {
      "epoch": 0.33168,
      "grad_norm": 0.1760493516921997,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2073
    },
    {
      "epoch": 0.33184,
      "grad_norm": 0.13625064492225647,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 2074
    },
    {
      "epoch": 0.332,
      "grad_norm": 0.1424647867679596,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 2075
    },
    {
      "epoch": 0.33216,
      "grad_norm": 0.11757511645555496,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 2076
    },
    {
      "epoch": 0.33232,
      "grad_norm": 0.13520723581314087,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 2077
    },
    {
      "epoch": 0.33248,
      "grad_norm": 0.10499738901853561,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 2078
    },
    {
      "epoch": 0.33264,
      "grad_norm": 0.12294278293848038,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 2079
    },
    {
      "epoch": 0.3328,
      "grad_norm": 0.10954389721155167,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 2080
    },
    {
      "epoch": 0.33296,
      "grad_norm": 0.13178886473178864,
      "learning_rate": 0.0001,
      "loss": 0.3374,
      "step": 2081
    },
    {
      "epoch": 0.33312,
      "grad_norm": 0.12074356526136398,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 2082
    },
    {
      "epoch": 0.33328,
      "grad_norm": 0.12017054855823517,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 2083
    },
    {
      "epoch": 0.33344,
      "grad_norm": 0.12047813832759857,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 2084
    },
    {
      "epoch": 0.3336,
      "grad_norm": 0.09567273408174515,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 2085
    },
    {
      "epoch": 0.33376,
      "grad_norm": 0.12287427484989166,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 2086
    },
    {
      "epoch": 0.33392,
      "grad_norm": 0.14113111793994904,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 2087
    },
    {
      "epoch": 0.33408,
      "grad_norm": 0.12493672221899033,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 2088
    },
    {
      "epoch": 0.33424,
      "grad_norm": 0.12708111107349396,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2089
    },
    {
      "epoch": 0.3344,
      "grad_norm": 0.1108529344201088,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 2090
    },
    {
      "epoch": 0.33456,
      "grad_norm": 0.1392204463481903,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 2091
    },
    {
      "epoch": 0.33472,
      "grad_norm": 0.19482970237731934,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 2092
    },
    {
      "epoch": 0.33488,
      "grad_norm": 0.11334265768527985,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 2093
    },
    {
      "epoch": 0.33504,
      "grad_norm": 0.11189287155866623,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 2094
    },
    {
      "epoch": 0.3352,
      "grad_norm": 0.10970409214496613,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2095
    },
    {
      "epoch": 0.33536,
      "grad_norm": 0.10860662907361984,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 2096
    },
    {
      "epoch": 0.33552,
      "grad_norm": 0.15515026450157166,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 2097
    },
    {
      "epoch": 0.33568,
      "grad_norm": 0.11360307037830353,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 2098
    },
    {
      "epoch": 0.33584,
      "grad_norm": 0.11137070506811142,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 2099
    },
    {
      "epoch": 0.336,
      "grad_norm": 0.12253742665052414,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 2100
    },
    {
      "epoch": 0.336,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.32778599858283997,
      "eval_train_runtime": 4.2209,
      "eval_train_samples_per_second": 1184.586,
      "eval_train_steps_per_second": 14.926,
      "step": 2100
    },
    {
      "epoch": 0.336,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.3262394964694977,
      "eval_test_runtime": 4.8719,
      "eval_test_samples_per_second": 1026.288,
      "eval_test_steps_per_second": 12.931,
      "step": 2100
    },
    {
      "epoch": 0.33616,
      "grad_norm": 0.129387766122818,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 2101
    },
    {
      "epoch": 0.33632,
      "grad_norm": 0.1219576746225357,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 2102
    },
    {
      "epoch": 0.33648,
      "grad_norm": 0.16414637863636017,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 2103
    },
    {
      "epoch": 0.33664,
      "grad_norm": 0.18625514209270477,
      "learning_rate": 0.0001,
      "loss": 0.3375,
      "step": 2104
    },
    {
      "epoch": 0.3368,
      "grad_norm": 0.14992424845695496,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 2105
    },
    {
      "epoch": 0.33696,
      "grad_norm": 0.11729514598846436,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 2106
    },
    {
      "epoch": 0.33712,
      "grad_norm": 0.10695108771324158,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 2107
    },
    {
      "epoch": 0.33728,
      "grad_norm": 0.12912914156913757,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 2108
    },
    {
      "epoch": 0.33744,
      "grad_norm": 0.1363789588212967,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 2109
    },
    {
      "epoch": 0.3376,
      "grad_norm": 0.11027980595827103,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 2110
    },
    {
      "epoch": 0.33776,
      "grad_norm": 0.15975160896778107,
      "learning_rate": 0.0001,
      "loss": 0.3399,
      "step": 2111
    },
    {
      "epoch": 0.33792,
      "grad_norm": 0.10389403998851776,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 2112
    },
    {
      "epoch": 0.33808,
      "grad_norm": 0.11957509070634842,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 2113
    },
    {
      "epoch": 0.33824,
      "grad_norm": 0.10800684988498688,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 2114
    },
    {
      "epoch": 0.3384,
      "grad_norm": 0.10620272904634476,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2115
    },
    {
      "epoch": 0.33856,
      "grad_norm": 0.12424051016569138,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 2116
    },
    {
      "epoch": 0.33872,
      "grad_norm": 0.13209699094295502,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 2117
    },
    {
      "epoch": 0.33888,
      "grad_norm": 0.13610005378723145,
      "learning_rate": 0.0001,
      "loss": 0.3389,
      "step": 2118
    },
    {
      "epoch": 0.33904,
      "grad_norm": 0.11885478347539902,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2119
    },
    {
      "epoch": 0.3392,
      "grad_norm": 0.12224996089935303,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 2120
    },
    {
      "epoch": 0.33936,
      "grad_norm": 0.10911993682384491,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2121
    },
    {
      "epoch": 0.33952,
      "grad_norm": 0.12986819446086884,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 2122
    },
    {
      "epoch": 0.33968,
      "grad_norm": 0.10837101936340332,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2123
    },
    {
      "epoch": 0.33984,
      "grad_norm": 0.09758450835943222,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 2124
    },
    {
      "epoch": 0.34,
      "grad_norm": 0.1253977119922638,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 2125
    },
    {
      "epoch": 0.34016,
      "grad_norm": 0.12173014134168625,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 2126
    },
    {
      "epoch": 0.34032,
      "grad_norm": 0.11130951344966888,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 2127
    },
    {
      "epoch": 0.34048,
      "grad_norm": 0.12189286202192307,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 2128
    },
    {
      "epoch": 0.34064,
      "grad_norm": 0.11953867226839066,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2129
    },
    {
      "epoch": 0.3408,
      "grad_norm": 0.19722481071949005,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 2130
    },
    {
      "epoch": 0.34096,
      "grad_norm": 0.13974708318710327,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 2131
    },
    {
      "epoch": 0.34112,
      "grad_norm": 0.12143262475728989,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 2132
    },
    {
      "epoch": 0.34128,
      "grad_norm": 0.1441565304994583,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 2133
    },
    {
      "epoch": 0.34144,
      "grad_norm": 0.10233424603939056,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 2134
    },
    {
      "epoch": 0.3416,
      "grad_norm": 0.1641753762960434,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 2135
    },
    {
      "epoch": 0.34176,
      "grad_norm": 0.15505586564540863,
      "learning_rate": 0.0001,
      "loss": 0.3406,
      "step": 2136
    },
    {
      "epoch": 0.34192,
      "grad_norm": 0.16559042036533356,
      "learning_rate": 0.0001,
      "loss": 0.3395,
      "step": 2137
    },
    {
      "epoch": 0.34208,
      "grad_norm": 0.1613696813583374,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 2138
    },
    {
      "epoch": 0.34224,
      "grad_norm": 0.11052260547876358,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 2139
    },
    {
      "epoch": 0.3424,
      "grad_norm": 0.1157587394118309,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 2140
    },
    {
      "epoch": 0.34256,
      "grad_norm": 0.13310830295085907,
      "learning_rate": 0.0001,
      "loss": 0.3462,
      "step": 2141
    },
    {
      "epoch": 0.34272,
      "grad_norm": 0.1459205597639084,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2142
    },
    {
      "epoch": 0.34288,
      "grad_norm": 0.2108527421951294,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2143
    },
    {
      "epoch": 0.34304,
      "grad_norm": 0.12077199667692184,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 2144
    },
    {
      "epoch": 0.3432,
      "grad_norm": 0.12130613625049591,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 2145
    },
    {
      "epoch": 0.34336,
      "grad_norm": 0.14890502393245697,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2146
    },
    {
      "epoch": 0.34352,
      "grad_norm": 0.13900138437747955,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 2147
    },
    {
      "epoch": 0.34368,
      "grad_norm": 0.11418746411800385,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 2148
    },
    {
      "epoch": 0.34384,
      "grad_norm": 0.14433152973651886,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 2149
    },
    {
      "epoch": 0.344,
      "grad_norm": 0.1392539143562317,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 2150
    },
    {
      "epoch": 0.34416,
      "grad_norm": 0.12417422980070114,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2151
    },
    {
      "epoch": 0.34432,
      "grad_norm": 0.12188806384801865,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 2152
    },
    {
      "epoch": 0.34448,
      "grad_norm": 0.14187273383140564,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 2153
    },
    {
      "epoch": 0.34464,
      "grad_norm": 0.13610129058361053,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 2154
    },
    {
      "epoch": 0.3448,
      "grad_norm": 0.12242180854082108,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 2155
    },
    {
      "epoch": 0.34496,
      "grad_norm": 0.16011899709701538,
      "learning_rate": 0.0001,
      "loss": 0.3449,
      "step": 2156
    },
    {
      "epoch": 0.34512,
      "grad_norm": 0.11007893085479736,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 2157
    },
    {
      "epoch": 0.34528,
      "grad_norm": 0.11627331376075745,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 2158
    },
    {
      "epoch": 0.34544,
      "grad_norm": 0.11822054535150528,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 2159
    },
    {
      "epoch": 0.3456,
      "grad_norm": 0.126288503408432,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 2160
    },
    {
      "epoch": 0.34576,
      "grad_norm": 0.12681198120117188,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2161
    },
    {
      "epoch": 0.34592,
      "grad_norm": 0.13135536015033722,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 2162
    },
    {
      "epoch": 0.34608,
      "grad_norm": 0.09544102847576141,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 2163
    },
    {
      "epoch": 0.34624,
      "grad_norm": 0.1300232857465744,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 2164
    },
    {
      "epoch": 0.3464,
      "grad_norm": 0.14214853942394257,
      "learning_rate": 0.0001,
      "loss": 0.338,
      "step": 2165
    },
    {
      "epoch": 0.34656,
      "grad_norm": 0.11347097903490067,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 2166
    },
    {
      "epoch": 0.34672,
      "grad_norm": 0.11315913498401642,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 2167
    },
    {
      "epoch": 0.34688,
      "grad_norm": 0.12386836111545563,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 2168
    },
    {
      "epoch": 0.34704,
      "grad_norm": 0.1586599349975586,
      "learning_rate": 0.0001,
      "loss": 0.3463,
      "step": 2169
    },
    {
      "epoch": 0.3472,
      "grad_norm": 0.10005620121955872,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 2170
    },
    {
      "epoch": 0.34736,
      "grad_norm": 0.12448207288980484,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 2171
    },
    {
      "epoch": 0.34752,
      "grad_norm": 0.11451568454504013,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 2172
    },
    {
      "epoch": 0.34768,
      "grad_norm": 0.11756254732608795,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2173
    },
    {
      "epoch": 0.34784,
      "grad_norm": 0.16452021896839142,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 2174
    },
    {
      "epoch": 0.348,
      "grad_norm": 0.10787832736968994,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 2175
    },
    {
      "epoch": 0.34816,
      "grad_norm": 0.10477923601865768,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 2176
    },
    {
      "epoch": 0.34832,
      "grad_norm": 0.1129358559846878,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2177
    },
    {
      "epoch": 0.34848,
      "grad_norm": 0.1280083805322647,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 2178
    },
    {
      "epoch": 0.34864,
      "grad_norm": 0.138254314661026,
      "learning_rate": 0.0001,
      "loss": 0.3379,
      "step": 2179
    },
    {
      "epoch": 0.3488,
      "grad_norm": 0.19739143550395966,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 2180
    },
    {
      "epoch": 0.34896,
      "grad_norm": 0.11218719929456711,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 2181
    },
    {
      "epoch": 0.34912,
      "grad_norm": 0.10474766790866852,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 2182
    },
    {
      "epoch": 0.34928,
      "grad_norm": 0.11122014373540878,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 2183
    },
    {
      "epoch": 0.34944,
      "grad_norm": 0.12237920612096786,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 2184
    },
    {
      "epoch": 0.3496,
      "grad_norm": 0.11370402574539185,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 2185
    },
    {
      "epoch": 0.34976,
      "grad_norm": 0.12336581945419312,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 2186
    },
    {
      "epoch": 0.34992,
      "grad_norm": 0.11620622873306274,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 2187
    },
    {
      "epoch": 0.35008,
      "grad_norm": 0.10445386916399002,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 2188
    },
    {
      "epoch": 0.35024,
      "grad_norm": 0.11106449365615845,
      "learning_rate": 0.0001,
      "loss": 0.3397,
      "step": 2189
    },
    {
      "epoch": 0.3504,
      "grad_norm": 0.22871600091457367,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 2190
    },
    {
      "epoch": 0.35056,
      "grad_norm": 0.1263989806175232,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 2191
    },
    {
      "epoch": 0.35072,
      "grad_norm": 0.10995426028966904,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 2192
    },
    {
      "epoch": 0.35088,
      "grad_norm": 0.10635395348072052,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 2193
    },
    {
      "epoch": 0.35104,
      "grad_norm": 0.12233541905879974,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 2194
    },
    {
      "epoch": 0.3512,
      "grad_norm": 0.10917346924543381,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 2195
    },
    {
      "epoch": 0.35136,
      "grad_norm": 0.10134781897068024,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 2196
    },
    {
      "epoch": 0.35152,
      "grad_norm": 0.11078312247991562,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 2197
    },
    {
      "epoch": 0.35168,
      "grad_norm": 0.11831972748041153,
      "learning_rate": 0.0001,
      "loss": 0.3439,
      "step": 2198
    },
    {
      "epoch": 0.35184,
      "grad_norm": 0.10415994375944138,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 2199
    },
    {
      "epoch": 0.352,
      "grad_norm": 0.11796055734157562,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 2200
    },
    {
      "epoch": 0.352,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.3260576128959656,
      "eval_train_runtime": 4.1343,
      "eval_train_samples_per_second": 1209.383,
      "eval_train_steps_per_second": 15.238,
      "step": 2200
    },
    {
      "epoch": 0.352,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.32455042004585266,
      "eval_test_runtime": 4.6596,
      "eval_test_samples_per_second": 1073.047,
      "eval_test_steps_per_second": 13.52,
      "step": 2200
    },
    {
      "epoch": 0.35216,
      "grad_norm": 0.11579883843660355,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 2201
    },
    {
      "epoch": 0.35232,
      "grad_norm": 0.11037969589233398,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 2202
    },
    {
      "epoch": 0.35248,
      "grad_norm": 0.11712289601564407,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 2203
    },
    {
      "epoch": 0.35264,
      "grad_norm": 0.1066274642944336,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 2204
    },
    {
      "epoch": 0.3528,
      "grad_norm": 0.1183071881532669,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 2205
    },
    {
      "epoch": 0.35296,
      "grad_norm": 0.13721519708633423,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2206
    },
    {
      "epoch": 0.35312,
      "grad_norm": 0.12634016573429108,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 2207
    },
    {
      "epoch": 0.35328,
      "grad_norm": 0.3529675304889679,
      "learning_rate": 0.0001,
      "loss": 0.3388,
      "step": 2208
    },
    {
      "epoch": 0.35344,
      "grad_norm": 0.11898085474967957,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 2209
    },
    {
      "epoch": 0.3536,
      "grad_norm": 0.12775865197181702,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 2210
    },
    {
      "epoch": 0.35376,
      "grad_norm": 0.14131568372249603,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 2211
    },
    {
      "epoch": 0.35392,
      "grad_norm": 0.2544799745082855,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 2212
    },
    {
      "epoch": 0.35408,
      "grad_norm": 0.13113315403461456,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2213
    },
    {
      "epoch": 0.35424,
      "grad_norm": 0.11986728757619858,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 2214
    },
    {
      "epoch": 0.3544,
      "grad_norm": 0.15782968699932098,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 2215
    },
    {
      "epoch": 0.35456,
      "grad_norm": 0.10865745693445206,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 2216
    },
    {
      "epoch": 0.35472,
      "grad_norm": 0.12347614765167236,
      "learning_rate": 0.0001,
      "loss": 0.3355,
      "step": 2217
    },
    {
      "epoch": 0.35488,
      "grad_norm": 0.13418743014335632,
      "learning_rate": 0.0001,
      "loss": 0.3441,
      "step": 2218
    },
    {
      "epoch": 0.35504,
      "grad_norm": 0.26531296968460083,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 2219
    },
    {
      "epoch": 0.3552,
      "grad_norm": 0.1091773733496666,
      "learning_rate": 0.0001,
      "loss": 0.3075,
      "step": 2220
    },
    {
      "epoch": 0.35536,
      "grad_norm": 0.31727996468544006,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 2221
    },
    {
      "epoch": 0.35552,
      "grad_norm": 0.24227285385131836,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 2222
    },
    {
      "epoch": 0.35568,
      "grad_norm": 0.12136629223823547,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 2223
    },
    {
      "epoch": 0.35584,
      "grad_norm": 0.1524454802274704,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 2224
    },
    {
      "epoch": 0.356,
      "grad_norm": 0.13069620728492737,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 2225
    },
    {
      "epoch": 0.35616,
      "grad_norm": 0.1782466173171997,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 2226
    },
    {
      "epoch": 0.35632,
      "grad_norm": 0.12881676852703094,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 2227
    },
    {
      "epoch": 0.35648,
      "grad_norm": 0.1359933465719223,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 2228
    },
    {
      "epoch": 0.35664,
      "grad_norm": 0.1181279718875885,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 2229
    },
    {
      "epoch": 0.3568,
      "grad_norm": 0.47228094935417175,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 2230
    },
    {
      "epoch": 0.35696,
      "grad_norm": 0.11533527821302414,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 2231
    },
    {
      "epoch": 0.35712,
      "grad_norm": 0.12002180516719818,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 2232
    },
    {
      "epoch": 0.35728,
      "grad_norm": 0.17059125006198883,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 2233
    },
    {
      "epoch": 0.35744,
      "grad_norm": 0.23586057126522064,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 2234
    },
    {
      "epoch": 0.3576,
      "grad_norm": 0.14350999891757965,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2235
    },
    {
      "epoch": 0.35776,
      "grad_norm": 0.12551447749137878,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 2236
    },
    {
      "epoch": 0.35792,
      "grad_norm": 0.18427640199661255,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 2237
    },
    {
      "epoch": 0.35808,
      "grad_norm": 0.19412171840667725,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 2238
    },
    {
      "epoch": 0.35824,
      "grad_norm": 0.14368128776550293,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 2239
    },
    {
      "epoch": 0.3584,
      "grad_norm": 0.129522904753685,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2240
    },
    {
      "epoch": 0.35856,
      "grad_norm": 0.11257363110780716,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 2241
    },
    {
      "epoch": 0.35872,
      "grad_norm": 0.1521235853433609,
      "learning_rate": 0.0001,
      "loss": 0.3373,
      "step": 2242
    },
    {
      "epoch": 0.35888,
      "grad_norm": 0.11258068680763245,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 2243
    },
    {
      "epoch": 0.35904,
      "grad_norm": 0.11388730257749557,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 2244
    },
    {
      "epoch": 0.3592,
      "grad_norm": 0.11398710310459137,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 2245
    },
    {
      "epoch": 0.35936,
      "grad_norm": 0.14327114820480347,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 2246
    },
    {
      "epoch": 0.35952,
      "grad_norm": 0.15115498006343842,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2247
    },
    {
      "epoch": 0.35968,
      "grad_norm": 0.12486037611961365,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 2248
    },
    {
      "epoch": 0.35984,
      "grad_norm": 0.12298544496297836,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 2249
    },
    {
      "epoch": 0.36,
      "grad_norm": 0.10666581988334656,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 2250
    },
    {
      "epoch": 0.36016,
      "grad_norm": 0.1280151754617691,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 2251
    },
    {
      "epoch": 0.36032,
      "grad_norm": 0.12085993587970734,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 2252
    },
    {
      "epoch": 0.36048,
      "grad_norm": 0.12082549929618835,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 2253
    },
    {
      "epoch": 0.36064,
      "grad_norm": 0.14321604371070862,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 2254
    },
    {
      "epoch": 0.3608,
      "grad_norm": 0.10366538166999817,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 2255
    },
    {
      "epoch": 0.36096,
      "grad_norm": 0.1278337836265564,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 2256
    },
    {
      "epoch": 0.36112,
      "grad_norm": 0.13216820359230042,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 2257
    },
    {
      "epoch": 0.36128,
      "grad_norm": 0.15985238552093506,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 2258
    },
    {
      "epoch": 0.36144,
      "grad_norm": 0.10876744240522385,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 2259
    },
    {
      "epoch": 0.3616,
      "grad_norm": 0.12385690212249756,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 2260
    },
    {
      "epoch": 0.36176,
      "grad_norm": 0.12085788697004318,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 2261
    },
    {
      "epoch": 0.36192,
      "grad_norm": 0.1183178722858429,
      "learning_rate": 0.0001,
      "loss": 0.3065,
      "step": 2262
    },
    {
      "epoch": 0.36208,
      "grad_norm": 0.11438702791929245,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 2263
    },
    {
      "epoch": 0.36224,
      "grad_norm": 0.11363593488931656,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2264
    },
    {
      "epoch": 0.3624,
      "grad_norm": 0.10754364728927612,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 2265
    },
    {
      "epoch": 0.36256,
      "grad_norm": 0.10979928076267242,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 2266
    },
    {
      "epoch": 0.36272,
      "grad_norm": 0.18847401440143585,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 2267
    },
    {
      "epoch": 0.36288,
      "grad_norm": 0.12162382155656815,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 2268
    },
    {
      "epoch": 0.36304,
      "grad_norm": 0.12309007346630096,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 2269
    },
    {
      "epoch": 0.3632,
      "grad_norm": 0.13563160598278046,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2270
    },
    {
      "epoch": 0.36336,
      "grad_norm": 0.11999324709177017,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 2271
    },
    {
      "epoch": 0.36352,
      "grad_norm": 0.12077614665031433,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2272
    },
    {
      "epoch": 0.36368,
      "grad_norm": 0.12949608266353607,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 2273
    },
    {
      "epoch": 0.36384,
      "grad_norm": 0.12321238964796066,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 2274
    },
    {
      "epoch": 0.364,
      "grad_norm": 0.10747548937797546,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 2275
    },
    {
      "epoch": 0.36416,
      "grad_norm": 0.11554594337940216,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 2276
    },
    {
      "epoch": 0.36432,
      "grad_norm": 0.10558079928159714,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 2277
    },
    {
      "epoch": 0.36448,
      "grad_norm": 0.14631396532058716,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 2278
    },
    {
      "epoch": 0.36464,
      "grad_norm": 0.105309396982193,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2279
    },
    {
      "epoch": 0.3648,
      "grad_norm": 0.13247495889663696,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 2280
    },
    {
      "epoch": 0.36496,
      "grad_norm": 0.11274664103984833,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 2281
    },
    {
      "epoch": 0.36512,
      "grad_norm": 0.11797728389501572,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 2282
    },
    {
      "epoch": 0.36528,
      "grad_norm": 0.09605114161968231,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 2283
    },
    {
      "epoch": 0.36544,
      "grad_norm": 0.11159968376159668,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 2284
    },
    {
      "epoch": 0.3656,
      "grad_norm": 0.14516618847846985,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 2285
    },
    {
      "epoch": 0.36576,
      "grad_norm": 0.1240188255906105,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 2286
    },
    {
      "epoch": 0.36592,
      "grad_norm": 0.11656814813613892,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 2287
    },
    {
      "epoch": 0.36608,
      "grad_norm": 0.1238221675157547,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 2288
    },
    {
      "epoch": 0.36624,
      "grad_norm": 0.10642428696155548,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 2289
    },
    {
      "epoch": 0.3664,
      "grad_norm": 0.11262966692447662,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 2290
    },
    {
      "epoch": 0.36656,
      "grad_norm": 0.13850754499435425,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 2291
    },
    {
      "epoch": 0.36672,
      "grad_norm": 0.1037135198712349,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 2292
    },
    {
      "epoch": 0.36688,
      "grad_norm": 0.1349126398563385,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 2293
    },
    {
      "epoch": 0.36704,
      "grad_norm": 0.11601337790489197,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 2294
    },
    {
      "epoch": 0.3672,
      "grad_norm": 0.16779151558876038,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 2295
    },
    {
      "epoch": 0.36736,
      "grad_norm": 0.11744032055139542,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2296
    },
    {
      "epoch": 0.36752,
      "grad_norm": 0.10648846626281738,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 2297
    },
    {
      "epoch": 0.36768,
      "grad_norm": 0.11079126596450806,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 2298
    },
    {
      "epoch": 0.36784,
      "grad_norm": 0.22547012567520142,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 2299
    },
    {
      "epoch": 0.368,
      "grad_norm": 0.12343202531337738,
      "learning_rate": 0.0001,
      "loss": 0.3419,
      "step": 2300
    },
    {
      "epoch": 0.368,
      "eval_train_accuracy": 0.5314,
      "eval_train_loss": 0.32542121410369873,
      "eval_train_runtime": 4.5036,
      "eval_train_samples_per_second": 1110.227,
      "eval_train_steps_per_second": 13.989,
      "step": 2300
    },
    {
      "epoch": 0.368,
      "eval_test_accuracy": 0.5334,
      "eval_test_loss": 0.324101060628891,
      "eval_test_runtime": 4.9031,
      "eval_test_samples_per_second": 1019.754,
      "eval_test_steps_per_second": 12.849,
      "step": 2300
    },
    {
      "epoch": 0.36816,
      "grad_norm": 0.12234710901975632,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 2301
    },
    {
      "epoch": 0.36832,
      "grad_norm": 0.12393317371606827,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 2302
    },
    {
      "epoch": 0.36848,
      "grad_norm": 0.12241805344820023,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 2303
    },
    {
      "epoch": 0.36864,
      "grad_norm": 0.11665062606334686,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 2304
    },
    {
      "epoch": 0.3688,
      "grad_norm": 0.11634700745344162,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 2305
    },
    {
      "epoch": 0.36896,
      "grad_norm": 0.15589821338653564,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 2306
    },
    {
      "epoch": 0.36912,
      "grad_norm": 0.10305749624967575,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 2307
    },
    {
      "epoch": 0.36928,
      "grad_norm": 0.12054696679115295,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 2308
    },
    {
      "epoch": 0.36944,
      "grad_norm": 0.10504144430160522,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 2309
    },
    {
      "epoch": 0.3696,
      "grad_norm": 0.12110131233930588,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 2310
    },
    {
      "epoch": 0.36976,
      "grad_norm": 0.1324506551027298,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 2311
    },
    {
      "epoch": 0.36992,
      "grad_norm": 0.11619684100151062,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 2312
    },
    {
      "epoch": 0.37008,
      "grad_norm": 0.10368538647890091,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 2313
    },
    {
      "epoch": 0.37024,
      "grad_norm": 0.10158587992191315,
      "learning_rate": 0.0001,
      "loss": 0.341,
      "step": 2314
    },
    {
      "epoch": 0.3704,
      "grad_norm": 0.10352777689695358,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 2315
    },
    {
      "epoch": 0.37056,
      "grad_norm": 0.11338596045970917,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2316
    },
    {
      "epoch": 0.37072,
      "grad_norm": 0.10755731910467148,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 2317
    },
    {
      "epoch": 0.37088,
      "grad_norm": 0.11089514195919037,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 2318
    },
    {
      "epoch": 0.37104,
      "grad_norm": 0.1215713694691658,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 2319
    },
    {
      "epoch": 0.3712,
      "grad_norm": 0.10321096330881119,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2320
    },
    {
      "epoch": 0.37136,
      "grad_norm": 0.12694881856441498,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 2321
    },
    {
      "epoch": 0.37152,
      "grad_norm": 0.11554224789142609,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 2322
    },
    {
      "epoch": 0.37168,
      "grad_norm": 0.1142646074295044,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 2323
    },
    {
      "epoch": 0.37184,
      "grad_norm": 0.12147699296474457,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 2324
    },
    {
      "epoch": 0.372,
      "grad_norm": 0.10153897851705551,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 2325
    },
    {
      "epoch": 0.37216,
      "grad_norm": 0.10542626678943634,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 2326
    },
    {
      "epoch": 0.37232,
      "grad_norm": 0.11477416008710861,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 2327
    },
    {
      "epoch": 0.37248,
      "grad_norm": 0.11027899384498596,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 2328
    },
    {
      "epoch": 0.37264,
      "grad_norm": 0.10390910506248474,
      "learning_rate": 0.0001,
      "loss": 0.3486,
      "step": 2329
    },
    {
      "epoch": 0.3728,
      "grad_norm": 0.1339537352323532,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 2330
    },
    {
      "epoch": 0.37296,
      "grad_norm": 0.1563470959663391,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 2331
    },
    {
      "epoch": 0.37312,
      "grad_norm": 0.11742345243692398,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 2332
    },
    {
      "epoch": 0.37328,
      "grad_norm": 0.10514476150274277,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 2333
    },
    {
      "epoch": 0.37344,
      "grad_norm": 0.15513701736927032,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 2334
    },
    {
      "epoch": 0.3736,
      "grad_norm": 0.1300273984670639,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 2335
    },
    {
      "epoch": 0.37376,
      "grad_norm": 0.13540442287921906,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 2336
    },
    {
      "epoch": 0.37392,
      "grad_norm": 0.10990951210260391,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 2337
    },
    {
      "epoch": 0.37408,
      "grad_norm": 0.12648987770080566,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 2338
    },
    {
      "epoch": 0.37424,
      "grad_norm": 0.13034819066524506,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 2339
    },
    {
      "epoch": 0.3744,
      "grad_norm": 0.10000229626893997,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 2340
    },
    {
      "epoch": 0.37456,
      "grad_norm": 0.10292127728462219,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 2341
    },
    {
      "epoch": 0.37472,
      "grad_norm": 0.11551464349031448,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 2342
    },
    {
      "epoch": 0.37488,
      "grad_norm": 0.09514986723661423,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 2343
    },
    {
      "epoch": 0.37504,
      "grad_norm": 0.10606412589550018,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 2344
    },
    {
      "epoch": 0.3752,
      "grad_norm": 0.11169932782649994,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 2345
    },
    {
      "epoch": 0.37536,
      "grad_norm": 0.11748110502958298,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 2346
    },
    {
      "epoch": 0.37552,
      "grad_norm": 0.09881089627742767,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 2347
    },
    {
      "epoch": 0.37568,
      "grad_norm": 0.10443317890167236,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 2348
    },
    {
      "epoch": 0.37584,
      "grad_norm": 0.11168770492076874,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 2349
    },
    {
      "epoch": 0.376,
      "grad_norm": 0.12408431619405746,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 2350
    },
    {
      "epoch": 0.37616,
      "grad_norm": 0.12243320047855377,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2351
    },
    {
      "epoch": 0.37632,
      "grad_norm": 0.14365090429782867,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 2352
    },
    {
      "epoch": 0.37648,
      "grad_norm": 0.10581111162900925,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2353
    },
    {
      "epoch": 0.37664,
      "grad_norm": 0.12583404779434204,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 2354
    },
    {
      "epoch": 0.3768,
      "grad_norm": 0.10651274025440216,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 2355
    },
    {
      "epoch": 0.37696,
      "grad_norm": 0.11923079192638397,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 2356
    },
    {
      "epoch": 0.37712,
      "grad_norm": 0.13873305916786194,
      "learning_rate": 0.0001,
      "loss": 0.3444,
      "step": 2357
    },
    {
      "epoch": 0.37728,
      "grad_norm": 0.09780742973089218,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 2358
    },
    {
      "epoch": 0.37744,
      "grad_norm": 0.10870251059532166,
      "learning_rate": 0.0001,
      "loss": 0.3347,
      "step": 2359
    },
    {
      "epoch": 0.3776,
      "grad_norm": 0.10351570695638657,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 2360
    },
    {
      "epoch": 0.37776,
      "grad_norm": 0.10925869643688202,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 2361
    },
    {
      "epoch": 0.37792,
      "grad_norm": 0.10579466074705124,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 2362
    },
    {
      "epoch": 0.37808,
      "grad_norm": 0.12255460023880005,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 2363
    },
    {
      "epoch": 0.37824,
      "grad_norm": 0.13803568482398987,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2364
    },
    {
      "epoch": 0.3784,
      "grad_norm": 0.15671207010746002,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2365
    },
    {
      "epoch": 0.37856,
      "grad_norm": 0.11038796603679657,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 2366
    },
    {
      "epoch": 0.37872,
      "grad_norm": 0.10818414390087128,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 2367
    },
    {
      "epoch": 0.37888,
      "grad_norm": 0.11392533779144287,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 2368
    },
    {
      "epoch": 0.37904,
      "grad_norm": 0.12027720361948013,
      "learning_rate": 0.0001,
      "loss": 0.3454,
      "step": 2369
    },
    {
      "epoch": 0.3792,
      "grad_norm": 0.1080719605088234,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 2370
    },
    {
      "epoch": 0.37936,
      "grad_norm": 0.12956194579601288,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 2371
    },
    {
      "epoch": 0.37952,
      "grad_norm": 0.13845235109329224,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 2372
    },
    {
      "epoch": 0.37968,
      "grad_norm": 0.1250457763671875,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 2373
    },
    {
      "epoch": 0.37984,
      "grad_norm": 0.11883029341697693,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 2374
    },
    {
      "epoch": 0.38,
      "grad_norm": 0.13595661520957947,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 2375
    },
    {
      "epoch": 0.38016,
      "grad_norm": 0.13024386763572693,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 2376
    },
    {
      "epoch": 0.38032,
      "grad_norm": 0.10180073231458664,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 2377
    },
    {
      "epoch": 0.38048,
      "grad_norm": 0.1289522647857666,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 2378
    },
    {
      "epoch": 0.38064,
      "grad_norm": 0.1309906244277954,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 2379
    },
    {
      "epoch": 0.3808,
      "grad_norm": 0.11095349490642548,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 2380
    },
    {
      "epoch": 0.38096,
      "grad_norm": 0.12326326221227646,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 2381
    },
    {
      "epoch": 0.38112,
      "grad_norm": 0.12927930057048798,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 2382
    },
    {
      "epoch": 0.38128,
      "grad_norm": 0.11347060650587082,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 2383
    },
    {
      "epoch": 0.38144,
      "grad_norm": 0.11483477801084518,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 2384
    },
    {
      "epoch": 0.3816,
      "grad_norm": 0.17667879164218903,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 2385
    },
    {
      "epoch": 0.38176,
      "grad_norm": 0.11996132135391235,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 2386
    },
    {
      "epoch": 0.38192,
      "grad_norm": 0.1221967488527298,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2387
    },
    {
      "epoch": 0.38208,
      "grad_norm": 0.12894243001937866,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 2388
    },
    {
      "epoch": 0.38224,
      "grad_norm": 0.12466363608837128,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 2389
    },
    {
      "epoch": 0.3824,
      "grad_norm": 0.12348698079586029,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 2390
    },
    {
      "epoch": 0.38256,
      "grad_norm": 0.13157130777835846,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 2391
    },
    {
      "epoch": 0.38272,
      "grad_norm": 0.12410497665405273,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 2392
    },
    {
      "epoch": 0.38288,
      "grad_norm": 0.10948964208364487,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 2393
    },
    {
      "epoch": 0.38304,
      "grad_norm": 0.11458386480808258,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 2394
    },
    {
      "epoch": 0.3832,
      "grad_norm": 0.1582554429769516,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 2395
    },
    {
      "epoch": 0.38336,
      "grad_norm": 0.1336921900510788,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 2396
    },
    {
      "epoch": 0.38352,
      "grad_norm": 0.10674655437469482,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 2397
    },
    {
      "epoch": 0.38368,
      "grad_norm": 0.11523010581731796,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 2398
    },
    {
      "epoch": 0.38384,
      "grad_norm": 0.10716719925403595,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 2399
    },
    {
      "epoch": 0.384,
      "grad_norm": 0.10652142763137817,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 2400
    },
    {
      "epoch": 0.384,
      "eval_train_accuracy": 0.6788,
      "eval_train_loss": 0.3245276212692261,
      "eval_train_runtime": 4.0572,
      "eval_train_samples_per_second": 1232.38,
      "eval_train_steps_per_second": 15.528,
      "step": 2400
    },
    {
      "epoch": 0.384,
      "eval_test_accuracy": 0.6904,
      "eval_test_loss": 0.32310840487480164,
      "eval_test_runtime": 5.0676,
      "eval_test_samples_per_second": 986.657,
      "eval_test_steps_per_second": 12.432,
      "step": 2400
    },
    {
      "epoch": 0.38416,
      "grad_norm": 0.10747989267110825,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 2401
    },
    {
      "epoch": 0.38432,
      "grad_norm": 0.11177417635917664,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 2402
    },
    {
      "epoch": 0.38448,
      "grad_norm": 0.17026659846305847,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 2403
    },
    {
      "epoch": 0.38464,
      "grad_norm": 0.1073913723230362,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 2404
    },
    {
      "epoch": 0.3848,
      "grad_norm": 0.12339530885219574,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 2405
    },
    {
      "epoch": 0.38496,
      "grad_norm": 0.10273664444684982,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 2406
    },
    {
      "epoch": 0.38512,
      "grad_norm": 0.114778533577919,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 2407
    },
    {
      "epoch": 0.38528,
      "grad_norm": 0.21125738322734833,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 2408
    },
    {
      "epoch": 0.38544,
      "grad_norm": 0.12626634538173676,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 2409
    },
    {
      "epoch": 0.3856,
      "grad_norm": 0.1431693732738495,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 2410
    },
    {
      "epoch": 0.38576,
      "grad_norm": 0.11106137186288834,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 2411
    },
    {
      "epoch": 0.38592,
      "grad_norm": 0.12310248613357544,
      "learning_rate": 0.0001,
      "loss": 0.3443,
      "step": 2412
    },
    {
      "epoch": 0.38608,
      "grad_norm": 0.14784656465053558,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 2413
    },
    {
      "epoch": 0.38624,
      "grad_norm": 0.13158254325389862,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 2414
    },
    {
      "epoch": 0.3864,
      "grad_norm": 0.18271566927433014,
      "learning_rate": 0.0001,
      "loss": 0.3466,
      "step": 2415
    },
    {
      "epoch": 0.38656,
      "grad_norm": 0.11230696737766266,
      "learning_rate": 0.0001,
      "loss": 0.3074,
      "step": 2416
    },
    {
      "epoch": 0.38672,
      "grad_norm": 0.10791205614805222,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 2417
    },
    {
      "epoch": 0.38688,
      "grad_norm": 0.18142250180244446,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 2418
    },
    {
      "epoch": 0.38704,
      "grad_norm": 0.14520597457885742,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 2419
    },
    {
      "epoch": 0.3872,
      "grad_norm": 0.1155993863940239,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 2420
    },
    {
      "epoch": 0.38736,
      "grad_norm": 0.24449948966503143,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 2421
    },
    {
      "epoch": 0.38752,
      "grad_norm": 0.1210230365395546,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 2422
    },
    {
      "epoch": 0.38768,
      "grad_norm": 0.15726956725120544,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 2423
    },
    {
      "epoch": 0.38784,
      "grad_norm": 0.2827160954475403,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2424
    },
    {
      "epoch": 0.388,
      "grad_norm": 0.1208440512418747,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 2425
    },
    {
      "epoch": 0.38816,
      "grad_norm": 0.15955853462219238,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 2426
    },
    {
      "epoch": 0.38832,
      "grad_norm": 0.375602126121521,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 2427
    },
    {
      "epoch": 0.38848,
      "grad_norm": 0.11913121491670609,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2428
    },
    {
      "epoch": 0.38864,
      "grad_norm": 0.2128930389881134,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 2429
    },
    {
      "epoch": 0.3888,
      "grad_norm": 0.13701850175857544,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 2430
    },
    {
      "epoch": 0.38896,
      "grad_norm": 0.1384068876504898,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 2431
    },
    {
      "epoch": 0.38912,
      "grad_norm": 0.15861323475837708,
      "learning_rate": 0.0001,
      "loss": 0.3379,
      "step": 2432
    },
    {
      "epoch": 0.38928,
      "grad_norm": 0.13422581553459167,
      "learning_rate": 0.0001,
      "loss": 0.3005,
      "step": 2433
    },
    {
      "epoch": 0.38944,
      "grad_norm": 0.14473678171634674,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 2434
    },
    {
      "epoch": 0.3896,
      "grad_norm": 0.1295120120048523,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 2435
    },
    {
      "epoch": 0.38976,
      "grad_norm": 0.1338934302330017,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 2436
    },
    {
      "epoch": 0.38992,
      "grad_norm": 0.17626439034938812,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 2437
    },
    {
      "epoch": 0.39008,
      "grad_norm": 0.15484865009784698,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2438
    },
    {
      "epoch": 0.39024,
      "grad_norm": 0.11520970612764359,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 2439
    },
    {
      "epoch": 0.3904,
      "grad_norm": 0.13263575732707977,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 2440
    },
    {
      "epoch": 0.39056,
      "grad_norm": 0.137464702129364,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2441
    },
    {
      "epoch": 0.39072,
      "grad_norm": 0.1443982869386673,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 2442
    },
    {
      "epoch": 0.39088,
      "grad_norm": 0.10614851862192154,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 2443
    },
    {
      "epoch": 0.39104,
      "grad_norm": 0.1264694482088089,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 2444
    },
    {
      "epoch": 0.3912,
      "grad_norm": 0.1697823405265808,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 2445
    },
    {
      "epoch": 0.39136,
      "grad_norm": 0.13837428390979767,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 2446
    },
    {
      "epoch": 0.39152,
      "grad_norm": 0.13882380723953247,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 2447
    },
    {
      "epoch": 0.39168,
      "grad_norm": 0.107103131711483,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 2448
    },
    {
      "epoch": 0.39184,
      "grad_norm": 0.14198604226112366,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 2449
    },
    {
      "epoch": 0.392,
      "grad_norm": 0.11363136023283005,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 2450
    },
    {
      "epoch": 0.39216,
      "grad_norm": 0.15148676931858063,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 2451
    },
    {
      "epoch": 0.39232,
      "grad_norm": 0.11229991167783737,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2452
    },
    {
      "epoch": 0.39248,
      "grad_norm": 0.10492608696222305,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 2453
    },
    {
      "epoch": 0.39264,
      "grad_norm": 0.1132364273071289,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 2454
    },
    {
      "epoch": 0.3928,
      "grad_norm": 0.18402177095413208,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 2455
    },
    {
      "epoch": 0.39296,
      "grad_norm": 0.12926843762397766,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 2456
    },
    {
      "epoch": 0.39312,
      "grad_norm": 0.10696440190076828,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 2457
    },
    {
      "epoch": 0.39328,
      "grad_norm": 0.10572939366102219,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 2458
    },
    {
      "epoch": 0.39344,
      "grad_norm": 0.13228094577789307,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2459
    },
    {
      "epoch": 0.3936,
      "grad_norm": 0.1563679724931717,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 2460
    },
    {
      "epoch": 0.39376,
      "grad_norm": 0.33045047521591187,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 2461
    },
    {
      "epoch": 0.39392,
      "grad_norm": 0.13872261345386505,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 2462
    },
    {
      "epoch": 0.39408,
      "grad_norm": 0.1776776760816574,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 2463
    },
    {
      "epoch": 0.39424,
      "grad_norm": 0.22913576662540436,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 2464
    },
    {
      "epoch": 0.3944,
      "grad_norm": 0.14369148015975952,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 2465
    },
    {
      "epoch": 0.39456,
      "grad_norm": 0.18393272161483765,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 2466
    },
    {
      "epoch": 0.39472,
      "grad_norm": 0.11007234454154968,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 2467
    },
    {
      "epoch": 0.39488,
      "grad_norm": 0.16740818321704865,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 2468
    },
    {
      "epoch": 0.39504,
      "grad_norm": 0.14154532551765442,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 2469
    },
    {
      "epoch": 0.3952,
      "grad_norm": 0.15028567612171173,
      "learning_rate": 0.0001,
      "loss": 0.3379,
      "step": 2470
    },
    {
      "epoch": 0.39536,
      "grad_norm": 0.1599677950143814,
      "learning_rate": 0.0001,
      "loss": 0.3461,
      "step": 2471
    },
    {
      "epoch": 0.39552,
      "grad_norm": 0.14336203038692474,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 2472
    },
    {
      "epoch": 0.39568,
      "grad_norm": 0.1629016101360321,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 2473
    },
    {
      "epoch": 0.39584,
      "grad_norm": 0.11081822216510773,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 2474
    },
    {
      "epoch": 0.396,
      "grad_norm": 0.11404497176408768,
      "learning_rate": 0.0001,
      "loss": 0.3382,
      "step": 2475
    },
    {
      "epoch": 0.39616,
      "grad_norm": 0.12803597748279572,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 2476
    },
    {
      "epoch": 0.39632,
      "grad_norm": 0.16161426901817322,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 2477
    },
    {
      "epoch": 0.39648,
      "grad_norm": 0.10562841594219208,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 2478
    },
    {
      "epoch": 0.39664,
      "grad_norm": 0.20216801762580872,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 2479
    },
    {
      "epoch": 0.3968,
      "grad_norm": 0.1077875941991806,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 2480
    },
    {
      "epoch": 0.39696,
      "grad_norm": 0.12408171594142914,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 2481
    },
    {
      "epoch": 0.39712,
      "grad_norm": 0.15048858523368835,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2482
    },
    {
      "epoch": 0.39728,
      "grad_norm": 0.12107796221971512,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 2483
    },
    {
      "epoch": 0.39744,
      "grad_norm": 0.11900009959936142,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2484
    },
    {
      "epoch": 0.3976,
      "grad_norm": 0.12519969046115875,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 2485
    },
    {
      "epoch": 0.39776,
      "grad_norm": 0.18143633008003235,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 2486
    },
    {
      "epoch": 0.39792,
      "grad_norm": 0.11266526579856873,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 2487
    },
    {
      "epoch": 0.39808,
      "grad_norm": 0.11226256191730499,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 2488
    },
    {
      "epoch": 0.39824,
      "grad_norm": 0.09671229869127274,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2489
    },
    {
      "epoch": 0.3984,
      "grad_norm": 0.12894678115844727,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 2490
    },
    {
      "epoch": 0.39856,
      "grad_norm": 0.12000137567520142,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 2491
    },
    {
      "epoch": 0.39872,
      "grad_norm": 0.1585276871919632,
      "learning_rate": 0.0001,
      "loss": 0.3374,
      "step": 2492
    },
    {
      "epoch": 0.39888,
      "grad_norm": 0.10200025886297226,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 2493
    },
    {
      "epoch": 0.39904,
      "grad_norm": 0.12987764179706573,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 2494
    },
    {
      "epoch": 0.3992,
      "grad_norm": 0.12712039053440094,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 2495
    },
    {
      "epoch": 0.39936,
      "grad_norm": 0.14120984077453613,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 2496
    },
    {
      "epoch": 0.39952,
      "grad_norm": 0.1581098586320877,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 2497
    },
    {
      "epoch": 0.39968,
      "grad_norm": 0.12230660766363144,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 2498
    },
    {
      "epoch": 0.39984,
      "grad_norm": 0.11815525591373444,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 2499
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.1323769986629486,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 2500
    },
    {
      "epoch": 0.4,
      "eval_train_accuracy": 0.7426,
      "eval_train_loss": 0.32362309098243713,
      "eval_train_runtime": 4.1363,
      "eval_train_samples_per_second": 1208.81,
      "eval_train_steps_per_second": 15.231,
      "step": 2500
    },
    {
      "epoch": 0.4,
      "eval_test_accuracy": 0.747,
      "eval_test_loss": 0.32223784923553467,
      "eval_test_runtime": 4.9698,
      "eval_test_samples_per_second": 1006.077,
      "eval_test_steps_per_second": 12.677,
      "step": 2500
    },
    {
      "epoch": 0.40016,
      "grad_norm": 0.11582349240779877,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 2501
    },
    {
      "epoch": 0.40032,
      "grad_norm": 0.1047043725848198,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 2502
    },
    {
      "epoch": 0.40048,
      "grad_norm": 0.1169673353433609,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 2503
    },
    {
      "epoch": 0.40064,
      "grad_norm": 0.10395359247922897,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 2504
    },
    {
      "epoch": 0.4008,
      "grad_norm": 0.09677432477474213,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 2505
    },
    {
      "epoch": 0.40096,
      "grad_norm": 0.12827709317207336,
      "learning_rate": 0.0001,
      "loss": 0.3341,
      "step": 2506
    },
    {
      "epoch": 0.40112,
      "grad_norm": 0.11681554466485977,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 2507
    },
    {
      "epoch": 0.40128,
      "grad_norm": 0.09876133501529694,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 2508
    },
    {
      "epoch": 0.40144,
      "grad_norm": 0.10381720960140228,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 2509
    },
    {
      "epoch": 0.4016,
      "grad_norm": 0.10850965976715088,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 2510
    },
    {
      "epoch": 0.40176,
      "grad_norm": 0.12355365604162216,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 2511
    },
    {
      "epoch": 0.40192,
      "grad_norm": 0.09478185325860977,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 2512
    },
    {
      "epoch": 0.40208,
      "grad_norm": 0.10700369626283646,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 2513
    },
    {
      "epoch": 0.40224,
      "grad_norm": 0.10318486392498016,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 2514
    },
    {
      "epoch": 0.4024,
      "grad_norm": 0.12385929375886917,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 2515
    },
    {
      "epoch": 0.40256,
      "grad_norm": 0.12661373615264893,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 2516
    },
    {
      "epoch": 0.40272,
      "grad_norm": 0.12470251321792603,
      "learning_rate": 0.0001,
      "loss": 0.3427,
      "step": 2517
    },
    {
      "epoch": 0.40288,
      "grad_norm": 0.11154281347990036,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 2518
    },
    {
      "epoch": 0.40304,
      "grad_norm": 0.128149151802063,
      "learning_rate": 0.0001,
      "loss": 0.3418,
      "step": 2519
    },
    {
      "epoch": 0.4032,
      "grad_norm": 0.1410633772611618,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2520
    },
    {
      "epoch": 0.40336,
      "grad_norm": 0.15172727406024933,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 2521
    },
    {
      "epoch": 0.40352,
      "grad_norm": 0.11196920275688171,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 2522
    },
    {
      "epoch": 0.40368,
      "grad_norm": 0.10962597280740738,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2523
    },
    {
      "epoch": 0.40384,
      "grad_norm": 0.11207281798124313,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 2524
    },
    {
      "epoch": 0.404,
      "grad_norm": 0.1168472021818161,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 2525
    },
    {
      "epoch": 0.40416,
      "grad_norm": 0.11888640373945236,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 2526
    },
    {
      "epoch": 0.40432,
      "grad_norm": 0.11893723905086517,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 2527
    },
    {
      "epoch": 0.40448,
      "grad_norm": 0.10311131924390793,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2528
    },
    {
      "epoch": 0.40464,
      "grad_norm": 0.1056593507528305,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 2529
    },
    {
      "epoch": 0.4048,
      "grad_norm": 0.10350129753351212,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 2530
    },
    {
      "epoch": 0.40496,
      "grad_norm": 0.10026389360427856,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 2531
    },
    {
      "epoch": 0.40512,
      "grad_norm": 0.10843861848115921,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 2532
    },
    {
      "epoch": 0.40528,
      "grad_norm": 0.12025801837444305,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 2533
    },
    {
      "epoch": 0.40544,
      "grad_norm": 0.15796813368797302,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 2534
    },
    {
      "epoch": 0.4056,
      "grad_norm": 0.11282377690076828,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 2535
    },
    {
      "epoch": 0.40576,
      "grad_norm": 0.11310349404811859,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 2536
    },
    {
      "epoch": 0.40592,
      "grad_norm": 0.09769357740879059,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 2537
    },
    {
      "epoch": 0.40608,
      "grad_norm": 0.11995689570903778,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 2538
    },
    {
      "epoch": 0.40624,
      "grad_norm": 0.11560877412557602,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 2539
    },
    {
      "epoch": 0.4064,
      "grad_norm": 0.13342821598052979,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 2540
    },
    {
      "epoch": 0.40656,
      "grad_norm": 0.13217845559120178,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 2541
    },
    {
      "epoch": 0.40672,
      "grad_norm": 0.14398075640201569,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 2542
    },
    {
      "epoch": 0.40688,
      "grad_norm": 0.10590174049139023,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 2543
    },
    {
      "epoch": 0.40704,
      "grad_norm": 0.123258076608181,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 2544
    },
    {
      "epoch": 0.4072,
      "grad_norm": 0.10174952447414398,
      "learning_rate": 0.0001,
      "loss": 0.3051,
      "step": 2545
    },
    {
      "epoch": 0.40736,
      "grad_norm": 0.11299318075180054,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 2546
    },
    {
      "epoch": 0.40752,
      "grad_norm": 0.10181573778390884,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2547
    },
    {
      "epoch": 0.40768,
      "grad_norm": 0.14164608716964722,
      "learning_rate": 0.0001,
      "loss": 0.3374,
      "step": 2548
    },
    {
      "epoch": 0.40784,
      "grad_norm": 0.1200191080570221,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 2549
    },
    {
      "epoch": 0.408,
      "grad_norm": 0.12448751926422119,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 2550
    },
    {
      "epoch": 0.40816,
      "grad_norm": 0.13802847266197205,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 2551
    },
    {
      "epoch": 0.40832,
      "grad_norm": 0.09837442636489868,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 2552
    },
    {
      "epoch": 0.40848,
      "grad_norm": 0.13275393843650818,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 2553
    },
    {
      "epoch": 0.40864,
      "grad_norm": 0.13009029626846313,
      "learning_rate": 0.0001,
      "loss": 0.3476,
      "step": 2554
    },
    {
      "epoch": 0.4088,
      "grad_norm": 0.10984208434820175,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 2555
    },
    {
      "epoch": 0.40896,
      "grad_norm": 0.11849188804626465,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 2556
    },
    {
      "epoch": 0.40912,
      "grad_norm": 0.09942393004894257,
      "learning_rate": 0.0001,
      "loss": 0.3039,
      "step": 2557
    },
    {
      "epoch": 0.40928,
      "grad_norm": 0.10543383657932281,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 2558
    },
    {
      "epoch": 0.40944,
      "grad_norm": 0.11187929660081863,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 2559
    },
    {
      "epoch": 0.4096,
      "grad_norm": 0.10974433273077011,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 2560
    },
    {
      "epoch": 0.40976,
      "grad_norm": 0.09915843605995178,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 2561
    },
    {
      "epoch": 0.40992,
      "grad_norm": 0.10248491913080215,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 2562
    },
    {
      "epoch": 0.41008,
      "grad_norm": 0.11617068946361542,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 2563
    },
    {
      "epoch": 0.41024,
      "grad_norm": 0.11822386831045151,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2564
    },
    {
      "epoch": 0.4104,
      "grad_norm": 0.10518089681863785,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 2565
    },
    {
      "epoch": 0.41056,
      "grad_norm": 0.10709932446479797,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2566
    },
    {
      "epoch": 0.41072,
      "grad_norm": 0.10899267345666885,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 2567
    },
    {
      "epoch": 0.41088,
      "grad_norm": 0.13143199682235718,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 2568
    },
    {
      "epoch": 0.41104,
      "grad_norm": 0.12334710359573364,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 2569
    },
    {
      "epoch": 0.4112,
      "grad_norm": 0.1307666301727295,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 2570
    },
    {
      "epoch": 0.41136,
      "grad_norm": 0.10722175240516663,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 2571
    },
    {
      "epoch": 0.41152,
      "grad_norm": 0.09989871084690094,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 2572
    },
    {
      "epoch": 0.41168,
      "grad_norm": 0.10707149654626846,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 2573
    },
    {
      "epoch": 0.41184,
      "grad_norm": 0.1240626722574234,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 2574
    },
    {
      "epoch": 0.412,
      "grad_norm": 0.11807060241699219,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 2575
    },
    {
      "epoch": 0.41216,
      "grad_norm": 0.10958989709615707,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 2576
    },
    {
      "epoch": 0.41232,
      "grad_norm": 0.11141838878393173,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 2577
    },
    {
      "epoch": 0.41248,
      "grad_norm": 0.1186382919549942,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 2578
    },
    {
      "epoch": 0.41264,
      "grad_norm": 0.10202714800834656,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 2579
    },
    {
      "epoch": 0.4128,
      "grad_norm": 0.1198311448097229,
      "learning_rate": 0.0001,
      "loss": 0.3426,
      "step": 2580
    },
    {
      "epoch": 0.41296,
      "grad_norm": 0.1327478289604187,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 2581
    },
    {
      "epoch": 0.41312,
      "grad_norm": 0.11077796667814255,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 2582
    },
    {
      "epoch": 0.41328,
      "grad_norm": 0.10883497446775436,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 2583
    },
    {
      "epoch": 0.41344,
      "grad_norm": 0.1122002974152565,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 2584
    },
    {
      "epoch": 0.4136,
      "grad_norm": 0.10762489587068558,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 2585
    },
    {
      "epoch": 0.41376,
      "grad_norm": 0.11541450768709183,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 2586
    },
    {
      "epoch": 0.41392,
      "grad_norm": 0.13558566570281982,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 2587
    },
    {
      "epoch": 0.41408,
      "grad_norm": 0.10488512367010117,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 2588
    },
    {
      "epoch": 0.41424,
      "grad_norm": 0.0998014509677887,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 2589
    },
    {
      "epoch": 0.4144,
      "grad_norm": 0.10250917822122574,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 2590
    },
    {
      "epoch": 0.41456,
      "grad_norm": 0.1073417216539383,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 2591
    },
    {
      "epoch": 0.41472,
      "grad_norm": 0.10966341942548752,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 2592
    },
    {
      "epoch": 0.41488,
      "grad_norm": 0.10879063606262207,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2593
    },
    {
      "epoch": 0.41504,
      "grad_norm": 0.1037004366517067,
      "learning_rate": 0.0001,
      "loss": 0.348,
      "step": 2594
    },
    {
      "epoch": 0.4152,
      "grad_norm": 0.1071472093462944,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 2595
    },
    {
      "epoch": 0.41536,
      "grad_norm": 0.11157556623220444,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 2596
    },
    {
      "epoch": 0.41552,
      "grad_norm": 0.11511557549238205,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 2597
    },
    {
      "epoch": 0.41568,
      "grad_norm": 0.0994914174079895,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 2598
    },
    {
      "epoch": 0.41584,
      "grad_norm": 0.11640981584787369,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 2599
    },
    {
      "epoch": 0.416,
      "grad_norm": 0.1108134537935257,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 2600
    },
    {
      "epoch": 0.416,
      "eval_train_accuracy": 0.8048,
      "eval_train_loss": 0.32273322343826294,
      "eval_train_runtime": 4.2155,
      "eval_train_samples_per_second": 1186.109,
      "eval_train_steps_per_second": 14.945,
      "step": 2600
    },
    {
      "epoch": 0.416,
      "eval_test_accuracy": 0.8104,
      "eval_test_loss": 0.3214097023010254,
      "eval_test_runtime": 5.0488,
      "eval_test_samples_per_second": 990.335,
      "eval_test_steps_per_second": 12.478,
      "step": 2600
    },
    {
      "epoch": 0.41616,
      "grad_norm": 0.10079016536474228,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 2601
    },
    {
      "epoch": 0.41632,
      "grad_norm": 0.11038649082183838,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 2602
    },
    {
      "epoch": 0.41648,
      "grad_norm": 0.10283079743385315,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 2603
    },
    {
      "epoch": 0.41664,
      "grad_norm": 0.11383924633264542,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 2604
    },
    {
      "epoch": 0.4168,
      "grad_norm": 0.09498624503612518,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 2605
    },
    {
      "epoch": 0.41696,
      "grad_norm": 0.11280341446399689,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 2606
    },
    {
      "epoch": 0.41712,
      "grad_norm": 0.11105598509311676,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 2607
    },
    {
      "epoch": 0.41728,
      "grad_norm": 0.09769042581319809,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 2608
    },
    {
      "epoch": 0.41744,
      "grad_norm": 0.1120457798242569,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 2609
    },
    {
      "epoch": 0.4176,
      "grad_norm": 0.1758907586336136,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 2610
    },
    {
      "epoch": 0.41776,
      "grad_norm": 0.1029554232954979,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 2611
    },
    {
      "epoch": 0.41792,
      "grad_norm": 0.11353807896375656,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 2612
    },
    {
      "epoch": 0.41808,
      "grad_norm": 0.10511662065982819,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 2613
    },
    {
      "epoch": 0.41824,
      "grad_norm": 0.1039964035153389,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 2614
    },
    {
      "epoch": 0.4184,
      "grad_norm": 0.11221079528331757,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 2615
    },
    {
      "epoch": 0.41856,
      "grad_norm": 0.09975054115056992,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 2616
    },
    {
      "epoch": 0.41872,
      "grad_norm": 0.12223261594772339,
      "learning_rate": 0.0001,
      "loss": 0.3412,
      "step": 2617
    },
    {
      "epoch": 0.41888,
      "grad_norm": 0.10857853293418884,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 2618
    },
    {
      "epoch": 0.41904,
      "grad_norm": 0.09743118286132812,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 2619
    },
    {
      "epoch": 0.4192,
      "grad_norm": 0.12414038181304932,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 2620
    },
    {
      "epoch": 0.41936,
      "grad_norm": 0.10929007828235626,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 2621
    },
    {
      "epoch": 0.41952,
      "grad_norm": 0.10399610549211502,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 2622
    },
    {
      "epoch": 0.41968,
      "grad_norm": 0.10776747018098831,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 2623
    },
    {
      "epoch": 0.41984,
      "grad_norm": 0.11133759468793869,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 2624
    },
    {
      "epoch": 0.42,
      "grad_norm": 0.12367179989814758,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 2625
    },
    {
      "epoch": 0.42016,
      "grad_norm": 0.11739270389080048,
      "learning_rate": 0.0001,
      "loss": 0.3036,
      "step": 2626
    },
    {
      "epoch": 0.42032,
      "grad_norm": 0.10049425810575485,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 2627
    },
    {
      "epoch": 0.42048,
      "grad_norm": 0.1696789711713791,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 2628
    },
    {
      "epoch": 0.42064,
      "grad_norm": 0.11936946213245392,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 2629
    },
    {
      "epoch": 0.4208,
      "grad_norm": 0.12160590291023254,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 2630
    },
    {
      "epoch": 0.42096,
      "grad_norm": 0.11592204123735428,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 2631
    },
    {
      "epoch": 0.42112,
      "grad_norm": 0.109146349132061,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 2632
    },
    {
      "epoch": 0.42128,
      "grad_norm": 0.12249308079481125,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 2633
    },
    {
      "epoch": 0.42144,
      "grad_norm": 0.11590667814016342,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 2634
    },
    {
      "epoch": 0.4216,
      "grad_norm": 0.10119236260652542,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2635
    },
    {
      "epoch": 0.42176,
      "grad_norm": 0.10319478064775467,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 2636
    },
    {
      "epoch": 0.42192,
      "grad_norm": 0.11722022294998169,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 2637
    },
    {
      "epoch": 0.42208,
      "grad_norm": 0.10442198067903519,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 2638
    },
    {
      "epoch": 0.42224,
      "grad_norm": 0.13043814897537231,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 2639
    },
    {
      "epoch": 0.4224,
      "grad_norm": 0.12475743144750595,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 2640
    },
    {
      "epoch": 0.42256,
      "grad_norm": 0.11482406407594681,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 2641
    },
    {
      "epoch": 0.42272,
      "grad_norm": 0.09633524715900421,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 2642
    },
    {
      "epoch": 0.42288,
      "grad_norm": 0.10844068229198456,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 2643
    },
    {
      "epoch": 0.42304,
      "grad_norm": 0.11841168999671936,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 2644
    },
    {
      "epoch": 0.4232,
      "grad_norm": 0.13243508338928223,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 2645
    },
    {
      "epoch": 0.42336,
      "grad_norm": 0.09623164683580399,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 2646
    },
    {
      "epoch": 0.42352,
      "grad_norm": 0.11283896863460541,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 2647
    },
    {
      "epoch": 0.42368,
      "grad_norm": 0.11286674439907074,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 2648
    },
    {
      "epoch": 0.42384,
      "grad_norm": 0.1479852944612503,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2649
    },
    {
      "epoch": 0.424,
      "grad_norm": 0.12400073558092117,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 2650
    },
    {
      "epoch": 0.42416,
      "grad_norm": 0.09739860892295837,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 2651
    },
    {
      "epoch": 0.42432,
      "grad_norm": 0.11024969816207886,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 2652
    },
    {
      "epoch": 0.42448,
      "grad_norm": 0.12792563438415527,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 2653
    },
    {
      "epoch": 0.42464,
      "grad_norm": 0.18920566141605377,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 2654
    },
    {
      "epoch": 0.4248,
      "grad_norm": 0.12605956196784973,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 2655
    },
    {
      "epoch": 0.42496,
      "grad_norm": 0.11012927442789078,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 2656
    },
    {
      "epoch": 0.42512,
      "grad_norm": 0.14008739590644836,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 2657
    },
    {
      "epoch": 0.42528,
      "grad_norm": 0.13308261334896088,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 2658
    },
    {
      "epoch": 0.42544,
      "grad_norm": 0.14177289605140686,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 2659
    },
    {
      "epoch": 0.4256,
      "grad_norm": 0.13456937670707703,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2660
    },
    {
      "epoch": 0.42576,
      "grad_norm": 0.10706065595149994,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 2661
    },
    {
      "epoch": 0.42592,
      "grad_norm": 0.1154838278889656,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 2662
    },
    {
      "epoch": 0.42608,
      "grad_norm": 0.1153707429766655,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 2663
    },
    {
      "epoch": 0.42624,
      "grad_norm": 0.16798004508018494,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 2664
    },
    {
      "epoch": 0.4264,
      "grad_norm": 0.11066203564405441,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 2665
    },
    {
      "epoch": 0.42656,
      "grad_norm": 0.09961109608411789,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2666
    },
    {
      "epoch": 0.42672,
      "grad_norm": 0.1394730806350708,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 2667
    },
    {
      "epoch": 0.42688,
      "grad_norm": 0.12325872480869293,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 2668
    },
    {
      "epoch": 0.42704,
      "grad_norm": 0.11339433491230011,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 2669
    },
    {
      "epoch": 0.4272,
      "grad_norm": 0.12653671205043793,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 2670
    },
    {
      "epoch": 0.42736,
      "grad_norm": 0.16330230236053467,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 2671
    },
    {
      "epoch": 0.42752,
      "grad_norm": 0.10810599476099014,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 2672
    },
    {
      "epoch": 0.42768,
      "grad_norm": 0.10544660687446594,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 2673
    },
    {
      "epoch": 0.42784,
      "grad_norm": 0.10342631489038467,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 2674
    },
    {
      "epoch": 0.428,
      "grad_norm": 0.142362579703331,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 2675
    },
    {
      "epoch": 0.42816,
      "grad_norm": 0.11347315460443497,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 2676
    },
    {
      "epoch": 0.42832,
      "grad_norm": 0.11835528165102005,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 2677
    },
    {
      "epoch": 0.42848,
      "grad_norm": 0.10342149436473846,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 2678
    },
    {
      "epoch": 0.42864,
      "grad_norm": 0.10388867557048798,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 2679
    },
    {
      "epoch": 0.4288,
      "grad_norm": 0.09666138887405396,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 2680
    },
    {
      "epoch": 0.42896,
      "grad_norm": 0.09076989442110062,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 2681
    },
    {
      "epoch": 0.42912,
      "grad_norm": 0.13846631348133087,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 2682
    },
    {
      "epoch": 0.42928,
      "grad_norm": 0.11086908727884293,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 2683
    },
    {
      "epoch": 0.42944,
      "grad_norm": 0.10611280053853989,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 2684
    },
    {
      "epoch": 0.4296,
      "grad_norm": 0.11077755689620972,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 2685
    },
    {
      "epoch": 0.42976,
      "grad_norm": 0.10859295725822449,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 2686
    },
    {
      "epoch": 0.42992,
      "grad_norm": 0.17321035265922546,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 2687
    },
    {
      "epoch": 0.43008,
      "grad_norm": 0.10576114803552628,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 2688
    },
    {
      "epoch": 0.43024,
      "grad_norm": 0.1351432353258133,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 2689
    },
    {
      "epoch": 0.4304,
      "grad_norm": 0.10087903589010239,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 2690
    },
    {
      "epoch": 0.43056,
      "grad_norm": 0.14708538353443146,
      "learning_rate": 0.0001,
      "loss": 0.3402,
      "step": 2691
    },
    {
      "epoch": 0.43072,
      "grad_norm": 0.11669965088367462,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 2692
    },
    {
      "epoch": 0.43088,
      "grad_norm": 0.1256394237279892,
      "learning_rate": 0.0001,
      "loss": 0.3407,
      "step": 2693
    },
    {
      "epoch": 0.43104,
      "grad_norm": 0.11226612329483032,
      "learning_rate": 0.0001,
      "loss": 0.3081,
      "step": 2694
    },
    {
      "epoch": 0.4312,
      "grad_norm": 0.11114359647035599,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 2695
    },
    {
      "epoch": 0.43136,
      "grad_norm": 0.14999358355998993,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 2696
    },
    {
      "epoch": 0.43152,
      "grad_norm": 0.11509864032268524,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 2697
    },
    {
      "epoch": 0.43168,
      "grad_norm": 0.09949944913387299,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 2698
    },
    {
      "epoch": 0.43184,
      "grad_norm": 0.11768188327550888,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 2699
    },
    {
      "epoch": 0.432,
      "grad_norm": 0.11677326261997223,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 2700
    },
    {
      "epoch": 0.432,
      "eval_train_accuracy": 0.881,
      "eval_train_loss": 0.32187411189079285,
      "eval_train_runtime": 4.215,
      "eval_train_samples_per_second": 1186.232,
      "eval_train_steps_per_second": 14.947,
      "step": 2700
    },
    {
      "epoch": 0.432,
      "eval_test_accuracy": 0.8872,
      "eval_test_loss": 0.3205346167087555,
      "eval_test_runtime": 5.0431,
      "eval_test_samples_per_second": 991.458,
      "eval_test_steps_per_second": 12.492,
      "step": 2700
    },
    {
      "epoch": 0.43216,
      "grad_norm": 0.12503057718276978,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 2701
    },
    {
      "epoch": 0.43232,
      "grad_norm": 0.11992169171571732,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 2702
    },
    {
      "epoch": 0.43248,
      "grad_norm": 0.12547366321086884,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 2703
    },
    {
      "epoch": 0.43264,
      "grad_norm": 0.11154327541589737,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 2704
    },
    {
      "epoch": 0.4328,
      "grad_norm": 0.11072258651256561,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 2705
    },
    {
      "epoch": 0.43296,
      "grad_norm": 0.11575625091791153,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2706
    },
    {
      "epoch": 0.43312,
      "grad_norm": 0.11971543729305267,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 2707
    },
    {
      "epoch": 0.43328,
      "grad_norm": 0.11937635391950607,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 2708
    },
    {
      "epoch": 0.43344,
      "grad_norm": 0.10485561192035675,
      "learning_rate": 0.0001,
      "loss": 0.3046,
      "step": 2709
    },
    {
      "epoch": 0.4336,
      "grad_norm": 0.13022929430007935,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 2710
    },
    {
      "epoch": 0.43376,
      "grad_norm": 0.10747510194778442,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 2711
    },
    {
      "epoch": 0.43392,
      "grad_norm": 0.10465464740991592,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 2712
    },
    {
      "epoch": 0.43408,
      "grad_norm": 0.10303350538015366,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 2713
    },
    {
      "epoch": 0.43424,
      "grad_norm": 0.13231073319911957,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 2714
    },
    {
      "epoch": 0.4344,
      "grad_norm": 0.11448723077774048,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 2715
    },
    {
      "epoch": 0.43456,
      "grad_norm": 0.1116509735584259,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2716
    },
    {
      "epoch": 0.43472,
      "grad_norm": 0.09418844431638718,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 2717
    },
    {
      "epoch": 0.43488,
      "grad_norm": 0.1469184160232544,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 2718
    },
    {
      "epoch": 0.43504,
      "grad_norm": 0.1050325334072113,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 2719
    },
    {
      "epoch": 0.4352,
      "grad_norm": 0.11587122082710266,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 2720
    },
    {
      "epoch": 0.43536,
      "grad_norm": 0.18054062128067017,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 2721
    },
    {
      "epoch": 0.43552,
      "grad_norm": 0.11672119796276093,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 2722
    },
    {
      "epoch": 0.43568,
      "grad_norm": 0.11850754171609879,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2723
    },
    {
      "epoch": 0.43584,
      "grad_norm": 0.1061621829867363,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 2724
    },
    {
      "epoch": 0.436,
      "grad_norm": 0.1068657785654068,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 2725
    },
    {
      "epoch": 0.43616,
      "grad_norm": 0.12624827027320862,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 2726
    },
    {
      "epoch": 0.43632,
      "grad_norm": 0.1484760344028473,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 2727
    },
    {
      "epoch": 0.43648,
      "grad_norm": 0.11132913082838058,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 2728
    },
    {
      "epoch": 0.43664,
      "grad_norm": 0.10992412269115448,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2729
    },
    {
      "epoch": 0.4368,
      "grad_norm": 0.10855383425951004,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 2730
    },
    {
      "epoch": 0.43696,
      "grad_norm": 0.11259263008832932,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 2731
    },
    {
      "epoch": 0.43712,
      "grad_norm": 0.10088556259870529,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 2732
    },
    {
      "epoch": 0.43728,
      "grad_norm": 0.11953538656234741,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 2733
    },
    {
      "epoch": 0.43744,
      "grad_norm": 0.10668614506721497,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 2734
    },
    {
      "epoch": 0.4376,
      "grad_norm": 0.10510395467281342,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 2735
    },
    {
      "epoch": 0.43776,
      "grad_norm": 0.11280282586812973,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 2736
    },
    {
      "epoch": 0.43792,
      "grad_norm": 0.10551659762859344,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 2737
    },
    {
      "epoch": 0.43808,
      "grad_norm": 0.11437080055475235,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 2738
    },
    {
      "epoch": 0.43824,
      "grad_norm": 0.12453226745128632,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 2739
    },
    {
      "epoch": 0.4384,
      "grad_norm": 0.1045239269733429,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2740
    },
    {
      "epoch": 0.43856,
      "grad_norm": 0.09164854884147644,
      "learning_rate": 0.0001,
      "loss": 0.3025,
      "step": 2741
    },
    {
      "epoch": 0.43872,
      "grad_norm": 0.10154610127210617,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 2742
    },
    {
      "epoch": 0.43888,
      "grad_norm": 0.11274431645870209,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 2743
    },
    {
      "epoch": 0.43904,
      "grad_norm": 0.10923925787210464,
      "learning_rate": 0.0001,
      "loss": 0.3061,
      "step": 2744
    },
    {
      "epoch": 0.4392,
      "grad_norm": 0.13003379106521606,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 2745
    },
    {
      "epoch": 0.43936,
      "grad_norm": 0.12737968564033508,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2746
    },
    {
      "epoch": 0.43952,
      "grad_norm": 0.11507855355739594,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 2747
    },
    {
      "epoch": 0.43968,
      "grad_norm": 0.09419771283864975,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 2748
    },
    {
      "epoch": 0.43984,
      "grad_norm": 0.102553591132164,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 2749
    },
    {
      "epoch": 0.44,
      "grad_norm": 0.13615144789218903,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2750
    },
    {
      "epoch": 0.44016,
      "grad_norm": 0.10463440418243408,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 2751
    },
    {
      "epoch": 0.44032,
      "grad_norm": 0.10848019272089005,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 2752
    },
    {
      "epoch": 0.44048,
      "grad_norm": 0.11228593438863754,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 2753
    },
    {
      "epoch": 0.44064,
      "grad_norm": 0.10062629729509354,
      "learning_rate": 0.0001,
      "loss": 0.343,
      "step": 2754
    },
    {
      "epoch": 0.4408,
      "grad_norm": 0.09978277236223221,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 2755
    },
    {
      "epoch": 0.44096,
      "grad_norm": 0.11298370361328125,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 2756
    },
    {
      "epoch": 0.44112,
      "grad_norm": 0.14639142155647278,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2757
    },
    {
      "epoch": 0.44128,
      "grad_norm": 0.1059843897819519,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 2758
    },
    {
      "epoch": 0.44144,
      "grad_norm": 0.11378608644008636,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 2759
    },
    {
      "epoch": 0.4416,
      "grad_norm": 0.10544894635677338,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 2760
    },
    {
      "epoch": 0.44176,
      "grad_norm": 0.11142978072166443,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 2761
    },
    {
      "epoch": 0.44192,
      "grad_norm": 0.10813171416521072,
      "learning_rate": 0.0001,
      "loss": 0.3074,
      "step": 2762
    },
    {
      "epoch": 0.44208,
      "grad_norm": 0.10796234011650085,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 2763
    },
    {
      "epoch": 0.44224,
      "grad_norm": 0.10849925875663757,
      "learning_rate": 0.0001,
      "loss": 0.3436,
      "step": 2764
    },
    {
      "epoch": 0.4424,
      "grad_norm": 0.10964875668287277,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 2765
    },
    {
      "epoch": 0.44256,
      "grad_norm": 0.10158101469278336,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 2766
    },
    {
      "epoch": 0.44272,
      "grad_norm": 0.10579468309879303,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 2767
    },
    {
      "epoch": 0.44288,
      "grad_norm": 0.11318852007389069,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 2768
    },
    {
      "epoch": 0.44304,
      "grad_norm": 0.1384897232055664,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 2769
    },
    {
      "epoch": 0.4432,
      "grad_norm": 0.10298092663288116,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 2770
    },
    {
      "epoch": 0.44336,
      "grad_norm": 0.10431516170501709,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 2771
    },
    {
      "epoch": 0.44352,
      "grad_norm": 0.10094898194074631,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 2772
    },
    {
      "epoch": 0.44368,
      "grad_norm": 0.10225864499807358,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 2773
    },
    {
      "epoch": 0.44384,
      "grad_norm": 0.12442434579133987,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 2774
    },
    {
      "epoch": 0.444,
      "grad_norm": 0.1220717579126358,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 2775
    },
    {
      "epoch": 0.44416,
      "grad_norm": 0.09526855498552322,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 2776
    },
    {
      "epoch": 0.44432,
      "grad_norm": 0.10685978084802628,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2777
    },
    {
      "epoch": 0.44448,
      "grad_norm": 0.10959985852241516,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 2778
    },
    {
      "epoch": 0.44464,
      "grad_norm": 0.10233093798160553,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 2779
    },
    {
      "epoch": 0.4448,
      "grad_norm": 0.1478695273399353,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 2780
    },
    {
      "epoch": 0.44496,
      "grad_norm": 0.10530313104391098,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 2781
    },
    {
      "epoch": 0.44512,
      "grad_norm": 0.10069511830806732,
      "learning_rate": 0.0001,
      "loss": 0.3081,
      "step": 2782
    },
    {
      "epoch": 0.44528,
      "grad_norm": 0.12323980778455734,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 2783
    },
    {
      "epoch": 0.44544,
      "grad_norm": 0.11635302752256393,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 2784
    },
    {
      "epoch": 0.4456,
      "grad_norm": 0.13341908156871796,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 2785
    },
    {
      "epoch": 0.44576,
      "grad_norm": 0.11033214628696442,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2786
    },
    {
      "epoch": 0.44592,
      "grad_norm": 0.108792245388031,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 2787
    },
    {
      "epoch": 0.44608,
      "grad_norm": 0.11446108669042587,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 2788
    },
    {
      "epoch": 0.44624,
      "grad_norm": 0.1308017075061798,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 2789
    },
    {
      "epoch": 0.4464,
      "grad_norm": 0.11006288975477219,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 2790
    },
    {
      "epoch": 0.44656,
      "grad_norm": 0.10526398569345474,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 2791
    },
    {
      "epoch": 0.44672,
      "grad_norm": 0.0884258821606636,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 2792
    },
    {
      "epoch": 0.44688,
      "grad_norm": 0.1134815514087677,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 2793
    },
    {
      "epoch": 0.44704,
      "grad_norm": 0.11135957390069962,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 2794
    },
    {
      "epoch": 0.4472,
      "grad_norm": 0.1557762175798416,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 2795
    },
    {
      "epoch": 0.44736,
      "grad_norm": 0.09966876357793808,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 2796
    },
    {
      "epoch": 0.44752,
      "grad_norm": 0.09527310729026794,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 2797
    },
    {
      "epoch": 0.44768,
      "grad_norm": 0.09265919029712677,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 2798
    },
    {
      "epoch": 0.44784,
      "grad_norm": 0.25013428926467896,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 2799
    },
    {
      "epoch": 0.448,
      "grad_norm": 0.11183871328830719,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 2800
    },
    {
      "epoch": 0.448,
      "eval_train_accuracy": 0.9124,
      "eval_train_loss": 0.3212004005908966,
      "eval_train_runtime": 4.3675,
      "eval_train_samples_per_second": 1144.833,
      "eval_train_steps_per_second": 14.425,
      "step": 2800
    },
    {
      "epoch": 0.448,
      "eval_test_accuracy": 0.9096,
      "eval_test_loss": 0.31987157464027405,
      "eval_test_runtime": 5.0324,
      "eval_test_samples_per_second": 993.561,
      "eval_test_steps_per_second": 12.519,
      "step": 2800
    },
    {
      "epoch": 0.44816,
      "grad_norm": 0.11560887098312378,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 2801
    },
    {
      "epoch": 0.44832,
      "grad_norm": 0.10828565061092377,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 2802
    },
    {
      "epoch": 0.44848,
      "grad_norm": 0.1315097063779831,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 2803
    },
    {
      "epoch": 0.44864,
      "grad_norm": 0.13135644793510437,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 2804
    },
    {
      "epoch": 0.4488,
      "grad_norm": 0.1491176187992096,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2805
    },
    {
      "epoch": 0.44896,
      "grad_norm": 0.12149056792259216,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 2806
    },
    {
      "epoch": 0.44912,
      "grad_norm": 0.09406686574220657,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 2807
    },
    {
      "epoch": 0.44928,
      "grad_norm": 0.11710270494222641,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 2808
    },
    {
      "epoch": 0.44944,
      "grad_norm": 0.15745358169078827,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2809
    },
    {
      "epoch": 0.4496,
      "grad_norm": 0.13575588166713715,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2810
    },
    {
      "epoch": 0.44976,
      "grad_norm": 0.12041379511356354,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 2811
    },
    {
      "epoch": 0.44992,
      "grad_norm": 0.12129949033260345,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 2812
    },
    {
      "epoch": 0.45008,
      "grad_norm": 0.17178182303905487,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 2813
    },
    {
      "epoch": 0.45024,
      "grad_norm": 0.11145275086164474,
      "learning_rate": 0.0001,
      "loss": 0.3062,
      "step": 2814
    },
    {
      "epoch": 0.4504,
      "grad_norm": 0.11231845617294312,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2815
    },
    {
      "epoch": 0.45056,
      "grad_norm": 0.15798352658748627,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 2816
    },
    {
      "epoch": 0.45072,
      "grad_norm": 0.22542443871498108,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 2817
    },
    {
      "epoch": 0.45088,
      "grad_norm": 0.10149405151605606,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 2818
    },
    {
      "epoch": 0.45104,
      "grad_norm": 0.10566375404596329,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2819
    },
    {
      "epoch": 0.4512,
      "grad_norm": 0.10281475633382797,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 2820
    },
    {
      "epoch": 0.45136,
      "grad_norm": 0.11517565697431564,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 2821
    },
    {
      "epoch": 0.45152,
      "grad_norm": 0.17762896418571472,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 2822
    },
    {
      "epoch": 0.45168,
      "grad_norm": 0.17881768941879272,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 2823
    },
    {
      "epoch": 0.45184,
      "grad_norm": 0.11345379054546356,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 2824
    },
    {
      "epoch": 0.452,
      "grad_norm": 0.11915529519319534,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 2825
    },
    {
      "epoch": 0.45216,
      "grad_norm": 0.23748622834682465,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 2826
    },
    {
      "epoch": 0.45232,
      "grad_norm": 0.19015341997146606,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 2827
    },
    {
      "epoch": 0.45248,
      "grad_norm": 0.10308601707220078,
      "learning_rate": 0.0001,
      "loss": 0.3061,
      "step": 2828
    },
    {
      "epoch": 0.45264,
      "grad_norm": 0.10089027881622314,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 2829
    },
    {
      "epoch": 0.4528,
      "grad_norm": 0.19947503507137299,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 2830
    },
    {
      "epoch": 0.45296,
      "grad_norm": 0.12316735833883286,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 2831
    },
    {
      "epoch": 0.45312,
      "grad_norm": 0.15705783665180206,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 2832
    },
    {
      "epoch": 0.45328,
      "grad_norm": 0.1230349913239479,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 2833
    },
    {
      "epoch": 0.45344,
      "grad_norm": 0.1379709243774414,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2834
    },
    {
      "epoch": 0.4536,
      "grad_norm": 0.14858582615852356,
      "learning_rate": 0.0001,
      "loss": 0.3347,
      "step": 2835
    },
    {
      "epoch": 0.45376,
      "grad_norm": 0.1328333020210266,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 2836
    },
    {
      "epoch": 0.45392,
      "grad_norm": 0.11950816214084625,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 2837
    },
    {
      "epoch": 0.45408,
      "grad_norm": 0.13010655343532562,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 2838
    },
    {
      "epoch": 0.45424,
      "grad_norm": 0.12010554224252701,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2839
    },
    {
      "epoch": 0.4544,
      "grad_norm": 0.14396023750305176,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2840
    },
    {
      "epoch": 0.45456,
      "grad_norm": 0.11044161021709442,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 2841
    },
    {
      "epoch": 0.45472,
      "grad_norm": 0.13996413350105286,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 2842
    },
    {
      "epoch": 0.45488,
      "grad_norm": 0.11897249519824982,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 2843
    },
    {
      "epoch": 0.45504,
      "grad_norm": 0.16583913564682007,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 2844
    },
    {
      "epoch": 0.4552,
      "grad_norm": 0.11890600621700287,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 2845
    },
    {
      "epoch": 0.45536,
      "grad_norm": 0.11812873184680939,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 2846
    },
    {
      "epoch": 0.45552,
      "grad_norm": 0.14784370362758636,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2847
    },
    {
      "epoch": 0.45568,
      "grad_norm": 0.11866938322782516,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 2848
    },
    {
      "epoch": 0.45584,
      "grad_norm": 0.13566865026950836,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 2849
    },
    {
      "epoch": 0.456,
      "grad_norm": 0.1466798633337021,
      "learning_rate": 0.0001,
      "loss": 0.3081,
      "step": 2850
    },
    {
      "epoch": 0.45616,
      "grad_norm": 0.10115473717451096,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 2851
    },
    {
      "epoch": 0.45632,
      "grad_norm": 0.1190604716539383,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 2852
    },
    {
      "epoch": 0.45648,
      "grad_norm": 0.13369296491146088,
      "learning_rate": 0.0001,
      "loss": 0.3407,
      "step": 2853
    },
    {
      "epoch": 0.45664,
      "grad_norm": 0.13210180401802063,
      "learning_rate": 0.0001,
      "loss": 0.3089,
      "step": 2854
    },
    {
      "epoch": 0.4568,
      "grad_norm": 0.10851164162158966,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 2855
    },
    {
      "epoch": 0.45696,
      "grad_norm": 0.1892869770526886,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2856
    },
    {
      "epoch": 0.45712,
      "grad_norm": 0.12856425344944,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 2857
    },
    {
      "epoch": 0.45728,
      "grad_norm": 0.13109318912029266,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2858
    },
    {
      "epoch": 0.45744,
      "grad_norm": 0.11452677100896835,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 2859
    },
    {
      "epoch": 0.4576,
      "grad_norm": 0.12489821761846542,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 2860
    },
    {
      "epoch": 0.45776,
      "grad_norm": 0.19357697665691376,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 2861
    },
    {
      "epoch": 0.45792,
      "grad_norm": 0.14871786534786224,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 2862
    },
    {
      "epoch": 0.45808,
      "grad_norm": 0.22345951199531555,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 2863
    },
    {
      "epoch": 0.45824,
      "grad_norm": 0.1326616257429123,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 2864
    },
    {
      "epoch": 0.4584,
      "grad_norm": 0.14367099106311798,
      "learning_rate": 0.0001,
      "loss": 0.2962,
      "step": 2865
    },
    {
      "epoch": 0.45856,
      "grad_norm": 0.10423234105110168,
      "learning_rate": 0.0001,
      "loss": 0.3016,
      "step": 2866
    },
    {
      "epoch": 0.45872,
      "grad_norm": 0.11831282824277878,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 2867
    },
    {
      "epoch": 0.45888,
      "grad_norm": 0.16578219830989838,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 2868
    },
    {
      "epoch": 0.45904,
      "grad_norm": 0.13479255139827728,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2869
    },
    {
      "epoch": 0.4592,
      "grad_norm": 0.14097127318382263,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 2870
    },
    {
      "epoch": 0.45936,
      "grad_norm": 0.13921979069709778,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 2871
    },
    {
      "epoch": 0.45952,
      "grad_norm": 0.1151004508137703,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 2872
    },
    {
      "epoch": 0.45968,
      "grad_norm": 0.10191455483436584,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 2873
    },
    {
      "epoch": 0.45984,
      "grad_norm": 0.12015846371650696,
      "learning_rate": 0.0001,
      "loss": 0.3403,
      "step": 2874
    },
    {
      "epoch": 0.46,
      "grad_norm": 0.12763148546218872,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 2875
    },
    {
      "epoch": 0.46016,
      "grad_norm": 0.12458164244890213,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 2876
    },
    {
      "epoch": 0.46032,
      "grad_norm": 0.11870229989290237,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 2877
    },
    {
      "epoch": 0.46048,
      "grad_norm": 0.11058848351240158,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 2878
    },
    {
      "epoch": 0.46064,
      "grad_norm": 0.11438819766044617,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 2879
    },
    {
      "epoch": 0.4608,
      "grad_norm": 0.11092320084571838,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 2880
    },
    {
      "epoch": 0.46096,
      "grad_norm": 0.13223479688167572,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2881
    },
    {
      "epoch": 0.46112,
      "grad_norm": 0.10613351315259933,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 2882
    },
    {
      "epoch": 0.46128,
      "grad_norm": 0.11400067806243896,
      "learning_rate": 0.0001,
      "loss": 0.3427,
      "step": 2883
    },
    {
      "epoch": 0.46144,
      "grad_norm": 0.10818230360746384,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 2884
    },
    {
      "epoch": 0.4616,
      "grad_norm": 0.09571381658315659,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 2885
    },
    {
      "epoch": 0.46176,
      "grad_norm": 0.12213289737701416,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 2886
    },
    {
      "epoch": 0.46192,
      "grad_norm": 0.10438700020313263,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 2887
    },
    {
      "epoch": 0.46208,
      "grad_norm": 0.0901758074760437,
      "learning_rate": 0.0001,
      "loss": 0.3032,
      "step": 2888
    },
    {
      "epoch": 0.46224,
      "grad_norm": 0.10498987138271332,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 2889
    },
    {
      "epoch": 0.4624,
      "grad_norm": 0.11324870586395264,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 2890
    },
    {
      "epoch": 0.46256,
      "grad_norm": 0.12182259559631348,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 2891
    },
    {
      "epoch": 0.46272,
      "grad_norm": 0.12041064351797104,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 2892
    },
    {
      "epoch": 0.46288,
      "grad_norm": 0.11315647512674332,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 2893
    },
    {
      "epoch": 0.46304,
      "grad_norm": 0.11674563586711884,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 2894
    },
    {
      "epoch": 0.4632,
      "grad_norm": 0.10655530542135239,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 2895
    },
    {
      "epoch": 0.46336,
      "grad_norm": 0.12490794062614441,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 2896
    },
    {
      "epoch": 0.46352,
      "grad_norm": 0.10908199101686478,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 2897
    },
    {
      "epoch": 0.46368,
      "grad_norm": 0.10245943814516068,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 2898
    },
    {
      "epoch": 0.46384,
      "grad_norm": 0.10951904952526093,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 2899
    },
    {
      "epoch": 0.464,
      "grad_norm": 0.12184298783540726,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 2900
    },
    {
      "epoch": 0.464,
      "eval_train_accuracy": 0.9362,
      "eval_train_loss": 0.3208251893520355,
      "eval_train_runtime": 4.2435,
      "eval_train_samples_per_second": 1178.284,
      "eval_train_steps_per_second": 14.846,
      "step": 2900
    },
    {
      "epoch": 0.464,
      "eval_test_accuracy": 0.9302,
      "eval_test_loss": 0.3195894658565521,
      "eval_test_runtime": 4.7642,
      "eval_test_samples_per_second": 1049.486,
      "eval_test_steps_per_second": 13.224,
      "step": 2900
    },
    {
      "epoch": 0.46416,
      "grad_norm": 0.1200004294514656,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 2901
    },
    {
      "epoch": 0.46432,
      "grad_norm": 0.11651317775249481,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 2902
    },
    {
      "epoch": 0.46448,
      "grad_norm": 0.13067759573459625,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 2903
    },
    {
      "epoch": 0.46464,
      "grad_norm": 0.11846351623535156,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2904
    },
    {
      "epoch": 0.4648,
      "grad_norm": 0.10482408106327057,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 2905
    },
    {
      "epoch": 0.46496,
      "grad_norm": 0.11855417490005493,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 2906
    },
    {
      "epoch": 0.46512,
      "grad_norm": 0.15024356544017792,
      "learning_rate": 0.0001,
      "loss": 0.342,
      "step": 2907
    },
    {
      "epoch": 0.46528,
      "grad_norm": 0.12185999006032944,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 2908
    },
    {
      "epoch": 0.46544,
      "grad_norm": 0.10915792733430862,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 2909
    },
    {
      "epoch": 0.4656,
      "grad_norm": 0.09713644534349442,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 2910
    },
    {
      "epoch": 0.46576,
      "grad_norm": 0.10860394686460495,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 2911
    },
    {
      "epoch": 0.46592,
      "grad_norm": 0.10334663093090057,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 2912
    },
    {
      "epoch": 0.46608,
      "grad_norm": 0.10827144980430603,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 2913
    },
    {
      "epoch": 0.46624,
      "grad_norm": 0.11257775872945786,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 2914
    },
    {
      "epoch": 0.4664,
      "grad_norm": 0.10108932852745056,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 2915
    },
    {
      "epoch": 0.46656,
      "grad_norm": 0.09935056418180466,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 2916
    },
    {
      "epoch": 0.46672,
      "grad_norm": 0.10048732161521912,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 2917
    },
    {
      "epoch": 0.46688,
      "grad_norm": 0.14433415234088898,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 2918
    },
    {
      "epoch": 0.46704,
      "grad_norm": 0.10534157603979111,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2919
    },
    {
      "epoch": 0.4672,
      "grad_norm": 0.0975884273648262,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 2920
    },
    {
      "epoch": 0.46736,
      "grad_norm": 0.10632607340812683,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 2921
    },
    {
      "epoch": 0.46752,
      "grad_norm": 0.11849072575569153,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 2922
    },
    {
      "epoch": 0.46768,
      "grad_norm": 0.11667123436927795,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 2923
    },
    {
      "epoch": 0.46784,
      "grad_norm": 0.10373083502054214,
      "learning_rate": 0.0001,
      "loss": 0.3067,
      "step": 2924
    },
    {
      "epoch": 0.468,
      "grad_norm": 0.11274516582489014,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 2925
    },
    {
      "epoch": 0.46816,
      "grad_norm": 0.10318677127361298,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 2926
    },
    {
      "epoch": 0.46832,
      "grad_norm": 0.12418495863676071,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 2927
    },
    {
      "epoch": 0.46848,
      "grad_norm": 0.09974972903728485,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 2928
    },
    {
      "epoch": 0.46864,
      "grad_norm": 0.10541130602359772,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 2929
    },
    {
      "epoch": 0.4688,
      "grad_norm": 0.11157545447349548,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 2930
    },
    {
      "epoch": 0.46896,
      "grad_norm": 0.11241749674081802,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 2931
    },
    {
      "epoch": 0.46912,
      "grad_norm": 0.13195443153381348,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 2932
    },
    {
      "epoch": 0.46928,
      "grad_norm": 0.11295965313911438,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 2933
    },
    {
      "epoch": 0.46944,
      "grad_norm": 0.09724254906177521,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 2934
    },
    {
      "epoch": 0.4696,
      "grad_norm": 0.0979016050696373,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 2935
    },
    {
      "epoch": 0.46976,
      "grad_norm": 0.09966248273849487,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 2936
    },
    {
      "epoch": 0.46992,
      "grad_norm": 0.10782303661108017,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 2937
    },
    {
      "epoch": 0.47008,
      "grad_norm": 0.12338323146104813,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2938
    },
    {
      "epoch": 0.47024,
      "grad_norm": 0.08769723773002625,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 2939
    },
    {
      "epoch": 0.4704,
      "grad_norm": 0.10540192574262619,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 2940
    },
    {
      "epoch": 0.47056,
      "grad_norm": 0.10227848589420319,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 2941
    },
    {
      "epoch": 0.47072,
      "grad_norm": 0.18551568686962128,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 2942
    },
    {
      "epoch": 0.47088,
      "grad_norm": 0.10197846591472626,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 2943
    },
    {
      "epoch": 0.47104,
      "grad_norm": 0.09827844798564911,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 2944
    },
    {
      "epoch": 0.4712,
      "grad_norm": 0.12691955268383026,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 2945
    },
    {
      "epoch": 0.47136,
      "grad_norm": 0.09973446279764175,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 2946
    },
    {
      "epoch": 0.47152,
      "grad_norm": 0.15656441450119019,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 2947
    },
    {
      "epoch": 0.47168,
      "grad_norm": 0.1055326834321022,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2948
    },
    {
      "epoch": 0.47184,
      "grad_norm": 0.13985878229141235,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 2949
    },
    {
      "epoch": 0.472,
      "grad_norm": 0.14132088422775269,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 2950
    },
    {
      "epoch": 0.47216,
      "grad_norm": 0.12920264899730682,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 2951
    },
    {
      "epoch": 0.47232,
      "grad_norm": 0.19159512221813202,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 2952
    },
    {
      "epoch": 0.47248,
      "grad_norm": 0.1041935533285141,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 2953
    },
    {
      "epoch": 0.47264,
      "grad_norm": 0.10387575626373291,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 2954
    },
    {
      "epoch": 0.4728,
      "grad_norm": 0.09913671761751175,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 2955
    },
    {
      "epoch": 0.47296,
      "grad_norm": 0.11800333857536316,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 2956
    },
    {
      "epoch": 0.47312,
      "grad_norm": 0.13643649220466614,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 2957
    },
    {
      "epoch": 0.47328,
      "grad_norm": 0.12566791474819183,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 2958
    },
    {
      "epoch": 0.47344,
      "grad_norm": 0.11449321359395981,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 2959
    },
    {
      "epoch": 0.4736,
      "grad_norm": 0.09635094553232193,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 2960
    },
    {
      "epoch": 0.47376,
      "grad_norm": 0.11292620003223419,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 2961
    },
    {
      "epoch": 0.47392,
      "grad_norm": 0.10812141746282578,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 2962
    },
    {
      "epoch": 0.47408,
      "grad_norm": 0.09635835140943527,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 2963
    },
    {
      "epoch": 0.47424,
      "grad_norm": 0.10532999783754349,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 2964
    },
    {
      "epoch": 0.4744,
      "grad_norm": 0.10350952297449112,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 2965
    },
    {
      "epoch": 0.47456,
      "grad_norm": 0.11635168641805649,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 2966
    },
    {
      "epoch": 0.47472,
      "grad_norm": 0.18128280341625214,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 2967
    },
    {
      "epoch": 0.47488,
      "grad_norm": 0.11820186674594879,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 2968
    },
    {
      "epoch": 0.47504,
      "grad_norm": 0.10214298963546753,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 2969
    },
    {
      "epoch": 0.4752,
      "grad_norm": 0.11070625483989716,
      "learning_rate": 0.0001,
      "loss": 0.3043,
      "step": 2970
    },
    {
      "epoch": 0.47536,
      "grad_norm": 0.10733676701784134,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 2971
    },
    {
      "epoch": 0.47552,
      "grad_norm": 0.12006985396146774,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 2972
    },
    {
      "epoch": 0.47568,
      "grad_norm": 0.21214739978313446,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 2973
    },
    {
      "epoch": 0.47584,
      "grad_norm": 0.09352414309978485,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2974
    },
    {
      "epoch": 0.476,
      "grad_norm": 0.09644054621458054,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 2975
    },
    {
      "epoch": 0.47616,
      "grad_norm": 0.13919882476329803,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 2976
    },
    {
      "epoch": 0.47632,
      "grad_norm": 0.19709736108779907,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 2977
    },
    {
      "epoch": 0.47648,
      "grad_norm": 0.1280502825975418,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 2978
    },
    {
      "epoch": 0.47664,
      "grad_norm": 0.12230231612920761,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 2979
    },
    {
      "epoch": 0.4768,
      "grad_norm": 0.1992444396018982,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2980
    },
    {
      "epoch": 0.47696,
      "grad_norm": 0.12241511046886444,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 2981
    },
    {
      "epoch": 0.47712,
      "grad_norm": 0.11896306276321411,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 2982
    },
    {
      "epoch": 0.47728,
      "grad_norm": 0.15804490447044373,
      "learning_rate": 0.0001,
      "loss": 0.3023,
      "step": 2983
    },
    {
      "epoch": 0.47744,
      "grad_norm": 0.11824562400579453,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 2984
    },
    {
      "epoch": 0.4776,
      "grad_norm": 0.1311667561531067,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 2985
    },
    {
      "epoch": 0.47776,
      "grad_norm": 0.11747883260250092,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 2986
    },
    {
      "epoch": 0.47792,
      "grad_norm": 0.14360080659389496,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 2987
    },
    {
      "epoch": 0.47808,
      "grad_norm": 0.1055406779050827,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2988
    },
    {
      "epoch": 0.47824,
      "grad_norm": 0.14004629850387573,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2989
    },
    {
      "epoch": 0.4784,
      "grad_norm": 0.1767047941684723,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 2990
    },
    {
      "epoch": 0.47856,
      "grad_norm": 0.128445103764534,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 2991
    },
    {
      "epoch": 0.47872,
      "grad_norm": 0.12566335499286652,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 2992
    },
    {
      "epoch": 0.47888,
      "grad_norm": 0.11033318936824799,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 2993
    },
    {
      "epoch": 0.47904,
      "grad_norm": 0.26415911316871643,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 2994
    },
    {
      "epoch": 0.4792,
      "grad_norm": 0.10977185517549515,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 2995
    },
    {
      "epoch": 0.47936,
      "grad_norm": 0.15077649056911469,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 2996
    },
    {
      "epoch": 0.47952,
      "grad_norm": 0.23384800553321838,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 2997
    },
    {
      "epoch": 0.47968,
      "grad_norm": 0.14364643394947052,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 2998
    },
    {
      "epoch": 0.47984,
      "grad_norm": 0.12014313042163849,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 2999
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.1625840812921524,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 3000
    },
    {
      "epoch": 0.48,
      "eval_train_accuracy": 0.977,
      "eval_train_loss": 0.32072675228118896,
      "eval_train_runtime": 4.2117,
      "eval_train_samples_per_second": 1187.177,
      "eval_train_steps_per_second": 14.958,
      "step": 3000
    },
    {
      "epoch": 0.48,
      "eval_test_accuracy": 0.9768,
      "eval_test_loss": 0.3192235231399536,
      "eval_test_runtime": 4.8835,
      "eval_test_samples_per_second": 1023.853,
      "eval_test_steps_per_second": 12.901,
      "step": 3000
    },
    {
      "epoch": 0.48016,
      "grad_norm": 0.16265486180782318,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 3001
    },
    {
      "epoch": 0.48032,
      "grad_norm": 0.1328650563955307,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 3002
    },
    {
      "epoch": 0.48048,
      "grad_norm": 0.214478999376297,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 3003
    },
    {
      "epoch": 0.48064,
      "grad_norm": 0.18075333535671234,
      "learning_rate": 0.0001,
      "loss": 0.3449,
      "step": 3004
    },
    {
      "epoch": 0.4808,
      "grad_norm": 0.25934264063835144,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 3005
    },
    {
      "epoch": 0.48096,
      "grad_norm": 0.12051306664943695,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 3006
    },
    {
      "epoch": 0.48112,
      "grad_norm": 0.10559987276792526,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 3007
    },
    {
      "epoch": 0.48128,
      "grad_norm": 0.20866549015045166,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3008
    },
    {
      "epoch": 0.48144,
      "grad_norm": 0.1078842356801033,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 3009
    },
    {
      "epoch": 0.4816,
      "grad_norm": 0.14176960289478302,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 3010
    },
    {
      "epoch": 0.48176,
      "grad_norm": 0.15901890397071838,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3011
    },
    {
      "epoch": 0.48192,
      "grad_norm": 0.2527318596839905,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 3012
    },
    {
      "epoch": 0.48208,
      "grad_norm": 0.11308573931455612,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 3013
    },
    {
      "epoch": 0.48224,
      "grad_norm": 0.12571462988853455,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 3014
    },
    {
      "epoch": 0.4824,
      "grad_norm": 0.11038676649332047,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 3015
    },
    {
      "epoch": 0.48256,
      "grad_norm": 0.13459065556526184,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 3016
    },
    {
      "epoch": 0.48272,
      "grad_norm": 0.1785513162612915,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3017
    },
    {
      "epoch": 0.48288,
      "grad_norm": 0.17175708711147308,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 3018
    },
    {
      "epoch": 0.48304,
      "grad_norm": 0.11943862587213516,
      "learning_rate": 0.0001,
      "loss": 0.3347,
      "step": 3019
    },
    {
      "epoch": 0.4832,
      "grad_norm": 0.10475588589906693,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 3020
    },
    {
      "epoch": 0.48336,
      "grad_norm": 0.15376317501068115,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 3021
    },
    {
      "epoch": 0.48352,
      "grad_norm": 0.13453024625778198,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 3022
    },
    {
      "epoch": 0.48368,
      "grad_norm": 0.14512962102890015,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3023
    },
    {
      "epoch": 0.48384,
      "grad_norm": 0.14395664632320404,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 3024
    },
    {
      "epoch": 0.484,
      "grad_norm": 0.12163326889276505,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 3025
    },
    {
      "epoch": 0.48416,
      "grad_norm": 0.12336718291044235,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 3026
    },
    {
      "epoch": 0.48432,
      "grad_norm": 0.12350764870643616,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 3027
    },
    {
      "epoch": 0.48448,
      "grad_norm": 0.11760476976633072,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3028
    },
    {
      "epoch": 0.48464,
      "grad_norm": 0.12892146408557892,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 3029
    },
    {
      "epoch": 0.4848,
      "grad_norm": 0.11013440787792206,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 3030
    },
    {
      "epoch": 0.48496,
      "grad_norm": 0.09906651824712753,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 3031
    },
    {
      "epoch": 0.48512,
      "grad_norm": 0.1060870885848999,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3032
    },
    {
      "epoch": 0.48528,
      "grad_norm": 0.1420067548751831,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 3033
    },
    {
      "epoch": 0.48544,
      "grad_norm": 0.15167021751403809,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 3034
    },
    {
      "epoch": 0.4856,
      "grad_norm": 0.11554165929555893,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 3035
    },
    {
      "epoch": 0.48576,
      "grad_norm": 0.10284151881933212,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 3036
    },
    {
      "epoch": 0.48592,
      "grad_norm": 0.12360209226608276,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 3037
    },
    {
      "epoch": 0.48608,
      "grad_norm": 0.11863870173692703,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 3038
    },
    {
      "epoch": 0.48624,
      "grad_norm": 0.13880261778831482,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 3039
    },
    {
      "epoch": 0.4864,
      "grad_norm": 0.0913928672671318,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 3040
    },
    {
      "epoch": 0.48656,
      "grad_norm": 0.15241549909114838,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 3041
    },
    {
      "epoch": 0.48672,
      "grad_norm": 0.11324365437030792,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 3042
    },
    {
      "epoch": 0.48688,
      "grad_norm": 0.09701263904571533,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3043
    },
    {
      "epoch": 0.48704,
      "grad_norm": 0.12941627204418182,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 3044
    },
    {
      "epoch": 0.4872,
      "grad_norm": 0.10609030723571777,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 3045
    },
    {
      "epoch": 0.48736,
      "grad_norm": 0.10855976492166519,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 3046
    },
    {
      "epoch": 0.48752,
      "grad_norm": 0.11714297533035278,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 3047
    },
    {
      "epoch": 0.48768,
      "grad_norm": 0.124489925801754,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 3048
    },
    {
      "epoch": 0.48784,
      "grad_norm": 0.1539585441350937,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 3049
    },
    {
      "epoch": 0.488,
      "grad_norm": 0.13677407801151276,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 3050
    },
    {
      "epoch": 0.48816,
      "grad_norm": 0.12457139045000076,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3051
    },
    {
      "epoch": 0.48832,
      "grad_norm": 0.12920239567756653,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 3052
    },
    {
      "epoch": 0.48848,
      "grad_norm": 0.10789794474840164,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 3053
    },
    {
      "epoch": 0.48864,
      "grad_norm": 0.10339813679456711,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3054
    },
    {
      "epoch": 0.4888,
      "grad_norm": 0.16729888319969177,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 3055
    },
    {
      "epoch": 0.48896,
      "grad_norm": 0.11377531290054321,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 3056
    },
    {
      "epoch": 0.48912,
      "grad_norm": 0.10287119448184967,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 3057
    },
    {
      "epoch": 0.48928,
      "grad_norm": 0.12132026255130768,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 3058
    },
    {
      "epoch": 0.48944,
      "grad_norm": 0.23306146264076233,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3059
    },
    {
      "epoch": 0.4896,
      "grad_norm": 0.11938658356666565,
      "learning_rate": 0.0001,
      "loss": 0.3075,
      "step": 3060
    },
    {
      "epoch": 0.48976,
      "grad_norm": 0.10896977037191391,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 3061
    },
    {
      "epoch": 0.48992,
      "grad_norm": 0.1270660161972046,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 3062
    },
    {
      "epoch": 0.49008,
      "grad_norm": 0.18828222155570984,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 3063
    },
    {
      "epoch": 0.49024,
      "grad_norm": 0.10698802769184113,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3064
    },
    {
      "epoch": 0.4904,
      "grad_norm": 0.10451580584049225,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 3065
    },
    {
      "epoch": 0.49056,
      "grad_norm": 0.1243361309170723,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 3066
    },
    {
      "epoch": 0.49072,
      "grad_norm": 0.12573136389255524,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3067
    },
    {
      "epoch": 0.49088,
      "grad_norm": 0.10360056161880493,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 3068
    },
    {
      "epoch": 0.49104,
      "grad_norm": 0.1290171593427658,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 3069
    },
    {
      "epoch": 0.4912,
      "grad_norm": 0.12315646559000015,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 3070
    },
    {
      "epoch": 0.49136,
      "grad_norm": 0.10998234897851944,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 3071
    },
    {
      "epoch": 0.49152,
      "grad_norm": 0.11085914820432663,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 3072
    },
    {
      "epoch": 0.49168,
      "grad_norm": 0.15678593516349792,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 3073
    },
    {
      "epoch": 0.49184,
      "grad_norm": 0.11002585291862488,
      "learning_rate": 0.0001,
      "loss": 0.3011,
      "step": 3074
    },
    {
      "epoch": 0.492,
      "grad_norm": 0.19032439589500427,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3075
    },
    {
      "epoch": 0.49216,
      "grad_norm": 0.10263759642839432,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 3076
    },
    {
      "epoch": 0.49232,
      "grad_norm": 0.11495035886764526,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 3077
    },
    {
      "epoch": 0.49248,
      "grad_norm": 0.130154088139534,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 3078
    },
    {
      "epoch": 0.49264,
      "grad_norm": 0.16826260089874268,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3079
    },
    {
      "epoch": 0.4928,
      "grad_norm": 0.12941502034664154,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 3080
    },
    {
      "epoch": 0.49296,
      "grad_norm": 0.09885506331920624,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 3081
    },
    {
      "epoch": 0.49312,
      "grad_norm": 0.11502300202846527,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 3082
    },
    {
      "epoch": 0.49328,
      "grad_norm": 0.1411963403224945,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 3083
    },
    {
      "epoch": 0.49344,
      "grad_norm": 0.23219303786754608,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 3084
    },
    {
      "epoch": 0.4936,
      "grad_norm": 0.10378767549991608,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 3085
    },
    {
      "epoch": 0.49376,
      "grad_norm": 0.151272252202034,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 3086
    },
    {
      "epoch": 0.49392,
      "grad_norm": 0.10820832848548889,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 3087
    },
    {
      "epoch": 0.49408,
      "grad_norm": 0.15523913502693176,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3088
    },
    {
      "epoch": 0.49424,
      "grad_norm": 0.23925934731960297,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 3089
    },
    {
      "epoch": 0.4944,
      "grad_norm": 0.12530405819416046,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 3090
    },
    {
      "epoch": 0.49456,
      "grad_norm": 0.15163278579711914,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 3091
    },
    {
      "epoch": 0.49472,
      "grad_norm": 0.11106797307729721,
      "learning_rate": 0.0001,
      "loss": 0.3374,
      "step": 3092
    },
    {
      "epoch": 0.49488,
      "grad_norm": 0.11011262238025665,
      "learning_rate": 0.0001,
      "loss": 0.3399,
      "step": 3093
    },
    {
      "epoch": 0.49504,
      "grad_norm": 0.10153382271528244,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 3094
    },
    {
      "epoch": 0.4952,
      "grad_norm": 0.15930253267288208,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3095
    },
    {
      "epoch": 0.49536,
      "grad_norm": 0.5516898036003113,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 3096
    },
    {
      "epoch": 0.49552,
      "grad_norm": 0.10612662136554718,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3097
    },
    {
      "epoch": 0.49568,
      "grad_norm": 0.21645697951316833,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 3098
    },
    {
      "epoch": 0.49584,
      "grad_norm": 0.14310668408870697,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3099
    },
    {
      "epoch": 0.496,
      "grad_norm": 0.1264762282371521,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3100
    },
    {
      "epoch": 0.496,
      "eval_train_accuracy": 0.9808,
      "eval_train_loss": 0.32107824087142944,
      "eval_train_runtime": 4.1345,
      "eval_train_samples_per_second": 1209.349,
      "eval_train_steps_per_second": 15.238,
      "step": 3100
    },
    {
      "epoch": 0.496,
      "eval_test_accuracy": 0.977,
      "eval_test_loss": 0.31971773505210876,
      "eval_test_runtime": 4.9454,
      "eval_test_samples_per_second": 1011.043,
      "eval_test_steps_per_second": 12.739,
      "step": 3100
    },
    {
      "epoch": 0.49616,
      "grad_norm": 0.16738545894622803,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 3101
    },
    {
      "epoch": 0.49632,
      "grad_norm": 0.13062332570552826,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 3102
    },
    {
      "epoch": 0.49648,
      "grad_norm": 0.12617430090904236,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 3103
    },
    {
      "epoch": 0.49664,
      "grad_norm": 0.14746573567390442,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 3104
    },
    {
      "epoch": 0.4968,
      "grad_norm": 0.1545274555683136,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 3105
    },
    {
      "epoch": 0.49696,
      "grad_norm": 0.1920778602361679,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 3106
    },
    {
      "epoch": 0.49712,
      "grad_norm": 0.15937983989715576,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 3107
    },
    {
      "epoch": 0.49728,
      "grad_norm": 0.12902116775512695,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 3108
    },
    {
      "epoch": 0.49744,
      "grad_norm": 0.1062052994966507,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 3109
    },
    {
      "epoch": 0.4976,
      "grad_norm": 0.18676738440990448,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3110
    },
    {
      "epoch": 0.49776,
      "grad_norm": 0.10225480794906616,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 3111
    },
    {
      "epoch": 0.49792,
      "grad_norm": 0.09516838937997818,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 3112
    },
    {
      "epoch": 0.49808,
      "grad_norm": 0.11509101837873459,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 3113
    },
    {
      "epoch": 0.49824,
      "grad_norm": 0.12528672814369202,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 3114
    },
    {
      "epoch": 0.4984,
      "grad_norm": 0.11434523016214371,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 3115
    },
    {
      "epoch": 0.49856,
      "grad_norm": 0.19536848366260529,
      "learning_rate": 0.0001,
      "loss": 0.3433,
      "step": 3116
    },
    {
      "epoch": 0.49872,
      "grad_norm": 0.1093999519944191,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 3117
    },
    {
      "epoch": 0.49888,
      "grad_norm": 0.13988958299160004,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 3118
    },
    {
      "epoch": 0.49904,
      "grad_norm": 0.11928746849298477,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 3119
    },
    {
      "epoch": 0.4992,
      "grad_norm": 0.12390244007110596,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3120
    },
    {
      "epoch": 0.49936,
      "grad_norm": 0.23672913014888763,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 3121
    },
    {
      "epoch": 0.49952,
      "grad_norm": 0.11646459251642227,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 3122
    },
    {
      "epoch": 0.49968,
      "grad_norm": 0.21687562763690948,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 3123
    },
    {
      "epoch": 0.49984,
      "grad_norm": 0.17983700335025787,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 3124
    },
    {
      "epoch": 0.5,
      "grad_norm": 0.12033899128437042,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3125
    },
    {
      "epoch": 0.50016,
      "grad_norm": 0.19035544991493225,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 3126
    },
    {
      "epoch": 0.50032,
      "grad_norm": 0.12198127806186676,
      "learning_rate": 0.0001,
      "loss": 0.3081,
      "step": 3127
    },
    {
      "epoch": 0.50048,
      "grad_norm": 0.204365536570549,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 3128
    },
    {
      "epoch": 0.50064,
      "grad_norm": 0.16039501130580902,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 3129
    },
    {
      "epoch": 0.5008,
      "grad_norm": 0.1243327260017395,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 3130
    },
    {
      "epoch": 0.50096,
      "grad_norm": 0.17896521091461182,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 3131
    },
    {
      "epoch": 0.50112,
      "grad_norm": 0.1641237735748291,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 3132
    },
    {
      "epoch": 0.50128,
      "grad_norm": 0.18181656301021576,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 3133
    },
    {
      "epoch": 0.50144,
      "grad_norm": 0.1446913480758667,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 3134
    },
    {
      "epoch": 0.5016,
      "grad_norm": 0.12636609375476837,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 3135
    },
    {
      "epoch": 0.50176,
      "grad_norm": 0.1494416892528534,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 3136
    },
    {
      "epoch": 0.50192,
      "grad_norm": 0.39898353815078735,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 3137
    },
    {
      "epoch": 0.50208,
      "grad_norm": 0.11624374240636826,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 3138
    },
    {
      "epoch": 0.50224,
      "grad_norm": 0.19550538063049316,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 3139
    },
    {
      "epoch": 0.5024,
      "grad_norm": 0.23682458698749542,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 3140
    },
    {
      "epoch": 0.50256,
      "grad_norm": 0.10952247679233551,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 3141
    },
    {
      "epoch": 0.50272,
      "grad_norm": 0.18545961380004883,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3142
    },
    {
      "epoch": 0.50288,
      "grad_norm": 0.2261037975549698,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 3143
    },
    {
      "epoch": 0.50304,
      "grad_norm": 0.120768703520298,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3144
    },
    {
      "epoch": 0.5032,
      "grad_norm": 0.1749250888824463,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 3145
    },
    {
      "epoch": 0.50336,
      "grad_norm": 0.1104358434677124,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 3146
    },
    {
      "epoch": 0.50352,
      "grad_norm": 0.13673707842826843,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3147
    },
    {
      "epoch": 0.50368,
      "grad_norm": 0.10635993629693985,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 3148
    },
    {
      "epoch": 0.50384,
      "grad_norm": 0.14717890322208405,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 3149
    },
    {
      "epoch": 0.504,
      "grad_norm": 0.15240342915058136,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 3150
    },
    {
      "epoch": 0.50416,
      "grad_norm": 0.20171120762825012,
      "learning_rate": 0.0001,
      "loss": 0.3394,
      "step": 3151
    },
    {
      "epoch": 0.50432,
      "grad_norm": 0.11708015203475952,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 3152
    },
    {
      "epoch": 0.50448,
      "grad_norm": 0.15492375195026398,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 3153
    },
    {
      "epoch": 0.50464,
      "grad_norm": 0.20246784389019012,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 3154
    },
    {
      "epoch": 0.5048,
      "grad_norm": 0.24222418665885925,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 3155
    },
    {
      "epoch": 0.50496,
      "grad_norm": 0.1543584018945694,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 3156
    },
    {
      "epoch": 0.50512,
      "grad_norm": 0.1208951473236084,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 3157
    },
    {
      "epoch": 0.50528,
      "grad_norm": 0.1765441745519638,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 3158
    },
    {
      "epoch": 0.50544,
      "grad_norm": 0.12612733244895935,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 3159
    },
    {
      "epoch": 0.5056,
      "grad_norm": 0.12735925614833832,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 3160
    },
    {
      "epoch": 0.50576,
      "grad_norm": 0.15131857991218567,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 3161
    },
    {
      "epoch": 0.50592,
      "grad_norm": 0.16434751451015472,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 3162
    },
    {
      "epoch": 0.50608,
      "grad_norm": 0.17614726722240448,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 3163
    },
    {
      "epoch": 0.50624,
      "grad_norm": 0.12050691992044449,
      "learning_rate": 0.0001,
      "loss": 0.3089,
      "step": 3164
    },
    {
      "epoch": 0.5064,
      "grad_norm": 0.14182816445827484,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 3165
    },
    {
      "epoch": 0.50656,
      "grad_norm": 0.11822140961885452,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 3166
    },
    {
      "epoch": 0.50672,
      "grad_norm": 0.16001175343990326,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 3167
    },
    {
      "epoch": 0.50688,
      "grad_norm": 0.14459624886512756,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3168
    },
    {
      "epoch": 0.50704,
      "grad_norm": 0.15643735229969025,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 3169
    },
    {
      "epoch": 0.5072,
      "grad_norm": 0.1143341064453125,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 3170
    },
    {
      "epoch": 0.50736,
      "grad_norm": 0.14269624650478363,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 3171
    },
    {
      "epoch": 0.50752,
      "grad_norm": 0.11054796725511551,
      "learning_rate": 0.0001,
      "loss": 0.3031,
      "step": 3172
    },
    {
      "epoch": 0.50768,
      "grad_norm": 0.1239457055926323,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 3173
    },
    {
      "epoch": 0.50784,
      "grad_norm": 0.12369084358215332,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 3174
    },
    {
      "epoch": 0.508,
      "grad_norm": 0.140681654214859,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3175
    },
    {
      "epoch": 0.50816,
      "grad_norm": 0.1802617758512497,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3176
    },
    {
      "epoch": 0.50832,
      "grad_norm": 0.11322241276502609,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 3177
    },
    {
      "epoch": 0.50848,
      "grad_norm": 0.18996980786323547,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 3178
    },
    {
      "epoch": 0.50864,
      "grad_norm": 0.10849584639072418,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 3179
    },
    {
      "epoch": 0.5088,
      "grad_norm": 0.11719806492328644,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 3180
    },
    {
      "epoch": 0.50896,
      "grad_norm": 0.18752862513065338,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 3181
    },
    {
      "epoch": 0.50912,
      "grad_norm": 0.13862019777297974,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3182
    },
    {
      "epoch": 0.50928,
      "grad_norm": 0.12720711529254913,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 3183
    },
    {
      "epoch": 0.50944,
      "grad_norm": 0.3129781484603882,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 3184
    },
    {
      "epoch": 0.5096,
      "grad_norm": 0.16875657439231873,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 3185
    },
    {
      "epoch": 0.50976,
      "grad_norm": 0.7359816431999207,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 3186
    },
    {
      "epoch": 0.50992,
      "grad_norm": 0.11843600869178772,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3187
    },
    {
      "epoch": 0.51008,
      "grad_norm": 0.19224120676517487,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 3188
    },
    {
      "epoch": 0.51024,
      "grad_norm": 0.17173892259597778,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 3189
    },
    {
      "epoch": 0.5104,
      "grad_norm": 0.1860424280166626,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 3190
    },
    {
      "epoch": 0.51056,
      "grad_norm": 0.14276406168937683,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 3191
    },
    {
      "epoch": 0.51072,
      "grad_norm": 0.2176530361175537,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 3192
    },
    {
      "epoch": 0.51088,
      "grad_norm": 0.1950269490480423,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 3193
    },
    {
      "epoch": 0.51104,
      "grad_norm": 0.1224505603313446,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 3194
    },
    {
      "epoch": 0.5112,
      "grad_norm": 0.15092012286186218,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 3195
    },
    {
      "epoch": 0.51136,
      "grad_norm": 0.14118967950344086,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 3196
    },
    {
      "epoch": 0.51152,
      "grad_norm": 0.210279643535614,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 3197
    },
    {
      "epoch": 0.51168,
      "grad_norm": 0.14732466638088226,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3198
    },
    {
      "epoch": 0.51184,
      "grad_norm": 0.2093518227338791,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 3199
    },
    {
      "epoch": 0.512,
      "grad_norm": 0.13584372401237488,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 3200
    },
    {
      "epoch": 0.512,
      "eval_train_accuracy": 0.983,
      "eval_train_loss": 0.3207608461380005,
      "eval_train_runtime": 4.1918,
      "eval_train_samples_per_second": 1192.798,
      "eval_train_steps_per_second": 15.029,
      "step": 3200
    },
    {
      "epoch": 0.512,
      "eval_test_accuracy": 0.9834,
      "eval_test_loss": 0.3197557330131531,
      "eval_test_runtime": 4.8243,
      "eval_test_samples_per_second": 1036.425,
      "eval_test_steps_per_second": 13.059,
      "step": 3200
    },
    {
      "epoch": 0.51216,
      "grad_norm": 0.29580703377723694,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 3201
    },
    {
      "epoch": 0.51232,
      "grad_norm": 0.13954441249370575,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 3202
    },
    {
      "epoch": 0.51248,
      "grad_norm": 0.11782759428024292,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 3203
    },
    {
      "epoch": 0.51264,
      "grad_norm": 0.33385515213012695,
      "learning_rate": 0.0001,
      "loss": 0.3462,
      "step": 3204
    },
    {
      "epoch": 0.5128,
      "grad_norm": 0.10570886731147766,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3205
    },
    {
      "epoch": 0.51296,
      "grad_norm": 0.22513574361801147,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 3206
    },
    {
      "epoch": 0.51312,
      "grad_norm": 0.1653578281402588,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 3207
    },
    {
      "epoch": 0.51328,
      "grad_norm": 0.16542471945285797,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3208
    },
    {
      "epoch": 0.51344,
      "grad_norm": 0.11573108285665512,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 3209
    },
    {
      "epoch": 0.5136,
      "grad_norm": 0.1383720189332962,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 3210
    },
    {
      "epoch": 0.51376,
      "grad_norm": 0.14090366661548615,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 3211
    },
    {
      "epoch": 0.51392,
      "grad_norm": 0.17063474655151367,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 3212
    },
    {
      "epoch": 0.51408,
      "grad_norm": 0.13354209065437317,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 3213
    },
    {
      "epoch": 0.51424,
      "grad_norm": 0.18785783648490906,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 3214
    },
    {
      "epoch": 0.5144,
      "grad_norm": 0.21125975251197815,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 3215
    },
    {
      "epoch": 0.51456,
      "grad_norm": 0.12677507102489471,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3216
    },
    {
      "epoch": 0.51472,
      "grad_norm": 0.12910377979278564,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 3217
    },
    {
      "epoch": 0.51488,
      "grad_norm": 0.12556327879428864,
      "learning_rate": 0.0001,
      "loss": 0.3017,
      "step": 3218
    },
    {
      "epoch": 0.51504,
      "grad_norm": 0.17237700521945953,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3219
    },
    {
      "epoch": 0.5152,
      "grad_norm": 0.15886980295181274,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 3220
    },
    {
      "epoch": 0.51536,
      "grad_norm": 0.27418482303619385,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 3221
    },
    {
      "epoch": 0.51552,
      "grad_norm": 0.2578693628311157,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 3222
    },
    {
      "epoch": 0.51568,
      "grad_norm": 0.230605810880661,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 3223
    },
    {
      "epoch": 0.51584,
      "grad_norm": 0.12451185286045074,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 3224
    },
    {
      "epoch": 0.516,
      "grad_norm": 0.203547403216362,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 3225
    },
    {
      "epoch": 0.51616,
      "grad_norm": 0.17217499017715454,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 3226
    },
    {
      "epoch": 0.51632,
      "grad_norm": 0.13441595435142517,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 3227
    },
    {
      "epoch": 0.51648,
      "grad_norm": 0.11632373929023743,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 3228
    },
    {
      "epoch": 0.51664,
      "grad_norm": 0.14003971219062805,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 3229
    },
    {
      "epoch": 0.5168,
      "grad_norm": 0.16406522691249847,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 3230
    },
    {
      "epoch": 0.51696,
      "grad_norm": 0.1202084869146347,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 3231
    },
    {
      "epoch": 0.51712,
      "grad_norm": 0.109534852206707,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 3232
    },
    {
      "epoch": 0.51728,
      "grad_norm": 0.22586333751678467,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3233
    },
    {
      "epoch": 0.51744,
      "grad_norm": 0.11286605149507523,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 3234
    },
    {
      "epoch": 0.5176,
      "grad_norm": 0.1197928637266159,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 3235
    },
    {
      "epoch": 0.51776,
      "grad_norm": 0.12362935394048691,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 3236
    },
    {
      "epoch": 0.51792,
      "grad_norm": 0.12163010984659195,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 3237
    },
    {
      "epoch": 0.51808,
      "grad_norm": 0.11381333321332932,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 3238
    },
    {
      "epoch": 0.51824,
      "grad_norm": 0.12735287845134735,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 3239
    },
    {
      "epoch": 0.5184,
      "grad_norm": 0.12415094673633575,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3240
    },
    {
      "epoch": 0.51856,
      "grad_norm": 0.13041245937347412,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 3241
    },
    {
      "epoch": 0.51872,
      "grad_norm": 0.15781787037849426,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 3242
    },
    {
      "epoch": 0.51888,
      "grad_norm": 0.11174479126930237,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 3243
    },
    {
      "epoch": 0.51904,
      "grad_norm": 0.0992090180516243,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 3244
    },
    {
      "epoch": 0.5192,
      "grad_norm": 0.10396700352430344,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 3245
    },
    {
      "epoch": 0.51936,
      "grad_norm": 0.20390425622463226,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 3246
    },
    {
      "epoch": 0.51952,
      "grad_norm": 0.12170989066362381,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 3247
    },
    {
      "epoch": 0.51968,
      "grad_norm": 0.14292484521865845,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 3248
    },
    {
      "epoch": 0.51984,
      "grad_norm": 0.17855952680110931,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 3249
    },
    {
      "epoch": 0.52,
      "grad_norm": 0.12399110198020935,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 3250
    },
    {
      "epoch": 0.52016,
      "grad_norm": 0.13657449185848236,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 3251
    },
    {
      "epoch": 0.52032,
      "grad_norm": 0.10611047595739365,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 3252
    },
    {
      "epoch": 0.52048,
      "grad_norm": 0.17000141739845276,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 3253
    },
    {
      "epoch": 0.52064,
      "grad_norm": 0.20083867013454437,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 3254
    },
    {
      "epoch": 0.5208,
      "grad_norm": 0.10772409290075302,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 3255
    },
    {
      "epoch": 0.52096,
      "grad_norm": 0.10080686956644058,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 3256
    },
    {
      "epoch": 0.52112,
      "grad_norm": 0.12615849077701569,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 3257
    },
    {
      "epoch": 0.52128,
      "grad_norm": 0.1690431833267212,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 3258
    },
    {
      "epoch": 0.52144,
      "grad_norm": 0.13887447118759155,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 3259
    },
    {
      "epoch": 0.5216,
      "grad_norm": 0.19728831946849823,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 3260
    },
    {
      "epoch": 0.52176,
      "grad_norm": 0.16625265777111053,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 3261
    },
    {
      "epoch": 0.52192,
      "grad_norm": 0.1307239979505539,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 3262
    },
    {
      "epoch": 0.52208,
      "grad_norm": 0.11711318790912628,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 3263
    },
    {
      "epoch": 0.52224,
      "grad_norm": 0.13756051659584045,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 3264
    },
    {
      "epoch": 0.5224,
      "grad_norm": 0.1278756558895111,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 3265
    },
    {
      "epoch": 0.52256,
      "grad_norm": 0.14494648575782776,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 3266
    },
    {
      "epoch": 0.52272,
      "grad_norm": 0.11501076817512512,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 3267
    },
    {
      "epoch": 0.52288,
      "grad_norm": 0.09663407504558563,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 3268
    },
    {
      "epoch": 0.52304,
      "grad_norm": 0.10953684151172638,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 3269
    },
    {
      "epoch": 0.5232,
      "grad_norm": 0.09232155978679657,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 3270
    },
    {
      "epoch": 0.52336,
      "grad_norm": 0.1006808876991272,
      "learning_rate": 0.0001,
      "loss": 0.3035,
      "step": 3271
    },
    {
      "epoch": 0.52352,
      "grad_norm": 0.16304974257946014,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 3272
    },
    {
      "epoch": 0.52368,
      "grad_norm": 0.11497969180345535,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 3273
    },
    {
      "epoch": 0.52384,
      "grad_norm": 0.1836049109697342,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 3274
    },
    {
      "epoch": 0.524,
      "grad_norm": 0.1667373776435852,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 3275
    },
    {
      "epoch": 0.52416,
      "grad_norm": 0.10117842257022858,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 3276
    },
    {
      "epoch": 0.52432,
      "grad_norm": 0.12225232273340225,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 3277
    },
    {
      "epoch": 0.52448,
      "grad_norm": 0.11890346556901932,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3278
    },
    {
      "epoch": 0.52464,
      "grad_norm": 0.11280043423175812,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 3279
    },
    {
      "epoch": 0.5248,
      "grad_norm": 0.12157174944877625,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 3280
    },
    {
      "epoch": 0.52496,
      "grad_norm": 0.1637333780527115,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 3281
    },
    {
      "epoch": 0.52512,
      "grad_norm": 0.1835414171218872,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 3282
    },
    {
      "epoch": 0.52528,
      "grad_norm": 0.13729900121688843,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 3283
    },
    {
      "epoch": 0.52544,
      "grad_norm": 0.11237026751041412,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 3284
    },
    {
      "epoch": 0.5256,
      "grad_norm": 0.12871527671813965,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3285
    },
    {
      "epoch": 0.52576,
      "grad_norm": 0.1225331723690033,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3286
    },
    {
      "epoch": 0.52592,
      "grad_norm": 0.11316513270139694,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3287
    },
    {
      "epoch": 0.52608,
      "grad_norm": 0.11475392431020737,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 3288
    },
    {
      "epoch": 0.52624,
      "grad_norm": 0.2305043786764145,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 3289
    },
    {
      "epoch": 0.5264,
      "grad_norm": 0.0998280718922615,
      "learning_rate": 0.0001,
      "loss": 0.3038,
      "step": 3290
    },
    {
      "epoch": 0.52656,
      "grad_norm": 0.10230669379234314,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 3291
    },
    {
      "epoch": 0.52672,
      "grad_norm": 0.11150969564914703,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 3292
    },
    {
      "epoch": 0.52688,
      "grad_norm": 0.23846012353897095,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 3293
    },
    {
      "epoch": 0.52704,
      "grad_norm": 0.17826522886753082,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 3294
    },
    {
      "epoch": 0.5272,
      "grad_norm": 0.12728595733642578,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 3295
    },
    {
      "epoch": 0.52736,
      "grad_norm": 0.10841888934373856,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 3296
    },
    {
      "epoch": 0.52752,
      "grad_norm": 0.16045446693897247,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 3297
    },
    {
      "epoch": 0.52768,
      "grad_norm": 0.12406647950410843,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 3298
    },
    {
      "epoch": 0.52784,
      "grad_norm": 0.1474330723285675,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 3299
    },
    {
      "epoch": 0.528,
      "grad_norm": 0.12244715541601181,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 3300
    },
    {
      "epoch": 0.528,
      "eval_train_accuracy": 0.9882,
      "eval_train_loss": 0.3200684189796448,
      "eval_train_runtime": 4.0584,
      "eval_train_samples_per_second": 1232.018,
      "eval_train_steps_per_second": 15.523,
      "step": 3300
    },
    {
      "epoch": 0.528,
      "eval_test_accuracy": 0.988,
      "eval_test_loss": 0.31891754269599915,
      "eval_test_runtime": 4.7877,
      "eval_test_samples_per_second": 1044.339,
      "eval_test_steps_per_second": 13.159,
      "step": 3300
    },
    {
      "epoch": 0.52816,
      "grad_norm": 0.14863350987434387,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3301
    },
    {
      "epoch": 0.52832,
      "grad_norm": 0.12370523810386658,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 3302
    },
    {
      "epoch": 0.52848,
      "grad_norm": 0.14828163385391235,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 3303
    },
    {
      "epoch": 0.52864,
      "grad_norm": 0.09977713972330093,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 3304
    },
    {
      "epoch": 0.5288,
      "grad_norm": 0.11295855045318604,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 3305
    },
    {
      "epoch": 0.52896,
      "grad_norm": 0.11573389917612076,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 3306
    },
    {
      "epoch": 0.52912,
      "grad_norm": 0.12314732372760773,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 3307
    },
    {
      "epoch": 0.52928,
      "grad_norm": 0.10818979144096375,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 3308
    },
    {
      "epoch": 0.52944,
      "grad_norm": 0.10523024946451187,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 3309
    },
    {
      "epoch": 0.5296,
      "grad_norm": 0.1284976601600647,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 3310
    },
    {
      "epoch": 0.52976,
      "grad_norm": 0.095648393034935,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 3311
    },
    {
      "epoch": 0.52992,
      "grad_norm": 0.13800686597824097,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 3312
    },
    {
      "epoch": 0.53008,
      "grad_norm": 0.08966177701950073,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 3313
    },
    {
      "epoch": 0.53024,
      "grad_norm": 0.12604492902755737,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 3314
    },
    {
      "epoch": 0.5304,
      "grad_norm": 0.10226476192474365,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 3315
    },
    {
      "epoch": 0.53056,
      "grad_norm": 0.108592189848423,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 3316
    },
    {
      "epoch": 0.53072,
      "grad_norm": 0.08691772073507309,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 3317
    },
    {
      "epoch": 0.53088,
      "grad_norm": 0.10648048669099808,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 3318
    },
    {
      "epoch": 0.53104,
      "grad_norm": 0.10024162381887436,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3319
    },
    {
      "epoch": 0.5312,
      "grad_norm": 0.11952635645866394,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 3320
    },
    {
      "epoch": 0.53136,
      "grad_norm": 0.09625405818223953,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 3321
    },
    {
      "epoch": 0.53152,
      "grad_norm": 0.12916986644268036,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 3322
    },
    {
      "epoch": 0.53168,
      "grad_norm": 0.1028459295630455,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3323
    },
    {
      "epoch": 0.53184,
      "grad_norm": 0.836391031742096,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 3324
    },
    {
      "epoch": 0.532,
      "grad_norm": 0.11712680757045746,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 3325
    },
    {
      "epoch": 0.53216,
      "grad_norm": 0.10496681183576584,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 3326
    },
    {
      "epoch": 0.53232,
      "grad_norm": 0.17874722182750702,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 3327
    },
    {
      "epoch": 0.53248,
      "grad_norm": 0.28274455666542053,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 3328
    },
    {
      "epoch": 0.53264,
      "grad_norm": 0.21163612604141235,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 3329
    },
    {
      "epoch": 0.5328,
      "grad_norm": 0.31174200773239136,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 3330
    },
    {
      "epoch": 0.53296,
      "grad_norm": 0.14487971365451813,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 3331
    },
    {
      "epoch": 0.53312,
      "grad_norm": 0.3139082193374634,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3332
    },
    {
      "epoch": 0.53328,
      "grad_norm": 0.15056973695755005,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3333
    },
    {
      "epoch": 0.53344,
      "grad_norm": 0.19260619580745697,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 3334
    },
    {
      "epoch": 0.5336,
      "grad_norm": 0.14528118073940277,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 3335
    },
    {
      "epoch": 0.53376,
      "grad_norm": 0.13466022908687592,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 3336
    },
    {
      "epoch": 0.53392,
      "grad_norm": 0.2557229697704315,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 3337
    },
    {
      "epoch": 0.53408,
      "grad_norm": 0.17226417362689972,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 3338
    },
    {
      "epoch": 0.53424,
      "grad_norm": 0.12252230942249298,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3339
    },
    {
      "epoch": 0.5344,
      "grad_norm": 0.12969432771205902,
      "learning_rate": 0.0001,
      "loss": 0.3048,
      "step": 3340
    },
    {
      "epoch": 0.53456,
      "grad_norm": 0.19330064952373505,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3341
    },
    {
      "epoch": 0.53472,
      "grad_norm": 0.10007994621992111,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 3342
    },
    {
      "epoch": 0.53488,
      "grad_norm": 0.1445176750421524,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 3343
    },
    {
      "epoch": 0.53504,
      "grad_norm": 0.11125106364488602,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 3344
    },
    {
      "epoch": 0.5352,
      "grad_norm": 0.14111082255840302,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 3345
    },
    {
      "epoch": 0.53536,
      "grad_norm": 0.1472996026277542,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 3346
    },
    {
      "epoch": 0.53552,
      "grad_norm": 0.12728054821491241,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 3347
    },
    {
      "epoch": 0.53568,
      "grad_norm": 0.14666563272476196,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 3348
    },
    {
      "epoch": 0.53584,
      "grad_norm": 0.15499114990234375,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3349
    },
    {
      "epoch": 0.536,
      "grad_norm": 0.13833844661712646,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 3350
    },
    {
      "epoch": 0.53616,
      "grad_norm": 0.12612797319889069,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 3351
    },
    {
      "epoch": 0.53632,
      "grad_norm": 0.1898922473192215,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 3352
    },
    {
      "epoch": 0.53648,
      "grad_norm": 0.09867317974567413,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 3353
    },
    {
      "epoch": 0.53664,
      "grad_norm": 0.1524198204278946,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 3354
    },
    {
      "epoch": 0.5368,
      "grad_norm": 0.15503902733325958,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 3355
    },
    {
      "epoch": 0.53696,
      "grad_norm": 0.11668746918439865,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 3356
    },
    {
      "epoch": 0.53712,
      "grad_norm": 0.1180446594953537,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 3357
    },
    {
      "epoch": 0.53728,
      "grad_norm": 0.101191945374012,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 3358
    },
    {
      "epoch": 0.53744,
      "grad_norm": 0.10494006425142288,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3359
    },
    {
      "epoch": 0.5376,
      "grad_norm": 0.11140304803848267,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 3360
    },
    {
      "epoch": 0.53776,
      "grad_norm": 0.12300042062997818,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 3361
    },
    {
      "epoch": 0.53792,
      "grad_norm": 0.11378412693738937,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 3362
    },
    {
      "epoch": 0.53808,
      "grad_norm": 0.09578884392976761,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 3363
    },
    {
      "epoch": 0.53824,
      "grad_norm": 0.1192234680056572,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 3364
    },
    {
      "epoch": 0.5384,
      "grad_norm": 0.11212106049060822,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 3365
    },
    {
      "epoch": 0.53856,
      "grad_norm": 0.10157596319913864,
      "learning_rate": 0.0001,
      "loss": 0.3023,
      "step": 3366
    },
    {
      "epoch": 0.53872,
      "grad_norm": 0.1396099030971527,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3367
    },
    {
      "epoch": 0.53888,
      "grad_norm": 0.10939455032348633,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 3368
    },
    {
      "epoch": 0.53904,
      "grad_norm": 0.10015084594488144,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 3369
    },
    {
      "epoch": 0.5392,
      "grad_norm": 0.10502756386995316,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 3370
    },
    {
      "epoch": 0.53936,
      "grad_norm": 0.11254394054412842,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3371
    },
    {
      "epoch": 0.53952,
      "grad_norm": 0.17791996896266937,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 3372
    },
    {
      "epoch": 0.53968,
      "grad_norm": 0.09525439888238907,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3373
    },
    {
      "epoch": 0.53984,
      "grad_norm": 0.13560277223587036,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 3374
    },
    {
      "epoch": 0.54,
      "grad_norm": 0.11175809800624847,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3375
    },
    {
      "epoch": 0.54016,
      "grad_norm": 0.10699024796485901,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 3376
    },
    {
      "epoch": 0.54032,
      "grad_norm": 0.13592292368412018,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 3377
    },
    {
      "epoch": 0.54048,
      "grad_norm": 0.12697292864322662,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3378
    },
    {
      "epoch": 0.54064,
      "grad_norm": 0.17386780679225922,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3379
    },
    {
      "epoch": 0.5408,
      "grad_norm": 0.09707138687372208,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 3380
    },
    {
      "epoch": 0.54096,
      "grad_norm": 0.10697929561138153,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 3381
    },
    {
      "epoch": 0.54112,
      "grad_norm": 0.1131083220243454,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 3382
    },
    {
      "epoch": 0.54128,
      "grad_norm": 0.11128127574920654,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3383
    },
    {
      "epoch": 0.54144,
      "grad_norm": 0.09089181572198868,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 3384
    },
    {
      "epoch": 0.5416,
      "grad_norm": 0.09878874570131302,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 3385
    },
    {
      "epoch": 0.54176,
      "grad_norm": 0.1202060803771019,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3386
    },
    {
      "epoch": 0.54192,
      "grad_norm": 0.10456633567810059,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 3387
    },
    {
      "epoch": 0.54208,
      "grad_norm": 0.1122647151350975,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 3388
    },
    {
      "epoch": 0.54224,
      "grad_norm": 0.1350654661655426,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 3389
    },
    {
      "epoch": 0.5424,
      "grad_norm": 0.10119981318712234,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 3390
    },
    {
      "epoch": 0.54256,
      "grad_norm": 0.10009413957595825,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 3391
    },
    {
      "epoch": 0.54272,
      "grad_norm": 0.16130685806274414,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 3392
    },
    {
      "epoch": 0.54288,
      "grad_norm": 0.10996116697788239,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 3393
    },
    {
      "epoch": 0.54304,
      "grad_norm": 0.12543831765651703,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3394
    },
    {
      "epoch": 0.5432,
      "grad_norm": 0.1091829314827919,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 3395
    },
    {
      "epoch": 0.54336,
      "grad_norm": 0.10317611694335938,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 3396
    },
    {
      "epoch": 0.54352,
      "grad_norm": 0.10776038467884064,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 3397
    },
    {
      "epoch": 0.54368,
      "grad_norm": 0.1004701629281044,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 3398
    },
    {
      "epoch": 0.54384,
      "grad_norm": 0.1184043437242508,
      "learning_rate": 0.0001,
      "loss": 0.3083,
      "step": 3399
    },
    {
      "epoch": 0.544,
      "grad_norm": 0.10120641440153122,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 3400
    },
    {
      "epoch": 0.544,
      "eval_train_accuracy": 0.9944,
      "eval_train_loss": 0.31994837522506714,
      "eval_train_runtime": 4.0787,
      "eval_train_samples_per_second": 1225.871,
      "eval_train_steps_per_second": 15.446,
      "step": 3400
    },
    {
      "epoch": 0.544,
      "eval_test_accuracy": 0.9914,
      "eval_test_loss": 0.3186294436454773,
      "eval_test_runtime": 4.656,
      "eval_test_samples_per_second": 1073.893,
      "eval_test_steps_per_second": 13.531,
      "step": 3400
    },
    {
      "epoch": 0.54416,
      "grad_norm": 0.15557830035686493,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3401
    },
    {
      "epoch": 0.54432,
      "grad_norm": 0.12240831553936005,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 3402
    },
    {
      "epoch": 0.54448,
      "grad_norm": 0.11164719611406326,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3403
    },
    {
      "epoch": 0.54464,
      "grad_norm": 0.15732933580875397,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 3404
    },
    {
      "epoch": 0.5448,
      "grad_norm": 0.11052405834197998,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 3405
    },
    {
      "epoch": 0.54496,
      "grad_norm": 0.11308450996875763,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3406
    },
    {
      "epoch": 0.54512,
      "grad_norm": 0.114859938621521,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 3407
    },
    {
      "epoch": 0.54528,
      "grad_norm": 0.10232625156641006,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3408
    },
    {
      "epoch": 0.54544,
      "grad_norm": 0.10602415353059769,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 3409
    },
    {
      "epoch": 0.5456,
      "grad_norm": 0.11677675694227219,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3410
    },
    {
      "epoch": 0.54576,
      "grad_norm": 0.08882476389408112,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 3411
    },
    {
      "epoch": 0.54592,
      "grad_norm": 0.10732158273458481,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 3412
    },
    {
      "epoch": 0.54608,
      "grad_norm": 0.09580101072788239,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 3413
    },
    {
      "epoch": 0.54624,
      "grad_norm": 0.09901941567659378,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 3414
    },
    {
      "epoch": 0.5464,
      "grad_norm": 0.10295119136571884,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 3415
    },
    {
      "epoch": 0.54656,
      "grad_norm": 0.09261579811573029,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 3416
    },
    {
      "epoch": 0.54672,
      "grad_norm": 0.10781090706586838,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 3417
    },
    {
      "epoch": 0.54688,
      "grad_norm": 0.1205095425248146,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 3418
    },
    {
      "epoch": 0.54704,
      "grad_norm": 0.10028302669525146,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 3419
    },
    {
      "epoch": 0.5472,
      "grad_norm": 0.08881637454032898,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 3420
    },
    {
      "epoch": 0.54736,
      "grad_norm": 0.09963775426149368,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 3421
    },
    {
      "epoch": 0.54752,
      "grad_norm": 0.09095465391874313,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 3422
    },
    {
      "epoch": 0.54768,
      "grad_norm": 0.09114304184913635,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3423
    },
    {
      "epoch": 0.54784,
      "grad_norm": 0.10682615637779236,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 3424
    },
    {
      "epoch": 0.548,
      "grad_norm": 0.10430964082479477,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 3425
    },
    {
      "epoch": 0.54816,
      "grad_norm": 0.10191604495048523,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 3426
    },
    {
      "epoch": 0.54832,
      "grad_norm": 0.09956352412700653,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 3427
    },
    {
      "epoch": 0.54848,
      "grad_norm": 0.0973113402724266,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3428
    },
    {
      "epoch": 0.54864,
      "grad_norm": 0.10402929037809372,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 3429
    },
    {
      "epoch": 0.5488,
      "grad_norm": 0.5181101560592651,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 3430
    },
    {
      "epoch": 0.54896,
      "grad_norm": 0.09538685530424118,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 3431
    },
    {
      "epoch": 0.54912,
      "grad_norm": 0.10679171234369278,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 3432
    },
    {
      "epoch": 0.54928,
      "grad_norm": 0.1068548858165741,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 3433
    },
    {
      "epoch": 0.54944,
      "grad_norm": 0.1626078486442566,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3434
    },
    {
      "epoch": 0.5496,
      "grad_norm": 0.5103786587715149,
      "learning_rate": 0.0001,
      "loss": 0.3052,
      "step": 3435
    },
    {
      "epoch": 0.54976,
      "grad_norm": 0.16847775876522064,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 3436
    },
    {
      "epoch": 0.54992,
      "grad_norm": 0.19125960767269135,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 3437
    },
    {
      "epoch": 0.55008,
      "grad_norm": 0.131229430437088,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 3438
    },
    {
      "epoch": 0.55024,
      "grad_norm": 0.18964222073554993,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 3439
    },
    {
      "epoch": 0.5504,
      "grad_norm": 0.19700783491134644,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3440
    },
    {
      "epoch": 0.55056,
      "grad_norm": 0.18054765462875366,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3441
    },
    {
      "epoch": 0.55072,
      "grad_norm": 0.17589090764522552,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 3442
    },
    {
      "epoch": 0.55088,
      "grad_norm": 0.11814317107200623,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 3443
    },
    {
      "epoch": 0.55104,
      "grad_norm": 0.19666688144207,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 3444
    },
    {
      "epoch": 0.5512,
      "grad_norm": 0.1359635293483734,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3445
    },
    {
      "epoch": 0.55136,
      "grad_norm": 0.10762181133031845,
      "learning_rate": 0.0001,
      "loss": 0.3064,
      "step": 3446
    },
    {
      "epoch": 0.55152,
      "grad_norm": 0.13132041692733765,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 3447
    },
    {
      "epoch": 0.55168,
      "grad_norm": 0.09903924912214279,
      "learning_rate": 0.0001,
      "loss": 0.3067,
      "step": 3448
    },
    {
      "epoch": 0.55184,
      "grad_norm": 0.12971441447734833,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 3449
    },
    {
      "epoch": 0.552,
      "grad_norm": 0.10542476922273636,
      "learning_rate": 0.0001,
      "loss": 0.304,
      "step": 3450
    },
    {
      "epoch": 0.55216,
      "grad_norm": 0.12219392508268356,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 3451
    },
    {
      "epoch": 0.55232,
      "grad_norm": 0.11325868964195251,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 3452
    },
    {
      "epoch": 0.55248,
      "grad_norm": 0.10540337860584259,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3453
    },
    {
      "epoch": 0.55264,
      "grad_norm": 0.10938708484172821,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3454
    },
    {
      "epoch": 0.5528,
      "grad_norm": 0.10584519803524017,
      "learning_rate": 0.0001,
      "loss": 0.3389,
      "step": 3455
    },
    {
      "epoch": 0.55296,
      "grad_norm": 0.13967418670654297,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3456
    },
    {
      "epoch": 0.55312,
      "grad_norm": 0.10349965840578079,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 3457
    },
    {
      "epoch": 0.55328,
      "grad_norm": 0.12038084864616394,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3458
    },
    {
      "epoch": 0.55344,
      "grad_norm": 0.09590435028076172,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 3459
    },
    {
      "epoch": 0.5536,
      "grad_norm": 0.09362691640853882,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 3460
    },
    {
      "epoch": 0.55376,
      "grad_norm": 0.09181298315525055,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 3461
    },
    {
      "epoch": 0.55392,
      "grad_norm": 0.11684607714414597,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3462
    },
    {
      "epoch": 0.55408,
      "grad_norm": 0.08598766475915909,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 3463
    },
    {
      "epoch": 0.55424,
      "grad_norm": 0.09446936845779419,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 3464
    },
    {
      "epoch": 0.5544,
      "grad_norm": 0.11839593946933746,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 3465
    },
    {
      "epoch": 0.55456,
      "grad_norm": 0.1217014491558075,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 3466
    },
    {
      "epoch": 0.55472,
      "grad_norm": 0.10280370712280273,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 3467
    },
    {
      "epoch": 0.55488,
      "grad_norm": 0.11031454801559448,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 3468
    },
    {
      "epoch": 0.55504,
      "grad_norm": 0.10220959782600403,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 3469
    },
    {
      "epoch": 0.5552,
      "grad_norm": 0.09489460289478302,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 3470
    },
    {
      "epoch": 0.55536,
      "grad_norm": 0.1045602336525917,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 3471
    },
    {
      "epoch": 0.55552,
      "grad_norm": 0.10506007075309753,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 3472
    },
    {
      "epoch": 0.55568,
      "grad_norm": 0.09769585728645325,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 3473
    },
    {
      "epoch": 0.55584,
      "grad_norm": 0.09954562783241272,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 3474
    },
    {
      "epoch": 0.556,
      "grad_norm": 0.09651655703783035,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 3475
    },
    {
      "epoch": 0.55616,
      "grad_norm": 0.10535580664873123,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3476
    },
    {
      "epoch": 0.55632,
      "grad_norm": 0.10494866222143173,
      "learning_rate": 0.0001,
      "loss": 0.306,
      "step": 3477
    },
    {
      "epoch": 0.55648,
      "grad_norm": 0.1111963763833046,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 3478
    },
    {
      "epoch": 0.55664,
      "grad_norm": 0.1107124462723732,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 3479
    },
    {
      "epoch": 0.5568,
      "grad_norm": 0.08792755752801895,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 3480
    },
    {
      "epoch": 0.55696,
      "grad_norm": 0.10560042411088943,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 3481
    },
    {
      "epoch": 0.55712,
      "grad_norm": 0.1073012575507164,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 3482
    },
    {
      "epoch": 0.55728,
      "grad_norm": 0.09773129224777222,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 3483
    },
    {
      "epoch": 0.55744,
      "grad_norm": 0.10045385360717773,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 3484
    },
    {
      "epoch": 0.5576,
      "grad_norm": 0.08313401788473129,
      "learning_rate": 0.0001,
      "loss": 0.3089,
      "step": 3485
    },
    {
      "epoch": 0.55776,
      "grad_norm": 0.10091694444417953,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 3486
    },
    {
      "epoch": 0.55792,
      "grad_norm": 0.08908886462450027,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 3487
    },
    {
      "epoch": 0.55808,
      "grad_norm": 0.09747426956892014,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 3488
    },
    {
      "epoch": 0.55824,
      "grad_norm": 0.19051192700862885,
      "learning_rate": 0.0001,
      "loss": 0.3456,
      "step": 3489
    },
    {
      "epoch": 0.5584,
      "grad_norm": 0.08383829891681671,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 3490
    },
    {
      "epoch": 0.55856,
      "grad_norm": 0.09049396961927414,
      "learning_rate": 0.0001,
      "loss": 0.3019,
      "step": 3491
    },
    {
      "epoch": 0.55872,
      "grad_norm": 0.10302558541297913,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 3492
    },
    {
      "epoch": 0.55888,
      "grad_norm": 0.09962590038776398,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3493
    },
    {
      "epoch": 0.55904,
      "grad_norm": 0.09742962568998337,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3494
    },
    {
      "epoch": 0.5592,
      "grad_norm": 0.09220189601182938,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 3495
    },
    {
      "epoch": 0.55936,
      "grad_norm": 0.10859262198209763,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 3496
    },
    {
      "epoch": 0.55952,
      "grad_norm": 0.10001470148563385,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 3497
    },
    {
      "epoch": 0.55968,
      "grad_norm": 0.09572295099496841,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 3498
    },
    {
      "epoch": 0.55984,
      "grad_norm": 0.14115935564041138,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3499
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.12071511894464493,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 3500
    },
    {
      "epoch": 0.56,
      "eval_train_accuracy": 0.9952,
      "eval_train_loss": 0.31947430968284607,
      "eval_train_runtime": 4.1844,
      "eval_train_samples_per_second": 1194.908,
      "eval_train_steps_per_second": 15.056,
      "step": 3500
    },
    {
      "epoch": 0.56,
      "eval_test_accuracy": 0.9926,
      "eval_test_loss": 0.3183499574661255,
      "eval_test_runtime": 4.9058,
      "eval_test_samples_per_second": 1019.198,
      "eval_test_steps_per_second": 12.842,
      "step": 3500
    },
    {
      "epoch": 0.56016,
      "grad_norm": 0.08519215136766434,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 3501
    },
    {
      "epoch": 0.56032,
      "grad_norm": 0.09828601032495499,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 3502
    },
    {
      "epoch": 0.56048,
      "grad_norm": 0.10830318182706833,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 3503
    },
    {
      "epoch": 0.56064,
      "grad_norm": 0.08561482280492783,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3504
    },
    {
      "epoch": 0.5608,
      "grad_norm": 0.1947229951620102,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 3505
    },
    {
      "epoch": 0.56096,
      "grad_norm": 0.11150733381509781,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 3506
    },
    {
      "epoch": 0.56112,
      "grad_norm": 0.10216297954320908,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 3507
    },
    {
      "epoch": 0.56128,
      "grad_norm": 0.09849546104669571,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 3508
    },
    {
      "epoch": 0.56144,
      "grad_norm": 0.10512486100196838,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 3509
    },
    {
      "epoch": 0.5616,
      "grad_norm": 0.170803040266037,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 3510
    },
    {
      "epoch": 0.56176,
      "grad_norm": 0.18101340532302856,
      "learning_rate": 0.0001,
      "loss": 0.3481,
      "step": 3511
    },
    {
      "epoch": 0.56192,
      "grad_norm": 0.14689631760120392,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 3512
    },
    {
      "epoch": 0.56208,
      "grad_norm": 0.09927169978618622,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 3513
    },
    {
      "epoch": 0.56224,
      "grad_norm": 0.15391069650650024,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 3514
    },
    {
      "epoch": 0.5624,
      "grad_norm": 0.1126285120844841,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3515
    },
    {
      "epoch": 0.56256,
      "grad_norm": 0.12888981401920319,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3516
    },
    {
      "epoch": 0.56272,
      "grad_norm": 0.10475526005029678,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 3517
    },
    {
      "epoch": 0.56288,
      "grad_norm": 0.10190580040216446,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 3518
    },
    {
      "epoch": 0.56304,
      "grad_norm": 0.10232923179864883,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 3519
    },
    {
      "epoch": 0.5632,
      "grad_norm": 0.1371188759803772,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 3520
    },
    {
      "epoch": 0.56336,
      "grad_norm": 0.11935316026210785,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 3521
    },
    {
      "epoch": 0.56352,
      "grad_norm": 0.09133568406105042,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3522
    },
    {
      "epoch": 0.56368,
      "grad_norm": 0.11029570549726486,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 3523
    },
    {
      "epoch": 0.56384,
      "grad_norm": 0.11173147708177567,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 3524
    },
    {
      "epoch": 0.564,
      "grad_norm": 0.0960998609662056,
      "learning_rate": 0.0001,
      "loss": 0.3048,
      "step": 3525
    },
    {
      "epoch": 0.56416,
      "grad_norm": 0.10297515243291855,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 3526
    },
    {
      "epoch": 0.56432,
      "grad_norm": 0.11141253262758255,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 3527
    },
    {
      "epoch": 0.56448,
      "grad_norm": 0.10287131369113922,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 3528
    },
    {
      "epoch": 0.56464,
      "grad_norm": 0.10353010147809982,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 3529
    },
    {
      "epoch": 0.5648,
      "grad_norm": 0.0860632061958313,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 3530
    },
    {
      "epoch": 0.56496,
      "grad_norm": 0.09367886930704117,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 3531
    },
    {
      "epoch": 0.56512,
      "grad_norm": 0.10725878924131393,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 3532
    },
    {
      "epoch": 0.56528,
      "grad_norm": 0.09959129989147186,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 3533
    },
    {
      "epoch": 0.56544,
      "grad_norm": 0.163296177983284,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 3534
    },
    {
      "epoch": 0.5656,
      "grad_norm": 0.09333799034357071,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 3535
    },
    {
      "epoch": 0.56576,
      "grad_norm": 0.09429775178432465,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3536
    },
    {
      "epoch": 0.56592,
      "grad_norm": 0.09254460781812668,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3537
    },
    {
      "epoch": 0.56608,
      "grad_norm": 0.10287781059741974,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 3538
    },
    {
      "epoch": 0.56624,
      "grad_norm": 0.10453210771083832,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 3539
    },
    {
      "epoch": 0.5664,
      "grad_norm": 0.09791507571935654,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 3540
    },
    {
      "epoch": 0.56656,
      "grad_norm": 0.0882137343287468,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 3541
    },
    {
      "epoch": 0.56672,
      "grad_norm": 0.10841568559408188,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 3542
    },
    {
      "epoch": 0.56688,
      "grad_norm": 0.09319933503866196,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 3543
    },
    {
      "epoch": 0.56704,
      "grad_norm": 0.08926621079444885,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 3544
    },
    {
      "epoch": 0.5672,
      "grad_norm": 0.08940592408180237,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 3545
    },
    {
      "epoch": 0.56736,
      "grad_norm": 0.0987095832824707,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 3546
    },
    {
      "epoch": 0.56752,
      "grad_norm": 0.09197597950696945,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 3547
    },
    {
      "epoch": 0.56768,
      "grad_norm": 0.11907985806465149,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 3548
    },
    {
      "epoch": 0.56784,
      "grad_norm": 0.09525533765554428,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 3549
    },
    {
      "epoch": 0.568,
      "grad_norm": 0.09064458310604095,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 3550
    },
    {
      "epoch": 0.56816,
      "grad_norm": 0.08292321115732193,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 3551
    },
    {
      "epoch": 0.56832,
      "grad_norm": 0.08753208070993423,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3552
    },
    {
      "epoch": 0.56848,
      "grad_norm": 0.10426256060600281,
      "learning_rate": 0.0001,
      "loss": 0.3074,
      "step": 3553
    },
    {
      "epoch": 0.56864,
      "grad_norm": 0.12302002310752869,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 3554
    },
    {
      "epoch": 0.5688,
      "grad_norm": 0.09420997649431229,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3555
    },
    {
      "epoch": 0.56896,
      "grad_norm": 0.09159686416387558,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 3556
    },
    {
      "epoch": 0.56912,
      "grad_norm": 0.08830182254314423,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 3557
    },
    {
      "epoch": 0.56928,
      "grad_norm": 0.10435766726732254,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3558
    },
    {
      "epoch": 0.56944,
      "grad_norm": 0.11309056729078293,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 3559
    },
    {
      "epoch": 0.5696,
      "grad_norm": 0.12137104570865631,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 3560
    },
    {
      "epoch": 0.56976,
      "grad_norm": 0.11109388619661331,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 3561
    },
    {
      "epoch": 0.56992,
      "grad_norm": 0.09976249188184738,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 3562
    },
    {
      "epoch": 0.57008,
      "grad_norm": 0.08787424117326736,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 3563
    },
    {
      "epoch": 0.57024,
      "grad_norm": 0.09099151194095612,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 3564
    },
    {
      "epoch": 0.5704,
      "grad_norm": 0.09411560744047165,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 3565
    },
    {
      "epoch": 0.57056,
      "grad_norm": 0.11519668996334076,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 3566
    },
    {
      "epoch": 0.57072,
      "grad_norm": 0.10070772469043732,
      "learning_rate": 0.0001,
      "loss": 0.3073,
      "step": 3567
    },
    {
      "epoch": 0.57088,
      "grad_norm": 0.0892036035656929,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 3568
    },
    {
      "epoch": 0.57104,
      "grad_norm": 0.10450639575719833,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 3569
    },
    {
      "epoch": 0.5712,
      "grad_norm": 0.08889253437519073,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 3570
    },
    {
      "epoch": 0.57136,
      "grad_norm": 0.09138992428779602,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3571
    },
    {
      "epoch": 0.57152,
      "grad_norm": 0.09858200699090958,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 3572
    },
    {
      "epoch": 0.57168,
      "grad_norm": 0.09534361213445663,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 3573
    },
    {
      "epoch": 0.57184,
      "grad_norm": 0.09135989099740982,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 3574
    },
    {
      "epoch": 0.572,
      "grad_norm": 0.09955442696809769,
      "learning_rate": 0.0001,
      "loss": 0.3388,
      "step": 3575
    },
    {
      "epoch": 0.57216,
      "grad_norm": 0.10139088332653046,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 3576
    },
    {
      "epoch": 0.57232,
      "grad_norm": 0.10418836027383804,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3577
    },
    {
      "epoch": 0.57248,
      "grad_norm": 0.08803915232419968,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 3578
    },
    {
      "epoch": 0.57264,
      "grad_norm": 0.1018337830901146,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 3579
    },
    {
      "epoch": 0.5728,
      "grad_norm": 0.10243120044469833,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 3580
    },
    {
      "epoch": 0.57296,
      "grad_norm": 0.10481085628271103,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 3581
    },
    {
      "epoch": 0.57312,
      "grad_norm": 0.09043866395950317,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 3582
    },
    {
      "epoch": 0.57328,
      "grad_norm": 0.10258802771568298,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 3583
    },
    {
      "epoch": 0.57344,
      "grad_norm": 0.09751662611961365,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 3584
    },
    {
      "epoch": 0.5736,
      "grad_norm": 0.09728806465864182,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3585
    },
    {
      "epoch": 0.57376,
      "grad_norm": 0.08264529705047607,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 3586
    },
    {
      "epoch": 0.57392,
      "grad_norm": 0.09208936244249344,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 3587
    },
    {
      "epoch": 0.57408,
      "grad_norm": 0.08567629754543304,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 3588
    },
    {
      "epoch": 0.57424,
      "grad_norm": 0.10015252977609634,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 3589
    },
    {
      "epoch": 0.5744,
      "grad_norm": 0.08827846497297287,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 3590
    },
    {
      "epoch": 0.57456,
      "grad_norm": 0.09310632199048996,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 3591
    },
    {
      "epoch": 0.57472,
      "grad_norm": 0.09405821561813354,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 3592
    },
    {
      "epoch": 0.57488,
      "grad_norm": 0.1065264344215393,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 3593
    },
    {
      "epoch": 0.57504,
      "grad_norm": 0.0962202176451683,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 3594
    },
    {
      "epoch": 0.5752,
      "grad_norm": 0.0977725088596344,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 3595
    },
    {
      "epoch": 0.57536,
      "grad_norm": 0.0910673439502716,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 3596
    },
    {
      "epoch": 0.57552,
      "grad_norm": 0.08701852709054947,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 3597
    },
    {
      "epoch": 0.57568,
      "grad_norm": 0.09561201184988022,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 3598
    },
    {
      "epoch": 0.57584,
      "grad_norm": 0.10991550981998444,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3599
    },
    {
      "epoch": 0.576,
      "grad_norm": 0.09072860330343246,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 3600
    },
    {
      "epoch": 0.576,
      "eval_train_accuracy": 0.9946,
      "eval_train_loss": 0.31915903091430664,
      "eval_train_runtime": 4.0734,
      "eval_train_samples_per_second": 1227.491,
      "eval_train_steps_per_second": 15.466,
      "step": 3600
    },
    {
      "epoch": 0.576,
      "eval_test_accuracy": 0.9936,
      "eval_test_loss": 0.3178800046443939,
      "eval_test_runtime": 5.1132,
      "eval_test_samples_per_second": 977.869,
      "eval_test_steps_per_second": 12.321,
      "step": 3600
    },
    {
      "epoch": 0.57616,
      "grad_norm": 0.09154734015464783,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 3601
    },
    {
      "epoch": 0.57632,
      "grad_norm": 0.09960856288671494,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 3602
    },
    {
      "epoch": 0.57648,
      "grad_norm": 0.09398730099201202,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3603
    },
    {
      "epoch": 0.57664,
      "grad_norm": 0.10356272011995316,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3604
    },
    {
      "epoch": 0.5768,
      "grad_norm": 0.08718031644821167,
      "learning_rate": 0.0001,
      "loss": 0.3042,
      "step": 3605
    },
    {
      "epoch": 0.57696,
      "grad_norm": 0.09224389493465424,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 3606
    },
    {
      "epoch": 0.57712,
      "grad_norm": 0.09119690209627151,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 3607
    },
    {
      "epoch": 0.57728,
      "grad_norm": 0.09295324981212616,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 3608
    },
    {
      "epoch": 0.57744,
      "grad_norm": 0.08632834255695343,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3609
    },
    {
      "epoch": 0.5776,
      "grad_norm": 0.09517315030097961,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 3610
    },
    {
      "epoch": 0.57776,
      "grad_norm": 0.09382835775613785,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 3611
    },
    {
      "epoch": 0.57792,
      "grad_norm": 0.09829175472259521,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3612
    },
    {
      "epoch": 0.57808,
      "grad_norm": 0.09717214852571487,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 3613
    },
    {
      "epoch": 0.57824,
      "grad_norm": 0.0994662418961525,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 3614
    },
    {
      "epoch": 0.5784,
      "grad_norm": 0.08965285122394562,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 3615
    },
    {
      "epoch": 0.57856,
      "grad_norm": 0.09565103054046631,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 3616
    },
    {
      "epoch": 0.57872,
      "grad_norm": 0.11654552072286606,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 3617
    },
    {
      "epoch": 0.57888,
      "grad_norm": 0.0947762057185173,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 3618
    },
    {
      "epoch": 0.57904,
      "grad_norm": 0.08786381781101227,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 3619
    },
    {
      "epoch": 0.5792,
      "grad_norm": 0.09194686263799667,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 3620
    },
    {
      "epoch": 0.57936,
      "grad_norm": 0.10153467208147049,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3621
    },
    {
      "epoch": 0.57952,
      "grad_norm": 0.09763262420892715,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 3622
    },
    {
      "epoch": 0.57968,
      "grad_norm": 0.10476125776767731,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 3623
    },
    {
      "epoch": 0.57984,
      "grad_norm": 0.10653995722532272,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 3624
    },
    {
      "epoch": 0.58,
      "grad_norm": 0.09219294786453247,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 3625
    },
    {
      "epoch": 0.58016,
      "grad_norm": 0.08810354024171829,
      "learning_rate": 0.0001,
      "loss": 0.3064,
      "step": 3626
    },
    {
      "epoch": 0.58032,
      "grad_norm": 0.08694679290056229,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 3627
    },
    {
      "epoch": 0.58048,
      "grad_norm": 0.09003540128469467,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 3628
    },
    {
      "epoch": 0.58064,
      "grad_norm": 0.09567464888095856,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 3629
    },
    {
      "epoch": 0.5808,
      "grad_norm": 0.08704809844493866,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 3630
    },
    {
      "epoch": 0.58096,
      "grad_norm": 0.08435790985822678,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 3631
    },
    {
      "epoch": 0.58112,
      "grad_norm": 0.1049591600894928,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 3632
    },
    {
      "epoch": 0.58128,
      "grad_norm": 0.09595724195241928,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3633
    },
    {
      "epoch": 0.58144,
      "grad_norm": 0.08685202151536942,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 3634
    },
    {
      "epoch": 0.5816,
      "grad_norm": 0.1266845464706421,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 3635
    },
    {
      "epoch": 0.58176,
      "grad_norm": 0.09530894458293915,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3636
    },
    {
      "epoch": 0.58192,
      "grad_norm": 0.09349611401557922,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3637
    },
    {
      "epoch": 0.58208,
      "grad_norm": 0.10771429538726807,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 3638
    },
    {
      "epoch": 0.58224,
      "grad_norm": 0.08496194332838058,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 3639
    },
    {
      "epoch": 0.5824,
      "grad_norm": 0.09201178699731827,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 3640
    },
    {
      "epoch": 0.58256,
      "grad_norm": 0.11004301905632019,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 3641
    },
    {
      "epoch": 0.58272,
      "grad_norm": 0.09956130385398865,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 3642
    },
    {
      "epoch": 0.58288,
      "grad_norm": 0.0905715823173523,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 3643
    },
    {
      "epoch": 0.58304,
      "grad_norm": 0.09283602237701416,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 3644
    },
    {
      "epoch": 0.5832,
      "grad_norm": 0.10785748064517975,
      "learning_rate": 0.0001,
      "loss": 0.304,
      "step": 3645
    },
    {
      "epoch": 0.58336,
      "grad_norm": 0.09633823484182358,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 3646
    },
    {
      "epoch": 0.58352,
      "grad_norm": 0.09292034059762955,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 3647
    },
    {
      "epoch": 0.58368,
      "grad_norm": 0.09046676754951477,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 3648
    },
    {
      "epoch": 0.58384,
      "grad_norm": 0.10475035011768341,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 3649
    },
    {
      "epoch": 0.584,
      "grad_norm": 0.08623360097408295,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 3650
    },
    {
      "epoch": 0.58416,
      "grad_norm": 0.10158530622720718,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3651
    },
    {
      "epoch": 0.58432,
      "grad_norm": 0.1049533486366272,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 3652
    },
    {
      "epoch": 0.58448,
      "grad_norm": 0.10341393947601318,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 3653
    },
    {
      "epoch": 0.58464,
      "grad_norm": 0.0970870852470398,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3654
    },
    {
      "epoch": 0.5848,
      "grad_norm": 0.10167448967695236,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 3655
    },
    {
      "epoch": 0.58496,
      "grad_norm": 0.09769682586193085,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 3656
    },
    {
      "epoch": 0.58512,
      "grad_norm": 0.09544293582439423,
      "learning_rate": 0.0001,
      "loss": 0.3401,
      "step": 3657
    },
    {
      "epoch": 0.58528,
      "grad_norm": 0.09080532193183899,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 3658
    },
    {
      "epoch": 0.58544,
      "grad_norm": 0.0831504687666893,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 3659
    },
    {
      "epoch": 0.5856,
      "grad_norm": 0.08979236334562302,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 3660
    },
    {
      "epoch": 0.58576,
      "grad_norm": 0.08912798017263412,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 3661
    },
    {
      "epoch": 0.58592,
      "grad_norm": 0.10799632966518402,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 3662
    },
    {
      "epoch": 0.58608,
      "grad_norm": 0.10112285614013672,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 3663
    },
    {
      "epoch": 0.58624,
      "grad_norm": 0.08996933698654175,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 3664
    },
    {
      "epoch": 0.5864,
      "grad_norm": 0.13155969977378845,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 3665
    },
    {
      "epoch": 0.58656,
      "grad_norm": 0.09665654599666595,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 3666
    },
    {
      "epoch": 0.58672,
      "grad_norm": 0.09182365238666534,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 3667
    },
    {
      "epoch": 0.58688,
      "grad_norm": 0.08847613632678986,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 3668
    },
    {
      "epoch": 0.58704,
      "grad_norm": 0.0943790152668953,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 3669
    },
    {
      "epoch": 0.5872,
      "grad_norm": 0.08624039590358734,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 3670
    },
    {
      "epoch": 0.58736,
      "grad_norm": 0.0863182321190834,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 3671
    },
    {
      "epoch": 0.58752,
      "grad_norm": 0.0949363261461258,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 3672
    },
    {
      "epoch": 0.58768,
      "grad_norm": 0.08936356753110886,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 3673
    },
    {
      "epoch": 0.58784,
      "grad_norm": 0.08365689963102341,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 3674
    },
    {
      "epoch": 0.588,
      "grad_norm": 0.09115792065858841,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 3675
    },
    {
      "epoch": 0.58816,
      "grad_norm": 0.10450824350118637,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 3676
    },
    {
      "epoch": 0.58832,
      "grad_norm": 0.08823421597480774,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3677
    },
    {
      "epoch": 0.58848,
      "grad_norm": 0.10564269870519638,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 3678
    },
    {
      "epoch": 0.58864,
      "grad_norm": 0.09341444820165634,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 3679
    },
    {
      "epoch": 0.5888,
      "grad_norm": 0.08727344125509262,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 3680
    },
    {
      "epoch": 0.58896,
      "grad_norm": 0.08296642452478409,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 3681
    },
    {
      "epoch": 0.58912,
      "grad_norm": 0.08725055307149887,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3682
    },
    {
      "epoch": 0.58928,
      "grad_norm": 0.08626846224069595,
      "learning_rate": 0.0001,
      "loss": 0.2993,
      "step": 3683
    },
    {
      "epoch": 0.58944,
      "grad_norm": 0.08808682858943939,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 3684
    },
    {
      "epoch": 0.5896,
      "grad_norm": 0.09327855706214905,
      "learning_rate": 0.0001,
      "loss": 0.3341,
      "step": 3685
    },
    {
      "epoch": 0.58976,
      "grad_norm": 0.08938966691493988,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 3686
    },
    {
      "epoch": 0.58992,
      "grad_norm": 0.1043708547949791,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 3687
    },
    {
      "epoch": 0.59008,
      "grad_norm": 0.08731169998645782,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 3688
    },
    {
      "epoch": 0.59024,
      "grad_norm": 0.09827764332294464,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 3689
    },
    {
      "epoch": 0.5904,
      "grad_norm": 0.09274990111589432,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3690
    },
    {
      "epoch": 0.59056,
      "grad_norm": 0.10781851410865784,
      "learning_rate": 0.0001,
      "loss": 0.3379,
      "step": 3691
    },
    {
      "epoch": 0.59072,
      "grad_norm": 0.09967458993196487,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 3692
    },
    {
      "epoch": 0.59088,
      "grad_norm": 0.09171596169471741,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 3693
    },
    {
      "epoch": 0.59104,
      "grad_norm": 0.09079805016517639,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 3694
    },
    {
      "epoch": 0.5912,
      "grad_norm": 0.09172920882701874,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 3695
    },
    {
      "epoch": 0.59136,
      "grad_norm": 0.09405459463596344,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 3696
    },
    {
      "epoch": 0.59152,
      "grad_norm": 0.09082898497581482,
      "learning_rate": 0.0001,
      "loss": 0.3089,
      "step": 3697
    },
    {
      "epoch": 0.59168,
      "grad_norm": 0.09875478595495224,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 3698
    },
    {
      "epoch": 0.59184,
      "grad_norm": 0.09167958796024323,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 3699
    },
    {
      "epoch": 0.592,
      "grad_norm": 0.08552785217761993,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3700
    },
    {
      "epoch": 0.592,
      "eval_train_accuracy": 0.994,
      "eval_train_loss": 0.31893566250801086,
      "eval_train_runtime": 4.0984,
      "eval_train_samples_per_second": 1219.987,
      "eval_train_steps_per_second": 15.372,
      "step": 3700
    },
    {
      "epoch": 0.592,
      "eval_test_accuracy": 0.994,
      "eval_test_loss": 0.31771543622016907,
      "eval_test_runtime": 4.8778,
      "eval_test_samples_per_second": 1025.05,
      "eval_test_steps_per_second": 12.916,
      "step": 3700
    },
    {
      "epoch": 0.59216,
      "grad_norm": 0.10003504157066345,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 3701
    },
    {
      "epoch": 0.59232,
      "grad_norm": 0.10802445560693741,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 3702
    },
    {
      "epoch": 0.59248,
      "grad_norm": 0.09445320814847946,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3703
    },
    {
      "epoch": 0.59264,
      "grad_norm": 0.09074743837118149,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 3704
    },
    {
      "epoch": 0.5928,
      "grad_norm": 0.0855245292186737,
      "learning_rate": 0.0001,
      "loss": 0.3038,
      "step": 3705
    },
    {
      "epoch": 0.59296,
      "grad_norm": 0.08342277258634567,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 3706
    },
    {
      "epoch": 0.59312,
      "grad_norm": 0.09617432206869125,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 3707
    },
    {
      "epoch": 0.59328,
      "grad_norm": 0.08921711891889572,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 3708
    },
    {
      "epoch": 0.59344,
      "grad_norm": 0.12101948261260986,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3709
    },
    {
      "epoch": 0.5936,
      "grad_norm": 0.1381293088197708,
      "learning_rate": 0.0001,
      "loss": 0.3476,
      "step": 3710
    },
    {
      "epoch": 0.59376,
      "grad_norm": 0.10719206929206848,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 3711
    },
    {
      "epoch": 0.59392,
      "grad_norm": 0.08607781678438187,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3712
    },
    {
      "epoch": 0.59408,
      "grad_norm": 0.0973341092467308,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 3713
    },
    {
      "epoch": 0.59424,
      "grad_norm": 0.10702937841415405,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 3714
    },
    {
      "epoch": 0.5944,
      "grad_norm": 0.12239932268857956,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 3715
    },
    {
      "epoch": 0.59456,
      "grad_norm": 0.10627377778291702,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 3716
    },
    {
      "epoch": 0.59472,
      "grad_norm": 0.08922210335731506,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 3717
    },
    {
      "epoch": 0.59488,
      "grad_norm": 0.09335220605134964,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 3718
    },
    {
      "epoch": 0.59504,
      "grad_norm": 0.09029290825128555,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 3719
    },
    {
      "epoch": 0.5952,
      "grad_norm": 0.11058039963245392,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 3720
    },
    {
      "epoch": 0.59536,
      "grad_norm": 0.10659731179475784,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 3721
    },
    {
      "epoch": 0.59552,
      "grad_norm": 0.09325550496578217,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 3722
    },
    {
      "epoch": 0.59568,
      "grad_norm": 0.08554349094629288,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 3723
    },
    {
      "epoch": 0.59584,
      "grad_norm": 0.11516740918159485,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3724
    },
    {
      "epoch": 0.596,
      "grad_norm": 0.10268548130989075,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3725
    },
    {
      "epoch": 0.59616,
      "grad_norm": 0.09176561236381531,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 3726
    },
    {
      "epoch": 0.59632,
      "grad_norm": 0.0865568220615387,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 3727
    },
    {
      "epoch": 0.59648,
      "grad_norm": 0.08775611221790314,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 3728
    },
    {
      "epoch": 0.59664,
      "grad_norm": 0.09334822744131088,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3729
    },
    {
      "epoch": 0.5968,
      "grad_norm": 0.08904603868722916,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 3730
    },
    {
      "epoch": 0.59696,
      "grad_norm": 0.12069903314113617,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 3731
    },
    {
      "epoch": 0.59712,
      "grad_norm": 0.08569841831922531,
      "learning_rate": 0.0001,
      "loss": 0.3039,
      "step": 3732
    },
    {
      "epoch": 0.59728,
      "grad_norm": 0.09265092760324478,
      "learning_rate": 0.0001,
      "loss": 0.2982,
      "step": 3733
    },
    {
      "epoch": 0.59744,
      "grad_norm": 0.09500502794981003,
      "learning_rate": 0.0001,
      "loss": 0.3052,
      "step": 3734
    },
    {
      "epoch": 0.5976,
      "grad_norm": 0.10545147955417633,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 3735
    },
    {
      "epoch": 0.59776,
      "grad_norm": 0.08938686549663544,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3736
    },
    {
      "epoch": 0.59792,
      "grad_norm": 0.10106033831834793,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 3737
    },
    {
      "epoch": 0.59808,
      "grad_norm": 0.08693340420722961,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3738
    },
    {
      "epoch": 0.59824,
      "grad_norm": 0.13348476588726044,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 3739
    },
    {
      "epoch": 0.5984,
      "grad_norm": 0.08957505226135254,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 3740
    },
    {
      "epoch": 0.59856,
      "grad_norm": 0.11136428266763687,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3741
    },
    {
      "epoch": 0.59872,
      "grad_norm": 0.15550784766674042,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 3742
    },
    {
      "epoch": 0.59888,
      "grad_norm": 0.08387833833694458,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 3743
    },
    {
      "epoch": 0.59904,
      "grad_norm": 0.10957594960927963,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3744
    },
    {
      "epoch": 0.5992,
      "grad_norm": 0.12273399531841278,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 3745
    },
    {
      "epoch": 0.59936,
      "grad_norm": 0.14904378354549408,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 3746
    },
    {
      "epoch": 0.59952,
      "grad_norm": 0.07900069653987885,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 3747
    },
    {
      "epoch": 0.59968,
      "grad_norm": 0.07867231965065002,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 3748
    },
    {
      "epoch": 0.59984,
      "grad_norm": 0.09225602447986603,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 3749
    },
    {
      "epoch": 0.6,
      "grad_norm": 0.08239848911762238,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 3750
    },
    {
      "epoch": 0.60016,
      "grad_norm": 0.08849158883094788,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 3751
    },
    {
      "epoch": 0.60032,
      "grad_norm": 0.11987745016813278,
      "learning_rate": 0.0001,
      "loss": 0.3064,
      "step": 3752
    },
    {
      "epoch": 0.60048,
      "grad_norm": 0.08738866448402405,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 3753
    },
    {
      "epoch": 0.60064,
      "grad_norm": 0.08385788649320602,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 3754
    },
    {
      "epoch": 0.6008,
      "grad_norm": 0.08449367433786392,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 3755
    },
    {
      "epoch": 0.60096,
      "grad_norm": 0.08762525022029877,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 3756
    },
    {
      "epoch": 0.60112,
      "grad_norm": 0.09741827845573425,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3757
    },
    {
      "epoch": 0.60128,
      "grad_norm": 0.09982658922672272,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3758
    },
    {
      "epoch": 0.60144,
      "grad_norm": 0.10860474407672882,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 3759
    },
    {
      "epoch": 0.6016,
      "grad_norm": 0.08848274499177933,
      "learning_rate": 0.0001,
      "loss": 0.3089,
      "step": 3760
    },
    {
      "epoch": 0.60176,
      "grad_norm": 0.1164049580693245,
      "learning_rate": 0.0001,
      "loss": 0.3448,
      "step": 3761
    },
    {
      "epoch": 0.60192,
      "grad_norm": 0.08880653977394104,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 3762
    },
    {
      "epoch": 0.60208,
      "grad_norm": 0.10413268953561783,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 3763
    },
    {
      "epoch": 0.60224,
      "grad_norm": 0.11694672703742981,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 3764
    },
    {
      "epoch": 0.6024,
      "grad_norm": 0.18907925486564636,
      "learning_rate": 0.0001,
      "loss": 0.2999,
      "step": 3765
    },
    {
      "epoch": 0.60256,
      "grad_norm": 0.09974423050880432,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 3766
    },
    {
      "epoch": 0.60272,
      "grad_norm": 0.09222706407308578,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 3767
    },
    {
      "epoch": 0.60288,
      "grad_norm": 0.10279548913240433,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 3768
    },
    {
      "epoch": 0.60304,
      "grad_norm": 0.08882252871990204,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 3769
    },
    {
      "epoch": 0.6032,
      "grad_norm": 0.09328906983137131,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3770
    },
    {
      "epoch": 0.60336,
      "grad_norm": 0.09764375537633896,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 3771
    },
    {
      "epoch": 0.60352,
      "grad_norm": 0.08923134952783585,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 3772
    },
    {
      "epoch": 0.60368,
      "grad_norm": 0.13985756039619446,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 3773
    },
    {
      "epoch": 0.60384,
      "grad_norm": 0.09467557072639465,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 3774
    },
    {
      "epoch": 0.604,
      "grad_norm": 0.22016626596450806,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 3775
    },
    {
      "epoch": 0.60416,
      "grad_norm": 0.09720093756914139,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 3776
    },
    {
      "epoch": 0.60432,
      "grad_norm": 0.08870046585798264,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 3777
    },
    {
      "epoch": 0.60448,
      "grad_norm": 0.11057472229003906,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 3778
    },
    {
      "epoch": 0.60464,
      "grad_norm": 0.13582223653793335,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3779
    },
    {
      "epoch": 0.6048,
      "grad_norm": 0.12037120014429092,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 3780
    },
    {
      "epoch": 0.60496,
      "grad_norm": 0.10068167746067047,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 3781
    },
    {
      "epoch": 0.60512,
      "grad_norm": 0.09800348430871964,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 3782
    },
    {
      "epoch": 0.60528,
      "grad_norm": 0.104241281747818,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 3783
    },
    {
      "epoch": 0.60544,
      "grad_norm": 0.169247567653656,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3784
    },
    {
      "epoch": 0.6056,
      "grad_norm": 0.08064059913158417,
      "learning_rate": 0.0001,
      "loss": 0.3018,
      "step": 3785
    },
    {
      "epoch": 0.60576,
      "grad_norm": 0.09259837120771408,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 3786
    },
    {
      "epoch": 0.60592,
      "grad_norm": 0.09775640815496445,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 3787
    },
    {
      "epoch": 0.60608,
      "grad_norm": 0.1022244244813919,
      "learning_rate": 0.0001,
      "loss": 0.3083,
      "step": 3788
    },
    {
      "epoch": 0.60624,
      "grad_norm": 0.11917915940284729,
      "learning_rate": 0.0001,
      "loss": 0.3055,
      "step": 3789
    },
    {
      "epoch": 0.6064,
      "grad_norm": 0.09652800858020782,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3790
    },
    {
      "epoch": 0.60656,
      "grad_norm": 0.11227152496576309,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 3791
    },
    {
      "epoch": 0.60672,
      "grad_norm": 0.09705837070941925,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 3792
    },
    {
      "epoch": 0.60688,
      "grad_norm": 0.08850881457328796,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 3793
    },
    {
      "epoch": 0.60704,
      "grad_norm": 0.085405632853508,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 3794
    },
    {
      "epoch": 0.6072,
      "grad_norm": 0.10357258468866348,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 3795
    },
    {
      "epoch": 0.60736,
      "grad_norm": 0.08779946714639664,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 3796
    },
    {
      "epoch": 0.60752,
      "grad_norm": 0.10133559256792068,
      "learning_rate": 0.0001,
      "loss": 0.307,
      "step": 3797
    },
    {
      "epoch": 0.60768,
      "grad_norm": 0.10281828790903091,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3798
    },
    {
      "epoch": 0.60784,
      "grad_norm": 0.11018051207065582,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 3799
    },
    {
      "epoch": 0.608,
      "grad_norm": 0.09638355672359467,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3800
    },
    {
      "epoch": 0.608,
      "eval_train_accuracy": 0.9936,
      "eval_train_loss": 0.3190517723560333,
      "eval_train_runtime": 4.2569,
      "eval_train_samples_per_second": 1174.55,
      "eval_train_steps_per_second": 14.799,
      "step": 3800
    },
    {
      "epoch": 0.608,
      "eval_test_accuracy": 0.9886,
      "eval_test_loss": 0.3177546262741089,
      "eval_test_runtime": 4.6144,
      "eval_test_samples_per_second": 1083.559,
      "eval_test_steps_per_second": 13.653,
      "step": 3800
    },
    {
      "epoch": 0.60816,
      "grad_norm": 0.09318779408931732,
      "learning_rate": 0.0001,
      "loss": 0.306,
      "step": 3801
    },
    {
      "epoch": 0.60832,
      "grad_norm": 0.089589424431324,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 3802
    },
    {
      "epoch": 0.60848,
      "grad_norm": 0.08675061911344528,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 3803
    },
    {
      "epoch": 0.60864,
      "grad_norm": 0.08323805779218674,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 3804
    },
    {
      "epoch": 0.6088,
      "grad_norm": 0.10598499327898026,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 3805
    },
    {
      "epoch": 0.60896,
      "grad_norm": 0.10307452827692032,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 3806
    },
    {
      "epoch": 0.60912,
      "grad_norm": 0.08573365956544876,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3807
    },
    {
      "epoch": 0.60928,
      "grad_norm": 0.2186959981918335,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 3808
    },
    {
      "epoch": 0.60944,
      "grad_norm": 0.08633149415254593,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 3809
    },
    {
      "epoch": 0.6096,
      "grad_norm": 0.11535211652517319,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 3810
    },
    {
      "epoch": 0.60976,
      "grad_norm": 0.10131809115409851,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 3811
    },
    {
      "epoch": 0.60992,
      "grad_norm": 0.10006377100944519,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 3812
    },
    {
      "epoch": 0.61008,
      "grad_norm": 0.08390084654092789,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 3813
    },
    {
      "epoch": 0.61024,
      "grad_norm": 0.1090870052576065,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 3814
    },
    {
      "epoch": 0.6104,
      "grad_norm": 0.09938445687294006,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 3815
    },
    {
      "epoch": 0.61056,
      "grad_norm": 0.08969719707965851,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 3816
    },
    {
      "epoch": 0.61072,
      "grad_norm": 0.11315895617008209,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 3817
    },
    {
      "epoch": 0.61088,
      "grad_norm": 0.08841776847839355,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 3818
    },
    {
      "epoch": 0.61104,
      "grad_norm": 0.09537770599126816,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 3819
    },
    {
      "epoch": 0.6112,
      "grad_norm": 0.09780025482177734,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3820
    },
    {
      "epoch": 0.61136,
      "grad_norm": 0.10264371335506439,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3821
    },
    {
      "epoch": 0.61152,
      "grad_norm": 0.12355799973011017,
      "learning_rate": 0.0001,
      "loss": 0.3091,
      "step": 3822
    },
    {
      "epoch": 0.61168,
      "grad_norm": 0.08373242616653442,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 3823
    },
    {
      "epoch": 0.61184,
      "grad_norm": 0.0951242446899414,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 3824
    },
    {
      "epoch": 0.612,
      "grad_norm": 0.1018066555261612,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 3825
    },
    {
      "epoch": 0.61216,
      "grad_norm": 0.08492293208837509,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 3826
    },
    {
      "epoch": 0.61232,
      "grad_norm": 0.09705628454685211,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 3827
    },
    {
      "epoch": 0.61248,
      "grad_norm": 0.09950286149978638,
      "learning_rate": 0.0001,
      "loss": 0.3078,
      "step": 3828
    },
    {
      "epoch": 0.61264,
      "grad_norm": 0.08633006364107132,
      "learning_rate": 0.0001,
      "loss": 0.3037,
      "step": 3829
    },
    {
      "epoch": 0.6128,
      "grad_norm": 0.09640330076217651,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 3830
    },
    {
      "epoch": 0.61296,
      "grad_norm": 0.09667255729436874,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 3831
    },
    {
      "epoch": 0.61312,
      "grad_norm": 0.0810551568865776,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 3832
    },
    {
      "epoch": 0.61328,
      "grad_norm": 0.2710418701171875,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 3833
    },
    {
      "epoch": 0.61344,
      "grad_norm": 0.09457990527153015,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 3834
    },
    {
      "epoch": 0.6136,
      "grad_norm": 0.10840888321399689,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 3835
    },
    {
      "epoch": 0.61376,
      "grad_norm": 0.11953707784414291,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 3836
    },
    {
      "epoch": 0.61392,
      "grad_norm": 0.10237804055213928,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 3837
    },
    {
      "epoch": 0.61408,
      "grad_norm": 0.13158784806728363,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 3838
    },
    {
      "epoch": 0.61424,
      "grad_norm": 0.11115091294050217,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 3839
    },
    {
      "epoch": 0.6144,
      "grad_norm": 0.1018969714641571,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 3840
    },
    {
      "epoch": 0.61456,
      "grad_norm": 0.11300596594810486,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 3841
    },
    {
      "epoch": 0.61472,
      "grad_norm": 0.12569841742515564,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3842
    },
    {
      "epoch": 0.61488,
      "grad_norm": 0.18285509943962097,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 3843
    },
    {
      "epoch": 0.61504,
      "grad_norm": 0.1279098242521286,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 3844
    },
    {
      "epoch": 0.6152,
      "grad_norm": 0.08882351964712143,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 3845
    },
    {
      "epoch": 0.61536,
      "grad_norm": 0.2014109045267105,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3846
    },
    {
      "epoch": 0.61552,
      "grad_norm": 0.1426590234041214,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 3847
    },
    {
      "epoch": 0.61568,
      "grad_norm": 0.15295852720737457,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 3848
    },
    {
      "epoch": 0.61584,
      "grad_norm": 0.12768076360225677,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 3849
    },
    {
      "epoch": 0.616,
      "grad_norm": 0.10507883131504059,
      "learning_rate": 0.0001,
      "loss": 0.3071,
      "step": 3850
    },
    {
      "epoch": 0.61616,
      "grad_norm": 0.13173148036003113,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 3851
    },
    {
      "epoch": 0.61632,
      "grad_norm": 0.10292694717645645,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 3852
    },
    {
      "epoch": 0.61648,
      "grad_norm": 0.0979575365781784,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 3853
    },
    {
      "epoch": 0.61664,
      "grad_norm": 0.11390919983386993,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3854
    },
    {
      "epoch": 0.6168,
      "grad_norm": 0.11203788965940475,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 3855
    },
    {
      "epoch": 0.61696,
      "grad_norm": 0.10438580065965652,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 3856
    },
    {
      "epoch": 0.61712,
      "grad_norm": 0.10904756933450699,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3857
    },
    {
      "epoch": 0.61728,
      "grad_norm": 0.10842124372720718,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 3858
    },
    {
      "epoch": 0.61744,
      "grad_norm": 0.08479223400354385,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 3859
    },
    {
      "epoch": 0.6176,
      "grad_norm": 0.10305863618850708,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 3860
    },
    {
      "epoch": 0.61776,
      "grad_norm": 0.09408670663833618,
      "learning_rate": 0.0001,
      "loss": 0.3065,
      "step": 3861
    },
    {
      "epoch": 0.61792,
      "grad_norm": 0.09731464833021164,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3862
    },
    {
      "epoch": 0.61808,
      "grad_norm": 0.0943698137998581,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 3863
    },
    {
      "epoch": 0.61824,
      "grad_norm": 0.08373007923364639,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3864
    },
    {
      "epoch": 0.6184,
      "grad_norm": 0.08904393017292023,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 3865
    },
    {
      "epoch": 0.61856,
      "grad_norm": 0.10298497974872589,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 3866
    },
    {
      "epoch": 0.61872,
      "grad_norm": 0.09325762093067169,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 3867
    },
    {
      "epoch": 0.61888,
      "grad_norm": 0.08873818069696426,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 3868
    },
    {
      "epoch": 0.61904,
      "grad_norm": 0.09623024612665176,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 3869
    },
    {
      "epoch": 0.6192,
      "grad_norm": 0.10288579016923904,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 3870
    },
    {
      "epoch": 0.61936,
      "grad_norm": 0.1046050637960434,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 3871
    },
    {
      "epoch": 0.61952,
      "grad_norm": 0.11202207952737808,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 3872
    },
    {
      "epoch": 0.61968,
      "grad_norm": 0.10155992209911346,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 3873
    },
    {
      "epoch": 0.61984,
      "grad_norm": 0.09547574073076248,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3874
    },
    {
      "epoch": 0.62,
      "grad_norm": 0.11530216038227081,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 3875
    },
    {
      "epoch": 0.62016,
      "grad_norm": 0.10669735819101334,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 3876
    },
    {
      "epoch": 0.62032,
      "grad_norm": 0.09074217826128006,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 3877
    },
    {
      "epoch": 0.62048,
      "grad_norm": 0.09395413845777512,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3878
    },
    {
      "epoch": 0.62064,
      "grad_norm": 0.09695236384868622,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 3879
    },
    {
      "epoch": 0.6208,
      "grad_norm": 0.08310166746377945,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 3880
    },
    {
      "epoch": 0.62096,
      "grad_norm": 0.09321362525224686,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 3881
    },
    {
      "epoch": 0.62112,
      "grad_norm": 0.0982205793261528,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 3882
    },
    {
      "epoch": 0.62128,
      "grad_norm": 0.09054107218980789,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 3883
    },
    {
      "epoch": 0.62144,
      "grad_norm": 0.10408664494752884,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 3884
    },
    {
      "epoch": 0.6216,
      "grad_norm": 0.0957980751991272,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3885
    },
    {
      "epoch": 0.62176,
      "grad_norm": 0.2002027928829193,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 3886
    },
    {
      "epoch": 0.62192,
      "grad_norm": 0.08929572999477386,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 3887
    },
    {
      "epoch": 0.62208,
      "grad_norm": 0.0975913405418396,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 3888
    },
    {
      "epoch": 0.62224,
      "grad_norm": 0.08152339607477188,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 3889
    },
    {
      "epoch": 0.6224,
      "grad_norm": 0.0872361809015274,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 3890
    },
    {
      "epoch": 0.62256,
      "grad_norm": 0.09676755964756012,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 3891
    },
    {
      "epoch": 0.62272,
      "grad_norm": 0.10426392406225204,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 3892
    },
    {
      "epoch": 0.62288,
      "grad_norm": 0.09622209519147873,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 3893
    },
    {
      "epoch": 0.62304,
      "grad_norm": 0.0995459333062172,
      "learning_rate": 0.0001,
      "loss": 0.3024,
      "step": 3894
    },
    {
      "epoch": 0.6232,
      "grad_norm": 0.11316091567277908,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 3895
    },
    {
      "epoch": 0.62336,
      "grad_norm": 0.12809090316295624,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 3896
    },
    {
      "epoch": 0.62352,
      "grad_norm": 0.10714437812566757,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 3897
    },
    {
      "epoch": 0.62368,
      "grad_norm": 0.09750522673130035,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 3898
    },
    {
      "epoch": 0.62384,
      "grad_norm": 0.1000853031873703,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 3899
    },
    {
      "epoch": 0.624,
      "grad_norm": 0.08937019109725952,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 3900
    },
    {
      "epoch": 0.624,
      "eval_train_accuracy": 0.9942,
      "eval_train_loss": 0.3194349408149719,
      "eval_train_runtime": 4.0752,
      "eval_train_samples_per_second": 1226.935,
      "eval_train_steps_per_second": 15.459,
      "step": 3900
    },
    {
      "epoch": 0.624,
      "eval_test_accuracy": 0.9926,
      "eval_test_loss": 0.3181317150592804,
      "eval_test_runtime": 4.7273,
      "eval_test_samples_per_second": 1057.677,
      "eval_test_steps_per_second": 13.327,
      "step": 3900
    },
    {
      "epoch": 0.62416,
      "grad_norm": 0.09333031624555588,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 3901
    },
    {
      "epoch": 0.62432,
      "grad_norm": 0.09920453280210495,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 3902
    },
    {
      "epoch": 0.62448,
      "grad_norm": 0.09617789834737778,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 3903
    },
    {
      "epoch": 0.62464,
      "grad_norm": 0.09367837011814117,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 3904
    },
    {
      "epoch": 0.6248,
      "grad_norm": 0.10122548043727875,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 3905
    },
    {
      "epoch": 0.62496,
      "grad_norm": 0.09709659218788147,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 3906
    },
    {
      "epoch": 0.62512,
      "grad_norm": 0.09258096665143967,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 3907
    },
    {
      "epoch": 0.62528,
      "grad_norm": 0.1040441021323204,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 3908
    },
    {
      "epoch": 0.62544,
      "grad_norm": 0.09280425310134888,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3909
    },
    {
      "epoch": 0.6256,
      "grad_norm": 0.0912674218416214,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3910
    },
    {
      "epoch": 0.62576,
      "grad_norm": 0.08449404686689377,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 3911
    },
    {
      "epoch": 0.62592,
      "grad_norm": 0.10465260595083237,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 3912
    },
    {
      "epoch": 0.62608,
      "grad_norm": 0.09383119642734528,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 3913
    },
    {
      "epoch": 0.62624,
      "grad_norm": 0.10241828858852386,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3914
    },
    {
      "epoch": 0.6264,
      "grad_norm": 0.09350240975618362,
      "learning_rate": 0.0001,
      "loss": 0.3049,
      "step": 3915
    },
    {
      "epoch": 0.62656,
      "grad_norm": 0.09838086366653442,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3916
    },
    {
      "epoch": 0.62672,
      "grad_norm": 0.186345636844635,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 3917
    },
    {
      "epoch": 0.62688,
      "grad_norm": 0.0907171219587326,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 3918
    },
    {
      "epoch": 0.62704,
      "grad_norm": 0.0920744463801384,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 3919
    },
    {
      "epoch": 0.6272,
      "grad_norm": 0.0987362489104271,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 3920
    },
    {
      "epoch": 0.62736,
      "grad_norm": 0.09130405634641647,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3921
    },
    {
      "epoch": 0.62752,
      "grad_norm": 0.10823944956064224,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3922
    },
    {
      "epoch": 0.62768,
      "grad_norm": 0.08426140993833542,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 3923
    },
    {
      "epoch": 0.62784,
      "grad_norm": 0.09239345043897629,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 3924
    },
    {
      "epoch": 0.628,
      "grad_norm": 0.15916433930397034,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 3925
    },
    {
      "epoch": 0.62816,
      "grad_norm": 0.10713255405426025,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 3926
    },
    {
      "epoch": 0.62832,
      "grad_norm": 0.10660995543003082,
      "learning_rate": 0.0001,
      "loss": 0.342,
      "step": 3927
    },
    {
      "epoch": 0.62848,
      "grad_norm": 0.16666170954704285,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 3928
    },
    {
      "epoch": 0.62864,
      "grad_norm": 0.08320184051990509,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 3929
    },
    {
      "epoch": 0.6288,
      "grad_norm": 0.08193153142929077,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 3930
    },
    {
      "epoch": 0.62896,
      "grad_norm": 0.09787450730800629,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 3931
    },
    {
      "epoch": 0.62912,
      "grad_norm": 0.13125629723072052,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3932
    },
    {
      "epoch": 0.62928,
      "grad_norm": 0.12082720547914505,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 3933
    },
    {
      "epoch": 0.62944,
      "grad_norm": 0.097016841173172,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 3934
    },
    {
      "epoch": 0.6296,
      "grad_norm": 0.10687272995710373,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 3935
    },
    {
      "epoch": 0.62976,
      "grad_norm": 0.1256582885980606,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 3936
    },
    {
      "epoch": 0.62992,
      "grad_norm": 0.08872630447149277,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 3937
    },
    {
      "epoch": 0.63008,
      "grad_norm": 0.08679110556840897,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 3938
    },
    {
      "epoch": 0.63024,
      "grad_norm": 0.0908510759472847,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 3939
    },
    {
      "epoch": 0.6304,
      "grad_norm": 0.0873895063996315,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 3940
    },
    {
      "epoch": 0.63056,
      "grad_norm": 0.08800330013036728,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3941
    },
    {
      "epoch": 0.63072,
      "grad_norm": 0.09886208176612854,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 3942
    },
    {
      "epoch": 0.63088,
      "grad_norm": 0.09634111821651459,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 3943
    },
    {
      "epoch": 0.63104,
      "grad_norm": 0.08875059336423874,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 3944
    },
    {
      "epoch": 0.6312,
      "grad_norm": 0.08012398332357407,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 3945
    },
    {
      "epoch": 0.63136,
      "grad_norm": 0.1061459481716156,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3946
    },
    {
      "epoch": 0.63152,
      "grad_norm": 0.09340465813875198,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3947
    },
    {
      "epoch": 0.63168,
      "grad_norm": 0.09253484010696411,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 3948
    },
    {
      "epoch": 0.63184,
      "grad_norm": 0.16053242981433868,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 3949
    },
    {
      "epoch": 0.632,
      "grad_norm": 0.0914260670542717,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 3950
    },
    {
      "epoch": 0.63216,
      "grad_norm": 0.0839008167386055,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 3951
    },
    {
      "epoch": 0.63232,
      "grad_norm": 0.07913947105407715,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 3952
    },
    {
      "epoch": 0.63248,
      "grad_norm": 0.0898929312825203,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 3953
    },
    {
      "epoch": 0.63264,
      "grad_norm": 0.0854753702878952,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3954
    },
    {
      "epoch": 0.6328,
      "grad_norm": 0.09289949387311935,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3955
    },
    {
      "epoch": 0.63296,
      "grad_norm": 0.1035848930478096,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 3956
    },
    {
      "epoch": 0.63312,
      "grad_norm": 0.0828159898519516,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 3957
    },
    {
      "epoch": 0.63328,
      "grad_norm": 0.0906711295247078,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 3958
    },
    {
      "epoch": 0.63344,
      "grad_norm": 0.07925961166620255,
      "learning_rate": 0.0001,
      "loss": 0.3016,
      "step": 3959
    },
    {
      "epoch": 0.6336,
      "grad_norm": 0.10607355833053589,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 3960
    },
    {
      "epoch": 0.63376,
      "grad_norm": 0.13542760908603668,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 3961
    },
    {
      "epoch": 0.63392,
      "grad_norm": 0.09272360801696777,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 3962
    },
    {
      "epoch": 0.63408,
      "grad_norm": 0.09292378276586533,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 3963
    },
    {
      "epoch": 0.63424,
      "grad_norm": 0.10847658663988113,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 3964
    },
    {
      "epoch": 0.6344,
      "grad_norm": 0.0835505947470665,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3965
    },
    {
      "epoch": 0.63456,
      "grad_norm": 0.0872952938079834,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3966
    },
    {
      "epoch": 0.63472,
      "grad_norm": 0.10994330048561096,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 3967
    },
    {
      "epoch": 0.63488,
      "grad_norm": 0.09470222890377045,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 3968
    },
    {
      "epoch": 0.63504,
      "grad_norm": 0.09449471533298492,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 3969
    },
    {
      "epoch": 0.6352,
      "grad_norm": 0.08707638084888458,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3970
    },
    {
      "epoch": 0.63536,
      "grad_norm": 0.0985056683421135,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 3971
    },
    {
      "epoch": 0.63552,
      "grad_norm": 0.10000071674585342,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 3972
    },
    {
      "epoch": 0.63568,
      "grad_norm": 0.08964302390813828,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 3973
    },
    {
      "epoch": 0.63584,
      "grad_norm": 0.08869940787553787,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 3974
    },
    {
      "epoch": 0.636,
      "grad_norm": 0.09831058979034424,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 3975
    },
    {
      "epoch": 0.63616,
      "grad_norm": 0.09131669998168945,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 3976
    },
    {
      "epoch": 0.63632,
      "grad_norm": 0.09925341606140137,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3977
    },
    {
      "epoch": 0.63648,
      "grad_norm": 0.08831005543470383,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3978
    },
    {
      "epoch": 0.63664,
      "grad_norm": 0.08953221142292023,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 3979
    },
    {
      "epoch": 0.6368,
      "grad_norm": 0.09952973574399948,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 3980
    },
    {
      "epoch": 0.63696,
      "grad_norm": 0.10086982697248459,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 3981
    },
    {
      "epoch": 0.63712,
      "grad_norm": 0.08875405043363571,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3982
    },
    {
      "epoch": 0.63728,
      "grad_norm": 0.08394505828619003,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 3983
    },
    {
      "epoch": 0.63744,
      "grad_norm": 0.10311850160360336,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3984
    },
    {
      "epoch": 0.6376,
      "grad_norm": 0.09180747717618942,
      "learning_rate": 0.0001,
      "loss": 0.3026,
      "step": 3985
    },
    {
      "epoch": 0.63776,
      "grad_norm": 0.1010235846042633,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 3986
    },
    {
      "epoch": 0.63792,
      "grad_norm": 0.07793354243040085,
      "learning_rate": 0.0001,
      "loss": 0.2998,
      "step": 3987
    },
    {
      "epoch": 0.63808,
      "grad_norm": 0.08565941452980042,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3988
    },
    {
      "epoch": 0.63824,
      "grad_norm": 0.0899265855550766,
      "learning_rate": 0.0001,
      "loss": 0.2895,
      "step": 3989
    },
    {
      "epoch": 0.6384,
      "grad_norm": 0.1040802150964737,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 3990
    },
    {
      "epoch": 0.63856,
      "grad_norm": 0.10719306021928787,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 3991
    },
    {
      "epoch": 0.63872,
      "grad_norm": 0.09351789206266403,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 3992
    },
    {
      "epoch": 0.63888,
      "grad_norm": 0.08875835686922073,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3993
    },
    {
      "epoch": 0.63904,
      "grad_norm": 0.08736731112003326,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 3994
    },
    {
      "epoch": 0.6392,
      "grad_norm": 0.07896845787763596,
      "learning_rate": 0.0001,
      "loss": 0.2931,
      "step": 3995
    },
    {
      "epoch": 0.63936,
      "grad_norm": 0.09989969432353973,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 3996
    },
    {
      "epoch": 0.63952,
      "grad_norm": 0.10840623080730438,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 3997
    },
    {
      "epoch": 0.63968,
      "grad_norm": 0.09500956535339355,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3998
    },
    {
      "epoch": 0.63984,
      "grad_norm": 0.08833357691764832,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 3999
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.09593518078327179,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 4000
    },
    {
      "epoch": 0.64,
      "eval_train_accuracy": 0.996,
      "eval_train_loss": 0.31926748156547546,
      "eval_train_runtime": 4.2452,
      "eval_train_samples_per_second": 1177.791,
      "eval_train_steps_per_second": 14.84,
      "step": 4000
    },
    {
      "epoch": 0.64,
      "eval_test_accuracy": 0.9944,
      "eval_test_loss": 0.318104088306427,
      "eval_test_runtime": 4.7121,
      "eval_test_samples_per_second": 1061.087,
      "eval_test_steps_per_second": 13.37,
      "step": 4000
    },
    {
      "epoch": 0.64016,
      "grad_norm": 0.0932156890630722,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4001
    },
    {
      "epoch": 0.64032,
      "grad_norm": 0.09577526897192001,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 4002
    },
    {
      "epoch": 0.64048,
      "grad_norm": 0.10690510272979736,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 4003
    },
    {
      "epoch": 0.64064,
      "grad_norm": 0.0964619368314743,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4004
    },
    {
      "epoch": 0.6408,
      "grad_norm": 0.08702845871448517,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4005
    },
    {
      "epoch": 0.64096,
      "grad_norm": 0.0927053838968277,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 4006
    },
    {
      "epoch": 0.64112,
      "grad_norm": 0.09590406715869904,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 4007
    },
    {
      "epoch": 0.64128,
      "grad_norm": 0.10137973725795746,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 4008
    },
    {
      "epoch": 0.64144,
      "grad_norm": 0.0946703627705574,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4009
    },
    {
      "epoch": 0.6416,
      "grad_norm": 0.12195964902639389,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4010
    },
    {
      "epoch": 0.64176,
      "grad_norm": 0.10365630686283112,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 4011
    },
    {
      "epoch": 0.64192,
      "grad_norm": 0.09306465089321136,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 4012
    },
    {
      "epoch": 0.64208,
      "grad_norm": 0.08727070689201355,
      "learning_rate": 0.0001,
      "loss": 0.3048,
      "step": 4013
    },
    {
      "epoch": 0.64224,
      "grad_norm": 0.11376315355300903,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 4014
    },
    {
      "epoch": 0.6424,
      "grad_norm": 0.10428988933563232,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 4015
    },
    {
      "epoch": 0.64256,
      "grad_norm": 0.1098279282450676,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 4016
    },
    {
      "epoch": 0.64272,
      "grad_norm": 0.0919824093580246,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4017
    },
    {
      "epoch": 0.64288,
      "grad_norm": 0.10895507782697678,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 4018
    },
    {
      "epoch": 0.64304,
      "grad_norm": 0.10087629407644272,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 4019
    },
    {
      "epoch": 0.6432,
      "grad_norm": 0.09944511204957962,
      "learning_rate": 0.0001,
      "loss": 0.3073,
      "step": 4020
    },
    {
      "epoch": 0.64336,
      "grad_norm": 0.08999960869550705,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 4021
    },
    {
      "epoch": 0.64352,
      "grad_norm": 0.08392785489559174,
      "learning_rate": 0.0001,
      "loss": 0.3061,
      "step": 4022
    },
    {
      "epoch": 0.64368,
      "grad_norm": 0.10791028290987015,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 4023
    },
    {
      "epoch": 0.64384,
      "grad_norm": 0.08737921714782715,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 4024
    },
    {
      "epoch": 0.644,
      "grad_norm": 0.09372717887163162,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4025
    },
    {
      "epoch": 0.64416,
      "grad_norm": 0.14987023174762726,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4026
    },
    {
      "epoch": 0.64432,
      "grad_norm": 0.12403582036495209,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 4027
    },
    {
      "epoch": 0.64448,
      "grad_norm": 0.09259911626577377,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 4028
    },
    {
      "epoch": 0.64464,
      "grad_norm": 0.09325969964265823,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4029
    },
    {
      "epoch": 0.6448,
      "grad_norm": 0.11013662815093994,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 4030
    },
    {
      "epoch": 0.64496,
      "grad_norm": 0.11825466901063919,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 4031
    },
    {
      "epoch": 0.64512,
      "grad_norm": 0.09259648621082306,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4032
    },
    {
      "epoch": 0.64528,
      "grad_norm": 0.09629496932029724,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 4033
    },
    {
      "epoch": 0.64544,
      "grad_norm": 0.10838311910629272,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 4034
    },
    {
      "epoch": 0.6456,
      "grad_norm": 0.10165326297283173,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4035
    },
    {
      "epoch": 0.64576,
      "grad_norm": 0.11613397300243378,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 4036
    },
    {
      "epoch": 0.64592,
      "grad_norm": 0.10302277654409409,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4037
    },
    {
      "epoch": 0.64608,
      "grad_norm": 0.12680703401565552,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 4038
    },
    {
      "epoch": 0.64624,
      "grad_norm": 0.09185715764760971,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 4039
    },
    {
      "epoch": 0.6464,
      "grad_norm": 0.10009785741567612,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 4040
    },
    {
      "epoch": 0.64656,
      "grad_norm": 0.0896274596452713,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 4041
    },
    {
      "epoch": 0.64672,
      "grad_norm": 0.08713546395301819,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 4042
    },
    {
      "epoch": 0.64688,
      "grad_norm": 0.0990552082657814,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 4043
    },
    {
      "epoch": 0.64704,
      "grad_norm": 0.1006002202630043,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 4044
    },
    {
      "epoch": 0.6472,
      "grad_norm": 0.10225652158260345,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4045
    },
    {
      "epoch": 0.64736,
      "grad_norm": 0.09433677047491074,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 4046
    },
    {
      "epoch": 0.64752,
      "grad_norm": 0.10007495433092117,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 4047
    },
    {
      "epoch": 0.64768,
      "grad_norm": 0.08329170197248459,
      "learning_rate": 0.0001,
      "loss": 0.3028,
      "step": 4048
    },
    {
      "epoch": 0.64784,
      "grad_norm": 0.09371341019868851,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 4049
    },
    {
      "epoch": 0.648,
      "grad_norm": 0.09804249554872513,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 4050
    },
    {
      "epoch": 0.64816,
      "grad_norm": 0.10556422173976898,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 4051
    },
    {
      "epoch": 0.64832,
      "grad_norm": 0.08976194262504578,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 4052
    },
    {
      "epoch": 0.64848,
      "grad_norm": 0.1064172089099884,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 4053
    },
    {
      "epoch": 0.64864,
      "grad_norm": 0.08476493507623672,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4054
    },
    {
      "epoch": 0.6488,
      "grad_norm": 0.09855805337429047,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 4055
    },
    {
      "epoch": 0.64896,
      "grad_norm": 0.10243772715330124,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4056
    },
    {
      "epoch": 0.64912,
      "grad_norm": 0.110176220536232,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 4057
    },
    {
      "epoch": 0.64928,
      "grad_norm": 0.10843901336193085,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 4058
    },
    {
      "epoch": 0.64944,
      "grad_norm": 0.11099371314048767,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 4059
    },
    {
      "epoch": 0.6496,
      "grad_norm": 0.09234333038330078,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 4060
    },
    {
      "epoch": 0.64976,
      "grad_norm": 0.08683176338672638,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 4061
    },
    {
      "epoch": 0.64992,
      "grad_norm": 0.11079318821430206,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 4062
    },
    {
      "epoch": 0.65008,
      "grad_norm": 0.09121082723140717,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4063
    },
    {
      "epoch": 0.65024,
      "grad_norm": 0.09927741438150406,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4064
    },
    {
      "epoch": 0.6504,
      "grad_norm": 0.10687212646007538,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 4065
    },
    {
      "epoch": 0.65056,
      "grad_norm": 0.08001774549484253,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 4066
    },
    {
      "epoch": 0.65072,
      "grad_norm": 0.09311274439096451,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 4067
    },
    {
      "epoch": 0.65088,
      "grad_norm": 0.08999521285295486,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4068
    },
    {
      "epoch": 0.65104,
      "grad_norm": 0.0942997932434082,
      "learning_rate": 0.0001,
      "loss": 0.2987,
      "step": 4069
    },
    {
      "epoch": 0.6512,
      "grad_norm": 0.10646379739046097,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 4070
    },
    {
      "epoch": 0.65136,
      "grad_norm": 0.09616905450820923,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 4071
    },
    {
      "epoch": 0.65152,
      "grad_norm": 0.09210468828678131,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 4072
    },
    {
      "epoch": 0.65168,
      "grad_norm": 0.09094178676605225,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4073
    },
    {
      "epoch": 0.65184,
      "grad_norm": 0.09644269943237305,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 4074
    },
    {
      "epoch": 0.652,
      "grad_norm": 0.10931231826543808,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 4075
    },
    {
      "epoch": 0.65216,
      "grad_norm": 0.09084407985210419,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 4076
    },
    {
      "epoch": 0.65232,
      "grad_norm": 0.10288571566343307,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4077
    },
    {
      "epoch": 0.65248,
      "grad_norm": 0.08738991618156433,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 4078
    },
    {
      "epoch": 0.65264,
      "grad_norm": 0.114516481757164,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 4079
    },
    {
      "epoch": 0.6528,
      "grad_norm": 0.09389463812112808,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 4080
    },
    {
      "epoch": 0.65296,
      "grad_norm": 0.0885300487279892,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 4081
    },
    {
      "epoch": 0.65312,
      "grad_norm": 0.08630675077438354,
      "learning_rate": 0.0001,
      "loss": 0.3067,
      "step": 4082
    },
    {
      "epoch": 0.65328,
      "grad_norm": 0.09453649073839188,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4083
    },
    {
      "epoch": 0.65344,
      "grad_norm": 0.1005617156624794,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4084
    },
    {
      "epoch": 0.6536,
      "grad_norm": 0.09882019460201263,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 4085
    },
    {
      "epoch": 0.65376,
      "grad_norm": 0.092132069170475,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 4086
    },
    {
      "epoch": 0.65392,
      "grad_norm": 0.10421949625015259,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 4087
    },
    {
      "epoch": 0.65408,
      "grad_norm": 0.07899360358715057,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 4088
    },
    {
      "epoch": 0.65424,
      "grad_norm": 0.09627123177051544,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4089
    },
    {
      "epoch": 0.6544,
      "grad_norm": 0.09326642006635666,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4090
    },
    {
      "epoch": 0.65456,
      "grad_norm": 0.11111835390329361,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 4091
    },
    {
      "epoch": 0.65472,
      "grad_norm": 0.10387585312128067,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 4092
    },
    {
      "epoch": 0.65488,
      "grad_norm": 0.09184081852436066,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 4093
    },
    {
      "epoch": 0.65504,
      "grad_norm": 0.09262824803590775,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4094
    },
    {
      "epoch": 0.6552,
      "grad_norm": 0.09882670640945435,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 4095
    },
    {
      "epoch": 0.65536,
      "grad_norm": 0.10896332561969757,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 4096
    },
    {
      "epoch": 0.65552,
      "grad_norm": 0.09847410023212433,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4097
    },
    {
      "epoch": 0.65568,
      "grad_norm": 0.08025560528039932,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 4098
    },
    {
      "epoch": 0.65584,
      "grad_norm": 0.0926240012049675,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 4099
    },
    {
      "epoch": 0.656,
      "grad_norm": 0.09134454280138016,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 4100
    },
    {
      "epoch": 0.656,
      "eval_train_accuracy": 0.996,
      "eval_train_loss": 0.3187902867794037,
      "eval_train_runtime": 4.1569,
      "eval_train_samples_per_second": 1202.811,
      "eval_train_steps_per_second": 15.155,
      "step": 4100
    },
    {
      "epoch": 0.656,
      "eval_test_accuracy": 0.9956,
      "eval_test_loss": 0.31744638085365295,
      "eval_test_runtime": 4.7992,
      "eval_test_samples_per_second": 1041.845,
      "eval_test_steps_per_second": 13.127,
      "step": 4100
    },
    {
      "epoch": 0.65616,
      "grad_norm": 0.08667249977588654,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 4101
    },
    {
      "epoch": 0.65632,
      "grad_norm": 0.10761808604001999,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4102
    },
    {
      "epoch": 0.65648,
      "grad_norm": 0.0898372009396553,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 4103
    },
    {
      "epoch": 0.65664,
      "grad_norm": 0.09067147225141525,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 4104
    },
    {
      "epoch": 0.6568,
      "grad_norm": 0.09378329664468765,
      "learning_rate": 0.0001,
      "loss": 0.3046,
      "step": 4105
    },
    {
      "epoch": 0.65696,
      "grad_norm": 0.09551557898521423,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 4106
    },
    {
      "epoch": 0.65712,
      "grad_norm": 0.10010777413845062,
      "learning_rate": 0.0001,
      "loss": 0.3039,
      "step": 4107
    },
    {
      "epoch": 0.65728,
      "grad_norm": 0.09477461874485016,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 4108
    },
    {
      "epoch": 0.65744,
      "grad_norm": 0.093726247549057,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 4109
    },
    {
      "epoch": 0.6576,
      "grad_norm": 0.13451112806797028,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 4110
    },
    {
      "epoch": 0.65776,
      "grad_norm": 0.0959787517786026,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 4111
    },
    {
      "epoch": 0.65792,
      "grad_norm": 0.10326004028320312,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 4112
    },
    {
      "epoch": 0.65808,
      "grad_norm": 0.10420472919940948,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 4113
    },
    {
      "epoch": 0.65824,
      "grad_norm": 0.09783875942230225,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 4114
    },
    {
      "epoch": 0.6584,
      "grad_norm": 0.09905960410833359,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4115
    },
    {
      "epoch": 0.65856,
      "grad_norm": 0.08884893357753754,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 4116
    },
    {
      "epoch": 0.65872,
      "grad_norm": 0.09072493761777878,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4117
    },
    {
      "epoch": 0.65888,
      "grad_norm": 0.10673902183771133,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 4118
    },
    {
      "epoch": 0.65904,
      "grad_norm": 0.10901600867509842,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 4119
    },
    {
      "epoch": 0.6592,
      "grad_norm": 0.09266931563615799,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 4120
    },
    {
      "epoch": 0.65936,
      "grad_norm": 0.09971415251493454,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 4121
    },
    {
      "epoch": 0.65952,
      "grad_norm": 0.11005179584026337,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 4122
    },
    {
      "epoch": 0.65968,
      "grad_norm": 0.0980779156088829,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 4123
    },
    {
      "epoch": 0.65984,
      "grad_norm": 0.1775578111410141,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 4124
    },
    {
      "epoch": 0.66,
      "grad_norm": 0.08696728199720383,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 4125
    },
    {
      "epoch": 0.66016,
      "grad_norm": 0.10994190722703934,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 4126
    },
    {
      "epoch": 0.66032,
      "grad_norm": 0.08582088351249695,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 4127
    },
    {
      "epoch": 0.66048,
      "grad_norm": 0.13671427965164185,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 4128
    },
    {
      "epoch": 0.66064,
      "grad_norm": 0.10336355865001678,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 4129
    },
    {
      "epoch": 0.6608,
      "grad_norm": 0.12121445685625076,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 4130
    },
    {
      "epoch": 0.66096,
      "grad_norm": 0.11694088578224182,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 4131
    },
    {
      "epoch": 0.66112,
      "grad_norm": 0.10500799119472504,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 4132
    },
    {
      "epoch": 0.66128,
      "grad_norm": 0.08679883182048798,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 4133
    },
    {
      "epoch": 0.66144,
      "grad_norm": 0.09312518686056137,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 4134
    },
    {
      "epoch": 0.6616,
      "grad_norm": 0.1393686830997467,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 4135
    },
    {
      "epoch": 0.66176,
      "grad_norm": 0.17337460815906525,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 4136
    },
    {
      "epoch": 0.66192,
      "grad_norm": 0.14139361679553986,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 4137
    },
    {
      "epoch": 0.66208,
      "grad_norm": 0.08965973556041718,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 4138
    },
    {
      "epoch": 0.66224,
      "grad_norm": 0.1436086744070053,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 4139
    },
    {
      "epoch": 0.6624,
      "grad_norm": 0.19774523377418518,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 4140
    },
    {
      "epoch": 0.66256,
      "grad_norm": 0.23849570751190186,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4141
    },
    {
      "epoch": 0.66272,
      "grad_norm": 0.09481257945299149,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 4142
    },
    {
      "epoch": 0.66288,
      "grad_norm": 0.1025761291384697,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4143
    },
    {
      "epoch": 0.66304,
      "grad_norm": 0.22220361232757568,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 4144
    },
    {
      "epoch": 0.6632,
      "grad_norm": 0.15230868756771088,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 4145
    },
    {
      "epoch": 0.66336,
      "grad_norm": 0.10603617131710052,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 4146
    },
    {
      "epoch": 0.66352,
      "grad_norm": 0.10716453939676285,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 4147
    },
    {
      "epoch": 0.66368,
      "grad_norm": 0.16205236315727234,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 4148
    },
    {
      "epoch": 0.66384,
      "grad_norm": 0.08869486302137375,
      "learning_rate": 0.0001,
      "loss": 0.2983,
      "step": 4149
    },
    {
      "epoch": 0.664,
      "grad_norm": 0.08708242326974869,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 4150
    },
    {
      "epoch": 0.66416,
      "grad_norm": 0.1293354332447052,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 4151
    },
    {
      "epoch": 0.66432,
      "grad_norm": 0.12183128297328949,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 4152
    },
    {
      "epoch": 0.66448,
      "grad_norm": 0.14071790874004364,
      "learning_rate": 0.0001,
      "loss": 0.2985,
      "step": 4153
    },
    {
      "epoch": 0.66464,
      "grad_norm": 0.15656889975070953,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 4154
    },
    {
      "epoch": 0.6648,
      "grad_norm": 0.13687288761138916,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 4155
    },
    {
      "epoch": 0.66496,
      "grad_norm": 0.08996236324310303,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4156
    },
    {
      "epoch": 0.66512,
      "grad_norm": 0.09034545719623566,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 4157
    },
    {
      "epoch": 0.66528,
      "grad_norm": 0.09719724953174591,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 4158
    },
    {
      "epoch": 0.66544,
      "grad_norm": 0.10935365408658981,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 4159
    },
    {
      "epoch": 0.6656,
      "grad_norm": 0.11274547129869461,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 4160
    },
    {
      "epoch": 0.66576,
      "grad_norm": 0.11611298471689224,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 4161
    },
    {
      "epoch": 0.66592,
      "grad_norm": 0.1415461301803589,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 4162
    },
    {
      "epoch": 0.66608,
      "grad_norm": 0.10688252747058868,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 4163
    },
    {
      "epoch": 0.66624,
      "grad_norm": 0.09435340017080307,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4164
    },
    {
      "epoch": 0.6664,
      "grad_norm": 0.0951419472694397,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 4165
    },
    {
      "epoch": 0.66656,
      "grad_norm": 0.1464022696018219,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 4166
    },
    {
      "epoch": 0.66672,
      "grad_norm": 0.12018332630395889,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 4167
    },
    {
      "epoch": 0.66688,
      "grad_norm": 0.17129471898078918,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 4168
    },
    {
      "epoch": 0.66704,
      "grad_norm": 0.13132251799106598,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 4169
    },
    {
      "epoch": 0.6672,
      "grad_norm": 0.1863057166337967,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 4170
    },
    {
      "epoch": 0.66736,
      "grad_norm": 0.17261327803134918,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4171
    },
    {
      "epoch": 0.66752,
      "grad_norm": 0.11505019664764404,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 4172
    },
    {
      "epoch": 0.66768,
      "grad_norm": 0.2127932757139206,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4173
    },
    {
      "epoch": 0.66784,
      "grad_norm": 0.11022589355707169,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 4174
    },
    {
      "epoch": 0.668,
      "grad_norm": 0.14248913526535034,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 4175
    },
    {
      "epoch": 0.66816,
      "grad_norm": 0.15078917145729065,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 4176
    },
    {
      "epoch": 0.66832,
      "grad_norm": 0.11275336146354675,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4177
    },
    {
      "epoch": 0.66848,
      "grad_norm": 0.10799925029277802,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 4178
    },
    {
      "epoch": 0.66864,
      "grad_norm": 0.1421971470117569,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 4179
    },
    {
      "epoch": 0.6688,
      "grad_norm": 0.14368936419487,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 4180
    },
    {
      "epoch": 0.66896,
      "grad_norm": 0.191392719745636,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 4181
    },
    {
      "epoch": 0.66912,
      "grad_norm": 0.17436930537223816,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 4182
    },
    {
      "epoch": 0.66928,
      "grad_norm": 0.14030571281909943,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 4183
    },
    {
      "epoch": 0.66944,
      "grad_norm": 0.13008767366409302,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 4184
    },
    {
      "epoch": 0.6696,
      "grad_norm": 0.10165207833051682,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 4185
    },
    {
      "epoch": 0.66976,
      "grad_norm": 0.1221017986536026,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 4186
    },
    {
      "epoch": 0.66992,
      "grad_norm": 0.12239546328783035,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4187
    },
    {
      "epoch": 0.67008,
      "grad_norm": 0.11863898485898972,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 4188
    },
    {
      "epoch": 0.67024,
      "grad_norm": 0.0951399952173233,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 4189
    },
    {
      "epoch": 0.6704,
      "grad_norm": 0.11001498997211456,
      "learning_rate": 0.0001,
      "loss": 0.301,
      "step": 4190
    },
    {
      "epoch": 0.67056,
      "grad_norm": 0.24205176532268524,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 4191
    },
    {
      "epoch": 0.67072,
      "grad_norm": 0.12078464031219482,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 4192
    },
    {
      "epoch": 0.67088,
      "grad_norm": 0.10275658965110779,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 4193
    },
    {
      "epoch": 0.67104,
      "grad_norm": 0.09611769020557404,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 4194
    },
    {
      "epoch": 0.6712,
      "grad_norm": 0.1508742868900299,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4195
    },
    {
      "epoch": 0.67136,
      "grad_norm": 0.1601584553718567,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4196
    },
    {
      "epoch": 0.67152,
      "grad_norm": 0.08826766163110733,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 4197
    },
    {
      "epoch": 0.67168,
      "grad_norm": 0.11013350635766983,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 4198
    },
    {
      "epoch": 0.67184,
      "grad_norm": 0.10048186033964157,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 4199
    },
    {
      "epoch": 0.672,
      "grad_norm": 0.09387166053056717,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 4200
    },
    {
      "epoch": 0.672,
      "eval_train_accuracy": 0.9958,
      "eval_train_loss": 0.3191874027252197,
      "eval_train_runtime": 4.2385,
      "eval_train_samples_per_second": 1179.673,
      "eval_train_steps_per_second": 14.864,
      "step": 4200
    },
    {
      "epoch": 0.672,
      "eval_test_accuracy": 0.9962,
      "eval_test_loss": 0.31799018383026123,
      "eval_test_runtime": 4.9841,
      "eval_test_samples_per_second": 1003.18,
      "eval_test_steps_per_second": 12.64,
      "step": 4200
    },
    {
      "epoch": 0.67216,
      "grad_norm": 0.10288532078266144,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 4201
    },
    {
      "epoch": 0.67232,
      "grad_norm": 0.13413262367248535,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 4202
    },
    {
      "epoch": 0.67248,
      "grad_norm": 0.12364780902862549,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 4203
    },
    {
      "epoch": 0.67264,
      "grad_norm": 0.09347065538167953,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 4204
    },
    {
      "epoch": 0.6728,
      "grad_norm": 0.09615916013717651,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 4205
    },
    {
      "epoch": 0.67296,
      "grad_norm": 0.0927523672580719,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 4206
    },
    {
      "epoch": 0.67312,
      "grad_norm": 0.10765375941991806,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 4207
    },
    {
      "epoch": 0.67328,
      "grad_norm": 0.1308852881193161,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 4208
    },
    {
      "epoch": 0.67344,
      "grad_norm": 0.08968270570039749,
      "learning_rate": 0.0001,
      "loss": 0.2997,
      "step": 4209
    },
    {
      "epoch": 0.6736,
      "grad_norm": 0.10496653616428375,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4210
    },
    {
      "epoch": 0.67376,
      "grad_norm": 0.09045130759477615,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 4211
    },
    {
      "epoch": 0.67392,
      "grad_norm": 0.08769742399454117,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4212
    },
    {
      "epoch": 0.67408,
      "grad_norm": 0.10007360577583313,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 4213
    },
    {
      "epoch": 0.67424,
      "grad_norm": 0.11365325003862381,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4214
    },
    {
      "epoch": 0.6744,
      "grad_norm": 0.10238843411207199,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 4215
    },
    {
      "epoch": 0.67456,
      "grad_norm": 0.09261872619390488,
      "learning_rate": 0.0001,
      "loss": 0.3076,
      "step": 4216
    },
    {
      "epoch": 0.67472,
      "grad_norm": 0.08483188599348068,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 4217
    },
    {
      "epoch": 0.67488,
      "grad_norm": 0.14098776876926422,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4218
    },
    {
      "epoch": 0.67504,
      "grad_norm": 0.13815081119537354,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 4219
    },
    {
      "epoch": 0.6752,
      "grad_norm": 0.10671903938055038,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 4220
    },
    {
      "epoch": 0.67536,
      "grad_norm": 0.10829750448465347,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 4221
    },
    {
      "epoch": 0.67552,
      "grad_norm": 0.12907080352306366,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 4222
    },
    {
      "epoch": 0.67568,
      "grad_norm": 0.09121904522180557,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 4223
    },
    {
      "epoch": 0.67584,
      "grad_norm": 0.1630139797925949,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 4224
    },
    {
      "epoch": 0.676,
      "grad_norm": 0.17342278361320496,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 4225
    },
    {
      "epoch": 0.67616,
      "grad_norm": 0.09140925109386444,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 4226
    },
    {
      "epoch": 0.67632,
      "grad_norm": 0.10159240663051605,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 4227
    },
    {
      "epoch": 0.67648,
      "grad_norm": 0.11018043756484985,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 4228
    },
    {
      "epoch": 0.67664,
      "grad_norm": 0.11347661167383194,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 4229
    },
    {
      "epoch": 0.6768,
      "grad_norm": 0.10658407211303711,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4230
    },
    {
      "epoch": 0.67696,
      "grad_norm": 0.12565641105175018,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 4231
    },
    {
      "epoch": 0.67712,
      "grad_norm": 0.12023011595010757,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4232
    },
    {
      "epoch": 0.67728,
      "grad_norm": 0.09142352640628815,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 4233
    },
    {
      "epoch": 0.67744,
      "grad_norm": 0.08821917325258255,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 4234
    },
    {
      "epoch": 0.6776,
      "grad_norm": 0.09837639331817627,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4235
    },
    {
      "epoch": 0.67776,
      "grad_norm": 0.095987968146801,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 4236
    },
    {
      "epoch": 0.67792,
      "grad_norm": 0.11521422117948532,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4237
    },
    {
      "epoch": 0.67808,
      "grad_norm": 0.11605950444936752,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 4238
    },
    {
      "epoch": 0.67824,
      "grad_norm": 0.16347382962703705,
      "learning_rate": 0.0001,
      "loss": 0.3042,
      "step": 4239
    },
    {
      "epoch": 0.6784,
      "grad_norm": 0.09788544476032257,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4240
    },
    {
      "epoch": 0.67856,
      "grad_norm": 0.10326757282018661,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 4241
    },
    {
      "epoch": 0.67872,
      "grad_norm": 0.29613056778907776,
      "learning_rate": 0.0001,
      "loss": 0.3395,
      "step": 4242
    },
    {
      "epoch": 0.67888,
      "grad_norm": 0.14143939316272736,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 4243
    },
    {
      "epoch": 0.67904,
      "grad_norm": 0.2840301990509033,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4244
    },
    {
      "epoch": 0.6792,
      "grad_norm": 0.15330392122268677,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 4245
    },
    {
      "epoch": 0.67936,
      "grad_norm": 0.44665515422821045,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 4246
    },
    {
      "epoch": 0.67952,
      "grad_norm": 0.10044756531715393,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 4247
    },
    {
      "epoch": 0.67968,
      "grad_norm": 0.3678511381149292,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 4248
    },
    {
      "epoch": 0.67984,
      "grad_norm": 0.5058865547180176,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 4249
    },
    {
      "epoch": 0.68,
      "grad_norm": 0.9377855062484741,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 4250
    },
    {
      "epoch": 0.68016,
      "grad_norm": 2.0327699184417725,
      "learning_rate": 0.0001,
      "loss": 0.3519,
      "step": 4251
    },
    {
      "epoch": 0.68032,
      "grad_norm": 0.3770652115345001,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 4252
    },
    {
      "epoch": 0.68048,
      "grad_norm": 1.1108317375183105,
      "learning_rate": 0.0001,
      "loss": 0.3572,
      "step": 4253
    },
    {
      "epoch": 0.68064,
      "grad_norm": 4.160608768463135,
      "learning_rate": 0.0001,
      "loss": 0.4584,
      "step": 4254
    },
    {
      "epoch": 0.6808,
      "grad_norm": 2.4219186305999756,
      "learning_rate": 0.0001,
      "loss": 0.4324,
      "step": 4255
    },
    {
      "epoch": 0.68096,
      "grad_norm": 2.766577959060669,
      "learning_rate": 0.0001,
      "loss": 0.4282,
      "step": 4256
    },
    {
      "epoch": 0.68112,
      "grad_norm": 1.5379154682159424,
      "learning_rate": 0.0001,
      "loss": 0.371,
      "step": 4257
    },
    {
      "epoch": 0.68128,
      "grad_norm": 2.0991361141204834,
      "learning_rate": 0.0001,
      "loss": 0.3694,
      "step": 4258
    },
    {
      "epoch": 0.68144,
      "grad_norm": 1.3497759103775024,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 4259
    },
    {
      "epoch": 0.6816,
      "grad_norm": 2.1670055389404297,
      "learning_rate": 0.0001,
      "loss": 0.4172,
      "step": 4260
    },
    {
      "epoch": 0.68176,
      "grad_norm": 1.946250081062317,
      "learning_rate": 0.0001,
      "loss": 0.3767,
      "step": 4261
    },
    {
      "epoch": 0.68192,
      "grad_norm": 1.4496301412582397,
      "learning_rate": 0.0001,
      "loss": 0.3693,
      "step": 4262
    },
    {
      "epoch": 0.68208,
      "grad_norm": 2.988621473312378,
      "learning_rate": 0.0001,
      "loss": 0.4484,
      "step": 4263
    },
    {
      "epoch": 0.68224,
      "grad_norm": 0.6364216208457947,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 4264
    },
    {
      "epoch": 0.6824,
      "grad_norm": 0.6350489854812622,
      "learning_rate": 0.0001,
      "loss": 0.3545,
      "step": 4265
    },
    {
      "epoch": 0.68256,
      "grad_norm": 1.2656489610671997,
      "learning_rate": 0.0001,
      "loss": 0.3573,
      "step": 4266
    },
    {
      "epoch": 0.68272,
      "grad_norm": 0.3014744818210602,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 4267
    },
    {
      "epoch": 0.68288,
      "grad_norm": 0.5823450088500977,
      "learning_rate": 0.0001,
      "loss": 0.3478,
      "step": 4268
    },
    {
      "epoch": 0.68304,
      "grad_norm": 0.3581940829753876,
      "learning_rate": 0.0001,
      "loss": 0.3402,
      "step": 4269
    },
    {
      "epoch": 0.6832,
      "grad_norm": 0.33329126238822937,
      "learning_rate": 0.0001,
      "loss": 0.3419,
      "step": 4270
    },
    {
      "epoch": 0.68336,
      "grad_norm": 0.2966190576553345,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 4271
    },
    {
      "epoch": 0.68352,
      "grad_norm": 0.4493212103843689,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 4272
    },
    {
      "epoch": 0.68368,
      "grad_norm": 0.17514532804489136,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 4273
    },
    {
      "epoch": 0.68384,
      "grad_norm": 0.16183505952358246,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 4274
    },
    {
      "epoch": 0.684,
      "grad_norm": 0.12841685116291046,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 4275
    },
    {
      "epoch": 0.68416,
      "grad_norm": 0.15333573520183563,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 4276
    },
    {
      "epoch": 0.68432,
      "grad_norm": 0.19436372816562653,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 4277
    },
    {
      "epoch": 0.68448,
      "grad_norm": 0.1853417456150055,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 4278
    },
    {
      "epoch": 0.68464,
      "grad_norm": 0.18636059761047363,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 4279
    },
    {
      "epoch": 0.6848,
      "grad_norm": 0.15387147665023804,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 4280
    },
    {
      "epoch": 0.68496,
      "grad_norm": 0.16126711666584015,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 4281
    },
    {
      "epoch": 0.68512,
      "grad_norm": 0.15741397440433502,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 4282
    },
    {
      "epoch": 0.68528,
      "grad_norm": 0.15829460322856903,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 4283
    },
    {
      "epoch": 0.68544,
      "grad_norm": 0.11479313671588898,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4284
    },
    {
      "epoch": 0.6856,
      "grad_norm": 0.17856577038764954,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 4285
    },
    {
      "epoch": 0.68576,
      "grad_norm": 0.149746835231781,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 4286
    },
    {
      "epoch": 0.68592,
      "grad_norm": 0.1626691222190857,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 4287
    },
    {
      "epoch": 0.68608,
      "grad_norm": 0.10945073515176773,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 4288
    },
    {
      "epoch": 0.68624,
      "grad_norm": 0.11291835457086563,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 4289
    },
    {
      "epoch": 0.6864,
      "grad_norm": 0.11626488715410233,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 4290
    },
    {
      "epoch": 0.68656,
      "grad_norm": 0.11751098930835724,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 4291
    },
    {
      "epoch": 0.68672,
      "grad_norm": 0.11091221868991852,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 4292
    },
    {
      "epoch": 0.68688,
      "grad_norm": 0.11928211897611618,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 4293
    },
    {
      "epoch": 0.68704,
      "grad_norm": 0.09567002952098846,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 4294
    },
    {
      "epoch": 0.6872,
      "grad_norm": 0.13217219710350037,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 4295
    },
    {
      "epoch": 0.68736,
      "grad_norm": 0.12967516481876373,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4296
    },
    {
      "epoch": 0.68752,
      "grad_norm": 0.10509304702281952,
      "learning_rate": 0.0001,
      "loss": 0.3052,
      "step": 4297
    },
    {
      "epoch": 0.68768,
      "grad_norm": 0.09754002839326859,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 4298
    },
    {
      "epoch": 0.68784,
      "grad_norm": 0.10458827018737793,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 4299
    },
    {
      "epoch": 0.688,
      "grad_norm": 0.11640683561563492,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4300
    },
    {
      "epoch": 0.688,
      "eval_train_accuracy": 0.993,
      "eval_train_loss": 0.3196205496788025,
      "eval_train_runtime": 4.0742,
      "eval_train_samples_per_second": 1227.224,
      "eval_train_steps_per_second": 15.463,
      "step": 4300
    },
    {
      "epoch": 0.688,
      "eval_test_accuracy": 0.9928,
      "eval_test_loss": 0.3182014226913452,
      "eval_test_runtime": 4.9717,
      "eval_test_samples_per_second": 1005.688,
      "eval_test_steps_per_second": 12.672,
      "step": 4300
    },
    {
      "epoch": 0.68816,
      "grad_norm": 0.09882506728172302,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 4301
    },
    {
      "epoch": 0.68832,
      "grad_norm": 0.11059856414794922,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 4302
    },
    {
      "epoch": 0.68848,
      "grad_norm": 0.12637752294540405,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 4303
    },
    {
      "epoch": 0.68864,
      "grad_norm": 0.1316092610359192,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 4304
    },
    {
      "epoch": 0.6888,
      "grad_norm": 0.09542497992515564,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 4305
    },
    {
      "epoch": 0.68896,
      "grad_norm": 0.1426743119955063,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 4306
    },
    {
      "epoch": 0.68912,
      "grad_norm": 0.1051853820681572,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4307
    },
    {
      "epoch": 0.68928,
      "grad_norm": 0.1323881894350052,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 4308
    },
    {
      "epoch": 0.68944,
      "grad_norm": 0.09134234488010406,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 4309
    },
    {
      "epoch": 0.6896,
      "grad_norm": 0.1292104423046112,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 4310
    },
    {
      "epoch": 0.68976,
      "grad_norm": 0.10731000453233719,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 4311
    },
    {
      "epoch": 0.68992,
      "grad_norm": 0.0884978324174881,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 4312
    },
    {
      "epoch": 0.69008,
      "grad_norm": 0.09805256873369217,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 4313
    },
    {
      "epoch": 0.69024,
      "grad_norm": 0.09442351758480072,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4314
    },
    {
      "epoch": 0.6904,
      "grad_norm": 0.10355932265520096,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4315
    },
    {
      "epoch": 0.69056,
      "grad_norm": 0.1181226596236229,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 4316
    },
    {
      "epoch": 0.69072,
      "grad_norm": 0.09009853005409241,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 4317
    },
    {
      "epoch": 0.69088,
      "grad_norm": 0.09840010851621628,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 4318
    },
    {
      "epoch": 0.69104,
      "grad_norm": 0.09731972962617874,
      "learning_rate": 0.0001,
      "loss": 0.3013,
      "step": 4319
    },
    {
      "epoch": 0.6912,
      "grad_norm": 0.09538167715072632,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 4320
    },
    {
      "epoch": 0.69136,
      "grad_norm": 0.09337098151445389,
      "learning_rate": 0.0001,
      "loss": 0.3075,
      "step": 4321
    },
    {
      "epoch": 0.69152,
      "grad_norm": 0.08504482358694077,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 4322
    },
    {
      "epoch": 0.69168,
      "grad_norm": 0.09267448633909225,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 4323
    },
    {
      "epoch": 0.69184,
      "grad_norm": 0.10075730830430984,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 4324
    },
    {
      "epoch": 0.692,
      "grad_norm": 0.09612447023391724,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4325
    },
    {
      "epoch": 0.69216,
      "grad_norm": 0.09439920634031296,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 4326
    },
    {
      "epoch": 0.69232,
      "grad_norm": 0.1899033784866333,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 4327
    },
    {
      "epoch": 0.69248,
      "grad_norm": 0.09067200124263763,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 4328
    },
    {
      "epoch": 0.69264,
      "grad_norm": 0.09183083474636078,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 4329
    },
    {
      "epoch": 0.6928,
      "grad_norm": 0.10634874552488327,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 4330
    },
    {
      "epoch": 0.69296,
      "grad_norm": 0.0888609066605568,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 4331
    },
    {
      "epoch": 0.69312,
      "grad_norm": 0.09522057324647903,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 4332
    },
    {
      "epoch": 0.69328,
      "grad_norm": 0.09427393227815628,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 4333
    },
    {
      "epoch": 0.69344,
      "grad_norm": 0.19618074595928192,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 4334
    },
    {
      "epoch": 0.6936,
      "grad_norm": 0.11349230259656906,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 4335
    },
    {
      "epoch": 0.69376,
      "grad_norm": 0.08757109940052032,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 4336
    },
    {
      "epoch": 0.69392,
      "grad_norm": 0.08584999293088913,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 4337
    },
    {
      "epoch": 0.69408,
      "grad_norm": 0.09535194933414459,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4338
    },
    {
      "epoch": 0.69424,
      "grad_norm": 0.09539426863193512,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4339
    },
    {
      "epoch": 0.6944,
      "grad_norm": 0.11973700672388077,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 4340
    },
    {
      "epoch": 0.69456,
      "grad_norm": 0.09816981106996536,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 4341
    },
    {
      "epoch": 0.69472,
      "grad_norm": 0.09738034754991531,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 4342
    },
    {
      "epoch": 0.69488,
      "grad_norm": 0.11626878380775452,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4343
    },
    {
      "epoch": 0.69504,
      "grad_norm": 0.11154557019472122,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 4344
    },
    {
      "epoch": 0.6952,
      "grad_norm": 0.09168434888124466,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 4345
    },
    {
      "epoch": 0.69536,
      "grad_norm": 0.10550922900438309,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 4346
    },
    {
      "epoch": 0.69552,
      "grad_norm": 0.09399164468050003,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 4347
    },
    {
      "epoch": 0.69568,
      "grad_norm": 0.09800362586975098,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 4348
    },
    {
      "epoch": 0.69584,
      "grad_norm": 0.1007382944226265,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 4349
    },
    {
      "epoch": 0.696,
      "grad_norm": 0.10009032487869263,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 4350
    },
    {
      "epoch": 0.69616,
      "grad_norm": 0.10132721811532974,
      "learning_rate": 0.0001,
      "loss": 0.3039,
      "step": 4351
    },
    {
      "epoch": 0.69632,
      "grad_norm": 0.08640364557504654,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 4352
    },
    {
      "epoch": 0.69648,
      "grad_norm": 0.09731267392635345,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 4353
    },
    {
      "epoch": 0.69664,
      "grad_norm": 0.10613111406564713,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4354
    },
    {
      "epoch": 0.6968,
      "grad_norm": 0.08546990901231766,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 4355
    },
    {
      "epoch": 0.69696,
      "grad_norm": 0.09579188376665115,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 4356
    },
    {
      "epoch": 0.69712,
      "grad_norm": 0.09075279533863068,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 4357
    },
    {
      "epoch": 0.69728,
      "grad_norm": 0.09509024024009705,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 4358
    },
    {
      "epoch": 0.69744,
      "grad_norm": 0.09112471342086792,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 4359
    },
    {
      "epoch": 0.6976,
      "grad_norm": 0.12438245117664337,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4360
    },
    {
      "epoch": 0.69776,
      "grad_norm": 0.09963519126176834,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4361
    },
    {
      "epoch": 0.69792,
      "grad_norm": 0.09205642342567444,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 4362
    },
    {
      "epoch": 0.69808,
      "grad_norm": 0.10607349872589111,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 4363
    },
    {
      "epoch": 0.69824,
      "grad_norm": 0.08580764383077621,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 4364
    },
    {
      "epoch": 0.6984,
      "grad_norm": 0.08552584797143936,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4365
    },
    {
      "epoch": 0.69856,
      "grad_norm": 0.0983668640255928,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 4366
    },
    {
      "epoch": 0.69872,
      "grad_norm": 0.1075202152132988,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 4367
    },
    {
      "epoch": 0.69888,
      "grad_norm": 0.09758582711219788,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 4368
    },
    {
      "epoch": 0.69904,
      "grad_norm": 0.08648435026407242,
      "learning_rate": 0.0001,
      "loss": 0.3399,
      "step": 4369
    },
    {
      "epoch": 0.6992,
      "grad_norm": 0.09341944009065628,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4370
    },
    {
      "epoch": 0.69936,
      "grad_norm": 0.08585915714502335,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4371
    },
    {
      "epoch": 0.69952,
      "grad_norm": 0.15866228938102722,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 4372
    },
    {
      "epoch": 0.69968,
      "grad_norm": 0.09976333379745483,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 4373
    },
    {
      "epoch": 0.69984,
      "grad_norm": 0.10047225654125214,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4374
    },
    {
      "epoch": 0.7,
      "grad_norm": 0.11040319502353668,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 4375
    },
    {
      "epoch": 0.70016,
      "grad_norm": 0.12230165302753448,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 4376
    },
    {
      "epoch": 0.70032,
      "grad_norm": 0.09395001828670502,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 4377
    },
    {
      "epoch": 0.70048,
      "grad_norm": 0.10182826220989227,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 4378
    },
    {
      "epoch": 0.70064,
      "grad_norm": 0.09632346034049988,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 4379
    },
    {
      "epoch": 0.7008,
      "grad_norm": 0.10509630292654037,
      "learning_rate": 0.0001,
      "loss": 0.302,
      "step": 4380
    },
    {
      "epoch": 0.70096,
      "grad_norm": 0.08586416393518448,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 4381
    },
    {
      "epoch": 0.70112,
      "grad_norm": 0.08678821474313736,
      "learning_rate": 0.0001,
      "loss": 0.3068,
      "step": 4382
    },
    {
      "epoch": 0.70128,
      "grad_norm": 0.08364371210336685,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 4383
    },
    {
      "epoch": 0.70144,
      "grad_norm": 0.11471431702375412,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4384
    },
    {
      "epoch": 0.7016,
      "grad_norm": 0.08383314311504364,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 4385
    },
    {
      "epoch": 0.70176,
      "grad_norm": 0.09640203416347504,
      "learning_rate": 0.0001,
      "loss": 0.3033,
      "step": 4386
    },
    {
      "epoch": 0.70192,
      "grad_norm": 0.09677939862012863,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 4387
    },
    {
      "epoch": 0.70208,
      "grad_norm": 0.09185153990983963,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 4388
    },
    {
      "epoch": 0.70224,
      "grad_norm": 0.09145776182413101,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 4389
    },
    {
      "epoch": 0.7024,
      "grad_norm": 0.0899108350276947,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 4390
    },
    {
      "epoch": 0.70256,
      "grad_norm": 0.09227007627487183,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4391
    },
    {
      "epoch": 0.70272,
      "grad_norm": 0.0804251953959465,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 4392
    },
    {
      "epoch": 0.70288,
      "grad_norm": 0.09857862442731857,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 4393
    },
    {
      "epoch": 0.70304,
      "grad_norm": 0.0992220789194107,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 4394
    },
    {
      "epoch": 0.7032,
      "grad_norm": 0.09820158034563065,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 4395
    },
    {
      "epoch": 0.70336,
      "grad_norm": 0.09644672274589539,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 4396
    },
    {
      "epoch": 0.70352,
      "grad_norm": 0.09801653027534485,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 4397
    },
    {
      "epoch": 0.70368,
      "grad_norm": 0.08613044768571854,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 4398
    },
    {
      "epoch": 0.70384,
      "grad_norm": 0.10221032798290253,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 4399
    },
    {
      "epoch": 0.704,
      "grad_norm": 0.08821790665388107,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 4400
    },
    {
      "epoch": 0.704,
      "eval_train_accuracy": 0.9948,
      "eval_train_loss": 0.3179911971092224,
      "eval_train_runtime": 4.0754,
      "eval_train_samples_per_second": 1226.874,
      "eval_train_steps_per_second": 15.459,
      "step": 4400
    },
    {
      "epoch": 0.704,
      "eval_test_accuracy": 0.9964,
      "eval_test_loss": 0.31659549474716187,
      "eval_test_runtime": 5.1357,
      "eval_test_samples_per_second": 973.568,
      "eval_test_steps_per_second": 12.267,
      "step": 4400
    },
    {
      "epoch": 0.70416,
      "grad_norm": 0.09280814230442047,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4401
    },
    {
      "epoch": 0.70432,
      "grad_norm": 0.09286488592624664,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 4402
    },
    {
      "epoch": 0.70448,
      "grad_norm": 0.08516814559698105,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 4403
    },
    {
      "epoch": 0.70464,
      "grad_norm": 0.09274452179670334,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 4404
    },
    {
      "epoch": 0.7048,
      "grad_norm": 0.13210085034370422,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 4405
    },
    {
      "epoch": 0.70496,
      "grad_norm": 0.09631293267011642,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 4406
    },
    {
      "epoch": 0.70512,
      "grad_norm": 0.09483381360769272,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 4407
    },
    {
      "epoch": 0.70528,
      "grad_norm": 0.09819445759057999,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 4408
    },
    {
      "epoch": 0.70544,
      "grad_norm": 0.09668165445327759,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4409
    },
    {
      "epoch": 0.7056,
      "grad_norm": 0.18448308110237122,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 4410
    },
    {
      "epoch": 0.70576,
      "grad_norm": 0.08542316406965256,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 4411
    },
    {
      "epoch": 0.70592,
      "grad_norm": 0.09846038371324539,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 4412
    },
    {
      "epoch": 0.70608,
      "grad_norm": 0.09125738590955734,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 4413
    },
    {
      "epoch": 0.70624,
      "grad_norm": 0.08814021944999695,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 4414
    },
    {
      "epoch": 0.7064,
      "grad_norm": 0.08568074554204941,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4415
    },
    {
      "epoch": 0.70656,
      "grad_norm": 0.11021539568901062,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 4416
    },
    {
      "epoch": 0.70672,
      "grad_norm": 0.10412903875112534,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4417
    },
    {
      "epoch": 0.70688,
      "grad_norm": 0.08061201870441437,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 4418
    },
    {
      "epoch": 0.70704,
      "grad_norm": 0.09687480330467224,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 4419
    },
    {
      "epoch": 0.7072,
      "grad_norm": 0.10101381689310074,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 4420
    },
    {
      "epoch": 0.70736,
      "grad_norm": 0.09165916591882706,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 4421
    },
    {
      "epoch": 0.70752,
      "grad_norm": 0.08629006892442703,
      "learning_rate": 0.0001,
      "loss": 0.307,
      "step": 4422
    },
    {
      "epoch": 0.70768,
      "grad_norm": 0.09614178538322449,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4423
    },
    {
      "epoch": 0.70784,
      "grad_norm": 0.10084107518196106,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 4424
    },
    {
      "epoch": 0.708,
      "grad_norm": 0.092702217400074,
      "learning_rate": 0.0001,
      "loss": 0.2987,
      "step": 4425
    },
    {
      "epoch": 0.70816,
      "grad_norm": 0.0989244282245636,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 4426
    },
    {
      "epoch": 0.70832,
      "grad_norm": 0.08826049417257309,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 4427
    },
    {
      "epoch": 0.70848,
      "grad_norm": 0.09077322483062744,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 4428
    },
    {
      "epoch": 0.70864,
      "grad_norm": 0.1274401843547821,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 4429
    },
    {
      "epoch": 0.7088,
      "grad_norm": 0.08498789370059967,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 4430
    },
    {
      "epoch": 0.70896,
      "grad_norm": 0.1025843620300293,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 4431
    },
    {
      "epoch": 0.70912,
      "grad_norm": 0.09332459419965744,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4432
    },
    {
      "epoch": 0.70928,
      "grad_norm": 0.11414452642202377,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 4433
    },
    {
      "epoch": 0.70944,
      "grad_norm": 0.08070934563875198,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 4434
    },
    {
      "epoch": 0.7096,
      "grad_norm": 0.10291784256696701,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 4435
    },
    {
      "epoch": 0.70976,
      "grad_norm": 0.09883212298154831,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 4436
    },
    {
      "epoch": 0.70992,
      "grad_norm": 0.09329187124967575,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 4437
    },
    {
      "epoch": 0.71008,
      "grad_norm": 0.08573323488235474,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4438
    },
    {
      "epoch": 0.71024,
      "grad_norm": 0.09585745632648468,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 4439
    },
    {
      "epoch": 0.7104,
      "grad_norm": 0.09264911711215973,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 4440
    },
    {
      "epoch": 0.71056,
      "grad_norm": 0.0834050178527832,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4441
    },
    {
      "epoch": 0.71072,
      "grad_norm": 0.09774447977542877,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 4442
    },
    {
      "epoch": 0.71088,
      "grad_norm": 0.1429544985294342,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 4443
    },
    {
      "epoch": 0.71104,
      "grad_norm": 0.09933732450008392,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 4444
    },
    {
      "epoch": 0.7112,
      "grad_norm": 0.08360952883958817,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 4445
    },
    {
      "epoch": 0.71136,
      "grad_norm": 0.09117738902568817,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 4446
    },
    {
      "epoch": 0.71152,
      "grad_norm": 0.09538491815328598,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 4447
    },
    {
      "epoch": 0.71168,
      "grad_norm": 0.10629772394895554,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 4448
    },
    {
      "epoch": 0.71184,
      "grad_norm": 0.11335305869579315,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 4449
    },
    {
      "epoch": 0.712,
      "grad_norm": 0.0844777449965477,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4450
    },
    {
      "epoch": 0.71216,
      "grad_norm": 0.08558289706707001,
      "learning_rate": 0.0001,
      "loss": 0.3089,
      "step": 4451
    },
    {
      "epoch": 0.71232,
      "grad_norm": 0.07508745789527893,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 4452
    },
    {
      "epoch": 0.71248,
      "grad_norm": 0.08527764678001404,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 4453
    },
    {
      "epoch": 0.71264,
      "grad_norm": 0.09500310570001602,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4454
    },
    {
      "epoch": 0.7128,
      "grad_norm": 0.11415784060955048,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 4455
    },
    {
      "epoch": 0.71296,
      "grad_norm": 0.08585266023874283,
      "learning_rate": 0.0001,
      "loss": 0.303,
      "step": 4456
    },
    {
      "epoch": 0.71312,
      "grad_norm": 0.11012841761112213,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 4457
    },
    {
      "epoch": 0.71328,
      "grad_norm": 0.09758485853672028,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 4458
    },
    {
      "epoch": 0.71344,
      "grad_norm": 0.10776116698980331,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 4459
    },
    {
      "epoch": 0.7136,
      "grad_norm": 0.08677197247743607,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4460
    },
    {
      "epoch": 0.71376,
      "grad_norm": 0.09290692210197449,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 4461
    },
    {
      "epoch": 0.71392,
      "grad_norm": 0.08714313060045242,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 4462
    },
    {
      "epoch": 0.71408,
      "grad_norm": 0.10030229389667511,
      "learning_rate": 0.0001,
      "loss": 0.3375,
      "step": 4463
    },
    {
      "epoch": 0.71424,
      "grad_norm": 0.10271342843770981,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 4464
    },
    {
      "epoch": 0.7144,
      "grad_norm": 0.09646206349134445,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 4465
    },
    {
      "epoch": 0.71456,
      "grad_norm": 0.08869308978319168,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4466
    },
    {
      "epoch": 0.71472,
      "grad_norm": 0.09283728897571564,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 4467
    },
    {
      "epoch": 0.71488,
      "grad_norm": 0.09214979410171509,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 4468
    },
    {
      "epoch": 0.71504,
      "grad_norm": 0.12082716077566147,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 4469
    },
    {
      "epoch": 0.7152,
      "grad_norm": 0.08463967591524124,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 4470
    },
    {
      "epoch": 0.71536,
      "grad_norm": 0.0817175805568695,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 4471
    },
    {
      "epoch": 0.71552,
      "grad_norm": 0.09081939607858658,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 4472
    },
    {
      "epoch": 0.71568,
      "grad_norm": 0.08755480498075485,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 4473
    },
    {
      "epoch": 0.71584,
      "grad_norm": 0.0898279920220375,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 4474
    },
    {
      "epoch": 0.716,
      "grad_norm": 0.07636702805757523,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 4475
    },
    {
      "epoch": 0.71616,
      "grad_norm": 0.10725586861371994,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 4476
    },
    {
      "epoch": 0.71632,
      "grad_norm": 0.08313754945993423,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 4477
    },
    {
      "epoch": 0.71648,
      "grad_norm": 0.0895119234919548,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 4478
    },
    {
      "epoch": 0.71664,
      "grad_norm": 0.10887670516967773,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 4479
    },
    {
      "epoch": 0.7168,
      "grad_norm": 0.08675671368837357,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 4480
    },
    {
      "epoch": 0.71696,
      "grad_norm": 0.1101142019033432,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4481
    },
    {
      "epoch": 0.71712,
      "grad_norm": 0.10223747044801712,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 4482
    },
    {
      "epoch": 0.71728,
      "grad_norm": 0.084354929625988,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 4483
    },
    {
      "epoch": 0.71744,
      "grad_norm": 0.09979567676782608,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 4484
    },
    {
      "epoch": 0.7176,
      "grad_norm": 0.09438921511173248,
      "learning_rate": 0.0001,
      "loss": 0.3055,
      "step": 4485
    },
    {
      "epoch": 0.71776,
      "grad_norm": 0.09226212650537491,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4486
    },
    {
      "epoch": 0.71792,
      "grad_norm": 0.08612857758998871,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 4487
    },
    {
      "epoch": 0.71808,
      "grad_norm": 0.08427134156227112,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 4488
    },
    {
      "epoch": 0.71824,
      "grad_norm": 0.11308234184980392,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 4489
    },
    {
      "epoch": 0.7184,
      "grad_norm": 0.09220688790082932,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 4490
    },
    {
      "epoch": 0.71856,
      "grad_norm": 0.08665703982114792,
      "learning_rate": 0.0001,
      "loss": 0.3046,
      "step": 4491
    },
    {
      "epoch": 0.71872,
      "grad_norm": 0.08197914808988571,
      "learning_rate": 0.0001,
      "loss": 0.3076,
      "step": 4492
    },
    {
      "epoch": 0.71888,
      "grad_norm": 0.09233274310827255,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 4493
    },
    {
      "epoch": 0.71904,
      "grad_norm": 0.10043098777532578,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4494
    },
    {
      "epoch": 0.7192,
      "grad_norm": 0.08627527952194214,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4495
    },
    {
      "epoch": 0.71936,
      "grad_norm": 0.09776709973812103,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 4496
    },
    {
      "epoch": 0.71952,
      "grad_norm": 0.09183310717344284,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 4497
    },
    {
      "epoch": 0.71968,
      "grad_norm": 0.08446234464645386,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 4498
    },
    {
      "epoch": 0.71984,
      "grad_norm": 0.08818863332271576,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 4499
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.09123308211565018,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4500
    },
    {
      "epoch": 0.72,
      "eval_train_accuracy": 0.994,
      "eval_train_loss": 0.3181908130645752,
      "eval_train_runtime": 4.1925,
      "eval_train_samples_per_second": 1192.614,
      "eval_train_steps_per_second": 15.027,
      "step": 4500
    },
    {
      "epoch": 0.72,
      "eval_test_accuracy": 0.9956,
      "eval_test_loss": 0.3168122470378876,
      "eval_test_runtime": 4.7653,
      "eval_test_samples_per_second": 1049.261,
      "eval_test_steps_per_second": 13.221,
      "step": 4500
    },
    {
      "epoch": 0.72016,
      "grad_norm": 0.11271881312131882,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 4501
    },
    {
      "epoch": 0.72032,
      "grad_norm": 0.09139873832464218,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4502
    },
    {
      "epoch": 0.72048,
      "grad_norm": 0.08997642248868942,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 4503
    },
    {
      "epoch": 0.72064,
      "grad_norm": 0.08671852201223373,
      "learning_rate": 0.0001,
      "loss": 0.3043,
      "step": 4504
    },
    {
      "epoch": 0.7208,
      "grad_norm": 0.09876111894845963,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 4505
    },
    {
      "epoch": 0.72096,
      "grad_norm": 0.09274905920028687,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 4506
    },
    {
      "epoch": 0.72112,
      "grad_norm": 0.10134242475032806,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 4507
    },
    {
      "epoch": 0.72128,
      "grad_norm": 0.10863059014081955,
      "learning_rate": 0.0001,
      "loss": 0.3042,
      "step": 4508
    },
    {
      "epoch": 0.72144,
      "grad_norm": 0.09463746845722198,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4509
    },
    {
      "epoch": 0.7216,
      "grad_norm": 0.10233539342880249,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 4510
    },
    {
      "epoch": 0.72176,
      "grad_norm": 0.10435957461595535,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 4511
    },
    {
      "epoch": 0.72192,
      "grad_norm": 0.10780119895935059,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4512
    },
    {
      "epoch": 0.72208,
      "grad_norm": 0.0915401503443718,
      "learning_rate": 0.0001,
      "loss": 0.3057,
      "step": 4513
    },
    {
      "epoch": 0.72224,
      "grad_norm": 0.08090965449810028,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 4514
    },
    {
      "epoch": 0.7224,
      "grad_norm": 0.08765751868486404,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4515
    },
    {
      "epoch": 0.72256,
      "grad_norm": 0.09094984084367752,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 4516
    },
    {
      "epoch": 0.72272,
      "grad_norm": 0.08429769426584244,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 4517
    },
    {
      "epoch": 0.72288,
      "grad_norm": 0.08148687332868576,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 4518
    },
    {
      "epoch": 0.72304,
      "grad_norm": 0.08678079396486282,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 4519
    },
    {
      "epoch": 0.7232,
      "grad_norm": 0.08484106510877609,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 4520
    },
    {
      "epoch": 0.72336,
      "grad_norm": 0.10951252281665802,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 4521
    },
    {
      "epoch": 0.72352,
      "grad_norm": 0.07714463770389557,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 4522
    },
    {
      "epoch": 0.72368,
      "grad_norm": 0.08681652694940567,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 4523
    },
    {
      "epoch": 0.72384,
      "grad_norm": 0.0905999094247818,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 4524
    },
    {
      "epoch": 0.724,
      "grad_norm": 0.08572820574045181,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4525
    },
    {
      "epoch": 0.72416,
      "grad_norm": 0.1010471060872078,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 4526
    },
    {
      "epoch": 0.72432,
      "grad_norm": 0.10965248942375183,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4527
    },
    {
      "epoch": 0.72448,
      "grad_norm": 0.09388065338134766,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 4528
    },
    {
      "epoch": 0.72464,
      "grad_norm": 0.08267522603273392,
      "learning_rate": 0.0001,
      "loss": 0.3026,
      "step": 4529
    },
    {
      "epoch": 0.7248,
      "grad_norm": 0.09567766636610031,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 4530
    },
    {
      "epoch": 0.72496,
      "grad_norm": 0.10038579255342484,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 4531
    },
    {
      "epoch": 0.72512,
      "grad_norm": 0.09193623065948486,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 4532
    },
    {
      "epoch": 0.72528,
      "grad_norm": 0.10439150035381317,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 4533
    },
    {
      "epoch": 0.72544,
      "grad_norm": 0.08604302257299423,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 4534
    },
    {
      "epoch": 0.7256,
      "grad_norm": 0.08653825521469116,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 4535
    },
    {
      "epoch": 0.72576,
      "grad_norm": 0.08378643542528152,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4536
    },
    {
      "epoch": 0.72592,
      "grad_norm": 0.08437151461839676,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 4537
    },
    {
      "epoch": 0.72608,
      "grad_norm": 0.08097954094409943,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 4538
    },
    {
      "epoch": 0.72624,
      "grad_norm": 0.0965486466884613,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 4539
    },
    {
      "epoch": 0.7264,
      "grad_norm": 0.10574740916490555,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 4540
    },
    {
      "epoch": 0.72656,
      "grad_norm": 0.0875646248459816,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 4541
    },
    {
      "epoch": 0.72672,
      "grad_norm": 0.10426957160234451,
      "learning_rate": 0.0001,
      "loss": 0.3039,
      "step": 4542
    },
    {
      "epoch": 0.72688,
      "grad_norm": 0.10449061542749405,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 4543
    },
    {
      "epoch": 0.72704,
      "grad_norm": 0.08985716104507446,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 4544
    },
    {
      "epoch": 0.7272,
      "grad_norm": 0.079313725233078,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 4545
    },
    {
      "epoch": 0.72736,
      "grad_norm": 0.09411894530057907,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4546
    },
    {
      "epoch": 0.72752,
      "grad_norm": 0.0912262424826622,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 4547
    },
    {
      "epoch": 0.72768,
      "grad_norm": 0.10460235178470612,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 4548
    },
    {
      "epoch": 0.72784,
      "grad_norm": 0.08002018183469772,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 4549
    },
    {
      "epoch": 0.728,
      "grad_norm": 0.10224206745624542,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 4550
    },
    {
      "epoch": 0.72816,
      "grad_norm": 0.0911945030093193,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 4551
    },
    {
      "epoch": 0.72832,
      "grad_norm": 0.09273548424243927,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 4552
    },
    {
      "epoch": 0.72848,
      "grad_norm": 0.1014406755566597,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4553
    },
    {
      "epoch": 0.72864,
      "grad_norm": 0.09349941462278366,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 4554
    },
    {
      "epoch": 0.7288,
      "grad_norm": 0.09849508851766586,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 4555
    },
    {
      "epoch": 0.72896,
      "grad_norm": 0.10275733470916748,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 4556
    },
    {
      "epoch": 0.72912,
      "grad_norm": 0.08895471692085266,
      "learning_rate": 0.0001,
      "loss": 0.3039,
      "step": 4557
    },
    {
      "epoch": 0.72928,
      "grad_norm": 0.09019401669502258,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 4558
    },
    {
      "epoch": 0.72944,
      "grad_norm": 0.08786287903785706,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 4559
    },
    {
      "epoch": 0.7296,
      "grad_norm": 0.08459921181201935,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 4560
    },
    {
      "epoch": 0.72976,
      "grad_norm": 0.08935576677322388,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 4561
    },
    {
      "epoch": 0.72992,
      "grad_norm": 0.09093037992715836,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 4562
    },
    {
      "epoch": 0.73008,
      "grad_norm": 0.09638290107250214,
      "learning_rate": 0.0001,
      "loss": 0.3091,
      "step": 4563
    },
    {
      "epoch": 0.73024,
      "grad_norm": 0.08616334944963455,
      "learning_rate": 0.0001,
      "loss": 0.306,
      "step": 4564
    },
    {
      "epoch": 0.7304,
      "grad_norm": 0.08259942382574081,
      "learning_rate": 0.0001,
      "loss": 0.3025,
      "step": 4565
    },
    {
      "epoch": 0.73056,
      "grad_norm": 0.08763574063777924,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 4566
    },
    {
      "epoch": 0.73072,
      "grad_norm": 0.08139827102422714,
      "learning_rate": 0.0001,
      "loss": 0.2998,
      "step": 4567
    },
    {
      "epoch": 0.73088,
      "grad_norm": 0.0905224084854126,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 4568
    },
    {
      "epoch": 0.73104,
      "grad_norm": 0.09097132086753845,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 4569
    },
    {
      "epoch": 0.7312,
      "grad_norm": 0.08768779039382935,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 4570
    },
    {
      "epoch": 0.73136,
      "grad_norm": 0.07823491096496582,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 4571
    },
    {
      "epoch": 0.73152,
      "grad_norm": 0.08009670674800873,
      "learning_rate": 0.0001,
      "loss": 0.3064,
      "step": 4572
    },
    {
      "epoch": 0.73168,
      "grad_norm": 0.08471319824457169,
      "learning_rate": 0.0001,
      "loss": 0.3055,
      "step": 4573
    },
    {
      "epoch": 0.73184,
      "grad_norm": 0.0908062532544136,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 4574
    },
    {
      "epoch": 0.732,
      "grad_norm": 0.09253116697072983,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4575
    },
    {
      "epoch": 0.73216,
      "grad_norm": 0.11168286204338074,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4576
    },
    {
      "epoch": 0.73232,
      "grad_norm": 0.08706369996070862,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 4577
    },
    {
      "epoch": 0.73248,
      "grad_norm": 0.08775486052036285,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 4578
    },
    {
      "epoch": 0.73264,
      "grad_norm": 0.08570105582475662,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 4579
    },
    {
      "epoch": 0.7328,
      "grad_norm": 0.08609078079462051,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 4580
    },
    {
      "epoch": 0.73296,
      "grad_norm": 0.07819792628288269,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4581
    },
    {
      "epoch": 0.73312,
      "grad_norm": 0.08778588473796844,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 4582
    },
    {
      "epoch": 0.73328,
      "grad_norm": 0.08512045443058014,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4583
    },
    {
      "epoch": 0.73344,
      "grad_norm": 0.08622729033231735,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 4584
    },
    {
      "epoch": 0.7336,
      "grad_norm": 0.07937224209308624,
      "learning_rate": 0.0001,
      "loss": 0.3081,
      "step": 4585
    },
    {
      "epoch": 0.73376,
      "grad_norm": 0.07904400676488876,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 4586
    },
    {
      "epoch": 0.73392,
      "grad_norm": 0.09378065913915634,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4587
    },
    {
      "epoch": 0.73408,
      "grad_norm": 0.08934847265481949,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 4588
    },
    {
      "epoch": 0.73424,
      "grad_norm": 0.08711972832679749,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 4589
    },
    {
      "epoch": 0.7344,
      "grad_norm": 0.08623239398002625,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 4590
    },
    {
      "epoch": 0.73456,
      "grad_norm": 0.09150116890668869,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 4591
    },
    {
      "epoch": 0.73472,
      "grad_norm": 0.08302371203899384,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 4592
    },
    {
      "epoch": 0.73488,
      "grad_norm": 0.08903402090072632,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 4593
    },
    {
      "epoch": 0.73504,
      "grad_norm": 0.07654822617769241,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 4594
    },
    {
      "epoch": 0.7352,
      "grad_norm": 0.09152120351791382,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 4595
    },
    {
      "epoch": 0.73536,
      "grad_norm": 0.07771782577037811,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 4596
    },
    {
      "epoch": 0.73552,
      "grad_norm": 0.0824931189417839,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 4597
    },
    {
      "epoch": 0.73568,
      "grad_norm": 0.08798044919967651,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 4598
    },
    {
      "epoch": 0.73584,
      "grad_norm": 0.09811084717512131,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 4599
    },
    {
      "epoch": 0.736,
      "grad_norm": 0.0854680985212326,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 4600
    },
    {
      "epoch": 0.736,
      "eval_train_accuracy": 0.9978,
      "eval_train_loss": 0.317768394947052,
      "eval_train_runtime": 4.1174,
      "eval_train_samples_per_second": 1214.366,
      "eval_train_steps_per_second": 15.301,
      "step": 4600
    },
    {
      "epoch": 0.736,
      "eval_test_accuracy": 0.9974,
      "eval_test_loss": 0.31655648350715637,
      "eval_test_runtime": 5.0251,
      "eval_test_samples_per_second": 995.0,
      "eval_test_steps_per_second": 12.537,
      "step": 4600
    },
    {
      "epoch": 0.73616,
      "grad_norm": 0.08527728170156479,
      "learning_rate": 0.0001,
      "loss": 0.3054,
      "step": 4601
    },
    {
      "epoch": 0.73632,
      "grad_norm": 0.08806588500738144,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 4602
    },
    {
      "epoch": 0.73648,
      "grad_norm": 0.07306025922298431,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 4603
    },
    {
      "epoch": 0.73664,
      "grad_norm": 0.08922704309225082,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 4604
    },
    {
      "epoch": 0.7368,
      "grad_norm": 0.08245612680912018,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 4605
    },
    {
      "epoch": 0.73696,
      "grad_norm": 0.08532731235027313,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 4606
    },
    {
      "epoch": 0.73712,
      "grad_norm": 0.10082335770130157,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 4607
    },
    {
      "epoch": 0.73728,
      "grad_norm": 0.09409105032682419,
      "learning_rate": 0.0001,
      "loss": 0.3036,
      "step": 4608
    },
    {
      "epoch": 0.73744,
      "grad_norm": 0.08419547975063324,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 4609
    },
    {
      "epoch": 0.7376,
      "grad_norm": 0.08700691908597946,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4610
    },
    {
      "epoch": 0.73776,
      "grad_norm": 0.0878685861825943,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 4611
    },
    {
      "epoch": 0.73792,
      "grad_norm": 0.08330920338630676,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4612
    },
    {
      "epoch": 0.73808,
      "grad_norm": 0.08555873483419418,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 4613
    },
    {
      "epoch": 0.73824,
      "grad_norm": 0.10256832093000412,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4614
    },
    {
      "epoch": 0.7384,
      "grad_norm": 0.09144232422113419,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 4615
    },
    {
      "epoch": 0.73856,
      "grad_norm": 0.08978888392448425,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 4616
    },
    {
      "epoch": 0.73872,
      "grad_norm": 0.07439679652452469,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 4617
    },
    {
      "epoch": 0.73888,
      "grad_norm": 0.08550810813903809,
      "learning_rate": 0.0001,
      "loss": 0.3075,
      "step": 4618
    },
    {
      "epoch": 0.73904,
      "grad_norm": 0.0821191594004631,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 4619
    },
    {
      "epoch": 0.7392,
      "grad_norm": 0.10114515572786331,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 4620
    },
    {
      "epoch": 0.73936,
      "grad_norm": 0.08540336787700653,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 4621
    },
    {
      "epoch": 0.73952,
      "grad_norm": 0.10032441467046738,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 4622
    },
    {
      "epoch": 0.73968,
      "grad_norm": 0.09098339080810547,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 4623
    },
    {
      "epoch": 0.73984,
      "grad_norm": 0.08420019596815109,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 4624
    },
    {
      "epoch": 0.74,
      "grad_norm": 0.0845932811498642,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 4625
    },
    {
      "epoch": 0.74016,
      "grad_norm": 0.08088042587041855,
      "learning_rate": 0.0001,
      "loss": 0.3054,
      "step": 4626
    },
    {
      "epoch": 0.74032,
      "grad_norm": 0.0910692885518074,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 4627
    },
    {
      "epoch": 0.74048,
      "grad_norm": 0.0937054231762886,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 4628
    },
    {
      "epoch": 0.74064,
      "grad_norm": 0.09465176612138748,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 4629
    },
    {
      "epoch": 0.7408,
      "grad_norm": 0.0930795818567276,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 4630
    },
    {
      "epoch": 0.74096,
      "grad_norm": 0.1006547138094902,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 4631
    },
    {
      "epoch": 0.74112,
      "grad_norm": 0.08057347685098648,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4632
    },
    {
      "epoch": 0.74128,
      "grad_norm": 0.08709803223609924,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 4633
    },
    {
      "epoch": 0.74144,
      "grad_norm": 0.08348308503627777,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 4634
    },
    {
      "epoch": 0.7416,
      "grad_norm": 0.08690235018730164,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 4635
    },
    {
      "epoch": 0.74176,
      "grad_norm": 0.09821111708879471,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 4636
    },
    {
      "epoch": 0.74192,
      "grad_norm": 0.09736886620521545,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 4637
    },
    {
      "epoch": 0.74208,
      "grad_norm": 0.08516298234462738,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 4638
    },
    {
      "epoch": 0.74224,
      "grad_norm": 0.07521422952413559,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4639
    },
    {
      "epoch": 0.7424,
      "grad_norm": 0.09057603776454926,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4640
    },
    {
      "epoch": 0.74256,
      "grad_norm": 0.08889006823301315,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 4641
    },
    {
      "epoch": 0.74272,
      "grad_norm": 0.09340362250804901,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 4642
    },
    {
      "epoch": 0.74288,
      "grad_norm": 0.09611637890338898,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 4643
    },
    {
      "epoch": 0.74304,
      "grad_norm": 0.08246657997369766,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 4644
    },
    {
      "epoch": 0.7432,
      "grad_norm": 0.08892081677913666,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 4645
    },
    {
      "epoch": 0.74336,
      "grad_norm": 0.08751508593559265,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 4646
    },
    {
      "epoch": 0.74352,
      "grad_norm": 0.09185668081045151,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 4647
    },
    {
      "epoch": 0.74368,
      "grad_norm": 0.09160498529672623,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 4648
    },
    {
      "epoch": 0.74384,
      "grad_norm": 0.08112020045518875,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 4649
    },
    {
      "epoch": 0.744,
      "grad_norm": 0.09259343147277832,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 4650
    },
    {
      "epoch": 0.74416,
      "grad_norm": 0.08588230609893799,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4651
    },
    {
      "epoch": 0.74432,
      "grad_norm": 0.0923842191696167,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 4652
    },
    {
      "epoch": 0.74448,
      "grad_norm": 0.08885995298624039,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 4653
    },
    {
      "epoch": 0.74464,
      "grad_norm": 0.09327784925699234,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 4654
    },
    {
      "epoch": 0.7448,
      "grad_norm": 0.08442871272563934,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 4655
    },
    {
      "epoch": 0.74496,
      "grad_norm": 0.08745377510786057,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 4656
    },
    {
      "epoch": 0.74512,
      "grad_norm": 0.08760874718427658,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 4657
    },
    {
      "epoch": 0.74528,
      "grad_norm": 0.08938024193048477,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 4658
    },
    {
      "epoch": 0.74544,
      "grad_norm": 0.08450332283973694,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 4659
    },
    {
      "epoch": 0.7456,
      "grad_norm": 0.09014230966567993,
      "learning_rate": 0.0001,
      "loss": 0.3034,
      "step": 4660
    },
    {
      "epoch": 0.74576,
      "grad_norm": 0.09183544665575027,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 4661
    },
    {
      "epoch": 0.74592,
      "grad_norm": 0.08117078989744186,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 4662
    },
    {
      "epoch": 0.74608,
      "grad_norm": 0.0749521479010582,
      "learning_rate": 0.0001,
      "loss": 0.3071,
      "step": 4663
    },
    {
      "epoch": 0.74624,
      "grad_norm": 0.08124920725822449,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 4664
    },
    {
      "epoch": 0.7464,
      "grad_norm": 0.0889640748500824,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 4665
    },
    {
      "epoch": 0.74656,
      "grad_norm": 0.0766160860657692,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4666
    },
    {
      "epoch": 0.74672,
      "grad_norm": 0.08688855916261673,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4667
    },
    {
      "epoch": 0.74688,
      "grad_norm": 0.0859958827495575,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 4668
    },
    {
      "epoch": 0.74704,
      "grad_norm": 0.08471181243658066,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 4669
    },
    {
      "epoch": 0.7472,
      "grad_norm": 0.07829350233078003,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 4670
    },
    {
      "epoch": 0.74736,
      "grad_norm": 0.09393586218357086,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4671
    },
    {
      "epoch": 0.74752,
      "grad_norm": 0.08765015006065369,
      "learning_rate": 0.0001,
      "loss": 0.2985,
      "step": 4672
    },
    {
      "epoch": 0.74768,
      "grad_norm": 0.08285582810640335,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 4673
    },
    {
      "epoch": 0.74784,
      "grad_norm": 0.09649502485990524,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 4674
    },
    {
      "epoch": 0.748,
      "grad_norm": 0.08198586106300354,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 4675
    },
    {
      "epoch": 0.74816,
      "grad_norm": 0.08661512285470963,
      "learning_rate": 0.0001,
      "loss": 0.3073,
      "step": 4676
    },
    {
      "epoch": 0.74832,
      "grad_norm": 0.10004609823226929,
      "learning_rate": 0.0001,
      "loss": 0.3051,
      "step": 4677
    },
    {
      "epoch": 0.74848,
      "grad_norm": 0.09222320467233658,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 4678
    },
    {
      "epoch": 0.74864,
      "grad_norm": 0.08678918331861496,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4679
    },
    {
      "epoch": 0.7488,
      "grad_norm": 0.08578365296125412,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 4680
    },
    {
      "epoch": 0.74896,
      "grad_norm": 0.08706612884998322,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 4681
    },
    {
      "epoch": 0.74912,
      "grad_norm": 0.0842277929186821,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 4682
    },
    {
      "epoch": 0.74928,
      "grad_norm": 0.09876900166273117,
      "learning_rate": 0.0001,
      "loss": 0.3015,
      "step": 4683
    },
    {
      "epoch": 0.74944,
      "grad_norm": 0.0970401018857956,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 4684
    },
    {
      "epoch": 0.7496,
      "grad_norm": 0.08993466943502426,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 4685
    },
    {
      "epoch": 0.74976,
      "grad_norm": 0.08458495885133743,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 4686
    },
    {
      "epoch": 0.74992,
      "grad_norm": 0.08400201797485352,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 4687
    },
    {
      "epoch": 0.75008,
      "grad_norm": 0.07953571528196335,
      "learning_rate": 0.0001,
      "loss": 0.2936,
      "step": 4688
    },
    {
      "epoch": 0.75024,
      "grad_norm": 0.08705402165651321,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 4689
    },
    {
      "epoch": 0.7504,
      "grad_norm": 0.09411490708589554,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 4690
    },
    {
      "epoch": 0.75056,
      "grad_norm": 0.0911155715584755,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 4691
    },
    {
      "epoch": 0.75072,
      "grad_norm": 0.08091124892234802,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 4692
    },
    {
      "epoch": 0.75088,
      "grad_norm": 0.11414752900600433,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 4693
    },
    {
      "epoch": 0.75104,
      "grad_norm": 0.08964470028877258,
      "learning_rate": 0.0001,
      "loss": 0.3066,
      "step": 4694
    },
    {
      "epoch": 0.7512,
      "grad_norm": 0.08855491876602173,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 4695
    },
    {
      "epoch": 0.75136,
      "grad_norm": 0.09185641258955002,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 4696
    },
    {
      "epoch": 0.75152,
      "grad_norm": 0.1002572551369667,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 4697
    },
    {
      "epoch": 0.75168,
      "grad_norm": 0.09089912474155426,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 4698
    },
    {
      "epoch": 0.75184,
      "grad_norm": 0.08813654631376266,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 4699
    },
    {
      "epoch": 0.752,
      "grad_norm": 0.08872738480567932,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 4700
    },
    {
      "epoch": 0.752,
      "eval_train_accuracy": 0.9986,
      "eval_train_loss": 0.318002849817276,
      "eval_train_runtime": 4.1691,
      "eval_train_samples_per_second": 1199.291,
      "eval_train_steps_per_second": 15.111,
      "step": 4700
    },
    {
      "epoch": 0.752,
      "eval_test_accuracy": 0.9974,
      "eval_test_loss": 0.3166089355945587,
      "eval_test_runtime": 4.6519,
      "eval_test_samples_per_second": 1074.821,
      "eval_test_steps_per_second": 13.543,
      "step": 4700
    },
    {
      "epoch": 0.75216,
      "grad_norm": 0.09070233255624771,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 4701
    },
    {
      "epoch": 0.75232,
      "grad_norm": 0.09965884685516357,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 4702
    },
    {
      "epoch": 0.75248,
      "grad_norm": 0.1074647456407547,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 4703
    },
    {
      "epoch": 0.75264,
      "grad_norm": 0.08444125950336456,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 4704
    },
    {
      "epoch": 0.7528,
      "grad_norm": 0.09441941231489182,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4705
    },
    {
      "epoch": 0.75296,
      "grad_norm": 0.08623056858778,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 4706
    },
    {
      "epoch": 0.75312,
      "grad_norm": 0.08028100430965424,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 4707
    },
    {
      "epoch": 0.75328,
      "grad_norm": 0.09064161777496338,
      "learning_rate": 0.0001,
      "loss": 0.2927,
      "step": 4708
    },
    {
      "epoch": 0.75344,
      "grad_norm": 0.09948428720235825,
      "learning_rate": 0.0001,
      "loss": 0.3076,
      "step": 4709
    },
    {
      "epoch": 0.7536,
      "grad_norm": 0.10042234510183334,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 4710
    },
    {
      "epoch": 0.75376,
      "grad_norm": 0.09655631333589554,
      "learning_rate": 0.0001,
      "loss": 0.3091,
      "step": 4711
    },
    {
      "epoch": 0.75392,
      "grad_norm": 0.09764020889997482,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 4712
    },
    {
      "epoch": 0.75408,
      "grad_norm": 0.09334484487771988,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4713
    },
    {
      "epoch": 0.75424,
      "grad_norm": 0.08220288157463074,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 4714
    },
    {
      "epoch": 0.7544,
      "grad_norm": 0.08768385648727417,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 4715
    },
    {
      "epoch": 0.75456,
      "grad_norm": 0.0854850634932518,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 4716
    },
    {
      "epoch": 0.75472,
      "grad_norm": 0.09076069295406342,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 4717
    },
    {
      "epoch": 0.75488,
      "grad_norm": 0.08758203685283661,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 4718
    },
    {
      "epoch": 0.75504,
      "grad_norm": 0.08223193138837814,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 4719
    },
    {
      "epoch": 0.7552,
      "grad_norm": 0.0974159687757492,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 4720
    },
    {
      "epoch": 0.75536,
      "grad_norm": 0.10028881579637527,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 4721
    },
    {
      "epoch": 0.75552,
      "grad_norm": 0.077708400785923,
      "learning_rate": 0.0001,
      "loss": 0.3073,
      "step": 4722
    },
    {
      "epoch": 0.75568,
      "grad_norm": 0.08359333127737045,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 4723
    },
    {
      "epoch": 0.75584,
      "grad_norm": 0.10177158564329147,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 4724
    },
    {
      "epoch": 0.756,
      "grad_norm": 0.10192692279815674,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 4725
    },
    {
      "epoch": 0.75616,
      "grad_norm": 0.09091733396053314,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 4726
    },
    {
      "epoch": 0.75632,
      "grad_norm": 0.09234921634197235,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 4727
    },
    {
      "epoch": 0.75648,
      "grad_norm": 0.09727615118026733,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 4728
    },
    {
      "epoch": 0.75664,
      "grad_norm": 0.08651969581842422,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4729
    },
    {
      "epoch": 0.7568,
      "grad_norm": 0.0983988344669342,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 4730
    },
    {
      "epoch": 0.75696,
      "grad_norm": 0.0885421559214592,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4731
    },
    {
      "epoch": 0.75712,
      "grad_norm": 0.11776471138000488,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 4732
    },
    {
      "epoch": 0.75728,
      "grad_norm": 0.09245292842388153,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 4733
    },
    {
      "epoch": 0.75744,
      "grad_norm": 0.09187696874141693,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4734
    },
    {
      "epoch": 0.7576,
      "grad_norm": 0.11625389754772186,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 4735
    },
    {
      "epoch": 0.75776,
      "grad_norm": 0.0777568370103836,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 4736
    },
    {
      "epoch": 0.75792,
      "grad_norm": 0.08281663805246353,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 4737
    },
    {
      "epoch": 0.75808,
      "grad_norm": 0.08297807723283768,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 4738
    },
    {
      "epoch": 0.75824,
      "grad_norm": 0.08855149149894714,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 4739
    },
    {
      "epoch": 0.7584,
      "grad_norm": 0.08198598027229309,
      "learning_rate": 0.0001,
      "loss": 0.3057,
      "step": 4740
    },
    {
      "epoch": 0.75856,
      "grad_norm": 0.07374317198991776,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 4741
    },
    {
      "epoch": 0.75872,
      "grad_norm": 0.10253314673900604,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 4742
    },
    {
      "epoch": 0.75888,
      "grad_norm": 0.09616384655237198,
      "learning_rate": 0.0001,
      "loss": 0.3044,
      "step": 4743
    },
    {
      "epoch": 0.75904,
      "grad_norm": 0.08960062265396118,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4744
    },
    {
      "epoch": 0.7592,
      "grad_norm": 0.11307898163795471,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 4745
    },
    {
      "epoch": 0.75936,
      "grad_norm": 0.09153435379266739,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 4746
    },
    {
      "epoch": 0.75952,
      "grad_norm": 0.08772232383489609,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4747
    },
    {
      "epoch": 0.75968,
      "grad_norm": 0.10093092173337936,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 4748
    },
    {
      "epoch": 0.75984,
      "grad_norm": 0.08849401026964188,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4749
    },
    {
      "epoch": 0.76,
      "grad_norm": 0.08272994309663773,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4750
    },
    {
      "epoch": 0.76016,
      "grad_norm": 0.14009660482406616,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 4751
    },
    {
      "epoch": 0.76032,
      "grad_norm": 0.08430041372776031,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 4752
    },
    {
      "epoch": 0.76048,
      "grad_norm": 0.07722064852714539,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 4753
    },
    {
      "epoch": 0.76064,
      "grad_norm": 0.08633284270763397,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 4754
    },
    {
      "epoch": 0.7608,
      "grad_norm": 0.0842151939868927,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4755
    },
    {
      "epoch": 0.76096,
      "grad_norm": 0.08928875625133514,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4756
    },
    {
      "epoch": 0.76112,
      "grad_norm": 0.07836401462554932,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4757
    },
    {
      "epoch": 0.76128,
      "grad_norm": 0.089431032538414,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4758
    },
    {
      "epoch": 0.76144,
      "grad_norm": 0.09725870937108994,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 4759
    },
    {
      "epoch": 0.7616,
      "grad_norm": 0.08326151967048645,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 4760
    },
    {
      "epoch": 0.76176,
      "grad_norm": 0.1003643274307251,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 4761
    },
    {
      "epoch": 0.76192,
      "grad_norm": 0.10002864897251129,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 4762
    },
    {
      "epoch": 0.76208,
      "grad_norm": 0.12266440689563751,
      "learning_rate": 0.0001,
      "loss": 0.299,
      "step": 4763
    },
    {
      "epoch": 0.76224,
      "grad_norm": 0.10913907736539841,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 4764
    },
    {
      "epoch": 0.7624,
      "grad_norm": 0.10182001441717148,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 4765
    },
    {
      "epoch": 0.76256,
      "grad_norm": 0.08429867774248123,
      "learning_rate": 0.0001,
      "loss": 0.2989,
      "step": 4766
    },
    {
      "epoch": 0.76272,
      "grad_norm": 0.07336893677711487,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4767
    },
    {
      "epoch": 0.76288,
      "grad_norm": 0.09135456383228302,
      "learning_rate": 0.0001,
      "loss": 0.3022,
      "step": 4768
    },
    {
      "epoch": 0.76304,
      "grad_norm": 0.09339327365159988,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4769
    },
    {
      "epoch": 0.7632,
      "grad_norm": 0.09398496896028519,
      "learning_rate": 0.0001,
      "loss": 0.3078,
      "step": 4770
    },
    {
      "epoch": 0.76336,
      "grad_norm": 0.1011749804019928,
      "learning_rate": 0.0001,
      "loss": 0.3083,
      "step": 4771
    },
    {
      "epoch": 0.76352,
      "grad_norm": 0.08908194303512573,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 4772
    },
    {
      "epoch": 0.76368,
      "grad_norm": 0.10450313985347748,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 4773
    },
    {
      "epoch": 0.76384,
      "grad_norm": 0.08896131813526154,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 4774
    },
    {
      "epoch": 0.764,
      "grad_norm": 0.08188414573669434,
      "learning_rate": 0.0001,
      "loss": 0.2978,
      "step": 4775
    },
    {
      "epoch": 0.76416,
      "grad_norm": 0.08689437806606293,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 4776
    },
    {
      "epoch": 0.76432,
      "grad_norm": 0.10215786099433899,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 4777
    },
    {
      "epoch": 0.76448,
      "grad_norm": 0.12173621356487274,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 4778
    },
    {
      "epoch": 0.76464,
      "grad_norm": 0.0970006138086319,
      "learning_rate": 0.0001,
      "loss": 0.3067,
      "step": 4779
    },
    {
      "epoch": 0.7648,
      "grad_norm": 0.08676169067621231,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4780
    },
    {
      "epoch": 0.76496,
      "grad_norm": 0.08729138225317001,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 4781
    },
    {
      "epoch": 0.76512,
      "grad_norm": 0.08231379836797714,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 4782
    },
    {
      "epoch": 0.76528,
      "grad_norm": 0.09297355264425278,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4783
    },
    {
      "epoch": 0.76544,
      "grad_norm": 0.09475427120923996,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 4784
    },
    {
      "epoch": 0.7656,
      "grad_norm": 0.08756649494171143,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 4785
    },
    {
      "epoch": 0.76576,
      "grad_norm": 0.08298169076442719,
      "learning_rate": 0.0001,
      "loss": 0.3051,
      "step": 4786
    },
    {
      "epoch": 0.76592,
      "grad_norm": 0.08341703563928604,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 4787
    },
    {
      "epoch": 0.76608,
      "grad_norm": 0.10562235862016678,
      "learning_rate": 0.0001,
      "loss": 0.3067,
      "step": 4788
    },
    {
      "epoch": 0.76624,
      "grad_norm": 0.09535590559244156,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 4789
    },
    {
      "epoch": 0.7664,
      "grad_norm": 0.09843647480010986,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 4790
    },
    {
      "epoch": 0.76656,
      "grad_norm": 0.07784155011177063,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4791
    },
    {
      "epoch": 0.76672,
      "grad_norm": 0.08426032960414886,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 4792
    },
    {
      "epoch": 0.76688,
      "grad_norm": 0.09155303239822388,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 4793
    },
    {
      "epoch": 0.76704,
      "grad_norm": 0.07892940193414688,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 4794
    },
    {
      "epoch": 0.7672,
      "grad_norm": 0.1068810224533081,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 4795
    },
    {
      "epoch": 0.76736,
      "grad_norm": 0.10193327814340591,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 4796
    },
    {
      "epoch": 0.76752,
      "grad_norm": 0.0847925990819931,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 4797
    },
    {
      "epoch": 0.76768,
      "grad_norm": 0.10107337683439255,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 4798
    },
    {
      "epoch": 0.76784,
      "grad_norm": 0.09969659894704819,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 4799
    },
    {
      "epoch": 0.768,
      "grad_norm": 0.07725737988948822,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 4800
    },
    {
      "epoch": 0.768,
      "eval_train_accuracy": 0.9992,
      "eval_train_loss": 0.31774771213531494,
      "eval_train_runtime": 4.0375,
      "eval_train_samples_per_second": 1238.4,
      "eval_train_steps_per_second": 15.604,
      "step": 4800
    },
    {
      "epoch": 0.768,
      "eval_test_accuracy": 0.9982,
      "eval_test_loss": 0.31647899746894836,
      "eval_test_runtime": 4.7432,
      "eval_test_samples_per_second": 1054.13,
      "eval_test_steps_per_second": 13.282,
      "step": 4800
    },
    {
      "epoch": 0.76816,
      "grad_norm": 0.08410829305648804,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4801
    },
    {
      "epoch": 0.76832,
      "grad_norm": 0.09202821552753448,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 4802
    },
    {
      "epoch": 0.76848,
      "grad_norm": 0.09352108836174011,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 4803
    },
    {
      "epoch": 0.76864,
      "grad_norm": 0.08705738186836243,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 4804
    },
    {
      "epoch": 0.7688,
      "grad_norm": 0.09004320204257965,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 4805
    },
    {
      "epoch": 0.76896,
      "grad_norm": 0.11272154748439789,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 4806
    },
    {
      "epoch": 0.76912,
      "grad_norm": 0.08773919194936752,
      "learning_rate": 0.0001,
      "loss": 0.3066,
      "step": 4807
    },
    {
      "epoch": 0.76928,
      "grad_norm": 0.10011997073888779,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 4808
    },
    {
      "epoch": 0.76944,
      "grad_norm": 0.08287198841571808,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 4809
    },
    {
      "epoch": 0.7696,
      "grad_norm": 0.08655137568712234,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 4810
    },
    {
      "epoch": 0.76976,
      "grad_norm": 0.08414289355278015,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4811
    },
    {
      "epoch": 0.76992,
      "grad_norm": 0.08927848190069199,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 4812
    },
    {
      "epoch": 0.77008,
      "grad_norm": 0.08218301832675934,
      "learning_rate": 0.0001,
      "loss": 0.3393,
      "step": 4813
    },
    {
      "epoch": 0.77024,
      "grad_norm": 0.09043259918689728,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 4814
    },
    {
      "epoch": 0.7704,
      "grad_norm": 0.08439834415912628,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 4815
    },
    {
      "epoch": 0.77056,
      "grad_norm": 0.08021906018257141,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 4816
    },
    {
      "epoch": 0.77072,
      "grad_norm": 0.08045818656682968,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 4817
    },
    {
      "epoch": 0.77088,
      "grad_norm": 0.09370866417884827,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 4818
    },
    {
      "epoch": 0.77104,
      "grad_norm": 0.1106676235795021,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 4819
    },
    {
      "epoch": 0.7712,
      "grad_norm": 0.09512150287628174,
      "learning_rate": 0.0001,
      "loss": 0.3068,
      "step": 4820
    },
    {
      "epoch": 0.77136,
      "grad_norm": 0.09272532165050507,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 4821
    },
    {
      "epoch": 0.77152,
      "grad_norm": 0.09085975587368011,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 4822
    },
    {
      "epoch": 0.77168,
      "grad_norm": 0.08205710351467133,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 4823
    },
    {
      "epoch": 0.77184,
      "grad_norm": 0.0913340300321579,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 4824
    },
    {
      "epoch": 0.772,
      "grad_norm": 0.11097217351198196,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 4825
    },
    {
      "epoch": 0.77216,
      "grad_norm": 0.09551462531089783,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 4826
    },
    {
      "epoch": 0.77232,
      "grad_norm": 0.09375449270009995,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 4827
    },
    {
      "epoch": 0.77248,
      "grad_norm": 0.08811596035957336,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 4828
    },
    {
      "epoch": 0.77264,
      "grad_norm": 0.09928885847330093,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 4829
    },
    {
      "epoch": 0.7728,
      "grad_norm": 0.09029415994882584,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 4830
    },
    {
      "epoch": 0.77296,
      "grad_norm": 0.07943248748779297,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4831
    },
    {
      "epoch": 0.77312,
      "grad_norm": 0.08547211438417435,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 4832
    },
    {
      "epoch": 0.77328,
      "grad_norm": 0.10382518917322159,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 4833
    },
    {
      "epoch": 0.77344,
      "grad_norm": 0.09226837754249573,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4834
    },
    {
      "epoch": 0.7736,
      "grad_norm": 0.09309370815753937,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4835
    },
    {
      "epoch": 0.77376,
      "grad_norm": 0.09063832461833954,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 4836
    },
    {
      "epoch": 0.77392,
      "grad_norm": 0.08162424713373184,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 4837
    },
    {
      "epoch": 0.77408,
      "grad_norm": 0.09232041239738464,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 4838
    },
    {
      "epoch": 0.77424,
      "grad_norm": 0.0837569460272789,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 4839
    },
    {
      "epoch": 0.7744,
      "grad_norm": 0.0954430103302002,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 4840
    },
    {
      "epoch": 0.77456,
      "grad_norm": 0.11195167154073715,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 4841
    },
    {
      "epoch": 0.77472,
      "grad_norm": 0.07875260710716248,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 4842
    },
    {
      "epoch": 0.77488,
      "grad_norm": 0.09428126364946365,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 4843
    },
    {
      "epoch": 0.77504,
      "grad_norm": 0.08424300700426102,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 4844
    },
    {
      "epoch": 0.7752,
      "grad_norm": 0.10527181625366211,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4845
    },
    {
      "epoch": 0.77536,
      "grad_norm": 0.08693455904722214,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 4846
    },
    {
      "epoch": 0.77552,
      "grad_norm": 0.10641183704137802,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 4847
    },
    {
      "epoch": 0.77568,
      "grad_norm": 0.09504800289869308,
      "learning_rate": 0.0001,
      "loss": 0.3058,
      "step": 4848
    },
    {
      "epoch": 0.77584,
      "grad_norm": 0.08139324188232422,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 4849
    },
    {
      "epoch": 0.776,
      "grad_norm": 0.08724647760391235,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 4850
    },
    {
      "epoch": 0.77616,
      "grad_norm": 0.10173948854207993,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 4851
    },
    {
      "epoch": 0.77632,
      "grad_norm": 0.08191876113414764,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 4852
    },
    {
      "epoch": 0.77648,
      "grad_norm": 0.10899841785430908,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 4853
    },
    {
      "epoch": 0.77664,
      "grad_norm": 0.08875498920679092,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 4854
    },
    {
      "epoch": 0.7768,
      "grad_norm": 0.08759886026382446,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 4855
    },
    {
      "epoch": 0.77696,
      "grad_norm": 0.0831461250782013,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4856
    },
    {
      "epoch": 0.77712,
      "grad_norm": 0.0897955521941185,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 4857
    },
    {
      "epoch": 0.77728,
      "grad_norm": 0.08276011794805527,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 4858
    },
    {
      "epoch": 0.77744,
      "grad_norm": 0.09695171564817429,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 4859
    },
    {
      "epoch": 0.7776,
      "grad_norm": 0.10676033049821854,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4860
    },
    {
      "epoch": 0.77776,
      "grad_norm": 0.09499095380306244,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4861
    },
    {
      "epoch": 0.77792,
      "grad_norm": 0.08886317163705826,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 4862
    },
    {
      "epoch": 0.77808,
      "grad_norm": 0.09403140097856522,
      "learning_rate": 0.0001,
      "loss": 0.3025,
      "step": 4863
    },
    {
      "epoch": 0.77824,
      "grad_norm": 0.09240177273750305,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 4864
    },
    {
      "epoch": 0.7784,
      "grad_norm": 0.09492512792348862,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 4865
    },
    {
      "epoch": 0.77856,
      "grad_norm": 0.09817413985729218,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 4866
    },
    {
      "epoch": 0.77872,
      "grad_norm": 0.08241602033376694,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 4867
    },
    {
      "epoch": 0.77888,
      "grad_norm": 0.10418970882892609,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 4868
    },
    {
      "epoch": 0.77904,
      "grad_norm": 0.09149368107318878,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 4869
    },
    {
      "epoch": 0.7792,
      "grad_norm": 0.10188008844852448,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 4870
    },
    {
      "epoch": 0.77936,
      "grad_norm": 0.08941265940666199,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 4871
    },
    {
      "epoch": 0.77952,
      "grad_norm": 0.08748690038919449,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 4872
    },
    {
      "epoch": 0.77968,
      "grad_norm": 0.07913973927497864,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4873
    },
    {
      "epoch": 0.77984,
      "grad_norm": 0.07714512199163437,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 4874
    },
    {
      "epoch": 0.78,
      "grad_norm": 0.08777415007352829,
      "learning_rate": 0.0001,
      "loss": 0.3055,
      "step": 4875
    },
    {
      "epoch": 0.78016,
      "grad_norm": 0.07889862358570099,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 4876
    },
    {
      "epoch": 0.78032,
      "grad_norm": 0.08823138475418091,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 4877
    },
    {
      "epoch": 0.78048,
      "grad_norm": 0.08461388200521469,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 4878
    },
    {
      "epoch": 0.78064,
      "grad_norm": 0.10134247690439224,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 4879
    },
    {
      "epoch": 0.7808,
      "grad_norm": 0.08456723392009735,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 4880
    },
    {
      "epoch": 0.78096,
      "grad_norm": 0.08155820518732071,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 4881
    },
    {
      "epoch": 0.78112,
      "grad_norm": 0.07930285483598709,
      "learning_rate": 0.0001,
      "loss": 0.3041,
      "step": 4882
    },
    {
      "epoch": 0.78128,
      "grad_norm": 0.09124910086393356,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 4883
    },
    {
      "epoch": 0.78144,
      "grad_norm": 0.08323683589696884,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 4884
    },
    {
      "epoch": 0.7816,
      "grad_norm": 0.09348209202289581,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 4885
    },
    {
      "epoch": 0.78176,
      "grad_norm": 0.10471218079328537,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 4886
    },
    {
      "epoch": 0.78192,
      "grad_norm": 0.08309104293584824,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 4887
    },
    {
      "epoch": 0.78208,
      "grad_norm": 0.07803390175104141,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 4888
    },
    {
      "epoch": 0.78224,
      "grad_norm": 0.09413014352321625,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 4889
    },
    {
      "epoch": 0.7824,
      "grad_norm": 0.09029475599527359,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 4890
    },
    {
      "epoch": 0.78256,
      "grad_norm": 0.08223146945238113,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4891
    },
    {
      "epoch": 0.78272,
      "grad_norm": 0.0855419859290123,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 4892
    },
    {
      "epoch": 0.78288,
      "grad_norm": 0.13001392781734467,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 4893
    },
    {
      "epoch": 0.78304,
      "grad_norm": 0.09554099291563034,
      "learning_rate": 0.0001,
      "loss": 0.3059,
      "step": 4894
    },
    {
      "epoch": 0.7832,
      "grad_norm": 0.09553646296262741,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 4895
    },
    {
      "epoch": 0.78336,
      "grad_norm": 0.08430814743041992,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 4896
    },
    {
      "epoch": 0.78352,
      "grad_norm": 0.07922805845737457,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 4897
    },
    {
      "epoch": 0.78368,
      "grad_norm": 0.0907321497797966,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 4898
    },
    {
      "epoch": 0.78384,
      "grad_norm": 0.10460870712995529,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 4899
    },
    {
      "epoch": 0.784,
      "grad_norm": 0.09677315503358841,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4900
    },
    {
      "epoch": 0.784,
      "eval_train_accuracy": 0.999,
      "eval_train_loss": 0.31782934069633484,
      "eval_train_runtime": 4.2137,
      "eval_train_samples_per_second": 1186.617,
      "eval_train_steps_per_second": 14.951,
      "step": 4900
    },
    {
      "epoch": 0.784,
      "eval_test_accuracy": 0.9992,
      "eval_test_loss": 0.3164985477924347,
      "eval_test_runtime": 4.7455,
      "eval_test_samples_per_second": 1053.639,
      "eval_test_steps_per_second": 13.276,
      "step": 4900
    },
    {
      "epoch": 0.78416,
      "grad_norm": 0.10185983031988144,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 4901
    },
    {
      "epoch": 0.78432,
      "grad_norm": 0.10032326728105545,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 4902
    },
    {
      "epoch": 0.78448,
      "grad_norm": 0.08590216189622879,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 4903
    },
    {
      "epoch": 0.78464,
      "grad_norm": 0.10299061983823776,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 4904
    },
    {
      "epoch": 0.7848,
      "grad_norm": 0.08911418914794922,
      "learning_rate": 0.0001,
      "loss": 0.3027,
      "step": 4905
    },
    {
      "epoch": 0.78496,
      "grad_norm": 0.08578881621360779,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 4906
    },
    {
      "epoch": 0.78512,
      "grad_norm": 0.09303522109985352,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4907
    },
    {
      "epoch": 0.78528,
      "grad_norm": 0.0972108393907547,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 4908
    },
    {
      "epoch": 0.78544,
      "grad_norm": 0.12027158588171005,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 4909
    },
    {
      "epoch": 0.7856,
      "grad_norm": 0.10774190723896027,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 4910
    },
    {
      "epoch": 0.78576,
      "grad_norm": 0.10412152856588364,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 4911
    },
    {
      "epoch": 0.78592,
      "grad_norm": 0.08298066258430481,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 4912
    },
    {
      "epoch": 0.78608,
      "grad_norm": 0.0846613198518753,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 4913
    },
    {
      "epoch": 0.78624,
      "grad_norm": 0.08314985036849976,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 4914
    },
    {
      "epoch": 0.7864,
      "grad_norm": 0.11544153094291687,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 4915
    },
    {
      "epoch": 0.78656,
      "grad_norm": 0.10052710026502609,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 4916
    },
    {
      "epoch": 0.78672,
      "grad_norm": 0.08746615052223206,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 4917
    },
    {
      "epoch": 0.78688,
      "grad_norm": 0.12391967326402664,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 4918
    },
    {
      "epoch": 0.78704,
      "grad_norm": 0.08955474197864532,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 4919
    },
    {
      "epoch": 0.7872,
      "grad_norm": 0.08042719960212708,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 4920
    },
    {
      "epoch": 0.78736,
      "grad_norm": 0.08945850282907486,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 4921
    },
    {
      "epoch": 0.78752,
      "grad_norm": 0.09970130771398544,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 4922
    },
    {
      "epoch": 0.78768,
      "grad_norm": 0.08837755024433136,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 4923
    },
    {
      "epoch": 0.78784,
      "grad_norm": 0.08514130860567093,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 4924
    },
    {
      "epoch": 0.788,
      "grad_norm": 0.08518324047327042,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4925
    },
    {
      "epoch": 0.78816,
      "grad_norm": 0.09182074666023254,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 4926
    },
    {
      "epoch": 0.78832,
      "grad_norm": 0.08765412867069244,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4927
    },
    {
      "epoch": 0.78848,
      "grad_norm": 0.08440112322568893,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4928
    },
    {
      "epoch": 0.78864,
      "grad_norm": 0.0828220546245575,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 4929
    },
    {
      "epoch": 0.7888,
      "grad_norm": 0.08884984254837036,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 4930
    },
    {
      "epoch": 0.78896,
      "grad_norm": 0.09566967934370041,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4931
    },
    {
      "epoch": 0.78912,
      "grad_norm": 0.100941501557827,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 4932
    },
    {
      "epoch": 0.78928,
      "grad_norm": 0.09642868489027023,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 4933
    },
    {
      "epoch": 0.78944,
      "grad_norm": 0.09030818939208984,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 4934
    },
    {
      "epoch": 0.7896,
      "grad_norm": 0.07900562882423401,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 4935
    },
    {
      "epoch": 0.78976,
      "grad_norm": 0.08711912482976913,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 4936
    },
    {
      "epoch": 0.78992,
      "grad_norm": 0.09047558158636093,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4937
    },
    {
      "epoch": 0.79008,
      "grad_norm": 0.08639171719551086,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 4938
    },
    {
      "epoch": 0.79024,
      "grad_norm": 0.09252894669771194,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 4939
    },
    {
      "epoch": 0.7904,
      "grad_norm": 0.08329755812883377,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 4940
    },
    {
      "epoch": 0.79056,
      "grad_norm": 0.08056874573230743,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 4941
    },
    {
      "epoch": 0.79072,
      "grad_norm": 0.08526380360126495,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 4942
    },
    {
      "epoch": 0.79088,
      "grad_norm": 0.14726188778877258,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 4943
    },
    {
      "epoch": 0.79104,
      "grad_norm": 0.10509707033634186,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4944
    },
    {
      "epoch": 0.7912,
      "grad_norm": 0.08966390043497086,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 4945
    },
    {
      "epoch": 0.79136,
      "grad_norm": 0.09555799514055252,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 4946
    },
    {
      "epoch": 0.79152,
      "grad_norm": 0.089072585105896,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 4947
    },
    {
      "epoch": 0.79168,
      "grad_norm": 0.08614581823348999,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 4948
    },
    {
      "epoch": 0.79184,
      "grad_norm": 0.08130360394716263,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 4949
    },
    {
      "epoch": 0.792,
      "grad_norm": 0.09423043578863144,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 4950
    },
    {
      "epoch": 0.79216,
      "grad_norm": 0.08800714462995529,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 4951
    },
    {
      "epoch": 0.79232,
      "grad_norm": 0.13508395850658417,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 4952
    },
    {
      "epoch": 0.79248,
      "grad_norm": 0.10171888023614883,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 4953
    },
    {
      "epoch": 0.79264,
      "grad_norm": 0.08572611957788467,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 4954
    },
    {
      "epoch": 0.7928,
      "grad_norm": 0.09354221820831299,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 4955
    },
    {
      "epoch": 0.79296,
      "grad_norm": 0.08537409454584122,
      "learning_rate": 0.0001,
      "loss": 0.3076,
      "step": 4956
    },
    {
      "epoch": 0.79312,
      "grad_norm": 0.08222603797912598,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 4957
    },
    {
      "epoch": 0.79328,
      "grad_norm": 0.08936991542577744,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 4958
    },
    {
      "epoch": 0.79344,
      "grad_norm": 0.0869365930557251,
      "learning_rate": 0.0001,
      "loss": 0.3042,
      "step": 4959
    },
    {
      "epoch": 0.7936,
      "grad_norm": 0.09130645543336868,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 4960
    },
    {
      "epoch": 0.79376,
      "grad_norm": 0.09362369775772095,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 4961
    },
    {
      "epoch": 0.79392,
      "grad_norm": 0.0866287425160408,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 4962
    },
    {
      "epoch": 0.79408,
      "grad_norm": 0.08481336385011673,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 4963
    },
    {
      "epoch": 0.79424,
      "grad_norm": 0.08323212713003159,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 4964
    },
    {
      "epoch": 0.7944,
      "grad_norm": 0.0797668993473053,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4965
    },
    {
      "epoch": 0.79456,
      "grad_norm": 0.08048320561647415,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 4966
    },
    {
      "epoch": 0.79472,
      "grad_norm": 0.11239370703697205,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 4967
    },
    {
      "epoch": 0.79488,
      "grad_norm": 0.1003417819738388,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 4968
    },
    {
      "epoch": 0.79504,
      "grad_norm": 0.08782367408275604,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4969
    },
    {
      "epoch": 0.7952,
      "grad_norm": 0.09872852265834808,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 4970
    },
    {
      "epoch": 0.79536,
      "grad_norm": 0.08365321904420853,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 4971
    },
    {
      "epoch": 0.79552,
      "grad_norm": 0.07615386694669724,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 4972
    },
    {
      "epoch": 0.79568,
      "grad_norm": 0.08181657642126083,
      "learning_rate": 0.0001,
      "loss": 0.3066,
      "step": 4973
    },
    {
      "epoch": 0.79584,
      "grad_norm": 0.0977020412683487,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4974
    },
    {
      "epoch": 0.796,
      "grad_norm": 0.08990862965583801,
      "learning_rate": 0.0001,
      "loss": 0.3083,
      "step": 4975
    },
    {
      "epoch": 0.79616,
      "grad_norm": 0.08739979565143585,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 4976
    },
    {
      "epoch": 0.79632,
      "grad_norm": 0.09073053300380707,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 4977
    },
    {
      "epoch": 0.79648,
      "grad_norm": 0.08664493262767792,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 4978
    },
    {
      "epoch": 0.79664,
      "grad_norm": 0.0981147289276123,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 4979
    },
    {
      "epoch": 0.7968,
      "grad_norm": 0.08028954267501831,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4980
    },
    {
      "epoch": 0.79696,
      "grad_norm": 0.09966521710157394,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 4981
    },
    {
      "epoch": 0.79712,
      "grad_norm": 0.0908256396651268,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4982
    },
    {
      "epoch": 0.79728,
      "grad_norm": 0.12501442432403564,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 4983
    },
    {
      "epoch": 0.79744,
      "grad_norm": 0.1728144735097885,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 4984
    },
    {
      "epoch": 0.7976,
      "grad_norm": 0.09930454939603806,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 4985
    },
    {
      "epoch": 0.79776,
      "grad_norm": 0.09073538333177567,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 4986
    },
    {
      "epoch": 0.79792,
      "grad_norm": 0.11927742511034012,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4987
    },
    {
      "epoch": 0.79808,
      "grad_norm": 0.10692103952169418,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4988
    },
    {
      "epoch": 0.79824,
      "grad_norm": 0.08748091012239456,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 4989
    },
    {
      "epoch": 0.7984,
      "grad_norm": 0.09611420333385468,
      "learning_rate": 0.0001,
      "loss": 0.3066,
      "step": 4990
    },
    {
      "epoch": 0.79856,
      "grad_norm": 0.13240043818950653,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 4991
    },
    {
      "epoch": 0.79872,
      "grad_norm": 0.11972174793481827,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4992
    },
    {
      "epoch": 0.79888,
      "grad_norm": 0.08192935585975647,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 4993
    },
    {
      "epoch": 0.79904,
      "grad_norm": 0.0855807214975357,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4994
    },
    {
      "epoch": 0.7992,
      "grad_norm": 0.09323396533727646,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 4995
    },
    {
      "epoch": 0.79936,
      "grad_norm": 0.07834307849407196,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 4996
    },
    {
      "epoch": 0.79952,
      "grad_norm": 0.0929289162158966,
      "learning_rate": 0.0001,
      "loss": 0.3028,
      "step": 4997
    },
    {
      "epoch": 0.79968,
      "grad_norm": 0.10356953740119934,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4998
    },
    {
      "epoch": 0.79984,
      "grad_norm": 0.10383547842502594,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 4999
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.09076442569494247,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 5000
    },
    {
      "epoch": 0.8,
      "eval_train_accuracy": 0.999,
      "eval_train_loss": 0.317947655916214,
      "eval_train_runtime": 4.1315,
      "eval_train_samples_per_second": 1210.218,
      "eval_train_steps_per_second": 15.249,
      "step": 5000
    },
    {
      "epoch": 0.8,
      "eval_test_accuracy": 0.9978,
      "eval_test_loss": 0.3167368173599243,
      "eval_test_runtime": 4.9656,
      "eval_test_samples_per_second": 1006.929,
      "eval_test_steps_per_second": 12.687,
      "step": 5000
    },
    {
      "epoch": 0.80016,
      "grad_norm": 0.09865077584981918,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 5001
    },
    {
      "epoch": 0.80032,
      "grad_norm": 0.07974892854690552,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5002
    },
    {
      "epoch": 0.80048,
      "grad_norm": 0.09323906898498535,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 5003
    },
    {
      "epoch": 0.80064,
      "grad_norm": 0.09796729683876038,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 5004
    },
    {
      "epoch": 0.8008,
      "grad_norm": 0.09896053373813629,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 5005
    },
    {
      "epoch": 0.80096,
      "grad_norm": 0.11269761621952057,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 5006
    },
    {
      "epoch": 0.80112,
      "grad_norm": 0.0807519480586052,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 5007
    },
    {
      "epoch": 0.80128,
      "grad_norm": 0.08519124239683151,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 5008
    },
    {
      "epoch": 0.80144,
      "grad_norm": 0.09220308065414429,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 5009
    },
    {
      "epoch": 0.8016,
      "grad_norm": 0.09818018227815628,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 5010
    },
    {
      "epoch": 0.80176,
      "grad_norm": 0.08702541887760162,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 5011
    },
    {
      "epoch": 0.80192,
      "grad_norm": 0.22175392508506775,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 5012
    },
    {
      "epoch": 0.80208,
      "grad_norm": 0.0840449258685112,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 5013
    },
    {
      "epoch": 0.80224,
      "grad_norm": 0.08650463074445724,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 5014
    },
    {
      "epoch": 0.8024,
      "grad_norm": 0.0924236923456192,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 5015
    },
    {
      "epoch": 0.80256,
      "grad_norm": 0.14440567791461945,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 5016
    },
    {
      "epoch": 0.80272,
      "grad_norm": 0.0914321094751358,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 5017
    },
    {
      "epoch": 0.80288,
      "grad_norm": 0.16215607523918152,
      "learning_rate": 0.0001,
      "loss": 0.2998,
      "step": 5018
    },
    {
      "epoch": 0.80304,
      "grad_norm": 0.09252069145441055,
      "learning_rate": 0.0001,
      "loss": 0.306,
      "step": 5019
    },
    {
      "epoch": 0.8032,
      "grad_norm": 0.12560835480690002,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 5020
    },
    {
      "epoch": 0.80336,
      "grad_norm": 0.13223178684711456,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 5021
    },
    {
      "epoch": 0.80352,
      "grad_norm": 0.10420873016119003,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5022
    },
    {
      "epoch": 0.80368,
      "grad_norm": 0.08677496016025543,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 5023
    },
    {
      "epoch": 0.80384,
      "grad_norm": 0.09514200687408447,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5024
    },
    {
      "epoch": 0.804,
      "grad_norm": 0.11867032945156097,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 5025
    },
    {
      "epoch": 0.80416,
      "grad_norm": 0.08029142022132874,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 5026
    },
    {
      "epoch": 0.80432,
      "grad_norm": 0.08596596866846085,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 5027
    },
    {
      "epoch": 0.80448,
      "grad_norm": 0.12171486765146255,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 5028
    },
    {
      "epoch": 0.80464,
      "grad_norm": 0.088984914124012,
      "learning_rate": 0.0001,
      "loss": 0.2988,
      "step": 5029
    },
    {
      "epoch": 0.8048,
      "grad_norm": 0.10912458598613739,
      "learning_rate": 0.0001,
      "loss": 0.3066,
      "step": 5030
    },
    {
      "epoch": 0.80496,
      "grad_norm": 0.09119132161140442,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 5031
    },
    {
      "epoch": 0.80512,
      "grad_norm": 0.16670526564121246,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 5032
    },
    {
      "epoch": 0.80528,
      "grad_norm": 0.08979348838329315,
      "learning_rate": 0.0001,
      "loss": 0.3029,
      "step": 5033
    },
    {
      "epoch": 0.80544,
      "grad_norm": 0.11581910401582718,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 5034
    },
    {
      "epoch": 0.8056,
      "grad_norm": 0.0954853743314743,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 5035
    },
    {
      "epoch": 0.80576,
      "grad_norm": 0.1133570596575737,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 5036
    },
    {
      "epoch": 0.80592,
      "grad_norm": 0.13264015316963196,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5037
    },
    {
      "epoch": 0.80608,
      "grad_norm": 0.108506940305233,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5038
    },
    {
      "epoch": 0.80624,
      "grad_norm": 0.08499585837125778,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 5039
    },
    {
      "epoch": 0.8064,
      "grad_norm": 0.10191807150840759,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5040
    },
    {
      "epoch": 0.80656,
      "grad_norm": 0.0988861620426178,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 5041
    },
    {
      "epoch": 0.80672,
      "grad_norm": 0.08427195250988007,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 5042
    },
    {
      "epoch": 0.80688,
      "grad_norm": 0.07701604068279266,
      "learning_rate": 0.0001,
      "loss": 0.3001,
      "step": 5043
    },
    {
      "epoch": 0.80704,
      "grad_norm": 0.09965730458498001,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 5044
    },
    {
      "epoch": 0.8072,
      "grad_norm": 0.0932433232665062,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 5045
    },
    {
      "epoch": 0.80736,
      "grad_norm": 0.13495834171772003,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 5046
    },
    {
      "epoch": 0.80752,
      "grad_norm": 0.1624630242586136,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 5047
    },
    {
      "epoch": 0.80768,
      "grad_norm": 0.09751502424478531,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 5048
    },
    {
      "epoch": 0.80784,
      "grad_norm": 0.1015244722366333,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 5049
    },
    {
      "epoch": 0.808,
      "grad_norm": 0.09030548483133316,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 5050
    },
    {
      "epoch": 0.80816,
      "grad_norm": 0.12733466923236847,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 5051
    },
    {
      "epoch": 0.80832,
      "grad_norm": 0.10049954056739807,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 5052
    },
    {
      "epoch": 0.80848,
      "grad_norm": 0.11783634126186371,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 5053
    },
    {
      "epoch": 0.80864,
      "grad_norm": 0.09961539506912231,
      "learning_rate": 0.0001,
      "loss": 0.3057,
      "step": 5054
    },
    {
      "epoch": 0.8088,
      "grad_norm": 0.13303664326667786,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5055
    },
    {
      "epoch": 0.80896,
      "grad_norm": 0.08185447007417679,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 5056
    },
    {
      "epoch": 0.80912,
      "grad_norm": 0.10439363867044449,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 5057
    },
    {
      "epoch": 0.80928,
      "grad_norm": 0.1224546805024147,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 5058
    },
    {
      "epoch": 0.80944,
      "grad_norm": 0.09169064462184906,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5059
    },
    {
      "epoch": 0.8096,
      "grad_norm": 0.09299400448799133,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5060
    },
    {
      "epoch": 0.80976,
      "grad_norm": 0.09133937209844589,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 5061
    },
    {
      "epoch": 0.80992,
      "grad_norm": 0.08038719743490219,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 5062
    },
    {
      "epoch": 0.81008,
      "grad_norm": 0.09391286224126816,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 5063
    },
    {
      "epoch": 0.81024,
      "grad_norm": 0.08748192340135574,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 5064
    },
    {
      "epoch": 0.8104,
      "grad_norm": 0.11556902527809143,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 5065
    },
    {
      "epoch": 0.81056,
      "grad_norm": 0.11894078552722931,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5066
    },
    {
      "epoch": 0.81072,
      "grad_norm": 0.09494071453809738,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 5067
    },
    {
      "epoch": 0.81088,
      "grad_norm": 0.11053194850683212,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 5068
    },
    {
      "epoch": 0.81104,
      "grad_norm": 0.0792020708322525,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 5069
    },
    {
      "epoch": 0.8112,
      "grad_norm": 0.09630142897367477,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 5070
    },
    {
      "epoch": 0.81136,
      "grad_norm": 0.1021367609500885,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 5071
    },
    {
      "epoch": 0.81152,
      "grad_norm": 0.09612586349248886,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5072
    },
    {
      "epoch": 0.81168,
      "grad_norm": 0.10171480476856232,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 5073
    },
    {
      "epoch": 0.81184,
      "grad_norm": 0.08768041431903839,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 5074
    },
    {
      "epoch": 0.812,
      "grad_norm": 0.08950129896402359,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5075
    },
    {
      "epoch": 0.81216,
      "grad_norm": 0.08608464896678925,
      "learning_rate": 0.0001,
      "loss": 0.3018,
      "step": 5076
    },
    {
      "epoch": 0.81232,
      "grad_norm": 0.09059929102659225,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5077
    },
    {
      "epoch": 0.81248,
      "grad_norm": 0.08993927389383316,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 5078
    },
    {
      "epoch": 0.81264,
      "grad_norm": 0.09697405993938446,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 5079
    },
    {
      "epoch": 0.8128,
      "grad_norm": 0.08368620276451111,
      "learning_rate": 0.0001,
      "loss": 0.3042,
      "step": 5080
    },
    {
      "epoch": 0.81296,
      "grad_norm": 0.08697747439146042,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 5081
    },
    {
      "epoch": 0.81312,
      "grad_norm": 0.09285405278205872,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 5082
    },
    {
      "epoch": 0.81328,
      "grad_norm": 0.08847616612911224,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 5083
    },
    {
      "epoch": 0.81344,
      "grad_norm": 0.08411356061697006,
      "learning_rate": 0.0001,
      "loss": 0.302,
      "step": 5084
    },
    {
      "epoch": 0.8136,
      "grad_norm": 0.11551974713802338,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 5085
    },
    {
      "epoch": 0.81376,
      "grad_norm": 0.08778053522109985,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 5086
    },
    {
      "epoch": 0.81392,
      "grad_norm": 0.09170190244913101,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 5087
    },
    {
      "epoch": 0.81408,
      "grad_norm": 0.10783953219652176,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 5088
    },
    {
      "epoch": 0.81424,
      "grad_norm": 0.09486955404281616,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 5089
    },
    {
      "epoch": 0.8144,
      "grad_norm": 0.10669907182455063,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 5090
    },
    {
      "epoch": 0.81456,
      "grad_norm": 0.10069075971841812,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 5091
    },
    {
      "epoch": 0.81472,
      "grad_norm": 0.09019264578819275,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 5092
    },
    {
      "epoch": 0.81488,
      "grad_norm": 0.08262171596288681,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 5093
    },
    {
      "epoch": 0.81504,
      "grad_norm": 0.09556044638156891,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 5094
    },
    {
      "epoch": 0.8152,
      "grad_norm": 0.08433869481086731,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 5095
    },
    {
      "epoch": 0.81536,
      "grad_norm": 0.08629722148180008,
      "learning_rate": 0.0001,
      "loss": 0.3065,
      "step": 5096
    },
    {
      "epoch": 0.81552,
      "grad_norm": 0.08062512427568436,
      "learning_rate": 0.0001,
      "loss": 0.3039,
      "step": 5097
    },
    {
      "epoch": 0.81568,
      "grad_norm": 0.09498633444309235,
      "learning_rate": 0.0001,
      "loss": 0.306,
      "step": 5098
    },
    {
      "epoch": 0.81584,
      "grad_norm": 0.08336515724658966,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 5099
    },
    {
      "epoch": 0.816,
      "grad_norm": 0.09216403216123581,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 5100
    },
    {
      "epoch": 0.816,
      "eval_train_accuracy": 0.9986,
      "eval_train_loss": 0.31785064935684204,
      "eval_train_runtime": 4.094,
      "eval_train_samples_per_second": 1221.302,
      "eval_train_steps_per_second": 15.388,
      "step": 5100
    },
    {
      "epoch": 0.816,
      "eval_test_accuracy": 0.999,
      "eval_test_loss": 0.3164266049861908,
      "eval_test_runtime": 4.9811,
      "eval_test_samples_per_second": 1003.799,
      "eval_test_steps_per_second": 12.648,
      "step": 5100
    },
    {
      "epoch": 0.81616,
      "grad_norm": 0.08604211360216141,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 5101
    },
    {
      "epoch": 0.81632,
      "grad_norm": 0.08792625367641449,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 5102
    },
    {
      "epoch": 0.81648,
      "grad_norm": 0.08345948159694672,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 5103
    },
    {
      "epoch": 0.81664,
      "grad_norm": 0.0776909738779068,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 5104
    },
    {
      "epoch": 0.8168,
      "grad_norm": 0.09722542017698288,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5105
    },
    {
      "epoch": 0.81696,
      "grad_norm": 0.10452016443014145,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 5106
    },
    {
      "epoch": 0.81712,
      "grad_norm": 0.10308758169412613,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 5107
    },
    {
      "epoch": 0.81728,
      "grad_norm": 0.08456357568502426,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 5108
    },
    {
      "epoch": 0.81744,
      "grad_norm": 0.07911553978919983,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 5109
    },
    {
      "epoch": 0.8176,
      "grad_norm": 0.08280887454748154,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 5110
    },
    {
      "epoch": 0.81776,
      "grad_norm": 0.09932220727205276,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 5111
    },
    {
      "epoch": 0.81792,
      "grad_norm": 0.08699894696474075,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 5112
    },
    {
      "epoch": 0.81808,
      "grad_norm": 0.08527319878339767,
      "learning_rate": 0.0001,
      "loss": 0.306,
      "step": 5113
    },
    {
      "epoch": 0.81824,
      "grad_norm": 0.117393359541893,
      "learning_rate": 0.0001,
      "loss": 0.3074,
      "step": 5114
    },
    {
      "epoch": 0.8184,
      "grad_norm": 0.10085949301719666,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 5115
    },
    {
      "epoch": 0.81856,
      "grad_norm": 0.09110230952501297,
      "learning_rate": 0.0001,
      "loss": 0.2997,
      "step": 5116
    },
    {
      "epoch": 0.81872,
      "grad_norm": 0.09965334087610245,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 5117
    },
    {
      "epoch": 0.81888,
      "grad_norm": 0.09646093100309372,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 5118
    },
    {
      "epoch": 0.81904,
      "grad_norm": 0.10136698931455612,
      "learning_rate": 0.0001,
      "loss": 0.3005,
      "step": 5119
    },
    {
      "epoch": 0.8192,
      "grad_norm": 0.08828577399253845,
      "learning_rate": 0.0001,
      "loss": 0.3048,
      "step": 5120
    },
    {
      "epoch": 0.81936,
      "grad_norm": 0.07811138033866882,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 5121
    },
    {
      "epoch": 0.81952,
      "grad_norm": 0.09105711430311203,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 5122
    },
    {
      "epoch": 0.81968,
      "grad_norm": 0.10701604932546616,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 5123
    },
    {
      "epoch": 0.81984,
      "grad_norm": 0.08525028079748154,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 5124
    },
    {
      "epoch": 0.82,
      "grad_norm": 0.09347667545080185,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 5125
    },
    {
      "epoch": 0.82016,
      "grad_norm": 0.09103657305240631,
      "learning_rate": 0.0001,
      "loss": 0.3004,
      "step": 5126
    },
    {
      "epoch": 0.82032,
      "grad_norm": 0.0824703574180603,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 5127
    },
    {
      "epoch": 0.82048,
      "grad_norm": 0.18896573781967163,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5128
    },
    {
      "epoch": 0.82064,
      "grad_norm": 0.08928491920232773,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5129
    },
    {
      "epoch": 0.8208,
      "grad_norm": 0.1028972938656807,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 5130
    },
    {
      "epoch": 0.82096,
      "grad_norm": 0.09287220239639282,
      "learning_rate": 0.0001,
      "loss": 0.304,
      "step": 5131
    },
    {
      "epoch": 0.82112,
      "grad_norm": 0.08541052788496017,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 5132
    },
    {
      "epoch": 0.82128,
      "grad_norm": 0.10913098603487015,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 5133
    },
    {
      "epoch": 0.82144,
      "grad_norm": 0.10127492994070053,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5134
    },
    {
      "epoch": 0.8216,
      "grad_norm": 0.08412835001945496,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5135
    },
    {
      "epoch": 0.82176,
      "grad_norm": 0.09163473546504974,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 5136
    },
    {
      "epoch": 0.82192,
      "grad_norm": 0.09074093401432037,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5137
    },
    {
      "epoch": 0.82208,
      "grad_norm": 0.09669693559408188,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 5138
    },
    {
      "epoch": 0.82224,
      "grad_norm": 0.08900502324104309,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 5139
    },
    {
      "epoch": 0.8224,
      "grad_norm": 0.0900435671210289,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 5140
    },
    {
      "epoch": 0.82256,
      "grad_norm": 0.09176772087812424,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 5141
    },
    {
      "epoch": 0.82272,
      "grad_norm": 0.0940905287861824,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 5142
    },
    {
      "epoch": 0.82288,
      "grad_norm": 0.08938315510749817,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5143
    },
    {
      "epoch": 0.82304,
      "grad_norm": 0.08968272805213928,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 5144
    },
    {
      "epoch": 0.8232,
      "grad_norm": 0.080335833132267,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 5145
    },
    {
      "epoch": 0.82336,
      "grad_norm": 0.09684208780527115,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 5146
    },
    {
      "epoch": 0.82352,
      "grad_norm": 0.08837808668613434,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 5147
    },
    {
      "epoch": 0.82368,
      "grad_norm": 0.08971530199050903,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 5148
    },
    {
      "epoch": 0.82384,
      "grad_norm": 0.0827176570892334,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5149
    },
    {
      "epoch": 0.824,
      "grad_norm": 0.08667916804552078,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 5150
    },
    {
      "epoch": 0.82416,
      "grad_norm": 0.07757297903299332,
      "learning_rate": 0.0001,
      "loss": 0.3015,
      "step": 5151
    },
    {
      "epoch": 0.82432,
      "grad_norm": 0.08669645339250565,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 5152
    },
    {
      "epoch": 0.82448,
      "grad_norm": 0.08785837143659592,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5153
    },
    {
      "epoch": 0.82464,
      "grad_norm": 0.09146841615438461,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 5154
    },
    {
      "epoch": 0.8248,
      "grad_norm": 0.08213765919208527,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 5155
    },
    {
      "epoch": 0.82496,
      "grad_norm": 0.08525179326534271,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 5156
    },
    {
      "epoch": 0.82512,
      "grad_norm": 0.08380287885665894,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 5157
    },
    {
      "epoch": 0.82528,
      "grad_norm": 0.08436597138643265,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 5158
    },
    {
      "epoch": 0.82544,
      "grad_norm": 0.08285000920295715,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 5159
    },
    {
      "epoch": 0.8256,
      "grad_norm": 0.08802520483732224,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 5160
    },
    {
      "epoch": 0.82576,
      "grad_norm": 0.09678132086992264,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5161
    },
    {
      "epoch": 0.82592,
      "grad_norm": 0.08538192510604858,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 5162
    },
    {
      "epoch": 0.82608,
      "grad_norm": 0.09291491657495499,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 5163
    },
    {
      "epoch": 0.82624,
      "grad_norm": 0.08889081329107285,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 5164
    },
    {
      "epoch": 0.8264,
      "grad_norm": 0.09607432037591934,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 5165
    },
    {
      "epoch": 0.82656,
      "grad_norm": 0.08439306169748306,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 5166
    },
    {
      "epoch": 0.82672,
      "grad_norm": 0.0858217105269432,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 5167
    },
    {
      "epoch": 0.82688,
      "grad_norm": 0.08973438292741776,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 5168
    },
    {
      "epoch": 0.82704,
      "grad_norm": 0.08547551929950714,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 5169
    },
    {
      "epoch": 0.8272,
      "grad_norm": 0.09360873699188232,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 5170
    },
    {
      "epoch": 0.82736,
      "grad_norm": 0.09033630043268204,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5171
    },
    {
      "epoch": 0.82752,
      "grad_norm": 0.0755566731095314,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 5172
    },
    {
      "epoch": 0.82768,
      "grad_norm": 0.1205446869134903,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 5173
    },
    {
      "epoch": 0.82784,
      "grad_norm": 0.08171968907117844,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 5174
    },
    {
      "epoch": 0.828,
      "grad_norm": 0.07541840523481369,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 5175
    },
    {
      "epoch": 0.82816,
      "grad_norm": 0.13340352475643158,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 5176
    },
    {
      "epoch": 0.82832,
      "grad_norm": 0.09038161486387253,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 5177
    },
    {
      "epoch": 0.82848,
      "grad_norm": 0.08334556967020035,
      "learning_rate": 0.0001,
      "loss": 0.3006,
      "step": 5178
    },
    {
      "epoch": 0.82864,
      "grad_norm": 0.1080857366323471,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5179
    },
    {
      "epoch": 0.8288,
      "grad_norm": 0.17817537486553192,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 5180
    },
    {
      "epoch": 0.82896,
      "grad_norm": 0.07794519513845444,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 5181
    },
    {
      "epoch": 0.82912,
      "grad_norm": 0.09578575938940048,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 5182
    },
    {
      "epoch": 0.82928,
      "grad_norm": 0.09450113028287888,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 5183
    },
    {
      "epoch": 0.82944,
      "grad_norm": 0.11241587996482849,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 5184
    },
    {
      "epoch": 0.8296,
      "grad_norm": 0.11903110891580582,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5185
    },
    {
      "epoch": 0.82976,
      "grad_norm": 0.09405893087387085,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 5186
    },
    {
      "epoch": 0.82992,
      "grad_norm": 0.08370094746351242,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 5187
    },
    {
      "epoch": 0.83008,
      "grad_norm": 0.0975656658411026,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 5188
    },
    {
      "epoch": 0.83024,
      "grad_norm": 0.1337975114583969,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 5189
    },
    {
      "epoch": 0.8304,
      "grad_norm": 0.09688027948141098,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 5190
    },
    {
      "epoch": 0.83056,
      "grad_norm": 0.0960228443145752,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 5191
    },
    {
      "epoch": 0.83072,
      "grad_norm": 0.10829973220825195,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 5192
    },
    {
      "epoch": 0.83088,
      "grad_norm": 0.09726792573928833,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 5193
    },
    {
      "epoch": 0.83104,
      "grad_norm": 0.10169006139039993,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 5194
    },
    {
      "epoch": 0.8312,
      "grad_norm": 0.1024867594242096,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 5195
    },
    {
      "epoch": 0.83136,
      "grad_norm": 0.08462441712617874,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5196
    },
    {
      "epoch": 0.83152,
      "grad_norm": 0.11169711500406265,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 5197
    },
    {
      "epoch": 0.83168,
      "grad_norm": 0.08959793299436569,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 5198
    },
    {
      "epoch": 0.83184,
      "grad_norm": 0.08958559483289719,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 5199
    },
    {
      "epoch": 0.832,
      "grad_norm": 0.10098277777433395,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 5200
    },
    {
      "epoch": 0.832,
      "eval_train_accuracy": 0.9984,
      "eval_train_loss": 0.3177601397037506,
      "eval_train_runtime": 4.2274,
      "eval_train_samples_per_second": 1182.754,
      "eval_train_steps_per_second": 14.903,
      "step": 5200
    },
    {
      "epoch": 0.832,
      "eval_test_accuracy": 0.9988,
      "eval_test_loss": 0.3166080415248871,
      "eval_test_runtime": 4.8801,
      "eval_test_samples_per_second": 1024.56,
      "eval_test_steps_per_second": 12.909,
      "step": 5200
    },
    {
      "epoch": 0.83216,
      "grad_norm": 0.08675993978977203,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 5201
    },
    {
      "epoch": 0.83232,
      "grad_norm": 0.08698610216379166,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 5202
    },
    {
      "epoch": 0.83248,
      "grad_norm": 0.08840823918581009,
      "learning_rate": 0.0001,
      "loss": 0.3027,
      "step": 5203
    },
    {
      "epoch": 0.83264,
      "grad_norm": 0.09918902814388275,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 5204
    },
    {
      "epoch": 0.8328,
      "grad_norm": 0.09730083495378494,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 5205
    },
    {
      "epoch": 0.83296,
      "grad_norm": 0.0962328389286995,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 5206
    },
    {
      "epoch": 0.83312,
      "grad_norm": 0.0841427892446518,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 5207
    },
    {
      "epoch": 0.83328,
      "grad_norm": 0.08488868176937103,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 5208
    },
    {
      "epoch": 0.83344,
      "grad_norm": 0.14290909469127655,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 5209
    },
    {
      "epoch": 0.8336,
      "grad_norm": 0.0957610011100769,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5210
    },
    {
      "epoch": 0.83376,
      "grad_norm": 0.10697530955076218,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 5211
    },
    {
      "epoch": 0.83392,
      "grad_norm": 0.10230348259210587,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 5212
    },
    {
      "epoch": 0.83408,
      "grad_norm": 0.08384072780609131,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 5213
    },
    {
      "epoch": 0.83424,
      "grad_norm": 0.08062706142663956,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 5214
    },
    {
      "epoch": 0.8344,
      "grad_norm": 0.10445777326822281,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 5215
    },
    {
      "epoch": 0.83456,
      "grad_norm": 0.09665912389755249,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 5216
    },
    {
      "epoch": 0.83472,
      "grad_norm": 0.0830112025141716,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 5217
    },
    {
      "epoch": 0.83488,
      "grad_norm": 0.10207729786634445,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5218
    },
    {
      "epoch": 0.83504,
      "grad_norm": 0.08197381347417831,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 5219
    },
    {
      "epoch": 0.8352,
      "grad_norm": 0.10899922996759415,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 5220
    },
    {
      "epoch": 0.83536,
      "grad_norm": 0.08862383663654327,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 5221
    },
    {
      "epoch": 0.83552,
      "grad_norm": 0.09984765946865082,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 5222
    },
    {
      "epoch": 0.83568,
      "grad_norm": 0.09546259790658951,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 5223
    },
    {
      "epoch": 0.83584,
      "grad_norm": 0.12167024612426758,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 5224
    },
    {
      "epoch": 0.836,
      "grad_norm": 0.07930055260658264,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5225
    },
    {
      "epoch": 0.83616,
      "grad_norm": 0.09289433807134628,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 5226
    },
    {
      "epoch": 0.83632,
      "grad_norm": 0.09005703032016754,
      "learning_rate": 0.0001,
      "loss": 0.3023,
      "step": 5227
    },
    {
      "epoch": 0.83648,
      "grad_norm": 0.10763604193925858,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 5228
    },
    {
      "epoch": 0.83664,
      "grad_norm": 0.09811114519834518,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 5229
    },
    {
      "epoch": 0.8368,
      "grad_norm": 0.09591580182313919,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 5230
    },
    {
      "epoch": 0.83696,
      "grad_norm": 0.07771836221218109,
      "learning_rate": 0.0001,
      "loss": 0.3071,
      "step": 5231
    },
    {
      "epoch": 0.83712,
      "grad_norm": 0.08984102308750153,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 5232
    },
    {
      "epoch": 0.83728,
      "grad_norm": 0.10401175916194916,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 5233
    },
    {
      "epoch": 0.83744,
      "grad_norm": 0.11123688519001007,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 5234
    },
    {
      "epoch": 0.8376,
      "grad_norm": 0.08509379625320435,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 5235
    },
    {
      "epoch": 0.83776,
      "grad_norm": 0.08734672516584396,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 5236
    },
    {
      "epoch": 0.83792,
      "grad_norm": 0.09891607612371445,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 5237
    },
    {
      "epoch": 0.83808,
      "grad_norm": 0.092549167573452,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 5238
    },
    {
      "epoch": 0.83824,
      "grad_norm": 0.09639161825180054,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 5239
    },
    {
      "epoch": 0.8384,
      "grad_norm": 0.09399101138114929,
      "learning_rate": 0.0001,
      "loss": 0.3029,
      "step": 5240
    },
    {
      "epoch": 0.83856,
      "grad_norm": 0.08829733729362488,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 5241
    },
    {
      "epoch": 0.83872,
      "grad_norm": 0.12971216440200806,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 5242
    },
    {
      "epoch": 0.83888,
      "grad_norm": 0.07958570867776871,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 5243
    },
    {
      "epoch": 0.83904,
      "grad_norm": 0.09221179783344269,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 5244
    },
    {
      "epoch": 0.8392,
      "grad_norm": 0.09746633470058441,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 5245
    },
    {
      "epoch": 0.83936,
      "grad_norm": 0.08642362058162689,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 5246
    },
    {
      "epoch": 0.83952,
      "grad_norm": 0.0849427878856659,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5247
    },
    {
      "epoch": 0.83968,
      "grad_norm": 0.09854808449745178,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5248
    },
    {
      "epoch": 0.83984,
      "grad_norm": 0.09179044514894485,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 5249
    },
    {
      "epoch": 0.84,
      "grad_norm": 0.08789089322090149,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 5250
    },
    {
      "epoch": 0.84016,
      "grad_norm": 0.09484479576349258,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 5251
    },
    {
      "epoch": 0.84032,
      "grad_norm": 0.0863366425037384,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 5252
    },
    {
      "epoch": 0.84048,
      "grad_norm": 0.15011176466941833,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5253
    },
    {
      "epoch": 0.84064,
      "grad_norm": 0.11080773174762726,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 5254
    },
    {
      "epoch": 0.8408,
      "grad_norm": 0.08813636004924774,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 5255
    },
    {
      "epoch": 0.84096,
      "grad_norm": 0.07737216353416443,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 5256
    },
    {
      "epoch": 0.84112,
      "grad_norm": 0.12861153483390808,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 5257
    },
    {
      "epoch": 0.84128,
      "grad_norm": 0.1403771936893463,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5258
    },
    {
      "epoch": 0.84144,
      "grad_norm": 0.07560983300209045,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 5259
    },
    {
      "epoch": 0.8416,
      "grad_norm": 0.0926586166024208,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 5260
    },
    {
      "epoch": 0.84176,
      "grad_norm": 0.0828508585691452,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 5261
    },
    {
      "epoch": 0.84192,
      "grad_norm": 0.09213634580373764,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 5262
    },
    {
      "epoch": 0.84208,
      "grad_norm": 0.0860881358385086,
      "learning_rate": 0.0001,
      "loss": 0.307,
      "step": 5263
    },
    {
      "epoch": 0.84224,
      "grad_norm": 0.08012578636407852,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 5264
    },
    {
      "epoch": 0.8424,
      "grad_norm": 0.10032033920288086,
      "learning_rate": 0.0001,
      "loss": 0.3011,
      "step": 5265
    },
    {
      "epoch": 0.84256,
      "grad_norm": 0.09002503007650375,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 5266
    },
    {
      "epoch": 0.84272,
      "grad_norm": 0.09087873250246048,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 5267
    },
    {
      "epoch": 0.84288,
      "grad_norm": 0.1313062310218811,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 5268
    },
    {
      "epoch": 0.84304,
      "grad_norm": 0.08296597003936768,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 5269
    },
    {
      "epoch": 0.8432,
      "grad_norm": 0.1002461165189743,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 5270
    },
    {
      "epoch": 0.84336,
      "grad_norm": 0.09691034257411957,
      "learning_rate": 0.0001,
      "loss": 0.2991,
      "step": 5271
    },
    {
      "epoch": 0.84352,
      "grad_norm": 0.10670231282711029,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 5272
    },
    {
      "epoch": 0.84368,
      "grad_norm": 0.09061670303344727,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 5273
    },
    {
      "epoch": 0.84384,
      "grad_norm": 0.08484773337841034,
      "learning_rate": 0.0001,
      "loss": 0.3018,
      "step": 5274
    },
    {
      "epoch": 0.844,
      "grad_norm": 0.08153655380010605,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5275
    },
    {
      "epoch": 0.84416,
      "grad_norm": 0.1036514863371849,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 5276
    },
    {
      "epoch": 0.84432,
      "grad_norm": 0.08364763855934143,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 5277
    },
    {
      "epoch": 0.84448,
      "grad_norm": 0.09824992716312408,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 5278
    },
    {
      "epoch": 0.84464,
      "grad_norm": 0.08094445616006851,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 5279
    },
    {
      "epoch": 0.8448,
      "grad_norm": 0.09641021490097046,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 5280
    },
    {
      "epoch": 0.84496,
      "grad_norm": 0.09866565465927124,
      "learning_rate": 0.0001,
      "loss": 0.3066,
      "step": 5281
    },
    {
      "epoch": 0.84512,
      "grad_norm": 0.10971837490797043,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 5282
    },
    {
      "epoch": 0.84528,
      "grad_norm": 0.09182460606098175,
      "learning_rate": 0.0001,
      "loss": 0.3066,
      "step": 5283
    },
    {
      "epoch": 0.84544,
      "grad_norm": 0.09129247069358826,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 5284
    },
    {
      "epoch": 0.8456,
      "grad_norm": 0.08160796761512756,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 5285
    },
    {
      "epoch": 0.84576,
      "grad_norm": 0.08798468858003616,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5286
    },
    {
      "epoch": 0.84592,
      "grad_norm": 0.0986105352640152,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 5287
    },
    {
      "epoch": 0.84608,
      "grad_norm": 0.09832214564085007,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 5288
    },
    {
      "epoch": 0.84624,
      "grad_norm": 0.09405095875263214,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 5289
    },
    {
      "epoch": 0.8464,
      "grad_norm": 0.08788833767175674,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 5290
    },
    {
      "epoch": 0.84656,
      "grad_norm": 0.07987421751022339,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 5291
    },
    {
      "epoch": 0.84672,
      "grad_norm": 0.08979588001966476,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 5292
    },
    {
      "epoch": 0.84688,
      "grad_norm": 0.0903719812631607,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 5293
    },
    {
      "epoch": 0.84704,
      "grad_norm": 0.08147973567247391,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 5294
    },
    {
      "epoch": 0.8472,
      "grad_norm": 0.07906532287597656,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 5295
    },
    {
      "epoch": 0.84736,
      "grad_norm": 0.1219010129570961,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5296
    },
    {
      "epoch": 0.84752,
      "grad_norm": 0.08827068656682968,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 5297
    },
    {
      "epoch": 0.84768,
      "grad_norm": 0.08040911704301834,
      "learning_rate": 0.0001,
      "loss": 0.3015,
      "step": 5298
    },
    {
      "epoch": 0.84784,
      "grad_norm": 0.08359777927398682,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 5299
    },
    {
      "epoch": 0.848,
      "grad_norm": 0.09483711421489716,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 5300
    },
    {
      "epoch": 0.848,
      "eval_train_accuracy": 0.9988,
      "eval_train_loss": 0.3177796006202698,
      "eval_train_runtime": 4.0781,
      "eval_train_samples_per_second": 1226.075,
      "eval_train_steps_per_second": 15.449,
      "step": 5300
    },
    {
      "epoch": 0.848,
      "eval_test_accuracy": 0.998,
      "eval_test_loss": 0.316503643989563,
      "eval_test_runtime": 5.0338,
      "eval_test_samples_per_second": 993.295,
      "eval_test_steps_per_second": 12.516,
      "step": 5300
    },
    {
      "epoch": 0.84816,
      "grad_norm": 0.09519629180431366,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 5301
    },
    {
      "epoch": 0.84832,
      "grad_norm": 0.08466795086860657,
      "learning_rate": 0.0001,
      "loss": 0.3059,
      "step": 5302
    },
    {
      "epoch": 0.84848,
      "grad_norm": 0.09686113148927689,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5303
    },
    {
      "epoch": 0.84864,
      "grad_norm": 0.08427344262599945,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5304
    },
    {
      "epoch": 0.8488,
      "grad_norm": 0.12302862852811813,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 5305
    },
    {
      "epoch": 0.84896,
      "grad_norm": 0.08524486422538757,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 5306
    },
    {
      "epoch": 0.84912,
      "grad_norm": 0.08954738080501556,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 5307
    },
    {
      "epoch": 0.84928,
      "grad_norm": 0.08204962313175201,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 5308
    },
    {
      "epoch": 0.84944,
      "grad_norm": 0.07810484617948532,
      "learning_rate": 0.0001,
      "loss": 0.3056,
      "step": 5309
    },
    {
      "epoch": 0.8496,
      "grad_norm": 0.141062930226326,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5310
    },
    {
      "epoch": 0.84976,
      "grad_norm": 0.0791555717587471,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 5311
    },
    {
      "epoch": 0.84992,
      "grad_norm": 0.08864470571279526,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 5312
    },
    {
      "epoch": 0.85008,
      "grad_norm": 0.11212065815925598,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 5313
    },
    {
      "epoch": 0.85024,
      "grad_norm": 0.0999007299542427,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5314
    },
    {
      "epoch": 0.8504,
      "grad_norm": 0.09959642589092255,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 5315
    },
    {
      "epoch": 0.85056,
      "grad_norm": 0.09426034241914749,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 5316
    },
    {
      "epoch": 0.85072,
      "grad_norm": 0.08209392428398132,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 5317
    },
    {
      "epoch": 0.85088,
      "grad_norm": 0.09918346256017685,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 5318
    },
    {
      "epoch": 0.85104,
      "grad_norm": 0.11446348577737808,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 5319
    },
    {
      "epoch": 0.8512,
      "grad_norm": 0.09537670016288757,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 5320
    },
    {
      "epoch": 0.85136,
      "grad_norm": 0.08651477098464966,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5321
    },
    {
      "epoch": 0.85152,
      "grad_norm": 0.08321952819824219,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5322
    },
    {
      "epoch": 0.85168,
      "grad_norm": 0.08279109001159668,
      "learning_rate": 0.0001,
      "loss": 0.3009,
      "step": 5323
    },
    {
      "epoch": 0.85184,
      "grad_norm": 0.13304974138736725,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 5324
    },
    {
      "epoch": 0.852,
      "grad_norm": 0.09401766955852509,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 5325
    },
    {
      "epoch": 0.85216,
      "grad_norm": 0.09778303653001785,
      "learning_rate": 0.0001,
      "loss": 0.3064,
      "step": 5326
    },
    {
      "epoch": 0.85232,
      "grad_norm": 0.08408898860216141,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5327
    },
    {
      "epoch": 0.85248,
      "grad_norm": 0.09113911539316177,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5328
    },
    {
      "epoch": 0.85264,
      "grad_norm": 0.07707909494638443,
      "learning_rate": 0.0001,
      "loss": 0.3026,
      "step": 5329
    },
    {
      "epoch": 0.8528,
      "grad_norm": 0.09481526911258698,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5330
    },
    {
      "epoch": 0.85296,
      "grad_norm": 0.09652715921401978,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 5331
    },
    {
      "epoch": 0.85312,
      "grad_norm": 0.10629844665527344,
      "learning_rate": 0.0001,
      "loss": 0.3089,
      "step": 5332
    },
    {
      "epoch": 0.85328,
      "grad_norm": 0.08482354879379272,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 5333
    },
    {
      "epoch": 0.85344,
      "grad_norm": 0.14116798341274261,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 5334
    },
    {
      "epoch": 0.8536,
      "grad_norm": 0.08713632822036743,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 5335
    },
    {
      "epoch": 0.85376,
      "grad_norm": 0.10105296969413757,
      "learning_rate": 0.0001,
      "loss": 0.3066,
      "step": 5336
    },
    {
      "epoch": 0.85392,
      "grad_norm": 0.09592107683420181,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 5337
    },
    {
      "epoch": 0.85408,
      "grad_norm": 0.08933404088020325,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 5338
    },
    {
      "epoch": 0.85424,
      "grad_norm": 0.08343639224767685,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 5339
    },
    {
      "epoch": 0.8544,
      "grad_norm": 0.08149667084217072,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 5340
    },
    {
      "epoch": 0.85456,
      "grad_norm": 0.08779127895832062,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 5341
    },
    {
      "epoch": 0.85472,
      "grad_norm": 0.09236486256122589,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 5342
    },
    {
      "epoch": 0.85488,
      "grad_norm": 0.08264362066984177,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 5343
    },
    {
      "epoch": 0.85504,
      "grad_norm": 0.09121723473072052,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 5344
    },
    {
      "epoch": 0.8552,
      "grad_norm": 0.09405183047056198,
      "learning_rate": 0.0001,
      "loss": 0.2983,
      "step": 5345
    },
    {
      "epoch": 0.85536,
      "grad_norm": 0.10753687471151352,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 5346
    },
    {
      "epoch": 0.85552,
      "grad_norm": 0.08628629148006439,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 5347
    },
    {
      "epoch": 0.85568,
      "grad_norm": 0.08649484813213348,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 5348
    },
    {
      "epoch": 0.85584,
      "grad_norm": 0.11101022362709045,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5349
    },
    {
      "epoch": 0.856,
      "grad_norm": 0.09666956961154938,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 5350
    },
    {
      "epoch": 0.85616,
      "grad_norm": 0.09708292037248611,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 5351
    },
    {
      "epoch": 0.85632,
      "grad_norm": 0.09507527202367783,
      "learning_rate": 0.0001,
      "loss": 0.3083,
      "step": 5352
    },
    {
      "epoch": 0.85648,
      "grad_norm": 0.08629141002893448,
      "learning_rate": 0.0001,
      "loss": 0.2988,
      "step": 5353
    },
    {
      "epoch": 0.85664,
      "grad_norm": 0.09726215898990631,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 5354
    },
    {
      "epoch": 0.8568,
      "grad_norm": 0.08187516778707504,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 5355
    },
    {
      "epoch": 0.85696,
      "grad_norm": 0.086571604013443,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 5356
    },
    {
      "epoch": 0.85712,
      "grad_norm": 0.11573248356580734,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 5357
    },
    {
      "epoch": 0.85728,
      "grad_norm": 0.10488836467266083,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 5358
    },
    {
      "epoch": 0.85744,
      "grad_norm": 0.11464790254831314,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 5359
    },
    {
      "epoch": 0.8576,
      "grad_norm": 0.08261249959468842,
      "learning_rate": 0.0001,
      "loss": 0.3042,
      "step": 5360
    },
    {
      "epoch": 0.85776,
      "grad_norm": 0.09512591361999512,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5361
    },
    {
      "epoch": 0.85792,
      "grad_norm": 0.13387829065322876,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 5362
    },
    {
      "epoch": 0.85808,
      "grad_norm": 0.07838160544633865,
      "learning_rate": 0.0001,
      "loss": 0.2999,
      "step": 5363
    },
    {
      "epoch": 0.85824,
      "grad_norm": 0.11444973945617676,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 5364
    },
    {
      "epoch": 0.8584,
      "grad_norm": 0.10842520743608475,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 5365
    },
    {
      "epoch": 0.85856,
      "grad_norm": 0.10652226209640503,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 5366
    },
    {
      "epoch": 0.85872,
      "grad_norm": 0.08106549829244614,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 5367
    },
    {
      "epoch": 0.85888,
      "grad_norm": 0.16356852650642395,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 5368
    },
    {
      "epoch": 0.85904,
      "grad_norm": 0.09236934036016464,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 5369
    },
    {
      "epoch": 0.8592,
      "grad_norm": 0.0959276631474495,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 5370
    },
    {
      "epoch": 0.85936,
      "grad_norm": 0.08858204632997513,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 5371
    },
    {
      "epoch": 0.85952,
      "grad_norm": 0.09285849332809448,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5372
    },
    {
      "epoch": 0.85968,
      "grad_norm": 0.08547348529100418,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 5373
    },
    {
      "epoch": 0.85984,
      "grad_norm": 0.08841109275817871,
      "learning_rate": 0.0001,
      "loss": 0.307,
      "step": 5374
    },
    {
      "epoch": 0.86,
      "grad_norm": 0.08176054060459137,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 5375
    },
    {
      "epoch": 0.86016,
      "grad_norm": 0.08805148303508759,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 5376
    },
    {
      "epoch": 0.86032,
      "grad_norm": 0.08016859740018845,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 5377
    },
    {
      "epoch": 0.86048,
      "grad_norm": 0.1055602952837944,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5378
    },
    {
      "epoch": 0.86064,
      "grad_norm": 0.08619512617588043,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 5379
    },
    {
      "epoch": 0.8608,
      "grad_norm": 0.08963567018508911,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 5380
    },
    {
      "epoch": 0.86096,
      "grad_norm": 0.08825001120567322,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 5381
    },
    {
      "epoch": 0.86112,
      "grad_norm": 0.08151203393936157,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5382
    },
    {
      "epoch": 0.86128,
      "grad_norm": 0.08904440701007843,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 5383
    },
    {
      "epoch": 0.86144,
      "grad_norm": 0.09561188519001007,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 5384
    },
    {
      "epoch": 0.8616,
      "grad_norm": 0.08636950701475143,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 5385
    },
    {
      "epoch": 0.86176,
      "grad_norm": 0.0786575973033905,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 5386
    },
    {
      "epoch": 0.86192,
      "grad_norm": 0.09084969013929367,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 5387
    },
    {
      "epoch": 0.86208,
      "grad_norm": 0.08983341604471207,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5388
    },
    {
      "epoch": 0.86224,
      "grad_norm": 0.09778179228305817,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 5389
    },
    {
      "epoch": 0.8624,
      "grad_norm": 0.07438591122627258,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 5390
    },
    {
      "epoch": 0.86256,
      "grad_norm": 0.08594095706939697,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 5391
    },
    {
      "epoch": 0.86272,
      "grad_norm": 0.08695933222770691,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 5392
    },
    {
      "epoch": 0.86288,
      "grad_norm": 0.08753395080566406,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 5393
    },
    {
      "epoch": 0.86304,
      "grad_norm": 0.09704620391130447,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 5394
    },
    {
      "epoch": 0.8632,
      "grad_norm": 0.14776714146137238,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 5395
    },
    {
      "epoch": 0.86336,
      "grad_norm": 0.08265811204910278,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 5396
    },
    {
      "epoch": 0.86352,
      "grad_norm": 0.08170726150274277,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 5397
    },
    {
      "epoch": 0.86368,
      "grad_norm": 0.08763042092323303,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 5398
    },
    {
      "epoch": 0.86384,
      "grad_norm": 0.14605194330215454,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 5399
    },
    {
      "epoch": 0.864,
      "grad_norm": 0.09666063636541367,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 5400
    },
    {
      "epoch": 0.864,
      "eval_train_accuracy": 0.9996,
      "eval_train_loss": 0.3177338242530823,
      "eval_train_runtime": 4.0981,
      "eval_train_samples_per_second": 1220.083,
      "eval_train_steps_per_second": 15.373,
      "step": 5400
    },
    {
      "epoch": 0.864,
      "eval_test_accuracy": 0.999,
      "eval_test_loss": 0.31656569242477417,
      "eval_test_runtime": 4.9762,
      "eval_test_samples_per_second": 1004.781,
      "eval_test_steps_per_second": 12.66,
      "step": 5400
    },
    {
      "epoch": 0.86416,
      "grad_norm": 0.10847808420658112,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 5401
    },
    {
      "epoch": 0.86432,
      "grad_norm": 0.07922032475471497,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 5402
    },
    {
      "epoch": 0.86448,
      "grad_norm": 0.10175038129091263,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5403
    },
    {
      "epoch": 0.86464,
      "grad_norm": 0.09194397181272507,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 5404
    },
    {
      "epoch": 0.8648,
      "grad_norm": 0.10323946923017502,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 5405
    },
    {
      "epoch": 0.86496,
      "grad_norm": 0.0873616561293602,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 5406
    },
    {
      "epoch": 0.86512,
      "grad_norm": 0.09081985801458359,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 5407
    },
    {
      "epoch": 0.86528,
      "grad_norm": 0.12196256220340729,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 5408
    },
    {
      "epoch": 0.86544,
      "grad_norm": 0.09973360598087311,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 5409
    },
    {
      "epoch": 0.8656,
      "grad_norm": 0.08773458003997803,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 5410
    },
    {
      "epoch": 0.86576,
      "grad_norm": 0.09465950727462769,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 5411
    },
    {
      "epoch": 0.86592,
      "grad_norm": 0.14515763521194458,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 5412
    },
    {
      "epoch": 0.86608,
      "grad_norm": 0.09317148476839066,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 5413
    },
    {
      "epoch": 0.86624,
      "grad_norm": 0.1029491201043129,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 5414
    },
    {
      "epoch": 0.8664,
      "grad_norm": 0.09639036655426025,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 5415
    },
    {
      "epoch": 0.86656,
      "grad_norm": 0.10400059819221497,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 5416
    },
    {
      "epoch": 0.86672,
      "grad_norm": 0.10100384056568146,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 5417
    },
    {
      "epoch": 0.86688,
      "grad_norm": 0.10512933880090714,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 5418
    },
    {
      "epoch": 0.86704,
      "grad_norm": 0.11327888816595078,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 5419
    },
    {
      "epoch": 0.8672,
      "grad_norm": 0.08555968105792999,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 5420
    },
    {
      "epoch": 0.86736,
      "grad_norm": 0.09908642619848251,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5421
    },
    {
      "epoch": 0.86752,
      "grad_norm": 0.1363801211118698,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 5422
    },
    {
      "epoch": 0.86768,
      "grad_norm": 0.10383063554763794,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5423
    },
    {
      "epoch": 0.86784,
      "grad_norm": 0.11801744997501373,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 5424
    },
    {
      "epoch": 0.868,
      "grad_norm": 0.09228405356407166,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 5425
    },
    {
      "epoch": 0.86816,
      "grad_norm": 0.11570998281240463,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 5426
    },
    {
      "epoch": 0.86832,
      "grad_norm": 0.11491327732801437,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 5427
    },
    {
      "epoch": 0.86848,
      "grad_norm": 0.11267417669296265,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 5428
    },
    {
      "epoch": 0.86864,
      "grad_norm": 0.08576233685016632,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 5429
    },
    {
      "epoch": 0.8688,
      "grad_norm": 0.10249798744916916,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 5430
    },
    {
      "epoch": 0.86896,
      "grad_norm": 0.08024081587791443,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 5431
    },
    {
      "epoch": 0.86912,
      "grad_norm": 0.10936366766691208,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 5432
    },
    {
      "epoch": 0.86928,
      "grad_norm": 0.1098913624882698,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 5433
    },
    {
      "epoch": 0.86944,
      "grad_norm": 0.10863707959651947,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 5434
    },
    {
      "epoch": 0.8696,
      "grad_norm": 0.11712721735239029,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 5435
    },
    {
      "epoch": 0.86976,
      "grad_norm": 0.08686771988868713,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 5436
    },
    {
      "epoch": 0.86992,
      "grad_norm": 0.12548430263996124,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 5437
    },
    {
      "epoch": 0.87008,
      "grad_norm": 0.08838821947574615,
      "learning_rate": 0.0001,
      "loss": 0.3394,
      "step": 5438
    },
    {
      "epoch": 0.87024,
      "grad_norm": 0.09092921763658524,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 5439
    },
    {
      "epoch": 0.8704,
      "grad_norm": 0.09147978574037552,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 5440
    },
    {
      "epoch": 0.87056,
      "grad_norm": 0.13456587493419647,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 5441
    },
    {
      "epoch": 0.87072,
      "grad_norm": 0.0918162539601326,
      "learning_rate": 0.0001,
      "loss": 0.3063,
      "step": 5442
    },
    {
      "epoch": 0.87088,
      "grad_norm": 0.09333208203315735,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 5443
    },
    {
      "epoch": 0.87104,
      "grad_norm": 0.08547621965408325,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 5444
    },
    {
      "epoch": 0.8712,
      "grad_norm": 0.0976092591881752,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 5445
    },
    {
      "epoch": 0.87136,
      "grad_norm": 0.08981939405202866,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5446
    },
    {
      "epoch": 0.87152,
      "grad_norm": 0.11572549492120743,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 5447
    },
    {
      "epoch": 0.87168,
      "grad_norm": 0.09267086535692215,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 5448
    },
    {
      "epoch": 0.87184,
      "grad_norm": 0.10058706998825073,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 5449
    },
    {
      "epoch": 0.872,
      "grad_norm": 0.10419487208127975,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 5450
    },
    {
      "epoch": 0.87216,
      "grad_norm": 0.09496399760246277,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 5451
    },
    {
      "epoch": 0.87232,
      "grad_norm": 0.11927732080221176,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 5452
    },
    {
      "epoch": 0.87248,
      "grad_norm": 0.07932137697935104,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 5453
    },
    {
      "epoch": 0.87264,
      "grad_norm": 0.09509457647800446,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 5454
    },
    {
      "epoch": 0.8728,
      "grad_norm": 0.10639644414186478,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 5455
    },
    {
      "epoch": 0.87296,
      "grad_norm": 0.12054409086704254,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 5456
    },
    {
      "epoch": 0.87312,
      "grad_norm": 0.09812632203102112,
      "learning_rate": 0.0001,
      "loss": 0.3019,
      "step": 5457
    },
    {
      "epoch": 0.87328,
      "grad_norm": 0.12249741703271866,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5458
    },
    {
      "epoch": 0.87344,
      "grad_norm": 0.08926620334386826,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 5459
    },
    {
      "epoch": 0.8736,
      "grad_norm": 0.09722916036844254,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 5460
    },
    {
      "epoch": 0.87376,
      "grad_norm": 0.08701031655073166,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 5461
    },
    {
      "epoch": 0.87392,
      "grad_norm": 0.09610049426555634,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 5462
    },
    {
      "epoch": 0.87408,
      "grad_norm": 0.09588928520679474,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 5463
    },
    {
      "epoch": 0.87424,
      "grad_norm": 0.09143863618373871,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 5464
    },
    {
      "epoch": 0.8744,
      "grad_norm": 0.08191371709108353,
      "learning_rate": 0.0001,
      "loss": 0.305,
      "step": 5465
    },
    {
      "epoch": 0.87456,
      "grad_norm": 0.09999964386224747,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 5466
    },
    {
      "epoch": 0.87472,
      "grad_norm": 0.0891905352473259,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 5467
    },
    {
      "epoch": 0.87488,
      "grad_norm": 0.0808398425579071,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 5468
    },
    {
      "epoch": 0.87504,
      "grad_norm": 0.12166141718626022,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 5469
    },
    {
      "epoch": 0.8752,
      "grad_norm": 0.08376893401145935,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 5470
    },
    {
      "epoch": 0.87536,
      "grad_norm": 0.09793296456336975,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 5471
    },
    {
      "epoch": 0.87552,
      "grad_norm": 0.07719630002975464,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 5472
    },
    {
      "epoch": 0.87568,
      "grad_norm": 0.0844469889998436,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 5473
    },
    {
      "epoch": 0.87584,
      "grad_norm": 0.09222810715436935,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5474
    },
    {
      "epoch": 0.876,
      "grad_norm": 0.08540527522563934,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5475
    },
    {
      "epoch": 0.87616,
      "grad_norm": 0.0855371281504631,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 5476
    },
    {
      "epoch": 0.87632,
      "grad_norm": 0.07715268433094025,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 5477
    },
    {
      "epoch": 0.87648,
      "grad_norm": 0.086629219353199,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 5478
    },
    {
      "epoch": 0.87664,
      "grad_norm": 0.08959843963384628,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 5479
    },
    {
      "epoch": 0.8768,
      "grad_norm": 0.10229996591806412,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 5480
    },
    {
      "epoch": 0.87696,
      "grad_norm": 0.08258339762687683,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 5481
    },
    {
      "epoch": 0.87712,
      "grad_norm": 0.0869075357913971,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 5482
    },
    {
      "epoch": 0.87728,
      "grad_norm": 0.08220359683036804,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 5483
    },
    {
      "epoch": 0.87744,
      "grad_norm": 0.09054689854383469,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 5484
    },
    {
      "epoch": 0.8776,
      "grad_norm": 0.0917726680636406,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 5485
    },
    {
      "epoch": 0.87776,
      "grad_norm": 0.08152351528406143,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 5486
    },
    {
      "epoch": 0.87792,
      "grad_norm": 0.08729390054941177,
      "learning_rate": 0.0001,
      "loss": 0.3068,
      "step": 5487
    },
    {
      "epoch": 0.87808,
      "grad_norm": 0.07910321652889252,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 5488
    },
    {
      "epoch": 0.87824,
      "grad_norm": 0.08605200797319412,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 5489
    },
    {
      "epoch": 0.8784,
      "grad_norm": 0.10173558443784714,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 5490
    },
    {
      "epoch": 0.87856,
      "grad_norm": 0.08610538393259048,
      "learning_rate": 0.0001,
      "loss": 0.3058,
      "step": 5491
    },
    {
      "epoch": 0.87872,
      "grad_norm": 0.09353560209274292,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 5492
    },
    {
      "epoch": 0.87888,
      "grad_norm": 0.08077262341976166,
      "learning_rate": 0.0001,
      "loss": 0.3036,
      "step": 5493
    },
    {
      "epoch": 0.87904,
      "grad_norm": 0.08479871600866318,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 5494
    },
    {
      "epoch": 0.8792,
      "grad_norm": 0.09015140682458878,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 5495
    },
    {
      "epoch": 0.87936,
      "grad_norm": 0.13585181534290314,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5496
    },
    {
      "epoch": 0.87952,
      "grad_norm": 0.08856237679719925,
      "learning_rate": 0.0001,
      "loss": 0.3041,
      "step": 5497
    },
    {
      "epoch": 0.87968,
      "grad_norm": 0.09846313297748566,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 5498
    },
    {
      "epoch": 0.87984,
      "grad_norm": 0.07905055582523346,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 5499
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.08263611048460007,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 5500
    },
    {
      "epoch": 0.88,
      "eval_train_accuracy": 0.9994,
      "eval_train_loss": 0.3178356885910034,
      "eval_train_runtime": 4.0807,
      "eval_train_samples_per_second": 1225.283,
      "eval_train_steps_per_second": 15.439,
      "step": 5500
    },
    {
      "epoch": 0.88,
      "eval_test_accuracy": 0.9988,
      "eval_test_loss": 0.3165937066078186,
      "eval_test_runtime": 4.914,
      "eval_test_samples_per_second": 1017.497,
      "eval_test_steps_per_second": 12.82,
      "step": 5500
    },
    {
      "epoch": 0.88016,
      "grad_norm": 0.09122778475284576,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 5501
    },
    {
      "epoch": 0.88032,
      "grad_norm": 0.0998411774635315,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 5502
    },
    {
      "epoch": 0.88048,
      "grad_norm": 0.09842787683010101,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 5503
    },
    {
      "epoch": 0.88064,
      "grad_norm": 0.0970817431807518,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 5504
    },
    {
      "epoch": 0.8808,
      "grad_norm": 0.09097132831811905,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 5505
    },
    {
      "epoch": 0.88096,
      "grad_norm": 0.08126076310873032,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5506
    },
    {
      "epoch": 0.88112,
      "grad_norm": 0.09374761581420898,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 5507
    },
    {
      "epoch": 0.88128,
      "grad_norm": 0.09792662411928177,
      "learning_rate": 0.0001,
      "loss": 0.3068,
      "step": 5508
    },
    {
      "epoch": 0.88144,
      "grad_norm": 0.07987069338560104,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 5509
    },
    {
      "epoch": 0.8816,
      "grad_norm": 0.0930027961730957,
      "learning_rate": 0.0001,
      "loss": 0.3014,
      "step": 5510
    },
    {
      "epoch": 0.88176,
      "grad_norm": 0.07597699016332626,
      "learning_rate": 0.0001,
      "loss": 0.2956,
      "step": 5511
    },
    {
      "epoch": 0.88192,
      "grad_norm": 0.08964931219816208,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 5512
    },
    {
      "epoch": 0.88208,
      "grad_norm": 0.09478921443223953,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 5513
    },
    {
      "epoch": 0.88224,
      "grad_norm": 0.08409681916236877,
      "learning_rate": 0.0001,
      "loss": 0.298,
      "step": 5514
    },
    {
      "epoch": 0.8824,
      "grad_norm": 0.0797552540898323,
      "learning_rate": 0.0001,
      "loss": 0.3073,
      "step": 5515
    },
    {
      "epoch": 0.88256,
      "grad_norm": 0.08863338083028793,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 5516
    },
    {
      "epoch": 0.88272,
      "grad_norm": 0.0774991363286972,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 5517
    },
    {
      "epoch": 0.88288,
      "grad_norm": 0.08141718804836273,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5518
    },
    {
      "epoch": 0.88304,
      "grad_norm": 0.07778869569301605,
      "learning_rate": 0.0001,
      "loss": 0.3005,
      "step": 5519
    },
    {
      "epoch": 0.8832,
      "grad_norm": 0.09085496515035629,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 5520
    },
    {
      "epoch": 0.88336,
      "grad_norm": 0.08861761540174484,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 5521
    },
    {
      "epoch": 0.88352,
      "grad_norm": 0.07928785681724548,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 5522
    },
    {
      "epoch": 0.88368,
      "grad_norm": 0.08712685853242874,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 5523
    },
    {
      "epoch": 0.88384,
      "grad_norm": 0.09771914035081863,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 5524
    },
    {
      "epoch": 0.884,
      "grad_norm": 0.09297565370798111,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 5525
    },
    {
      "epoch": 0.88416,
      "grad_norm": 0.08333183079957962,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 5526
    },
    {
      "epoch": 0.88432,
      "grad_norm": 0.09233523905277252,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5527
    },
    {
      "epoch": 0.88448,
      "grad_norm": 0.08113993704319,
      "learning_rate": 0.0001,
      "loss": 0.2987,
      "step": 5528
    },
    {
      "epoch": 0.88464,
      "grad_norm": 0.08079405128955841,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 5529
    },
    {
      "epoch": 0.8848,
      "grad_norm": 0.09780074656009674,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 5530
    },
    {
      "epoch": 0.88496,
      "grad_norm": 0.08117731660604477,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 5531
    },
    {
      "epoch": 0.88512,
      "grad_norm": 0.0867362916469574,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 5532
    },
    {
      "epoch": 0.88528,
      "grad_norm": 0.10005506873130798,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 5533
    },
    {
      "epoch": 0.88544,
      "grad_norm": 0.08969981223344803,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 5534
    },
    {
      "epoch": 0.8856,
      "grad_norm": 0.08881130814552307,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 5535
    },
    {
      "epoch": 0.88576,
      "grad_norm": 0.08096335083246231,
      "learning_rate": 0.0001,
      "loss": 0.3083,
      "step": 5536
    },
    {
      "epoch": 0.88592,
      "grad_norm": 0.09429246932268143,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 5537
    },
    {
      "epoch": 0.88608,
      "grad_norm": 0.10038460791110992,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 5538
    },
    {
      "epoch": 0.88624,
      "grad_norm": 0.08347617834806442,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 5539
    },
    {
      "epoch": 0.8864,
      "grad_norm": 0.08713427186012268,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5540
    },
    {
      "epoch": 0.88656,
      "grad_norm": 0.08097092062234879,
      "learning_rate": 0.0001,
      "loss": 0.3071,
      "step": 5541
    },
    {
      "epoch": 0.88672,
      "grad_norm": 0.09239698946475983,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 5542
    },
    {
      "epoch": 0.88688,
      "grad_norm": 0.08011557161808014,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 5543
    },
    {
      "epoch": 0.88704,
      "grad_norm": 0.07753360271453857,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 5544
    },
    {
      "epoch": 0.8872,
      "grad_norm": 0.08793426305055618,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 5545
    },
    {
      "epoch": 0.88736,
      "grad_norm": 0.08114868402481079,
      "learning_rate": 0.0001,
      "loss": 0.3078,
      "step": 5546
    },
    {
      "epoch": 0.88752,
      "grad_norm": 0.078170545399189,
      "learning_rate": 0.0001,
      "loss": 0.3051,
      "step": 5547
    },
    {
      "epoch": 0.88768,
      "grad_norm": 0.07447439432144165,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 5548
    },
    {
      "epoch": 0.88784,
      "grad_norm": 0.0818522572517395,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 5549
    },
    {
      "epoch": 0.888,
      "grad_norm": 0.08631803840398788,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 5550
    },
    {
      "epoch": 0.88816,
      "grad_norm": 0.0875052660703659,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 5551
    },
    {
      "epoch": 0.88832,
      "grad_norm": 0.09305229783058167,
      "learning_rate": 0.0001,
      "loss": 0.2977,
      "step": 5552
    },
    {
      "epoch": 0.88848,
      "grad_norm": 0.09288562834262848,
      "learning_rate": 0.0001,
      "loss": 0.2988,
      "step": 5553
    },
    {
      "epoch": 0.88864,
      "grad_norm": 0.08904312551021576,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 5554
    },
    {
      "epoch": 0.8888,
      "grad_norm": 0.08449738472700119,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 5555
    },
    {
      "epoch": 0.88896,
      "grad_norm": 0.0805852860212326,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 5556
    },
    {
      "epoch": 0.88912,
      "grad_norm": 0.07465915381908417,
      "learning_rate": 0.0001,
      "loss": 0.3046,
      "step": 5557
    },
    {
      "epoch": 0.88928,
      "grad_norm": 0.08041633665561676,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 5558
    },
    {
      "epoch": 0.88944,
      "grad_norm": 0.09118810296058655,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 5559
    },
    {
      "epoch": 0.8896,
      "grad_norm": 0.09683731198310852,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 5560
    },
    {
      "epoch": 0.88976,
      "grad_norm": 0.09386790543794632,
      "learning_rate": 0.0001,
      "loss": 0.3038,
      "step": 5561
    },
    {
      "epoch": 0.88992,
      "grad_norm": 0.08104360103607178,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 5562
    },
    {
      "epoch": 0.89008,
      "grad_norm": 0.08031009137630463,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 5563
    },
    {
      "epoch": 0.89024,
      "grad_norm": 0.09724611788988113,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 5564
    },
    {
      "epoch": 0.8904,
      "grad_norm": 0.08579445630311966,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 5565
    },
    {
      "epoch": 0.89056,
      "grad_norm": 0.0893411710858345,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 5566
    },
    {
      "epoch": 0.89072,
      "grad_norm": 0.08800916373729706,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5567
    },
    {
      "epoch": 0.89088,
      "grad_norm": 0.07719672471284866,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5568
    },
    {
      "epoch": 0.89104,
      "grad_norm": 0.08338873088359833,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5569
    },
    {
      "epoch": 0.8912,
      "grad_norm": 0.08797293901443481,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 5570
    },
    {
      "epoch": 0.89136,
      "grad_norm": 0.07533321529626846,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5571
    },
    {
      "epoch": 0.89152,
      "grad_norm": 0.10260361433029175,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 5572
    },
    {
      "epoch": 0.89168,
      "grad_norm": 0.082119420170784,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 5573
    },
    {
      "epoch": 0.89184,
      "grad_norm": 0.10778263211250305,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 5574
    },
    {
      "epoch": 0.892,
      "grad_norm": 0.09188561141490936,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 5575
    },
    {
      "epoch": 0.89216,
      "grad_norm": 0.08389946818351746,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 5576
    },
    {
      "epoch": 0.89232,
      "grad_norm": 0.08334914594888687,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 5577
    },
    {
      "epoch": 0.89248,
      "grad_norm": 0.10227081179618835,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5578
    },
    {
      "epoch": 0.89264,
      "grad_norm": 0.09787590801715851,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 5579
    },
    {
      "epoch": 0.8928,
      "grad_norm": 0.08832091838121414,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 5580
    },
    {
      "epoch": 0.89296,
      "grad_norm": 0.07846204191446304,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5581
    },
    {
      "epoch": 0.89312,
      "grad_norm": 0.073001928627491,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 5582
    },
    {
      "epoch": 0.89328,
      "grad_norm": 0.08475954085588455,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 5583
    },
    {
      "epoch": 0.89344,
      "grad_norm": 0.08594777435064316,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 5584
    },
    {
      "epoch": 0.8936,
      "grad_norm": 0.102154441177845,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 5585
    },
    {
      "epoch": 0.89376,
      "grad_norm": 0.07926834374666214,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 5586
    },
    {
      "epoch": 0.89392,
      "grad_norm": 0.0820750817656517,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 5587
    },
    {
      "epoch": 0.89408,
      "grad_norm": 0.07724369317293167,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 5588
    },
    {
      "epoch": 0.89424,
      "grad_norm": 0.08309347927570343,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 5589
    },
    {
      "epoch": 0.8944,
      "grad_norm": 0.08686414361000061,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 5590
    },
    {
      "epoch": 0.89456,
      "grad_norm": 0.09254230558872223,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 5591
    },
    {
      "epoch": 0.89472,
      "grad_norm": 0.07736311107873917,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 5592
    },
    {
      "epoch": 0.89488,
      "grad_norm": 0.0905795693397522,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5593
    },
    {
      "epoch": 0.89504,
      "grad_norm": 0.08218428492546082,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 5594
    },
    {
      "epoch": 0.8952,
      "grad_norm": 0.08345019817352295,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5595
    },
    {
      "epoch": 0.89536,
      "grad_norm": 0.08648208528757095,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 5596
    },
    {
      "epoch": 0.89552,
      "grad_norm": 0.09030075371265411,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 5597
    },
    {
      "epoch": 0.89568,
      "grad_norm": 0.0821601077914238,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 5598
    },
    {
      "epoch": 0.89584,
      "grad_norm": 0.09052774310112,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 5599
    },
    {
      "epoch": 0.896,
      "grad_norm": 0.0778145045042038,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 5600
    },
    {
      "epoch": 0.896,
      "eval_train_accuracy": 0.9996,
      "eval_train_loss": 0.3172656297683716,
      "eval_train_runtime": 4.073,
      "eval_train_samples_per_second": 1227.594,
      "eval_train_steps_per_second": 15.468,
      "step": 5600
    },
    {
      "epoch": 0.896,
      "eval_test_accuracy": 0.9998,
      "eval_test_loss": 0.31628894805908203,
      "eval_test_runtime": 4.7152,
      "eval_test_samples_per_second": 1060.397,
      "eval_test_steps_per_second": 13.361,
      "step": 5600
    },
    {
      "epoch": 0.89616,
      "grad_norm": 0.07926998287439346,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 5601
    },
    {
      "epoch": 0.89632,
      "grad_norm": 0.0902242437005043,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 5602
    },
    {
      "epoch": 0.89648,
      "grad_norm": 0.07803912460803986,
      "learning_rate": 0.0001,
      "loss": 0.3074,
      "step": 5603
    },
    {
      "epoch": 0.89664,
      "grad_norm": 0.08859147876501083,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5604
    },
    {
      "epoch": 0.8968,
      "grad_norm": 0.08249065279960632,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 5605
    },
    {
      "epoch": 0.89696,
      "grad_norm": 0.0836406797170639,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 5606
    },
    {
      "epoch": 0.89712,
      "grad_norm": 0.08748682588338852,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5607
    },
    {
      "epoch": 0.89728,
      "grad_norm": 0.08924095332622528,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 5608
    },
    {
      "epoch": 0.89744,
      "grad_norm": 0.0881505236029625,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 5609
    },
    {
      "epoch": 0.8976,
      "grad_norm": 0.08874094486236572,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 5610
    },
    {
      "epoch": 0.89776,
      "grad_norm": 0.08218029886484146,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5611
    },
    {
      "epoch": 0.89792,
      "grad_norm": 0.08605461567640305,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 5612
    },
    {
      "epoch": 0.89808,
      "grad_norm": 0.08269466459751129,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 5613
    },
    {
      "epoch": 0.89824,
      "grad_norm": 0.086916483938694,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 5614
    },
    {
      "epoch": 0.8984,
      "grad_norm": 0.08595586568117142,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 5615
    },
    {
      "epoch": 0.89856,
      "grad_norm": 0.08521352708339691,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5616
    },
    {
      "epoch": 0.89872,
      "grad_norm": 0.08658372610807419,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 5617
    },
    {
      "epoch": 0.89888,
      "grad_norm": 0.09382570534944534,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 5618
    },
    {
      "epoch": 0.89904,
      "grad_norm": 0.07507704943418503,
      "learning_rate": 0.0001,
      "loss": 0.3046,
      "step": 5619
    },
    {
      "epoch": 0.8992,
      "grad_norm": 0.08460889011621475,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 5620
    },
    {
      "epoch": 0.89936,
      "grad_norm": 0.08990679681301117,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 5621
    },
    {
      "epoch": 0.89952,
      "grad_norm": 0.08632087707519531,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 5622
    },
    {
      "epoch": 0.89968,
      "grad_norm": 0.08656429499387741,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 5623
    },
    {
      "epoch": 0.89984,
      "grad_norm": 0.07762954384088516,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5624
    },
    {
      "epoch": 0.9,
      "grad_norm": 0.08717072755098343,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5625
    },
    {
      "epoch": 0.90016,
      "grad_norm": 0.0815422385931015,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 5626
    },
    {
      "epoch": 0.90032,
      "grad_norm": 0.09145725518465042,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 5627
    },
    {
      "epoch": 0.90048,
      "grad_norm": 0.08553371578454971,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 5628
    },
    {
      "epoch": 0.90064,
      "grad_norm": 0.09435241669416428,
      "learning_rate": 0.0001,
      "loss": 0.3009,
      "step": 5629
    },
    {
      "epoch": 0.9008,
      "grad_norm": 0.10995364189147949,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 5630
    },
    {
      "epoch": 0.90096,
      "grad_norm": 0.07935933768749237,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 5631
    },
    {
      "epoch": 0.90112,
      "grad_norm": 0.08419518917798996,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5632
    },
    {
      "epoch": 0.90128,
      "grad_norm": 0.08248413354158401,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5633
    },
    {
      "epoch": 0.90144,
      "grad_norm": 0.08376669138669968,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 5634
    },
    {
      "epoch": 0.9016,
      "grad_norm": 0.08044375479221344,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5635
    },
    {
      "epoch": 0.90176,
      "grad_norm": 0.07689923048019409,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 5636
    },
    {
      "epoch": 0.90192,
      "grad_norm": 0.07562924921512604,
      "learning_rate": 0.0001,
      "loss": 0.2955,
      "step": 5637
    },
    {
      "epoch": 0.90208,
      "grad_norm": 0.08762169629335403,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 5638
    },
    {
      "epoch": 0.90224,
      "grad_norm": 0.08701664209365845,
      "learning_rate": 0.0001,
      "loss": 0.2943,
      "step": 5639
    },
    {
      "epoch": 0.9024,
      "grad_norm": 0.09557587653398514,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 5640
    },
    {
      "epoch": 0.90256,
      "grad_norm": 0.07628382742404938,
      "learning_rate": 0.0001,
      "loss": 0.3035,
      "step": 5641
    },
    {
      "epoch": 0.90272,
      "grad_norm": 0.07700826972723007,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5642
    },
    {
      "epoch": 0.90288,
      "grad_norm": 0.07897850126028061,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 5643
    },
    {
      "epoch": 0.90304,
      "grad_norm": 0.08477407693862915,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 5644
    },
    {
      "epoch": 0.9032,
      "grad_norm": 0.07573284953832626,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 5645
    },
    {
      "epoch": 0.90336,
      "grad_norm": 0.12070158123970032,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5646
    },
    {
      "epoch": 0.90352,
      "grad_norm": 0.08976438641548157,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5647
    },
    {
      "epoch": 0.90368,
      "grad_norm": 0.08796411752700806,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 5648
    },
    {
      "epoch": 0.90384,
      "grad_norm": 0.10442245006561279,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 5649
    },
    {
      "epoch": 0.904,
      "grad_norm": 0.09291964769363403,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 5650
    },
    {
      "epoch": 0.90416,
      "grad_norm": 0.08148638904094696,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 5651
    },
    {
      "epoch": 0.90432,
      "grad_norm": 0.07758302241563797,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 5652
    },
    {
      "epoch": 0.90448,
      "grad_norm": 0.07921657711267471,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 5653
    },
    {
      "epoch": 0.90464,
      "grad_norm": 0.10705193877220154,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 5654
    },
    {
      "epoch": 0.9048,
      "grad_norm": 0.08335698395967484,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 5655
    },
    {
      "epoch": 0.90496,
      "grad_norm": 0.10527703166007996,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 5656
    },
    {
      "epoch": 0.90512,
      "grad_norm": 0.08321749418973923,
      "learning_rate": 0.0001,
      "loss": 0.3044,
      "step": 5657
    },
    {
      "epoch": 0.90528,
      "grad_norm": 0.0744621679186821,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 5658
    },
    {
      "epoch": 0.90544,
      "grad_norm": 0.10032813996076584,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 5659
    },
    {
      "epoch": 0.9056,
      "grad_norm": 0.0909881591796875,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 5660
    },
    {
      "epoch": 0.90576,
      "grad_norm": 0.09144013375043869,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 5661
    },
    {
      "epoch": 0.90592,
      "grad_norm": 0.11323874443769455,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 5662
    },
    {
      "epoch": 0.90608,
      "grad_norm": 0.09189458191394806,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 5663
    },
    {
      "epoch": 0.90624,
      "grad_norm": 0.08710989356040955,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 5664
    },
    {
      "epoch": 0.9064,
      "grad_norm": 0.08272503316402435,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 5665
    },
    {
      "epoch": 0.90656,
      "grad_norm": 0.09157605469226837,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 5666
    },
    {
      "epoch": 0.90672,
      "grad_norm": 0.08306246995925903,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 5667
    },
    {
      "epoch": 0.90688,
      "grad_norm": 0.08891905099153519,
      "learning_rate": 0.0001,
      "loss": 0.3071,
      "step": 5668
    },
    {
      "epoch": 0.90704,
      "grad_norm": 0.0937180295586586,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 5669
    },
    {
      "epoch": 0.9072,
      "grad_norm": 0.08823659271001816,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 5670
    },
    {
      "epoch": 0.90736,
      "grad_norm": 0.1097971722483635,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 5671
    },
    {
      "epoch": 0.90752,
      "grad_norm": 0.09677255898714066,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 5672
    },
    {
      "epoch": 0.90768,
      "grad_norm": 0.08125439286231995,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5673
    },
    {
      "epoch": 0.90784,
      "grad_norm": 0.0858587771654129,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5674
    },
    {
      "epoch": 0.908,
      "grad_norm": 0.10851854830980301,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 5675
    },
    {
      "epoch": 0.90816,
      "grad_norm": 0.07726911455392838,
      "learning_rate": 0.0001,
      "loss": 0.3026,
      "step": 5676
    },
    {
      "epoch": 0.90832,
      "grad_norm": 0.09004765003919601,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 5677
    },
    {
      "epoch": 0.90848,
      "grad_norm": 0.07612684369087219,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 5678
    },
    {
      "epoch": 0.90864,
      "grad_norm": 0.10033383965492249,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 5679
    },
    {
      "epoch": 0.9088,
      "grad_norm": 0.08284841477870941,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 5680
    },
    {
      "epoch": 0.90896,
      "grad_norm": 0.0862927958369255,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 5681
    },
    {
      "epoch": 0.90912,
      "grad_norm": 0.07971875369548798,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 5682
    },
    {
      "epoch": 0.90928,
      "grad_norm": 0.09985404461622238,
      "learning_rate": 0.0001,
      "loss": 0.3419,
      "step": 5683
    },
    {
      "epoch": 0.90944,
      "grad_norm": 0.08366940170526505,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 5684
    },
    {
      "epoch": 0.9096,
      "grad_norm": 0.08772940188646317,
      "learning_rate": 0.0001,
      "loss": 0.3057,
      "step": 5685
    },
    {
      "epoch": 0.90976,
      "grad_norm": 0.0744204968214035,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 5686
    },
    {
      "epoch": 0.90992,
      "grad_norm": 0.07805055379867554,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5687
    },
    {
      "epoch": 0.91008,
      "grad_norm": 0.0855938196182251,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 5688
    },
    {
      "epoch": 0.91024,
      "grad_norm": 0.13514290750026703,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 5689
    },
    {
      "epoch": 0.9104,
      "grad_norm": 0.09211628139019012,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 5690
    },
    {
      "epoch": 0.91056,
      "grad_norm": 0.08691887557506561,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 5691
    },
    {
      "epoch": 0.91072,
      "grad_norm": 0.08848975598812103,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 5692
    },
    {
      "epoch": 0.91088,
      "grad_norm": 0.10132627934217453,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 5693
    },
    {
      "epoch": 0.91104,
      "grad_norm": 0.08056633919477463,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 5694
    },
    {
      "epoch": 0.9112,
      "grad_norm": 0.08286849409341812,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 5695
    },
    {
      "epoch": 0.91136,
      "grad_norm": 0.0787317082285881,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 5696
    },
    {
      "epoch": 0.91152,
      "grad_norm": 0.09311271458864212,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 5697
    },
    {
      "epoch": 0.91168,
      "grad_norm": 0.0975593626499176,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 5698
    },
    {
      "epoch": 0.91184,
      "grad_norm": 0.08649515360593796,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 5699
    },
    {
      "epoch": 0.912,
      "grad_norm": 0.09647396951913834,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5700
    },
    {
      "epoch": 0.912,
      "eval_train_accuracy": 0.9998,
      "eval_train_loss": 0.31781837344169617,
      "eval_train_runtime": 4.3529,
      "eval_train_samples_per_second": 1148.649,
      "eval_train_steps_per_second": 14.473,
      "step": 5700
    },
    {
      "epoch": 0.912,
      "eval_test_accuracy": 0.9998,
      "eval_test_loss": 0.31655454635620117,
      "eval_test_runtime": 4.8157,
      "eval_test_samples_per_second": 1038.28,
      "eval_test_steps_per_second": 13.082,
      "step": 5700
    },
    {
      "epoch": 0.91216,
      "grad_norm": 0.09540203213691711,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 5701
    },
    {
      "epoch": 0.91232,
      "grad_norm": 0.1073199063539505,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 5702
    },
    {
      "epoch": 0.91248,
      "grad_norm": 0.08361383527517319,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 5703
    },
    {
      "epoch": 0.91264,
      "grad_norm": 0.08289548754692078,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 5704
    },
    {
      "epoch": 0.9128,
      "grad_norm": 0.09718921780586243,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 5705
    },
    {
      "epoch": 0.91296,
      "grad_norm": 0.07943406701087952,
      "learning_rate": 0.0001,
      "loss": 0.3076,
      "step": 5706
    },
    {
      "epoch": 0.91312,
      "grad_norm": 0.09692005068063736,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 5707
    },
    {
      "epoch": 0.91328,
      "grad_norm": 0.08367767184972763,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5708
    },
    {
      "epoch": 0.91344,
      "grad_norm": 0.08946055918931961,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 5709
    },
    {
      "epoch": 0.9136,
      "grad_norm": 0.08579400181770325,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 5710
    },
    {
      "epoch": 0.91376,
      "grad_norm": 0.08885420113801956,
      "learning_rate": 0.0001,
      "loss": 0.3071,
      "step": 5711
    },
    {
      "epoch": 0.91392,
      "grad_norm": 0.10069026798009872,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 5712
    },
    {
      "epoch": 0.91408,
      "grad_norm": 0.08353006094694138,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 5713
    },
    {
      "epoch": 0.91424,
      "grad_norm": 0.07154947519302368,
      "learning_rate": 0.0001,
      "loss": 0.3071,
      "step": 5714
    },
    {
      "epoch": 0.9144,
      "grad_norm": 0.08268041163682938,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 5715
    },
    {
      "epoch": 0.91456,
      "grad_norm": 0.08816834539175034,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 5716
    },
    {
      "epoch": 0.91472,
      "grad_norm": 0.08958322554826736,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 5717
    },
    {
      "epoch": 0.91488,
      "grad_norm": 0.08313091844320297,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 5718
    },
    {
      "epoch": 0.91504,
      "grad_norm": 0.08520004153251648,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 5719
    },
    {
      "epoch": 0.9152,
      "grad_norm": 0.08771207928657532,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 5720
    },
    {
      "epoch": 0.91536,
      "grad_norm": 0.09060349315404892,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 5721
    },
    {
      "epoch": 0.91552,
      "grad_norm": 0.1088571771979332,
      "learning_rate": 0.0001,
      "loss": 0.3035,
      "step": 5722
    },
    {
      "epoch": 0.91568,
      "grad_norm": 0.09805075824260712,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 5723
    },
    {
      "epoch": 0.91584,
      "grad_norm": 0.07425659149885178,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 5724
    },
    {
      "epoch": 0.916,
      "grad_norm": 0.0752059742808342,
      "learning_rate": 0.0001,
      "loss": 0.3038,
      "step": 5725
    },
    {
      "epoch": 0.91616,
      "grad_norm": 0.1764662116765976,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 5726
    },
    {
      "epoch": 0.91632,
      "grad_norm": 0.07943377643823624,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 5727
    },
    {
      "epoch": 0.91648,
      "grad_norm": 0.08915971964597702,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5728
    },
    {
      "epoch": 0.91664,
      "grad_norm": 0.14910806715488434,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 5729
    },
    {
      "epoch": 0.9168,
      "grad_norm": 0.10376227647066116,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5730
    },
    {
      "epoch": 0.91696,
      "grad_norm": 0.09536313265562057,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 5731
    },
    {
      "epoch": 0.91712,
      "grad_norm": 0.07571183890104294,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 5732
    },
    {
      "epoch": 0.91728,
      "grad_norm": 0.08218225091695786,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 5733
    },
    {
      "epoch": 0.91744,
      "grad_norm": 0.11298920214176178,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 5734
    },
    {
      "epoch": 0.9176,
      "grad_norm": 0.09609483927488327,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 5735
    },
    {
      "epoch": 0.91776,
      "grad_norm": 0.08901642262935638,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 5736
    },
    {
      "epoch": 0.91792,
      "grad_norm": 0.08399740606546402,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 5737
    },
    {
      "epoch": 0.91808,
      "grad_norm": 0.08398018032312393,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 5738
    },
    {
      "epoch": 0.91824,
      "grad_norm": 0.0932341143488884,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 5739
    },
    {
      "epoch": 0.9184,
      "grad_norm": 0.08148226141929626,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 5740
    },
    {
      "epoch": 0.91856,
      "grad_norm": 0.0864739790558815,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 5741
    },
    {
      "epoch": 0.91872,
      "grad_norm": 0.07397229969501495,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 5742
    },
    {
      "epoch": 0.91888,
      "grad_norm": 0.07960928976535797,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 5743
    },
    {
      "epoch": 0.91904,
      "grad_norm": 0.07816685736179352,
      "learning_rate": 0.0001,
      "loss": 0.3024,
      "step": 5744
    },
    {
      "epoch": 0.9192,
      "grad_norm": 0.09183549880981445,
      "learning_rate": 0.0001,
      "loss": 0.3031,
      "step": 5745
    },
    {
      "epoch": 0.91936,
      "grad_norm": 0.0956883430480957,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 5746
    },
    {
      "epoch": 0.91952,
      "grad_norm": 0.07660546898841858,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 5747
    },
    {
      "epoch": 0.91968,
      "grad_norm": 0.09561415016651154,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 5748
    },
    {
      "epoch": 0.91984,
      "grad_norm": 0.08597252517938614,
      "learning_rate": 0.0001,
      "loss": 0.3038,
      "step": 5749
    },
    {
      "epoch": 0.92,
      "grad_norm": 0.07810483127832413,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 5750
    },
    {
      "epoch": 0.92016,
      "grad_norm": 0.0807894915342331,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 5751
    },
    {
      "epoch": 0.92032,
      "grad_norm": 0.07864438742399216,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 5752
    },
    {
      "epoch": 0.92048,
      "grad_norm": 0.08418060094118118,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 5753
    },
    {
      "epoch": 0.92064,
      "grad_norm": 0.0829831212759018,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 5754
    },
    {
      "epoch": 0.9208,
      "grad_norm": 0.09323637187480927,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 5755
    },
    {
      "epoch": 0.92096,
      "grad_norm": 0.08686849474906921,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 5756
    },
    {
      "epoch": 0.92112,
      "grad_norm": 0.08099962025880814,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5757
    },
    {
      "epoch": 0.92128,
      "grad_norm": 0.08102209120988846,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 5758
    },
    {
      "epoch": 0.92144,
      "grad_norm": 0.08882096409797668,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 5759
    },
    {
      "epoch": 0.9216,
      "grad_norm": 0.09315293282270432,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 5760
    },
    {
      "epoch": 0.92176,
      "grad_norm": 0.11974351108074188,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5761
    },
    {
      "epoch": 0.92192,
      "grad_norm": 0.09999903291463852,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 5762
    },
    {
      "epoch": 0.92208,
      "grad_norm": 0.082865871489048,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 5763
    },
    {
      "epoch": 0.92224,
      "grad_norm": 0.08790658414363861,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 5764
    },
    {
      "epoch": 0.9224,
      "grad_norm": 0.08828337490558624,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 5765
    },
    {
      "epoch": 0.92256,
      "grad_norm": 0.08252312988042831,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 5766
    },
    {
      "epoch": 0.92272,
      "grad_norm": 0.0720483735203743,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 5767
    },
    {
      "epoch": 0.92288,
      "grad_norm": 0.07413278520107269,
      "learning_rate": 0.0001,
      "loss": 0.2959,
      "step": 5768
    },
    {
      "epoch": 0.92304,
      "grad_norm": 0.08188674598932266,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 5769
    },
    {
      "epoch": 0.9232,
      "grad_norm": 0.07957219332456589,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 5770
    },
    {
      "epoch": 0.92336,
      "grad_norm": 0.07199864089488983,
      "learning_rate": 0.0001,
      "loss": 0.3018,
      "step": 5771
    },
    {
      "epoch": 0.92352,
      "grad_norm": 0.085330531001091,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5772
    },
    {
      "epoch": 0.92368,
      "grad_norm": 0.08262919634580612,
      "learning_rate": 0.0001,
      "loss": 0.3089,
      "step": 5773
    },
    {
      "epoch": 0.92384,
      "grad_norm": 0.07746957987546921,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5774
    },
    {
      "epoch": 0.924,
      "grad_norm": 0.08016084879636765,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 5775
    },
    {
      "epoch": 0.92416,
      "grad_norm": 0.08494111895561218,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 5776
    },
    {
      "epoch": 0.92432,
      "grad_norm": 0.09443158656358719,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 5777
    },
    {
      "epoch": 0.92448,
      "grad_norm": 0.08836403489112854,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 5778
    },
    {
      "epoch": 0.92464,
      "grad_norm": 0.09136256575584412,
      "learning_rate": 0.0001,
      "loss": 0.3033,
      "step": 5779
    },
    {
      "epoch": 0.9248,
      "grad_norm": 0.08885511010885239,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5780
    },
    {
      "epoch": 0.92496,
      "grad_norm": 0.10453619062900543,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 5781
    },
    {
      "epoch": 0.92512,
      "grad_norm": 0.07597178965806961,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 5782
    },
    {
      "epoch": 0.92528,
      "grad_norm": 0.09506749361753464,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 5783
    },
    {
      "epoch": 0.92544,
      "grad_norm": 0.08943647891283035,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5784
    },
    {
      "epoch": 0.9256,
      "grad_norm": 0.08815798908472061,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 5785
    },
    {
      "epoch": 0.92576,
      "grad_norm": 0.1000242754817009,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5786
    },
    {
      "epoch": 0.92592,
      "grad_norm": 0.08755402266979218,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 5787
    },
    {
      "epoch": 0.92608,
      "grad_norm": 0.08265919983386993,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 5788
    },
    {
      "epoch": 0.92624,
      "grad_norm": 0.09563323110342026,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 5789
    },
    {
      "epoch": 0.9264,
      "grad_norm": 0.08253979682922363,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 5790
    },
    {
      "epoch": 0.92656,
      "grad_norm": 0.09294717758893967,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 5791
    },
    {
      "epoch": 0.92672,
      "grad_norm": 0.09865480661392212,
      "learning_rate": 0.0001,
      "loss": 0.2973,
      "step": 5792
    },
    {
      "epoch": 0.92688,
      "grad_norm": 0.09840010851621628,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5793
    },
    {
      "epoch": 0.92704,
      "grad_norm": 0.09352637827396393,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5794
    },
    {
      "epoch": 0.9272,
      "grad_norm": 0.10064635425806046,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 5795
    },
    {
      "epoch": 0.92736,
      "grad_norm": 0.11126632988452911,
      "learning_rate": 0.0001,
      "loss": 0.3029,
      "step": 5796
    },
    {
      "epoch": 0.92752,
      "grad_norm": 0.07585274428129196,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 5797
    },
    {
      "epoch": 0.92768,
      "grad_norm": 0.082854263484478,
      "learning_rate": 0.0001,
      "loss": 0.3061,
      "step": 5798
    },
    {
      "epoch": 0.92784,
      "grad_norm": 0.12923409044742584,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 5799
    },
    {
      "epoch": 0.928,
      "grad_norm": 0.08615247905254364,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 5800
    },
    {
      "epoch": 0.928,
      "eval_train_accuracy": 0.9996,
      "eval_train_loss": 0.31777504086494446,
      "eval_train_runtime": 4.2361,
      "eval_train_samples_per_second": 1180.332,
      "eval_train_steps_per_second": 14.872,
      "step": 5800
    },
    {
      "epoch": 0.928,
      "eval_test_accuracy": 0.9998,
      "eval_test_loss": 0.31638288497924805,
      "eval_test_runtime": 4.7926,
      "eval_test_samples_per_second": 1043.272,
      "eval_test_steps_per_second": 13.145,
      "step": 5800
    },
    {
      "epoch": 0.92816,
      "grad_norm": 0.10956552624702454,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 5801
    },
    {
      "epoch": 0.92832,
      "grad_norm": 0.09485840797424316,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 5802
    },
    {
      "epoch": 0.92848,
      "grad_norm": 0.09469304978847504,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 5803
    },
    {
      "epoch": 0.92864,
      "grad_norm": 0.09343655407428741,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 5804
    },
    {
      "epoch": 0.9288,
      "grad_norm": 0.08636145293712616,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 5805
    },
    {
      "epoch": 0.92896,
      "grad_norm": 0.10345087945461273,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 5806
    },
    {
      "epoch": 0.92912,
      "grad_norm": 0.16845564544200897,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5807
    },
    {
      "epoch": 0.92928,
      "grad_norm": 0.07890549302101135,
      "learning_rate": 0.0001,
      "loss": 0.3046,
      "step": 5808
    },
    {
      "epoch": 0.92944,
      "grad_norm": 0.10762753337621689,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 5809
    },
    {
      "epoch": 0.9296,
      "grad_norm": 0.09087121486663818,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 5810
    },
    {
      "epoch": 0.92976,
      "grad_norm": 0.08245501667261124,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 5811
    },
    {
      "epoch": 0.92992,
      "grad_norm": 0.09070659428834915,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5812
    },
    {
      "epoch": 0.93008,
      "grad_norm": 0.10641629248857498,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 5813
    },
    {
      "epoch": 0.93024,
      "grad_norm": 0.08278442174196243,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 5814
    },
    {
      "epoch": 0.9304,
      "grad_norm": 0.09144127368927002,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 5815
    },
    {
      "epoch": 0.93056,
      "grad_norm": 0.154400035738945,
      "learning_rate": 0.0001,
      "loss": 0.3051,
      "step": 5816
    },
    {
      "epoch": 0.93072,
      "grad_norm": 0.09859345108270645,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 5817
    },
    {
      "epoch": 0.93088,
      "grad_norm": 0.23257456719875336,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 5818
    },
    {
      "epoch": 0.93104,
      "grad_norm": 0.16514839231967926,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 5819
    },
    {
      "epoch": 0.9312,
      "grad_norm": 0.0971473976969719,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 5820
    },
    {
      "epoch": 0.93136,
      "grad_norm": 0.08457682281732559,
      "learning_rate": 0.0001,
      "loss": 0.3059,
      "step": 5821
    },
    {
      "epoch": 0.93152,
      "grad_norm": 0.08442966639995575,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 5822
    },
    {
      "epoch": 0.93168,
      "grad_norm": 0.16954493522644043,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 5823
    },
    {
      "epoch": 0.93184,
      "grad_norm": 0.3121563792228699,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 5824
    },
    {
      "epoch": 0.932,
      "grad_norm": 0.23334161937236786,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 5825
    },
    {
      "epoch": 0.93216,
      "grad_norm": 0.10080759972333908,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 5826
    },
    {
      "epoch": 0.93232,
      "grad_norm": 0.08738095313310623,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 5827
    },
    {
      "epoch": 0.93248,
      "grad_norm": 0.09371640533208847,
      "learning_rate": 0.0001,
      "loss": 0.304,
      "step": 5828
    },
    {
      "epoch": 0.93264,
      "grad_norm": 0.2654880881309509,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 5829
    },
    {
      "epoch": 0.9328,
      "grad_norm": 0.20289301872253418,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 5830
    },
    {
      "epoch": 0.93296,
      "grad_norm": 0.11006610095500946,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 5831
    },
    {
      "epoch": 0.93312,
      "grad_norm": 0.15686063468456268,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 5832
    },
    {
      "epoch": 0.93328,
      "grad_norm": 0.32174158096313477,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 5833
    },
    {
      "epoch": 0.93344,
      "grad_norm": 0.09444864839315414,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 5834
    },
    {
      "epoch": 0.9336,
      "grad_norm": 0.3195042312145233,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 5835
    },
    {
      "epoch": 0.93376,
      "grad_norm": 0.11646377295255661,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 5836
    },
    {
      "epoch": 0.93392,
      "grad_norm": 0.6653323173522949,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 5837
    },
    {
      "epoch": 0.93408,
      "grad_norm": 0.09046539664268494,
      "learning_rate": 0.0001,
      "loss": 0.3058,
      "step": 5838
    },
    {
      "epoch": 0.93424,
      "grad_norm": 0.23377464711666107,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 5839
    },
    {
      "epoch": 0.9344,
      "grad_norm": 0.2145880162715912,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 5840
    },
    {
      "epoch": 0.93456,
      "grad_norm": 0.19977548718452454,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 5841
    },
    {
      "epoch": 0.93472,
      "grad_norm": 0.33272528648376465,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 5842
    },
    {
      "epoch": 0.93488,
      "grad_norm": 0.1978895366191864,
      "learning_rate": 0.0001,
      "loss": 0.2996,
      "step": 5843
    },
    {
      "epoch": 0.93504,
      "grad_norm": 0.1538737267255783,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 5844
    },
    {
      "epoch": 0.9352,
      "grad_norm": 0.11037563532590866,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 5845
    },
    {
      "epoch": 0.93536,
      "grad_norm": 0.14419853687286377,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 5846
    },
    {
      "epoch": 0.93552,
      "grad_norm": 0.1225387305021286,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 5847
    },
    {
      "epoch": 0.93568,
      "grad_norm": 0.12132585048675537,
      "learning_rate": 0.0001,
      "loss": 0.3059,
      "step": 5848
    },
    {
      "epoch": 0.93584,
      "grad_norm": 0.10380306094884872,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 5849
    },
    {
      "epoch": 0.936,
      "grad_norm": 0.16554780304431915,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 5850
    },
    {
      "epoch": 0.93616,
      "grad_norm": 0.12953875958919525,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5851
    },
    {
      "epoch": 0.93632,
      "grad_norm": 0.3942616283893585,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 5852
    },
    {
      "epoch": 0.93648,
      "grad_norm": 0.20272982120513916,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 5853
    },
    {
      "epoch": 0.93664,
      "grad_norm": 0.09512468427419662,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 5854
    },
    {
      "epoch": 0.9368,
      "grad_norm": 0.18875491619110107,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 5855
    },
    {
      "epoch": 0.93696,
      "grad_norm": 0.23326249420642853,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 5856
    },
    {
      "epoch": 0.93712,
      "grad_norm": 0.1319834589958191,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 5857
    },
    {
      "epoch": 0.93728,
      "grad_norm": 0.132438525557518,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5858
    },
    {
      "epoch": 0.93744,
      "grad_norm": 0.10572052747011185,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 5859
    },
    {
      "epoch": 0.9376,
      "grad_norm": 0.1393509954214096,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5860
    },
    {
      "epoch": 0.93776,
      "grad_norm": 0.11207720637321472,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 5861
    },
    {
      "epoch": 0.93792,
      "grad_norm": 0.15865272283554077,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 5862
    },
    {
      "epoch": 0.93808,
      "grad_norm": 0.0934220403432846,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 5863
    },
    {
      "epoch": 0.93824,
      "grad_norm": 0.20997007191181183,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 5864
    },
    {
      "epoch": 0.9384,
      "grad_norm": 0.11197765916585922,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 5865
    },
    {
      "epoch": 0.93856,
      "grad_norm": 0.18461626768112183,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 5866
    },
    {
      "epoch": 0.93872,
      "grad_norm": 0.1804390698671341,
      "learning_rate": 0.0001,
      "loss": 0.3056,
      "step": 5867
    },
    {
      "epoch": 0.93888,
      "grad_norm": 0.15733477473258972,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 5868
    },
    {
      "epoch": 0.93904,
      "grad_norm": 0.09995907545089722,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 5869
    },
    {
      "epoch": 0.9392,
      "grad_norm": 0.09866936504840851,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 5870
    },
    {
      "epoch": 0.93936,
      "grad_norm": 0.09358377009630203,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 5871
    },
    {
      "epoch": 0.93952,
      "grad_norm": 0.16813401877880096,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 5872
    },
    {
      "epoch": 0.93968,
      "grad_norm": 0.17051953077316284,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 5873
    },
    {
      "epoch": 0.93984,
      "grad_norm": 0.12345436960458755,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 5874
    },
    {
      "epoch": 0.94,
      "grad_norm": 0.09357291460037231,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 5875
    },
    {
      "epoch": 0.94016,
      "grad_norm": 0.09907826036214828,
      "learning_rate": 0.0001,
      "loss": 0.3044,
      "step": 5876
    },
    {
      "epoch": 0.94032,
      "grad_norm": 0.11666824668645859,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 5877
    },
    {
      "epoch": 0.94048,
      "grad_norm": 0.09627460688352585,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 5878
    },
    {
      "epoch": 0.94064,
      "grad_norm": 0.12267818301916122,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 5879
    },
    {
      "epoch": 0.9408,
      "grad_norm": 0.1366855651140213,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 5880
    },
    {
      "epoch": 0.94096,
      "grad_norm": 0.08951010555028915,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 5881
    },
    {
      "epoch": 0.94112,
      "grad_norm": 0.11044913530349731,
      "learning_rate": 0.0001,
      "loss": 0.3057,
      "step": 5882
    },
    {
      "epoch": 0.94128,
      "grad_norm": 0.09243524819612503,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 5883
    },
    {
      "epoch": 0.94144,
      "grad_norm": 0.10450965166091919,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 5884
    },
    {
      "epoch": 0.9416,
      "grad_norm": 0.10549711436033249,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 5885
    },
    {
      "epoch": 0.94176,
      "grad_norm": 0.1373836249113083,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 5886
    },
    {
      "epoch": 0.94192,
      "grad_norm": 0.193313866853714,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 5887
    },
    {
      "epoch": 0.94208,
      "grad_norm": 0.09776986390352249,
      "learning_rate": 0.0001,
      "loss": 0.3066,
      "step": 5888
    },
    {
      "epoch": 0.94224,
      "grad_norm": 0.11360521614551544,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 5889
    },
    {
      "epoch": 0.9424,
      "grad_norm": 0.09560741484165192,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 5890
    },
    {
      "epoch": 0.94256,
      "grad_norm": 0.09292493760585785,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 5891
    },
    {
      "epoch": 0.94272,
      "grad_norm": 0.10316029191017151,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 5892
    },
    {
      "epoch": 0.94288,
      "grad_norm": 0.10858598351478577,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 5893
    },
    {
      "epoch": 0.94304,
      "grad_norm": 0.11175889521837234,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 5894
    },
    {
      "epoch": 0.9432,
      "grad_norm": 0.08994050323963165,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5895
    },
    {
      "epoch": 0.94336,
      "grad_norm": 0.09353756159543991,
      "learning_rate": 0.0001,
      "loss": 0.301,
      "step": 5896
    },
    {
      "epoch": 0.94352,
      "grad_norm": 0.08412650972604752,
      "learning_rate": 0.0001,
      "loss": 0.2949,
      "step": 5897
    },
    {
      "epoch": 0.94368,
      "grad_norm": 0.10272740572690964,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 5898
    },
    {
      "epoch": 0.94384,
      "grad_norm": 0.10741401463747025,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 5899
    },
    {
      "epoch": 0.944,
      "grad_norm": 0.09221510589122772,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 5900
    },
    {
      "epoch": 0.944,
      "eval_train_accuracy": 0.9988,
      "eval_train_loss": 0.31812798976898193,
      "eval_train_runtime": 4.1591,
      "eval_train_samples_per_second": 1202.175,
      "eval_train_steps_per_second": 15.147,
      "step": 5900
    },
    {
      "epoch": 0.944,
      "eval_test_accuracy": 0.9982,
      "eval_test_loss": 0.31700751185417175,
      "eval_test_runtime": 4.6804,
      "eval_test_samples_per_second": 1068.29,
      "eval_test_steps_per_second": 13.46,
      "step": 5900
    },
    {
      "epoch": 0.94416,
      "grad_norm": 0.16241756081581116,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 5901
    },
    {
      "epoch": 0.94432,
      "grad_norm": 0.09909366071224213,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 5902
    },
    {
      "epoch": 0.94448,
      "grad_norm": 0.09433957934379578,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 5903
    },
    {
      "epoch": 0.94464,
      "grad_norm": 0.08359366655349731,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 5904
    },
    {
      "epoch": 0.9448,
      "grad_norm": 0.10422409325838089,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 5905
    },
    {
      "epoch": 0.94496,
      "grad_norm": 0.09213367104530334,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 5906
    },
    {
      "epoch": 0.94512,
      "grad_norm": 0.12452831864356995,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 5907
    },
    {
      "epoch": 0.94528,
      "grad_norm": 0.08562955260276794,
      "learning_rate": 0.0001,
      "loss": 0.2999,
      "step": 5908
    },
    {
      "epoch": 0.94544,
      "grad_norm": 0.08491618186235428,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 5909
    },
    {
      "epoch": 0.9456,
      "grad_norm": 0.09420395642518997,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 5910
    },
    {
      "epoch": 0.94576,
      "grad_norm": 0.13844387233257294,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 5911
    },
    {
      "epoch": 0.94592,
      "grad_norm": 0.09931934624910355,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 5912
    },
    {
      "epoch": 0.94608,
      "grad_norm": 0.11644850671291351,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5913
    },
    {
      "epoch": 0.94624,
      "grad_norm": 0.133933886885643,
      "learning_rate": 0.0001,
      "loss": 0.305,
      "step": 5914
    },
    {
      "epoch": 0.9464,
      "grad_norm": 0.1319446861743927,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 5915
    },
    {
      "epoch": 0.94656,
      "grad_norm": 0.10612697154283524,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 5916
    },
    {
      "epoch": 0.94672,
      "grad_norm": 0.1004270389676094,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 5917
    },
    {
      "epoch": 0.94688,
      "grad_norm": 0.10014412552118301,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 5918
    },
    {
      "epoch": 0.94704,
      "grad_norm": 0.1262933760881424,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 5919
    },
    {
      "epoch": 0.9472,
      "grad_norm": 0.10252002626657486,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 5920
    },
    {
      "epoch": 0.94736,
      "grad_norm": 0.08741670101881027,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 5921
    },
    {
      "epoch": 0.94752,
      "grad_norm": 0.10170897096395493,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 5922
    },
    {
      "epoch": 0.94768,
      "grad_norm": 0.1069023609161377,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5923
    },
    {
      "epoch": 0.94784,
      "grad_norm": 0.10737670958042145,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 5924
    },
    {
      "epoch": 0.948,
      "grad_norm": 0.15213653445243835,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 5925
    },
    {
      "epoch": 0.94816,
      "grad_norm": 0.09666179120540619,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 5926
    },
    {
      "epoch": 0.94832,
      "grad_norm": 0.1116388812661171,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5927
    },
    {
      "epoch": 0.94848,
      "grad_norm": 0.09502170234918594,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 5928
    },
    {
      "epoch": 0.94864,
      "grad_norm": 0.08516286313533783,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 5929
    },
    {
      "epoch": 0.9488,
      "grad_norm": 0.12075810879468918,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 5930
    },
    {
      "epoch": 0.94896,
      "grad_norm": 0.10069594532251358,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 5931
    },
    {
      "epoch": 0.94912,
      "grad_norm": 0.10060416907072067,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 5932
    },
    {
      "epoch": 0.94928,
      "grad_norm": 0.09242594242095947,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 5933
    },
    {
      "epoch": 0.94944,
      "grad_norm": 0.09891099482774734,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 5934
    },
    {
      "epoch": 0.9496,
      "grad_norm": 0.09885016828775406,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 5935
    },
    {
      "epoch": 0.94976,
      "grad_norm": 0.11104302108287811,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5936
    },
    {
      "epoch": 0.94992,
      "grad_norm": 0.08340033888816833,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 5937
    },
    {
      "epoch": 0.95008,
      "grad_norm": 0.09074219316244125,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 5938
    },
    {
      "epoch": 0.95024,
      "grad_norm": 0.11542193591594696,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 5939
    },
    {
      "epoch": 0.9504,
      "grad_norm": 0.10056662559509277,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 5940
    },
    {
      "epoch": 0.95056,
      "grad_norm": 0.08596141636371613,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 5941
    },
    {
      "epoch": 0.95072,
      "grad_norm": 0.10506408661603928,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 5942
    },
    {
      "epoch": 0.95088,
      "grad_norm": 0.08894610404968262,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 5943
    },
    {
      "epoch": 0.95104,
      "grad_norm": 0.08235054463148117,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5944
    },
    {
      "epoch": 0.9512,
      "grad_norm": 0.09917742758989334,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5945
    },
    {
      "epoch": 0.95136,
      "grad_norm": 0.10393226891756058,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 5946
    },
    {
      "epoch": 0.95152,
      "grad_norm": 0.09633792191743851,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 5947
    },
    {
      "epoch": 0.95168,
      "grad_norm": 0.07026941329240799,
      "learning_rate": 0.0001,
      "loss": 0.3073,
      "step": 5948
    },
    {
      "epoch": 0.95184,
      "grad_norm": 0.08050895482301712,
      "learning_rate": 0.0001,
      "loss": 0.3064,
      "step": 5949
    },
    {
      "epoch": 0.952,
      "grad_norm": 0.09571180492639542,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 5950
    },
    {
      "epoch": 0.95216,
      "grad_norm": 0.09245540201663971,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 5951
    },
    {
      "epoch": 0.95232,
      "grad_norm": 0.11562227457761765,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 5952
    },
    {
      "epoch": 0.95248,
      "grad_norm": 0.08876253664493561,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 5953
    },
    {
      "epoch": 0.95264,
      "grad_norm": 0.08830007910728455,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 5954
    },
    {
      "epoch": 0.9528,
      "grad_norm": 0.08353841304779053,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 5955
    },
    {
      "epoch": 0.95296,
      "grad_norm": 0.08289951086044312,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 5956
    },
    {
      "epoch": 0.95312,
      "grad_norm": 0.10015631467103958,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 5957
    },
    {
      "epoch": 0.95328,
      "grad_norm": 0.09492377191781998,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 5958
    },
    {
      "epoch": 0.95344,
      "grad_norm": 0.08843503147363663,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 5959
    },
    {
      "epoch": 0.9536,
      "grad_norm": 0.092600017786026,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 5960
    },
    {
      "epoch": 0.95376,
      "grad_norm": 0.09045563638210297,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 5961
    },
    {
      "epoch": 0.95392,
      "grad_norm": 0.07946770638227463,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 5962
    },
    {
      "epoch": 0.95408,
      "grad_norm": 0.08931073546409607,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5963
    },
    {
      "epoch": 0.95424,
      "grad_norm": 0.09661801904439926,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5964
    },
    {
      "epoch": 0.9544,
      "grad_norm": 0.1041756197810173,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 5965
    },
    {
      "epoch": 0.95456,
      "grad_norm": 0.09360243380069733,
      "learning_rate": 0.0001,
      "loss": 0.3058,
      "step": 5966
    },
    {
      "epoch": 0.95472,
      "grad_norm": 0.10839494317770004,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 5967
    },
    {
      "epoch": 0.95488,
      "grad_norm": 0.09672130644321442,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 5968
    },
    {
      "epoch": 0.95504,
      "grad_norm": 0.11235356330871582,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5969
    },
    {
      "epoch": 0.9552,
      "grad_norm": 0.0825294628739357,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5970
    },
    {
      "epoch": 0.95536,
      "grad_norm": 0.08955452591180801,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 5971
    },
    {
      "epoch": 0.95552,
      "grad_norm": 0.07420255988836288,
      "learning_rate": 0.0001,
      "loss": 0.2982,
      "step": 5972
    },
    {
      "epoch": 0.95568,
      "grad_norm": 0.09320063143968582,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 5973
    },
    {
      "epoch": 0.95584,
      "grad_norm": 0.08276752382516861,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5974
    },
    {
      "epoch": 0.956,
      "grad_norm": 0.0830707848072052,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 5975
    },
    {
      "epoch": 0.95616,
      "grad_norm": 0.08344367891550064,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 5976
    },
    {
      "epoch": 0.95632,
      "grad_norm": 0.07757938653230667,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 5977
    },
    {
      "epoch": 0.95648,
      "grad_norm": 0.08080700784921646,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 5978
    },
    {
      "epoch": 0.95664,
      "grad_norm": 0.07808160781860352,
      "learning_rate": 0.0001,
      "loss": 0.3065,
      "step": 5979
    },
    {
      "epoch": 0.9568,
      "grad_norm": 0.10220425575971603,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 5980
    },
    {
      "epoch": 0.95696,
      "grad_norm": 0.12972992658615112,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 5981
    },
    {
      "epoch": 0.95712,
      "grad_norm": 0.1029336228966713,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 5982
    },
    {
      "epoch": 0.95728,
      "grad_norm": 0.10352443903684616,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 5983
    },
    {
      "epoch": 0.95744,
      "grad_norm": 0.08659608662128448,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 5984
    },
    {
      "epoch": 0.9576,
      "grad_norm": 0.09761366248130798,
      "learning_rate": 0.0001,
      "loss": 0.303,
      "step": 5985
    },
    {
      "epoch": 0.95776,
      "grad_norm": 0.10657287389039993,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 5986
    },
    {
      "epoch": 0.95792,
      "grad_norm": 0.08050891757011414,
      "learning_rate": 0.0001,
      "loss": 0.305,
      "step": 5987
    },
    {
      "epoch": 0.95808,
      "grad_norm": 0.08039245754480362,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 5988
    },
    {
      "epoch": 0.95824,
      "grad_norm": 0.08618246018886566,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 5989
    },
    {
      "epoch": 0.9584,
      "grad_norm": 0.09489086270332336,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 5990
    },
    {
      "epoch": 0.95856,
      "grad_norm": 0.10620497912168503,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 5991
    },
    {
      "epoch": 0.95872,
      "grad_norm": 0.1055523157119751,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5992
    },
    {
      "epoch": 0.95888,
      "grad_norm": 0.08333726227283478,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 5993
    },
    {
      "epoch": 0.95904,
      "grad_norm": 0.07644887268543243,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 5994
    },
    {
      "epoch": 0.9592,
      "grad_norm": 0.0742870643734932,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 5995
    },
    {
      "epoch": 0.95936,
      "grad_norm": 0.09215705841779709,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 5996
    },
    {
      "epoch": 0.95952,
      "grad_norm": 0.0893615111708641,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5997
    },
    {
      "epoch": 0.95968,
      "grad_norm": 0.10280978679656982,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 5998
    },
    {
      "epoch": 0.95984,
      "grad_norm": 0.11150134354829788,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 5999
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.09230491518974304,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 6000
    },
    {
      "epoch": 0.96,
      "eval_train_accuracy": 0.9994,
      "eval_train_loss": 0.3175499141216278,
      "eval_train_runtime": 4.1093,
      "eval_train_samples_per_second": 1216.763,
      "eval_train_steps_per_second": 15.331,
      "step": 6000
    },
    {
      "epoch": 0.96,
      "eval_test_accuracy": 1.0,
      "eval_test_loss": 0.31637540459632874,
      "eval_test_runtime": 5.1116,
      "eval_test_samples_per_second": 978.166,
      "eval_test_steps_per_second": 12.325,
      "step": 6000
    },
    {
      "epoch": 0.96016,
      "grad_norm": 0.08276960253715515,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 6001
    },
    {
      "epoch": 0.96032,
      "grad_norm": 0.09326394647359848,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 6002
    },
    {
      "epoch": 0.96048,
      "grad_norm": 0.08453796058893204,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 6003
    },
    {
      "epoch": 0.96064,
      "grad_norm": 0.08336455374956131,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 6004
    },
    {
      "epoch": 0.9608,
      "grad_norm": 0.13776619732379913,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 6005
    },
    {
      "epoch": 0.96096,
      "grad_norm": 0.10754814743995667,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 6006
    },
    {
      "epoch": 0.96112,
      "grad_norm": 0.09199176728725433,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 6007
    },
    {
      "epoch": 0.96128,
      "grad_norm": 0.08188609778881073,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 6008
    },
    {
      "epoch": 0.96144,
      "grad_norm": 0.08200982958078384,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 6009
    },
    {
      "epoch": 0.9616,
      "grad_norm": 0.09245569258928299,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 6010
    },
    {
      "epoch": 0.96176,
      "grad_norm": 0.08350342512130737,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 6011
    },
    {
      "epoch": 0.96192,
      "grad_norm": 0.09609096497297287,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 6012
    },
    {
      "epoch": 0.96208,
      "grad_norm": 0.08178264647722244,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 6013
    },
    {
      "epoch": 0.96224,
      "grad_norm": 0.08217157423496246,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 6014
    },
    {
      "epoch": 0.9624,
      "grad_norm": 0.0908178836107254,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 6015
    },
    {
      "epoch": 0.96256,
      "grad_norm": 0.10259605199098587,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 6016
    },
    {
      "epoch": 0.96272,
      "grad_norm": 0.08714061230421066,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 6017
    },
    {
      "epoch": 0.96288,
      "grad_norm": 0.08283484727144241,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 6018
    },
    {
      "epoch": 0.96304,
      "grad_norm": 0.09315157681703568,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 6019
    },
    {
      "epoch": 0.9632,
      "grad_norm": 0.18636852502822876,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 6020
    },
    {
      "epoch": 0.96336,
      "grad_norm": 0.08530091494321823,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 6021
    },
    {
      "epoch": 0.96352,
      "grad_norm": 0.08667546510696411,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 6022
    },
    {
      "epoch": 0.96368,
      "grad_norm": 0.10403755307197571,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 6023
    },
    {
      "epoch": 0.96384,
      "grad_norm": 0.09632760286331177,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 6024
    },
    {
      "epoch": 0.964,
      "grad_norm": 0.09384018182754517,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 6025
    },
    {
      "epoch": 0.96416,
      "grad_norm": 0.08024037629365921,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 6026
    },
    {
      "epoch": 0.96432,
      "grad_norm": 0.09166581928730011,
      "learning_rate": 0.0001,
      "loss": 0.3018,
      "step": 6027
    },
    {
      "epoch": 0.96448,
      "grad_norm": 0.09205878525972366,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 6028
    },
    {
      "epoch": 0.96464,
      "grad_norm": 0.08906946331262589,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 6029
    },
    {
      "epoch": 0.9648,
      "grad_norm": 0.09204176068305969,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 6030
    },
    {
      "epoch": 0.96496,
      "grad_norm": 0.09814025461673737,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 6031
    },
    {
      "epoch": 0.96512,
      "grad_norm": 0.0844966322183609,
      "learning_rate": 0.0001,
      "loss": 0.3002,
      "step": 6032
    },
    {
      "epoch": 0.96528,
      "grad_norm": 0.08420779556035995,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 6033
    },
    {
      "epoch": 0.96544,
      "grad_norm": 0.08649449050426483,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 6034
    },
    {
      "epoch": 0.9656,
      "grad_norm": 0.0958976000547409,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 6035
    },
    {
      "epoch": 0.96576,
      "grad_norm": 0.08573576807975769,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 6036
    },
    {
      "epoch": 0.96592,
      "grad_norm": 0.09657838195562363,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 6037
    },
    {
      "epoch": 0.96608,
      "grad_norm": 0.09187351167201996,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 6038
    },
    {
      "epoch": 0.96624,
      "grad_norm": 0.08229304105043411,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 6039
    },
    {
      "epoch": 0.9664,
      "grad_norm": 0.08366386592388153,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 6040
    },
    {
      "epoch": 0.96656,
      "grad_norm": 0.07772044092416763,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 6041
    },
    {
      "epoch": 0.96672,
      "grad_norm": 0.0741516500711441,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 6042
    },
    {
      "epoch": 0.96688,
      "grad_norm": 0.08567586541175842,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 6043
    },
    {
      "epoch": 0.96704,
      "grad_norm": 0.08606399595737457,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 6044
    },
    {
      "epoch": 0.9672,
      "grad_norm": 0.10017181187868118,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 6045
    },
    {
      "epoch": 0.96736,
      "grad_norm": 0.1723548024892807,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 6046
    },
    {
      "epoch": 0.96752,
      "grad_norm": 0.08987519890069962,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 6047
    },
    {
      "epoch": 0.96768,
      "grad_norm": 0.10389523208141327,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 6048
    },
    {
      "epoch": 0.96784,
      "grad_norm": 0.12834860384464264,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 6049
    },
    {
      "epoch": 0.968,
      "grad_norm": 0.10352084040641785,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 6050
    },
    {
      "epoch": 0.96816,
      "grad_norm": 0.08638914674520493,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 6051
    },
    {
      "epoch": 0.96832,
      "grad_norm": 0.09519590437412262,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 6052
    },
    {
      "epoch": 0.96848,
      "grad_norm": 0.11983156949281693,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 6053
    },
    {
      "epoch": 0.96864,
      "grad_norm": 0.12567327916622162,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 6054
    },
    {
      "epoch": 0.9688,
      "grad_norm": 0.08406682312488556,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 6055
    },
    {
      "epoch": 0.96896,
      "grad_norm": 0.07588779181241989,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 6056
    },
    {
      "epoch": 0.96912,
      "grad_norm": 0.0958547294139862,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 6057
    },
    {
      "epoch": 0.96928,
      "grad_norm": 0.08519822359085083,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 6058
    },
    {
      "epoch": 0.96944,
      "grad_norm": 0.09079954773187637,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 6059
    },
    {
      "epoch": 0.9696,
      "grad_norm": 0.07478600740432739,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 6060
    },
    {
      "epoch": 0.96976,
      "grad_norm": 0.07760085165500641,
      "learning_rate": 0.0001,
      "loss": 0.3089,
      "step": 6061
    },
    {
      "epoch": 0.96992,
      "grad_norm": 0.0992833599448204,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 6062
    },
    {
      "epoch": 0.97008,
      "grad_norm": 0.3356141746044159,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 6063
    },
    {
      "epoch": 0.97024,
      "grad_norm": 0.10400563478469849,
      "learning_rate": 0.0001,
      "loss": 0.3014,
      "step": 6064
    },
    {
      "epoch": 0.9704,
      "grad_norm": 0.08218354731798172,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 6065
    },
    {
      "epoch": 0.97056,
      "grad_norm": 0.14317359030246735,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 6066
    },
    {
      "epoch": 0.97072,
      "grad_norm": 0.1506987363100052,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 6067
    },
    {
      "epoch": 0.97088,
      "grad_norm": 0.154082790017128,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 6068
    },
    {
      "epoch": 0.97104,
      "grad_norm": 0.10028346627950668,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 6069
    },
    {
      "epoch": 0.9712,
      "grad_norm": 0.11463584750890732,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 6070
    },
    {
      "epoch": 0.97136,
      "grad_norm": 0.11166146397590637,
      "learning_rate": 0.0001,
      "loss": 0.3049,
      "step": 6071
    },
    {
      "epoch": 0.97152,
      "grad_norm": 0.09725833684206009,
      "learning_rate": 0.0001,
      "loss": 0.3033,
      "step": 6072
    },
    {
      "epoch": 0.97168,
      "grad_norm": 0.08921479433774948,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 6073
    },
    {
      "epoch": 0.97184,
      "grad_norm": 0.08974198997020721,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 6074
    },
    {
      "epoch": 0.972,
      "grad_norm": 0.08645264059305191,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 6075
    },
    {
      "epoch": 0.97216,
      "grad_norm": 0.08385800570249557,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 6076
    },
    {
      "epoch": 0.97232,
      "grad_norm": 0.10817433893680573,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 6077
    },
    {
      "epoch": 0.97248,
      "grad_norm": 0.09441688656806946,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 6078
    },
    {
      "epoch": 0.97264,
      "grad_norm": 0.08745339512825012,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 6079
    },
    {
      "epoch": 0.9728,
      "grad_norm": 0.09391998499631882,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 6080
    },
    {
      "epoch": 0.97296,
      "grad_norm": 0.09292677789926529,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 6081
    },
    {
      "epoch": 0.97312,
      "grad_norm": 0.08321597427129745,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 6082
    },
    {
      "epoch": 0.97328,
      "grad_norm": 0.08735956996679306,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 6083
    },
    {
      "epoch": 0.97344,
      "grad_norm": 0.09416098892688751,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 6084
    },
    {
      "epoch": 0.9736,
      "grad_norm": 0.12748457491397858,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 6085
    },
    {
      "epoch": 0.97376,
      "grad_norm": 0.13446930050849915,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 6086
    },
    {
      "epoch": 0.97392,
      "grad_norm": 0.0903104841709137,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 6087
    },
    {
      "epoch": 0.97408,
      "grad_norm": 0.08764573931694031,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 6088
    },
    {
      "epoch": 0.97424,
      "grad_norm": 0.09869587421417236,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 6089
    },
    {
      "epoch": 0.9744,
      "grad_norm": 0.10655760020017624,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 6090
    },
    {
      "epoch": 0.97456,
      "grad_norm": 0.08288385719060898,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 6091
    },
    {
      "epoch": 0.97472,
      "grad_norm": 0.10397886484861374,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 6092
    },
    {
      "epoch": 0.97488,
      "grad_norm": 0.09095887094736099,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 6093
    },
    {
      "epoch": 0.97504,
      "grad_norm": 0.07917013764381409,
      "learning_rate": 0.0001,
      "loss": 0.3025,
      "step": 6094
    },
    {
      "epoch": 0.9752,
      "grad_norm": 0.11023017019033432,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 6095
    },
    {
      "epoch": 0.97536,
      "grad_norm": 0.10923111438751221,
      "learning_rate": 0.0001,
      "loss": 0.3091,
      "step": 6096
    },
    {
      "epoch": 0.97552,
      "grad_norm": 0.09761960804462433,
      "learning_rate": 0.0001,
      "loss": 0.2967,
      "step": 6097
    },
    {
      "epoch": 0.97568,
      "grad_norm": 0.10557695478200912,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 6098
    },
    {
      "epoch": 0.97584,
      "grad_norm": 0.0847620815038681,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 6099
    },
    {
      "epoch": 0.976,
      "grad_norm": 0.07013962417840958,
      "learning_rate": 0.0001,
      "loss": 0.3067,
      "step": 6100
    },
    {
      "epoch": 0.976,
      "eval_train_accuracy": 0.9998,
      "eval_train_loss": 0.3176165521144867,
      "eval_train_runtime": 4.069,
      "eval_train_samples_per_second": 1228.811,
      "eval_train_steps_per_second": 15.483,
      "step": 6100
    },
    {
      "epoch": 0.976,
      "eval_test_accuracy": 1.0,
      "eval_test_loss": 0.31659603118896484,
      "eval_test_runtime": 4.7859,
      "eval_test_samples_per_second": 1044.733,
      "eval_test_steps_per_second": 13.164,
      "step": 6100
    },
    {
      "epoch": 0.97616,
      "grad_norm": 0.09450574964284897,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 6101
    },
    {
      "epoch": 0.97632,
      "grad_norm": 0.09928937256336212,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 6102
    },
    {
      "epoch": 0.97648,
      "grad_norm": 0.0793810486793518,
      "learning_rate": 0.0001,
      "loss": 0.3062,
      "step": 6103
    },
    {
      "epoch": 0.97664,
      "grad_norm": 0.09750436246395111,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 6104
    },
    {
      "epoch": 0.9768,
      "grad_norm": 0.1037895679473877,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 6105
    },
    {
      "epoch": 0.97696,
      "grad_norm": 0.09963840991258621,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 6106
    },
    {
      "epoch": 0.97712,
      "grad_norm": 0.08219756931066513,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 6107
    },
    {
      "epoch": 0.97728,
      "grad_norm": 0.08108042925596237,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 6108
    },
    {
      "epoch": 0.97744,
      "grad_norm": 0.08700882643461227,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 6109
    },
    {
      "epoch": 0.9776,
      "grad_norm": 0.09678967297077179,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 6110
    },
    {
      "epoch": 0.97776,
      "grad_norm": 0.09036623686552048,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 6111
    },
    {
      "epoch": 0.97792,
      "grad_norm": 0.12981058657169342,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 6112
    },
    {
      "epoch": 0.97808,
      "grad_norm": 0.10662483423948288,
      "learning_rate": 0.0001,
      "loss": 0.3041,
      "step": 6113
    },
    {
      "epoch": 0.97824,
      "grad_norm": 0.1007574051618576,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 6114
    },
    {
      "epoch": 0.9784,
      "grad_norm": 0.08822315186262131,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 6115
    },
    {
      "epoch": 0.97856,
      "grad_norm": 0.0865001529455185,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 6116
    },
    {
      "epoch": 0.97872,
      "grad_norm": 0.08830724656581879,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 6117
    },
    {
      "epoch": 0.97888,
      "grad_norm": 0.08826618641614914,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 6118
    },
    {
      "epoch": 0.97904,
      "grad_norm": 0.08578178286552429,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 6119
    },
    {
      "epoch": 0.9792,
      "grad_norm": 0.13478560745716095,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 6120
    },
    {
      "epoch": 0.97936,
      "grad_norm": 0.08648238331079483,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 6121
    },
    {
      "epoch": 0.97952,
      "grad_norm": 0.10302754491567612,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 6122
    },
    {
      "epoch": 0.97968,
      "grad_norm": 0.27503401041030884,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 6123
    },
    {
      "epoch": 0.97984,
      "grad_norm": 0.10653632879257202,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 6124
    },
    {
      "epoch": 0.98,
      "grad_norm": 0.13481557369232178,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 6125
    },
    {
      "epoch": 0.98016,
      "grad_norm": 0.14068295061588287,
      "learning_rate": 0.0001,
      "loss": 0.307,
      "step": 6126
    },
    {
      "epoch": 0.98032,
      "grad_norm": 0.11186925321817398,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 6127
    },
    {
      "epoch": 0.98048,
      "grad_norm": 0.10305041819810867,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 6128
    },
    {
      "epoch": 0.98064,
      "grad_norm": 0.1113201156258583,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 6129
    },
    {
      "epoch": 0.9808,
      "grad_norm": 0.11100631952285767,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 6130
    },
    {
      "epoch": 0.98096,
      "grad_norm": 0.11145254224538803,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 6131
    },
    {
      "epoch": 0.98112,
      "grad_norm": 0.15196679532527924,
      "learning_rate": 0.0001,
      "loss": 0.3051,
      "step": 6132
    },
    {
      "epoch": 0.98128,
      "grad_norm": 0.11488157510757446,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 6133
    },
    {
      "epoch": 0.98144,
      "grad_norm": 0.12304610013961792,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 6134
    },
    {
      "epoch": 0.9816,
      "grad_norm": 0.1233387142419815,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 6135
    },
    {
      "epoch": 0.98176,
      "grad_norm": 0.10768047720193863,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 6136
    },
    {
      "epoch": 0.98192,
      "grad_norm": 0.10552674531936646,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 6137
    },
    {
      "epoch": 0.98208,
      "grad_norm": 0.09829072654247284,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 6138
    },
    {
      "epoch": 0.98224,
      "grad_norm": 0.10902664065361023,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 6139
    },
    {
      "epoch": 0.9824,
      "grad_norm": 0.15560440719127655,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 6140
    },
    {
      "epoch": 0.98256,
      "grad_norm": 0.15721017122268677,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 6141
    },
    {
      "epoch": 0.98272,
      "grad_norm": 0.09200943261384964,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 6142
    },
    {
      "epoch": 0.98288,
      "grad_norm": 0.19045975804328918,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 6143
    },
    {
      "epoch": 0.98304,
      "grad_norm": 0.09935064613819122,
      "learning_rate": 0.0001,
      "loss": 0.3033,
      "step": 6144
    },
    {
      "epoch": 0.9832,
      "grad_norm": 0.3363020718097687,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 6145
    },
    {
      "epoch": 0.98336,
      "grad_norm": 0.09602189064025879,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 6146
    },
    {
      "epoch": 0.98352,
      "grad_norm": 0.09844254702329636,
      "learning_rate": 0.0001,
      "loss": 0.302,
      "step": 6147
    },
    {
      "epoch": 0.98368,
      "grad_norm": 0.29513248801231384,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 6148
    },
    {
      "epoch": 0.98384,
      "grad_norm": 0.23537054657936096,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 6149
    },
    {
      "epoch": 0.984,
      "grad_norm": 0.0919889286160469,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 6150
    },
    {
      "epoch": 0.98416,
      "grad_norm": 0.1262245625257492,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 6151
    },
    {
      "epoch": 0.98432,
      "grad_norm": 0.10826246440410614,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 6152
    },
    {
      "epoch": 0.98448,
      "grad_norm": 0.26253828406333923,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 6153
    },
    {
      "epoch": 0.98464,
      "grad_norm": 0.1330917477607727,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 6154
    },
    {
      "epoch": 0.9848,
      "grad_norm": 0.11032970994710922,
      "learning_rate": 0.0001,
      "loss": 0.2975,
      "step": 6155
    },
    {
      "epoch": 0.98496,
      "grad_norm": 0.24051503837108612,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 6156
    },
    {
      "epoch": 0.98512,
      "grad_norm": 0.1817351132631302,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 6157
    },
    {
      "epoch": 0.98528,
      "grad_norm": 0.1269799768924713,
      "learning_rate": 0.0001,
      "loss": 0.3048,
      "step": 6158
    },
    {
      "epoch": 0.98544,
      "grad_norm": 0.1472412794828415,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 6159
    },
    {
      "epoch": 0.9856,
      "grad_norm": 0.12558278441429138,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 6160
    },
    {
      "epoch": 0.98576,
      "grad_norm": 0.11322414129972458,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 6161
    },
    {
      "epoch": 0.98592,
      "grad_norm": 0.1207236722111702,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 6162
    },
    {
      "epoch": 0.98608,
      "grad_norm": 0.09774525463581085,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 6163
    },
    {
      "epoch": 0.98624,
      "grad_norm": 0.15573059022426605,
      "learning_rate": 0.0001,
      "loss": 0.338,
      "step": 6164
    },
    {
      "epoch": 0.9864,
      "grad_norm": 0.09810976684093475,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 6165
    },
    {
      "epoch": 0.98656,
      "grad_norm": 0.1272764503955841,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 6166
    },
    {
      "epoch": 0.98672,
      "grad_norm": 0.13067150115966797,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 6167
    },
    {
      "epoch": 0.98688,
      "grad_norm": 0.16999363899230957,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 6168
    },
    {
      "epoch": 0.98704,
      "grad_norm": 0.10030487924814224,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 6169
    },
    {
      "epoch": 0.9872,
      "grad_norm": 0.10076528042554855,
      "learning_rate": 0.0001,
      "loss": 0.3011,
      "step": 6170
    },
    {
      "epoch": 0.98736,
      "grad_norm": 0.17636170983314514,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 6171
    },
    {
      "epoch": 0.98752,
      "grad_norm": 0.11476939171552658,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 6172
    },
    {
      "epoch": 0.98768,
      "grad_norm": 0.10386504977941513,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 6173
    },
    {
      "epoch": 0.98784,
      "grad_norm": 0.08981077373027802,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 6174
    },
    {
      "epoch": 0.988,
      "grad_norm": 0.08128612488508224,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 6175
    },
    {
      "epoch": 0.98816,
      "grad_norm": 0.15280264616012573,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 6176
    },
    {
      "epoch": 0.98832,
      "grad_norm": 0.08873968571424484,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 6177
    },
    {
      "epoch": 0.98848,
      "grad_norm": 0.12760911881923676,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 6178
    },
    {
      "epoch": 0.98864,
      "grad_norm": 0.15733474493026733,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 6179
    },
    {
      "epoch": 0.9888,
      "grad_norm": 0.08856159448623657,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 6180
    },
    {
      "epoch": 0.98896,
      "grad_norm": 0.10531647503376007,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 6181
    },
    {
      "epoch": 0.98912,
      "grad_norm": 0.09936071932315826,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 6182
    },
    {
      "epoch": 0.98928,
      "grad_norm": 0.10169737040996552,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 6183
    },
    {
      "epoch": 0.98944,
      "grad_norm": 0.22252337634563446,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 6184
    },
    {
      "epoch": 0.9896,
      "grad_norm": 0.10892479866743088,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 6185
    },
    {
      "epoch": 0.98976,
      "grad_norm": 0.15926223993301392,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 6186
    },
    {
      "epoch": 0.98992,
      "grad_norm": 0.13125932216644287,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 6187
    },
    {
      "epoch": 0.99008,
      "grad_norm": 0.12009050697088242,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 6188
    },
    {
      "epoch": 0.99024,
      "grad_norm": 0.17489010095596313,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 6189
    },
    {
      "epoch": 0.9904,
      "grad_norm": 0.11446789652109146,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 6190
    },
    {
      "epoch": 0.99056,
      "grad_norm": 0.15123602747917175,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 6191
    },
    {
      "epoch": 0.99072,
      "grad_norm": 0.08995494991540909,
      "learning_rate": 0.0001,
      "loss": 0.3032,
      "step": 6192
    },
    {
      "epoch": 0.99088,
      "grad_norm": 0.10059061646461487,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 6193
    },
    {
      "epoch": 0.99104,
      "grad_norm": 0.13536140322685242,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 6194
    },
    {
      "epoch": 0.9912,
      "grad_norm": 0.1086760014295578,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 6195
    },
    {
      "epoch": 0.99136,
      "grad_norm": 0.24369895458221436,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 6196
    },
    {
      "epoch": 0.99152,
      "grad_norm": 0.09328737109899521,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 6197
    },
    {
      "epoch": 0.99168,
      "grad_norm": 0.11237797886133194,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 6198
    },
    {
      "epoch": 0.99184,
      "grad_norm": 0.1982058435678482,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 6199
    },
    {
      "epoch": 0.992,
      "grad_norm": 0.0860741138458252,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 6200
    },
    {
      "epoch": 0.992,
      "eval_train_accuracy": 0.9998,
      "eval_train_loss": 0.31824570894241333,
      "eval_train_runtime": 4.1374,
      "eval_train_samples_per_second": 1208.488,
      "eval_train_steps_per_second": 15.227,
      "step": 6200
    },
    {
      "epoch": 0.992,
      "eval_test_accuracy": 0.9996,
      "eval_test_loss": 0.3169545829296112,
      "eval_test_runtime": 4.9111,
      "eval_test_samples_per_second": 1018.107,
      "eval_test_steps_per_second": 12.828,
      "step": 6200
    },
    {
      "epoch": 0.99216,
      "grad_norm": 0.09509537369012833,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 6201
    },
    {
      "epoch": 0.99232,
      "grad_norm": 0.18441206216812134,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 6202
    },
    {
      "epoch": 0.99248,
      "grad_norm": 0.11097081750631332,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 6203
    },
    {
      "epoch": 0.99264,
      "grad_norm": 0.11259624361991882,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 6204
    },
    {
      "epoch": 0.9928,
      "grad_norm": 0.09391707926988602,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 6205
    },
    {
      "epoch": 0.99296,
      "grad_norm": 0.15487553179264069,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 6206
    },
    {
      "epoch": 0.99312,
      "grad_norm": 0.14004814624786377,
      "learning_rate": 0.0001,
      "loss": 0.307,
      "step": 6207
    },
    {
      "epoch": 0.99328,
      "grad_norm": 0.1285140961408615,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 6208
    },
    {
      "epoch": 0.99344,
      "grad_norm": 0.10075578093528748,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 6209
    },
    {
      "epoch": 0.9936,
      "grad_norm": 0.1619998961687088,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 6210
    },
    {
      "epoch": 0.99376,
      "grad_norm": 0.12904980778694153,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 6211
    },
    {
      "epoch": 0.99392,
      "grad_norm": 0.11548171937465668,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 6212
    },
    {
      "epoch": 0.99408,
      "grad_norm": 0.08789188414812088,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 6213
    },
    {
      "epoch": 0.99424,
      "grad_norm": 0.08565019816160202,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 6214
    },
    {
      "epoch": 0.9944,
      "grad_norm": 0.0880507156252861,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 6215
    },
    {
      "epoch": 0.99456,
      "grad_norm": 0.0990375280380249,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 6216
    },
    {
      "epoch": 0.99472,
      "grad_norm": 0.10260118544101715,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 6217
    },
    {
      "epoch": 0.99488,
      "grad_norm": 0.11523490399122238,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 6218
    },
    {
      "epoch": 0.99504,
      "grad_norm": 0.10252978652715683,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 6219
    },
    {
      "epoch": 0.9952,
      "grad_norm": 0.10348653048276901,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 6220
    },
    {
      "epoch": 0.99536,
      "grad_norm": 0.10083141922950745,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 6221
    },
    {
      "epoch": 0.99552,
      "grad_norm": 0.08585628122091293,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 6222
    },
    {
      "epoch": 0.99568,
      "grad_norm": 0.12407236546278,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 6223
    },
    {
      "epoch": 0.99584,
      "grad_norm": 0.09845121949911118,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 6224
    },
    {
      "epoch": 0.996,
      "grad_norm": 0.10850360989570618,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 6225
    },
    {
      "epoch": 0.99616,
      "grad_norm": 0.10242019593715668,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 6226
    },
    {
      "epoch": 0.99632,
      "grad_norm": 0.09575928002595901,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 6227
    },
    {
      "epoch": 0.99648,
      "grad_norm": 0.09945736080408096,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 6228
    },
    {
      "epoch": 0.99664,
      "grad_norm": 0.10033655911684036,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 6229
    },
    {
      "epoch": 0.9968,
      "grad_norm": 0.10493739694356918,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 6230
    },
    {
      "epoch": 0.99696,
      "grad_norm": 0.10617823898792267,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 6231
    },
    {
      "epoch": 0.99712,
      "grad_norm": 0.09723355621099472,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 6232
    },
    {
      "epoch": 0.99728,
      "grad_norm": 0.09663315862417221,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 6233
    },
    {
      "epoch": 0.99744,
      "grad_norm": 0.08700912445783615,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 6234
    },
    {
      "epoch": 0.9976,
      "grad_norm": 0.09308379888534546,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 6235
    },
    {
      "epoch": 0.99776,
      "grad_norm": 0.07947546988725662,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 6236
    },
    {
      "epoch": 0.99792,
      "grad_norm": 0.0896592065691948,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 6237
    },
    {
      "epoch": 0.99808,
      "grad_norm": 0.09790924936532974,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 6238
    },
    {
      "epoch": 0.99824,
      "grad_norm": 0.09101049602031708,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 6239
    },
    {
      "epoch": 0.9984,
      "grad_norm": 0.08285271376371384,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 6240
    },
    {
      "epoch": 0.99856,
      "grad_norm": 0.08636684715747833,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 6241
    },
    {
      "epoch": 0.99872,
      "grad_norm": 0.10920526832342148,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 6242
    },
    {
      "epoch": 0.99888,
      "grad_norm": 0.09146081656217575,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 6243
    },
    {
      "epoch": 0.99904,
      "grad_norm": 0.09512586891651154,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 6244
    },
    {
      "epoch": 0.9992,
      "grad_norm": 0.08358017355203629,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 6245
    },
    {
      "epoch": 0.99936,
      "grad_norm": 0.1026267409324646,
      "learning_rate": 0.0001,
      "loss": 0.2992,
      "step": 6246
    },
    {
      "epoch": 0.99952,
      "grad_norm": 0.09837280958890915,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 6247
    },
    {
      "epoch": 0.99968,
      "grad_norm": 0.0884619653224945,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 6248
    },
    {
      "epoch": 0.99984,
      "grad_norm": 0.09335175901651382,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 6249
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.10669976472854614,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 6250
    },
    {
      "before_init_mem_cpu": 1302065152,
      "before_init_mem_gpu": 596095488,
      "epoch": 1.0,
      "init_mem_cpu_alloc_delta": 655360,
      "init_mem_cpu_peaked_delta": 0,
      "init_mem_gpu_alloc_delta": 0,
      "init_mem_gpu_peaked_delta": 0,
      "step": 6250,
      "total_flos": 6.53232308224e+16,
      "train_loss": 0.3607722604894638,
      "train_mem_cpu_alloc_delta": 1975193600,
      "train_mem_cpu_peaked_delta": 619790336,
      "train_mem_gpu_alloc_delta": 1540037632,
      "train_mem_gpu_peaked_delta": 5834735616,
      "train_runtime": 3523.3884,
      "train_samples_per_second": 141.909,
      "train_steps_per_second": 1.774
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 6250,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 1000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 6.53232308224e+16,
  "train_batch_size": 10,
  "trial_name": null,
  "trial_params": null
}
