{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 100,
  "global_step": 6250,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.00016,
      "grad_norm": 80.39911651611328,
      "learning_rate": 0.0001,
      "loss": 10.5586,
      "step": 1
    },
    {
      "epoch": 0.00032,
      "grad_norm": 15.671442985534668,
      "learning_rate": 0.0001,
      "loss": 6.9402,
      "step": 2
    },
    {
      "epoch": 0.00048,
      "grad_norm": 12.67354965209961,
      "learning_rate": 0.0001,
      "loss": 5.6504,
      "step": 3
    },
    {
      "epoch": 0.00064,
      "grad_norm": 44.184486389160156,
      "learning_rate": 0.0001,
      "loss": 7.3502,
      "step": 4
    },
    {
      "epoch": 0.0008,
      "grad_norm": 19.673355102539062,
      "learning_rate": 0.0001,
      "loss": 5.82,
      "step": 5
    },
    {
      "epoch": 0.00096,
      "grad_norm": 5.060791015625,
      "learning_rate": 0.0001,
      "loss": 4.903,
      "step": 6
    },
    {
      "epoch": 0.00112,
      "grad_norm": 10.707590103149414,
      "learning_rate": 0.0001,
      "loss": 4.6905,
      "step": 7
    },
    {
      "epoch": 0.00128,
      "grad_norm": 3.606316089630127,
      "learning_rate": 0.0001,
      "loss": 4.4144,
      "step": 8
    },
    {
      "epoch": 0.00144,
      "grad_norm": 4.1366286277771,
      "learning_rate": 0.0001,
      "loss": 4.3461,
      "step": 9
    },
    {
      "epoch": 0.0016,
      "grad_norm": 3.5893471240997314,
      "learning_rate": 0.0001,
      "loss": 4.3901,
      "step": 10
    },
    {
      "epoch": 0.00176,
      "grad_norm": 3.547670602798462,
      "learning_rate": 0.0001,
      "loss": 4.2081,
      "step": 11
    },
    {
      "epoch": 0.00192,
      "grad_norm": 3.9063162803649902,
      "learning_rate": 0.0001,
      "loss": 4.0759,
      "step": 12
    },
    {
      "epoch": 0.00208,
      "grad_norm": 3.756638765335083,
      "learning_rate": 0.0001,
      "loss": 3.6603,
      "step": 13
    },
    {
      "epoch": 0.00224,
      "grad_norm": 23.16211700439453,
      "learning_rate": 0.0001,
      "loss": 3.7763,
      "step": 14
    },
    {
      "epoch": 0.0024,
      "grad_norm": 4.716541290283203,
      "learning_rate": 0.0001,
      "loss": 3.3283,
      "step": 15
    },
    {
      "epoch": 0.00256,
      "grad_norm": 6.814342498779297,
      "learning_rate": 0.0001,
      "loss": 3.2384,
      "step": 16
    },
    {
      "epoch": 0.00272,
      "grad_norm": 10.047076225280762,
      "learning_rate": 0.0001,
      "loss": 3.0703,
      "step": 17
    },
    {
      "epoch": 0.00288,
      "grad_norm": 6.14913272857666,
      "learning_rate": 0.0001,
      "loss": 2.8087,
      "step": 18
    },
    {
      "epoch": 0.00304,
      "grad_norm": 13.1141939163208,
      "learning_rate": 0.0001,
      "loss": 2.9996,
      "step": 19
    },
    {
      "epoch": 0.0032,
      "grad_norm": 10.180819511413574,
      "learning_rate": 0.0001,
      "loss": 2.8109,
      "step": 20
    },
    {
      "epoch": 0.00336,
      "grad_norm": 3.622014284133911,
      "learning_rate": 0.0001,
      "loss": 2.631,
      "step": 21
    },
    {
      "epoch": 0.00352,
      "grad_norm": 6.580256462097168,
      "learning_rate": 0.0001,
      "loss": 2.503,
      "step": 22
    },
    {
      "epoch": 0.00368,
      "grad_norm": 4.537111282348633,
      "learning_rate": 0.0001,
      "loss": 2.473,
      "step": 23
    },
    {
      "epoch": 0.00384,
      "grad_norm": 3.412684202194214,
      "learning_rate": 0.0001,
      "loss": 2.4312,
      "step": 24
    },
    {
      "epoch": 0.004,
      "grad_norm": 2.941577911376953,
      "learning_rate": 0.0001,
      "loss": 2.2721,
      "step": 25
    },
    {
      "epoch": 0.00416,
      "grad_norm": 3.690960645675659,
      "learning_rate": 0.0001,
      "loss": 2.259,
      "step": 26
    },
    {
      "epoch": 0.00432,
      "grad_norm": 2.1140716075897217,
      "learning_rate": 0.0001,
      "loss": 2.0257,
      "step": 27
    },
    {
      "epoch": 0.00448,
      "grad_norm": 2.423288583755493,
      "learning_rate": 0.0001,
      "loss": 1.996,
      "step": 28
    },
    {
      "epoch": 0.00464,
      "grad_norm": 2.7935056686401367,
      "learning_rate": 0.0001,
      "loss": 2.0,
      "step": 29
    },
    {
      "epoch": 0.0048,
      "grad_norm": 2.039024829864502,
      "learning_rate": 0.0001,
      "loss": 1.9211,
      "step": 30
    },
    {
      "epoch": 0.00496,
      "grad_norm": 1.9066983461380005,
      "learning_rate": 0.0001,
      "loss": 1.721,
      "step": 31
    },
    {
      "epoch": 0.00512,
      "grad_norm": 3.0360565185546875,
      "learning_rate": 0.0001,
      "loss": 1.7979,
      "step": 32
    },
    {
      "epoch": 0.00528,
      "grad_norm": 1.4364055395126343,
      "learning_rate": 0.0001,
      "loss": 1.7331,
      "step": 33
    },
    {
      "epoch": 0.00544,
      "grad_norm": 1.8469412326812744,
      "learning_rate": 0.0001,
      "loss": 1.5526,
      "step": 34
    },
    {
      "epoch": 0.0056,
      "grad_norm": 1.2137219905853271,
      "learning_rate": 0.0001,
      "loss": 1.489,
      "step": 35
    },
    {
      "epoch": 0.00576,
      "grad_norm": 1.3117668628692627,
      "learning_rate": 0.0001,
      "loss": 1.5585,
      "step": 36
    },
    {
      "epoch": 0.00592,
      "grad_norm": 1.1020876169204712,
      "learning_rate": 0.0001,
      "loss": 1.4766,
      "step": 37
    },
    {
      "epoch": 0.00608,
      "grad_norm": 1.0487780570983887,
      "learning_rate": 0.0001,
      "loss": 1.3695,
      "step": 38
    },
    {
      "epoch": 0.00624,
      "grad_norm": 0.8585880994796753,
      "learning_rate": 0.0001,
      "loss": 1.4066,
      "step": 39
    },
    {
      "epoch": 0.0064,
      "grad_norm": 1.0977801084518433,
      "learning_rate": 0.0001,
      "loss": 1.3736,
      "step": 40
    },
    {
      "epoch": 0.00656,
      "grad_norm": 1.2098522186279297,
      "learning_rate": 0.0001,
      "loss": 1.2897,
      "step": 41
    },
    {
      "epoch": 0.00672,
      "grad_norm": 1.3049031496047974,
      "learning_rate": 0.0001,
      "loss": 1.2628,
      "step": 42
    },
    {
      "epoch": 0.00688,
      "grad_norm": 1.0860329866409302,
      "learning_rate": 0.0001,
      "loss": 1.2333,
      "step": 43
    },
    {
      "epoch": 0.00704,
      "grad_norm": 0.8776577115058899,
      "learning_rate": 0.0001,
      "loss": 1.1816,
      "step": 44
    },
    {
      "epoch": 0.0072,
      "grad_norm": 0.9778422117233276,
      "learning_rate": 0.0001,
      "loss": 1.1006,
      "step": 45
    },
    {
      "epoch": 0.00736,
      "grad_norm": 0.9108880758285522,
      "learning_rate": 0.0001,
      "loss": 1.1075,
      "step": 46
    },
    {
      "epoch": 0.00752,
      "grad_norm": 1.031628966331482,
      "learning_rate": 0.0001,
      "loss": 1.1093,
      "step": 47
    },
    {
      "epoch": 0.00768,
      "grad_norm": 1.455325961112976,
      "learning_rate": 0.0001,
      "loss": 1.1046,
      "step": 48
    },
    {
      "epoch": 0.00784,
      "grad_norm": 0.9179613590240479,
      "learning_rate": 0.0001,
      "loss": 1.0923,
      "step": 49
    },
    {
      "epoch": 0.008,
      "grad_norm": 1.778253197669983,
      "learning_rate": 0.0001,
      "loss": 1.0364,
      "step": 50
    },
    {
      "epoch": 0.00816,
      "grad_norm": 1.2888579368591309,
      "learning_rate": 0.0001,
      "loss": 1.0164,
      "step": 51
    },
    {
      "epoch": 0.00832,
      "grad_norm": 1.3279385566711426,
      "learning_rate": 0.0001,
      "loss": 0.9379,
      "step": 52
    },
    {
      "epoch": 0.00848,
      "grad_norm": 1.7570205926895142,
      "learning_rate": 0.0001,
      "loss": 1.0125,
      "step": 53
    },
    {
      "epoch": 0.00864,
      "grad_norm": 0.9762410521507263,
      "learning_rate": 0.0001,
      "loss": 0.9492,
      "step": 54
    },
    {
      "epoch": 0.0088,
      "grad_norm": 1.7329506874084473,
      "learning_rate": 0.0001,
      "loss": 0.9128,
      "step": 55
    },
    {
      "epoch": 0.00896,
      "grad_norm": 1.6175631284713745,
      "learning_rate": 0.0001,
      "loss": 0.9056,
      "step": 56
    },
    {
      "epoch": 0.00912,
      "grad_norm": 1.3171113729476929,
      "learning_rate": 0.0001,
      "loss": 0.912,
      "step": 57
    },
    {
      "epoch": 0.00928,
      "grad_norm": 2.0482373237609863,
      "learning_rate": 0.0001,
      "loss": 0.9274,
      "step": 58
    },
    {
      "epoch": 0.00944,
      "grad_norm": 1.6833858489990234,
      "learning_rate": 0.0001,
      "loss": 0.8778,
      "step": 59
    },
    {
      "epoch": 0.0096,
      "grad_norm": 2.5913352966308594,
      "learning_rate": 0.0001,
      "loss": 0.9276,
      "step": 60
    },
    {
      "epoch": 0.00976,
      "grad_norm": 1.2974921464920044,
      "learning_rate": 0.0001,
      "loss": 0.8586,
      "step": 61
    },
    {
      "epoch": 0.00992,
      "grad_norm": 2.942783832550049,
      "learning_rate": 0.0001,
      "loss": 0.8918,
      "step": 62
    },
    {
      "epoch": 0.01008,
      "grad_norm": 2.3180718421936035,
      "learning_rate": 0.0001,
      "loss": 0.9002,
      "step": 63
    },
    {
      "epoch": 0.01024,
      "grad_norm": 2.254426956176758,
      "learning_rate": 0.0001,
      "loss": 0.8358,
      "step": 64
    },
    {
      "epoch": 0.0104,
      "grad_norm": 2.508162498474121,
      "learning_rate": 0.0001,
      "loss": 0.8418,
      "step": 65
    },
    {
      "epoch": 0.01056,
      "grad_norm": 2.2751739025115967,
      "learning_rate": 0.0001,
      "loss": 0.8474,
      "step": 66
    },
    {
      "epoch": 0.01072,
      "grad_norm": 1.4414390325546265,
      "learning_rate": 0.0001,
      "loss": 0.8043,
      "step": 67
    },
    {
      "epoch": 0.01088,
      "grad_norm": 1.5287507772445679,
      "learning_rate": 0.0001,
      "loss": 0.8365,
      "step": 68
    },
    {
      "epoch": 0.01104,
      "grad_norm": 0.9758228659629822,
      "learning_rate": 0.0001,
      "loss": 0.7858,
      "step": 69
    },
    {
      "epoch": 0.0112,
      "grad_norm": 2.078364610671997,
      "learning_rate": 0.0001,
      "loss": 0.8512,
      "step": 70
    },
    {
      "epoch": 0.01136,
      "grad_norm": 2.050605535507202,
      "learning_rate": 0.0001,
      "loss": 0.8335,
      "step": 71
    },
    {
      "epoch": 0.01152,
      "grad_norm": 1.4331215620040894,
      "learning_rate": 0.0001,
      "loss": 0.7704,
      "step": 72
    },
    {
      "epoch": 0.01168,
      "grad_norm": 1.215245246887207,
      "learning_rate": 0.0001,
      "loss": 0.7907,
      "step": 73
    },
    {
      "epoch": 0.01184,
      "grad_norm": 1.1511173248291016,
      "learning_rate": 0.0001,
      "loss": 0.7584,
      "step": 74
    },
    {
      "epoch": 0.012,
      "grad_norm": 0.8133059144020081,
      "learning_rate": 0.0001,
      "loss": 0.7616,
      "step": 75
    },
    {
      "epoch": 0.01216,
      "grad_norm": 1.3076802492141724,
      "learning_rate": 0.0001,
      "loss": 0.7818,
      "step": 76
    },
    {
      "epoch": 0.01232,
      "grad_norm": 0.8781341314315796,
      "learning_rate": 0.0001,
      "loss": 0.7752,
      "step": 77
    },
    {
      "epoch": 0.01248,
      "grad_norm": 1.229236364364624,
      "learning_rate": 0.0001,
      "loss": 0.7424,
      "step": 78
    },
    {
      "epoch": 0.01264,
      "grad_norm": 1.8056045770645142,
      "learning_rate": 0.0001,
      "loss": 0.7666,
      "step": 79
    },
    {
      "epoch": 0.0128,
      "grad_norm": 0.7361927032470703,
      "learning_rate": 0.0001,
      "loss": 0.7163,
      "step": 80
    },
    {
      "epoch": 0.01296,
      "grad_norm": 1.857187032699585,
      "learning_rate": 0.0001,
      "loss": 0.719,
      "step": 81
    },
    {
      "epoch": 0.01312,
      "grad_norm": 1.2613064050674438,
      "learning_rate": 0.0001,
      "loss": 0.7555,
      "step": 82
    },
    {
      "epoch": 0.01328,
      "grad_norm": 2.080594778060913,
      "learning_rate": 0.0001,
      "loss": 0.7487,
      "step": 83
    },
    {
      "epoch": 0.01344,
      "grad_norm": 0.7482504844665527,
      "learning_rate": 0.0001,
      "loss": 0.7509,
      "step": 84
    },
    {
      "epoch": 0.0136,
      "grad_norm": 1.42410409450531,
      "learning_rate": 0.0001,
      "loss": 0.7071,
      "step": 85
    },
    {
      "epoch": 0.01376,
      "grad_norm": 0.6582646369934082,
      "learning_rate": 0.0001,
      "loss": 0.6994,
      "step": 86
    },
    {
      "epoch": 0.01392,
      "grad_norm": 1.510961651802063,
      "learning_rate": 0.0001,
      "loss": 0.7273,
      "step": 87
    },
    {
      "epoch": 0.01408,
      "grad_norm": 0.8712301254272461,
      "learning_rate": 0.0001,
      "loss": 0.6734,
      "step": 88
    },
    {
      "epoch": 0.01424,
      "grad_norm": 1.0457253456115723,
      "learning_rate": 0.0001,
      "loss": 0.7105,
      "step": 89
    },
    {
      "epoch": 0.0144,
      "grad_norm": 0.7951797842979431,
      "learning_rate": 0.0001,
      "loss": 0.696,
      "step": 90
    },
    {
      "epoch": 0.01456,
      "grad_norm": 1.0288506746292114,
      "learning_rate": 0.0001,
      "loss": 0.7306,
      "step": 91
    },
    {
      "epoch": 0.01472,
      "grad_norm": 0.8065840005874634,
      "learning_rate": 0.0001,
      "loss": 0.703,
      "step": 92
    },
    {
      "epoch": 0.01488,
      "grad_norm": 0.7804507613182068,
      "learning_rate": 0.0001,
      "loss": 0.6995,
      "step": 93
    },
    {
      "epoch": 0.01504,
      "grad_norm": 0.9631140232086182,
      "learning_rate": 0.0001,
      "loss": 0.6967,
      "step": 94
    },
    {
      "epoch": 0.0152,
      "grad_norm": 0.6027512550354004,
      "learning_rate": 0.0001,
      "loss": 0.6854,
      "step": 95
    },
    {
      "epoch": 0.01536,
      "grad_norm": 0.9171997904777527,
      "learning_rate": 0.0001,
      "loss": 0.691,
      "step": 96
    },
    {
      "epoch": 0.01552,
      "grad_norm": 0.8984583616256714,
      "learning_rate": 0.0001,
      "loss": 0.7026,
      "step": 97
    },
    {
      "epoch": 0.01568,
      "grad_norm": 0.7166235446929932,
      "learning_rate": 0.0001,
      "loss": 0.6757,
      "step": 98
    },
    {
      "epoch": 0.01584,
      "grad_norm": 0.9572327733039856,
      "learning_rate": 0.0001,
      "loss": 0.6729,
      "step": 99
    },
    {
      "epoch": 0.016,
      "grad_norm": 0.9919624924659729,
      "learning_rate": 0.0001,
      "loss": 0.677,
      "step": 100
    },
    {
      "epoch": 0.016,
      "eval_train_accuracy": 0.492,
      "eval_train_loss": 0.659761905670166,
      "eval_train_runtime": 4.5011,
      "eval_train_samples_per_second": 1110.829,
      "eval_train_steps_per_second": 13.996,
      "step": 100
    },
    {
      "epoch": 0.016,
      "eval_test_accuracy": 0.5006,
      "eval_test_loss": 0.657030463218689,
      "eval_test_runtime": 4.5941,
      "eval_test_samples_per_second": 1088.361,
      "eval_test_steps_per_second": 13.713,
      "step": 100
    },
    {
      "epoch": 0.01616,
      "grad_norm": 0.6352452039718628,
      "learning_rate": 0.0001,
      "loss": 0.6506,
      "step": 101
    },
    {
      "epoch": 0.01632,
      "grad_norm": 1.8396220207214355,
      "learning_rate": 0.0001,
      "loss": 0.6993,
      "step": 102
    },
    {
      "epoch": 0.01648,
      "grad_norm": 0.7576035261154175,
      "learning_rate": 0.0001,
      "loss": 0.6716,
      "step": 103
    },
    {
      "epoch": 0.01664,
      "grad_norm": 0.8698413372039795,
      "learning_rate": 0.0001,
      "loss": 0.6732,
      "step": 104
    },
    {
      "epoch": 0.0168,
      "grad_norm": 0.8577501177787781,
      "learning_rate": 0.0001,
      "loss": 0.6649,
      "step": 105
    },
    {
      "epoch": 0.01696,
      "grad_norm": 0.7041000127792358,
      "learning_rate": 0.0001,
      "loss": 0.6573,
      "step": 106
    },
    {
      "epoch": 0.01712,
      "grad_norm": 0.7019261121749878,
      "learning_rate": 0.0001,
      "loss": 0.6631,
      "step": 107
    },
    {
      "epoch": 0.01728,
      "grad_norm": 0.7171713709831238,
      "learning_rate": 0.0001,
      "loss": 0.6628,
      "step": 108
    },
    {
      "epoch": 0.01744,
      "grad_norm": 0.7027986645698547,
      "learning_rate": 0.0001,
      "loss": 0.6721,
      "step": 109
    },
    {
      "epoch": 0.0176,
      "grad_norm": 0.9410023093223572,
      "learning_rate": 0.0001,
      "loss": 0.6577,
      "step": 110
    },
    {
      "epoch": 0.01776,
      "grad_norm": 1.2989214658737183,
      "learning_rate": 0.0001,
      "loss": 0.6753,
      "step": 111
    },
    {
      "epoch": 0.01792,
      "grad_norm": 1.181657075881958,
      "learning_rate": 0.0001,
      "loss": 0.6563,
      "step": 112
    },
    {
      "epoch": 0.01808,
      "grad_norm": 1.1706501245498657,
      "learning_rate": 0.0001,
      "loss": 0.6472,
      "step": 113
    },
    {
      "epoch": 0.01824,
      "grad_norm": 0.5766515731811523,
      "learning_rate": 0.0001,
      "loss": 0.6032,
      "step": 114
    },
    {
      "epoch": 0.0184,
      "grad_norm": 1.0594301223754883,
      "learning_rate": 0.0001,
      "loss": 0.6586,
      "step": 115
    },
    {
      "epoch": 0.01856,
      "grad_norm": 0.868653416633606,
      "learning_rate": 0.0001,
      "loss": 0.6323,
      "step": 116
    },
    {
      "epoch": 0.01872,
      "grad_norm": 0.7323935627937317,
      "learning_rate": 0.0001,
      "loss": 0.6617,
      "step": 117
    },
    {
      "epoch": 0.01888,
      "grad_norm": 1.3119251728057861,
      "learning_rate": 0.0001,
      "loss": 0.6413,
      "step": 118
    },
    {
      "epoch": 0.01904,
      "grad_norm": 0.6519016027450562,
      "learning_rate": 0.0001,
      "loss": 0.6336,
      "step": 119
    },
    {
      "epoch": 0.0192,
      "grad_norm": 0.8117568492889404,
      "learning_rate": 0.0001,
      "loss": 0.6371,
      "step": 120
    },
    {
      "epoch": 0.01936,
      "grad_norm": 0.8189674019813538,
      "learning_rate": 0.0001,
      "loss": 0.6022,
      "step": 121
    },
    {
      "epoch": 0.01952,
      "grad_norm": 0.9155477285385132,
      "learning_rate": 0.0001,
      "loss": 0.6581,
      "step": 122
    },
    {
      "epoch": 0.01968,
      "grad_norm": 1.3931384086608887,
      "learning_rate": 0.0001,
      "loss": 0.6268,
      "step": 123
    },
    {
      "epoch": 0.01984,
      "grad_norm": 0.6608943939208984,
      "learning_rate": 0.0001,
      "loss": 0.6358,
      "step": 124
    },
    {
      "epoch": 0.02,
      "grad_norm": 1.1930882930755615,
      "learning_rate": 0.0001,
      "loss": 0.6352,
      "step": 125
    },
    {
      "epoch": 0.02016,
      "grad_norm": 0.5051792860031128,
      "learning_rate": 0.0001,
      "loss": 0.6364,
      "step": 126
    },
    {
      "epoch": 0.02032,
      "grad_norm": 0.9110549092292786,
      "learning_rate": 0.0001,
      "loss": 0.6304,
      "step": 127
    },
    {
      "epoch": 0.02048,
      "grad_norm": 0.47985485196113586,
      "learning_rate": 0.0001,
      "loss": 0.6354,
      "step": 128
    },
    {
      "epoch": 0.02064,
      "grad_norm": 1.0370982885360718,
      "learning_rate": 0.0001,
      "loss": 0.6324,
      "step": 129
    },
    {
      "epoch": 0.0208,
      "grad_norm": 0.7392075061798096,
      "learning_rate": 0.0001,
      "loss": 0.6294,
      "step": 130
    },
    {
      "epoch": 0.02096,
      "grad_norm": 0.7855930924415588,
      "learning_rate": 0.0001,
      "loss": 0.6163,
      "step": 131
    },
    {
      "epoch": 0.02112,
      "grad_norm": 1.0568698644638062,
      "learning_rate": 0.0001,
      "loss": 0.6206,
      "step": 132
    },
    {
      "epoch": 0.02128,
      "grad_norm": 0.4767407774925232,
      "learning_rate": 0.0001,
      "loss": 0.6463,
      "step": 133
    },
    {
      "epoch": 0.02144,
      "grad_norm": 0.8539654612541199,
      "learning_rate": 0.0001,
      "loss": 0.6075,
      "step": 134
    },
    {
      "epoch": 0.0216,
      "grad_norm": 0.8813937306404114,
      "learning_rate": 0.0001,
      "loss": 0.6449,
      "step": 135
    },
    {
      "epoch": 0.02176,
      "grad_norm": 1.2799016237258911,
      "learning_rate": 0.0001,
      "loss": 0.6063,
      "step": 136
    },
    {
      "epoch": 0.02192,
      "grad_norm": 0.5358788967132568,
      "learning_rate": 0.0001,
      "loss": 0.6224,
      "step": 137
    },
    {
      "epoch": 0.02208,
      "grad_norm": 0.9495546221733093,
      "learning_rate": 0.0001,
      "loss": 0.626,
      "step": 138
    },
    {
      "epoch": 0.02224,
      "grad_norm": 0.5213519334793091,
      "learning_rate": 0.0001,
      "loss": 0.5987,
      "step": 139
    },
    {
      "epoch": 0.0224,
      "grad_norm": 0.6972806453704834,
      "learning_rate": 0.0001,
      "loss": 0.6147,
      "step": 140
    },
    {
      "epoch": 0.02256,
      "grad_norm": 0.6652224063873291,
      "learning_rate": 0.0001,
      "loss": 0.6248,
      "step": 141
    },
    {
      "epoch": 0.02272,
      "grad_norm": 0.5580384135246277,
      "learning_rate": 0.0001,
      "loss": 0.6019,
      "step": 142
    },
    {
      "epoch": 0.02288,
      "grad_norm": 0.7884476780891418,
      "learning_rate": 0.0001,
      "loss": 0.6124,
      "step": 143
    },
    {
      "epoch": 0.02304,
      "grad_norm": 0.41915982961654663,
      "learning_rate": 0.0001,
      "loss": 0.6175,
      "step": 144
    },
    {
      "epoch": 0.0232,
      "grad_norm": 0.9808709025382996,
      "learning_rate": 0.0001,
      "loss": 0.6443,
      "step": 145
    },
    {
      "epoch": 0.02336,
      "grad_norm": 0.6034143567085266,
      "learning_rate": 0.0001,
      "loss": 0.6118,
      "step": 146
    },
    {
      "epoch": 0.02352,
      "grad_norm": 0.5453589558601379,
      "learning_rate": 0.0001,
      "loss": 0.5669,
      "step": 147
    },
    {
      "epoch": 0.02368,
      "grad_norm": 0.4634958803653717,
      "learning_rate": 0.0001,
      "loss": 0.5971,
      "step": 148
    },
    {
      "epoch": 0.02384,
      "grad_norm": 0.6407926082611084,
      "learning_rate": 0.0001,
      "loss": 0.608,
      "step": 149
    },
    {
      "epoch": 0.024,
      "grad_norm": 0.5494604706764221,
      "learning_rate": 0.0001,
      "loss": 0.5823,
      "step": 150
    },
    {
      "epoch": 0.02416,
      "grad_norm": 0.5147597193717957,
      "learning_rate": 0.0001,
      "loss": 0.6048,
      "step": 151
    },
    {
      "epoch": 0.02432,
      "grad_norm": 0.4736330211162567,
      "learning_rate": 0.0001,
      "loss": 0.5899,
      "step": 152
    },
    {
      "epoch": 0.02448,
      "grad_norm": 0.45500102639198303,
      "learning_rate": 0.0001,
      "loss": 0.5902,
      "step": 153
    },
    {
      "epoch": 0.02464,
      "grad_norm": 0.43775123357772827,
      "learning_rate": 0.0001,
      "loss": 0.5876,
      "step": 154
    },
    {
      "epoch": 0.0248,
      "grad_norm": 0.46121856570243835,
      "learning_rate": 0.0001,
      "loss": 0.5912,
      "step": 155
    },
    {
      "epoch": 0.02496,
      "grad_norm": 0.4929647743701935,
      "learning_rate": 0.0001,
      "loss": 0.5956,
      "step": 156
    },
    {
      "epoch": 0.02512,
      "grad_norm": 0.6370607614517212,
      "learning_rate": 0.0001,
      "loss": 0.5883,
      "step": 157
    },
    {
      "epoch": 0.02528,
      "grad_norm": 0.4546813368797302,
      "learning_rate": 0.0001,
      "loss": 0.5935,
      "step": 158
    },
    {
      "epoch": 0.02544,
      "grad_norm": 0.6525943279266357,
      "learning_rate": 0.0001,
      "loss": 0.5465,
      "step": 159
    },
    {
      "epoch": 0.0256,
      "grad_norm": 0.6564240455627441,
      "learning_rate": 0.0001,
      "loss": 0.5727,
      "step": 160
    },
    {
      "epoch": 0.02576,
      "grad_norm": 0.4169841706752777,
      "learning_rate": 0.0001,
      "loss": 0.5758,
      "step": 161
    },
    {
      "epoch": 0.02592,
      "grad_norm": 0.6781461238861084,
      "learning_rate": 0.0001,
      "loss": 0.5968,
      "step": 162
    },
    {
      "epoch": 0.02608,
      "grad_norm": 0.5149124264717102,
      "learning_rate": 0.0001,
      "loss": 0.578,
      "step": 163
    },
    {
      "epoch": 0.02624,
      "grad_norm": 0.4813004434108734,
      "learning_rate": 0.0001,
      "loss": 0.5773,
      "step": 164
    },
    {
      "epoch": 0.0264,
      "grad_norm": 0.7043523192405701,
      "learning_rate": 0.0001,
      "loss": 0.5555,
      "step": 165
    },
    {
      "epoch": 0.02656,
      "grad_norm": 0.6648368835449219,
      "learning_rate": 0.0001,
      "loss": 0.5829,
      "step": 166
    },
    {
      "epoch": 0.02672,
      "grad_norm": 0.43357375264167786,
      "learning_rate": 0.0001,
      "loss": 0.5444,
      "step": 167
    },
    {
      "epoch": 0.02688,
      "grad_norm": 0.6134164929389954,
      "learning_rate": 0.0001,
      "loss": 0.5786,
      "step": 168
    },
    {
      "epoch": 0.02704,
      "grad_norm": 0.46206340193748474,
      "learning_rate": 0.0001,
      "loss": 0.5713,
      "step": 169
    },
    {
      "epoch": 0.0272,
      "grad_norm": 0.4207701086997986,
      "learning_rate": 0.0001,
      "loss": 0.6005,
      "step": 170
    },
    {
      "epoch": 0.02736,
      "grad_norm": 0.5122266411781311,
      "learning_rate": 0.0001,
      "loss": 0.5526,
      "step": 171
    },
    {
      "epoch": 0.02752,
      "grad_norm": 0.5077720284461975,
      "learning_rate": 0.0001,
      "loss": 0.578,
      "step": 172
    },
    {
      "epoch": 0.02768,
      "grad_norm": 0.4792933762073517,
      "learning_rate": 0.0001,
      "loss": 0.5823,
      "step": 173
    },
    {
      "epoch": 0.02784,
      "grad_norm": 0.6402912735939026,
      "learning_rate": 0.0001,
      "loss": 0.5729,
      "step": 174
    },
    {
      "epoch": 0.028,
      "grad_norm": 1.2318906784057617,
      "learning_rate": 0.0001,
      "loss": 0.5883,
      "step": 175
    },
    {
      "epoch": 0.02816,
      "grad_norm": 0.7542862892150879,
      "learning_rate": 0.0001,
      "loss": 0.5688,
      "step": 176
    },
    {
      "epoch": 0.02832,
      "grad_norm": 0.5659726858139038,
      "learning_rate": 0.0001,
      "loss": 0.5537,
      "step": 177
    },
    {
      "epoch": 0.02848,
      "grad_norm": 1.8289365768432617,
      "learning_rate": 0.0001,
      "loss": 0.5791,
      "step": 178
    },
    {
      "epoch": 0.02864,
      "grad_norm": 0.33832868933677673,
      "learning_rate": 0.0001,
      "loss": 0.5497,
      "step": 179
    },
    {
      "epoch": 0.0288,
      "grad_norm": 0.6187235116958618,
      "learning_rate": 0.0001,
      "loss": 0.5737,
      "step": 180
    },
    {
      "epoch": 0.02896,
      "grad_norm": 0.3687836527824402,
      "learning_rate": 0.0001,
      "loss": 0.5586,
      "step": 181
    },
    {
      "epoch": 0.02912,
      "grad_norm": 0.712786078453064,
      "learning_rate": 0.0001,
      "loss": 0.5552,
      "step": 182
    },
    {
      "epoch": 0.02928,
      "grad_norm": 0.5396907925605774,
      "learning_rate": 0.0001,
      "loss": 0.5529,
      "step": 183
    },
    {
      "epoch": 0.02944,
      "grad_norm": 0.49168047308921814,
      "learning_rate": 0.0001,
      "loss": 0.5646,
      "step": 184
    },
    {
      "epoch": 0.0296,
      "grad_norm": 0.5053112506866455,
      "learning_rate": 0.0001,
      "loss": 0.5559,
      "step": 185
    },
    {
      "epoch": 0.02976,
      "grad_norm": 0.5016252994537354,
      "learning_rate": 0.0001,
      "loss": 0.5532,
      "step": 186
    },
    {
      "epoch": 0.02992,
      "grad_norm": 0.4382171332836151,
      "learning_rate": 0.0001,
      "loss": 0.5541,
      "step": 187
    },
    {
      "epoch": 0.03008,
      "grad_norm": 0.5168175101280212,
      "learning_rate": 0.0001,
      "loss": 0.5826,
      "step": 188
    },
    {
      "epoch": 0.03024,
      "grad_norm": 0.3593970239162445,
      "learning_rate": 0.0001,
      "loss": 0.5686,
      "step": 189
    },
    {
      "epoch": 0.0304,
      "grad_norm": 0.38974612951278687,
      "learning_rate": 0.0001,
      "loss": 0.543,
      "step": 190
    },
    {
      "epoch": 0.03056,
      "grad_norm": 0.38404202461242676,
      "learning_rate": 0.0001,
      "loss": 0.5314,
      "step": 191
    },
    {
      "epoch": 0.03072,
      "grad_norm": 0.46663182973861694,
      "learning_rate": 0.0001,
      "loss": 0.5298,
      "step": 192
    },
    {
      "epoch": 0.03088,
      "grad_norm": 0.4511803090572357,
      "learning_rate": 0.0001,
      "loss": 0.5438,
      "step": 193
    },
    {
      "epoch": 0.03104,
      "grad_norm": 0.44206342101097107,
      "learning_rate": 0.0001,
      "loss": 0.5291,
      "step": 194
    },
    {
      "epoch": 0.0312,
      "grad_norm": 0.5212641358375549,
      "learning_rate": 0.0001,
      "loss": 0.569,
      "step": 195
    },
    {
      "epoch": 0.03136,
      "grad_norm": 0.4077339172363281,
      "learning_rate": 0.0001,
      "loss": 0.5473,
      "step": 196
    },
    {
      "epoch": 0.03152,
      "grad_norm": 0.4238377511501312,
      "learning_rate": 0.0001,
      "loss": 0.55,
      "step": 197
    },
    {
      "epoch": 0.03168,
      "grad_norm": 0.5286641716957092,
      "learning_rate": 0.0001,
      "loss": 0.5449,
      "step": 198
    },
    {
      "epoch": 0.03184,
      "grad_norm": 0.34756311774253845,
      "learning_rate": 0.0001,
      "loss": 0.5453,
      "step": 199
    },
    {
      "epoch": 0.032,
      "grad_norm": 0.43276897072792053,
      "learning_rate": 0.0001,
      "loss": 0.5419,
      "step": 200
    },
    {
      "epoch": 0.032,
      "eval_train_accuracy": 0.4994,
      "eval_train_loss": 0.5302751660346985,
      "eval_train_runtime": 4.4765,
      "eval_train_samples_per_second": 1116.934,
      "eval_train_steps_per_second": 14.073,
      "step": 200
    },
    {
      "epoch": 0.032,
      "eval_test_accuracy": 0.5032,
      "eval_test_loss": 0.5280587673187256,
      "eval_test_runtime": 4.5597,
      "eval_test_samples_per_second": 1096.57,
      "eval_test_steps_per_second": 13.817,
      "step": 200
    },
    {
      "epoch": 0.03216,
      "grad_norm": 0.39006882905960083,
      "learning_rate": 0.0001,
      "loss": 0.5071,
      "step": 201
    },
    {
      "epoch": 0.03232,
      "grad_norm": 0.451017826795578,
      "learning_rate": 0.0001,
      "loss": 0.5454,
      "step": 202
    },
    {
      "epoch": 0.03248,
      "grad_norm": 0.3380140960216522,
      "learning_rate": 0.0001,
      "loss": 0.5271,
      "step": 203
    },
    {
      "epoch": 0.03264,
      "grad_norm": 0.4852125346660614,
      "learning_rate": 0.0001,
      "loss": 0.5782,
      "step": 204
    },
    {
      "epoch": 0.0328,
      "grad_norm": 0.36828136444091797,
      "learning_rate": 0.0001,
      "loss": 0.5312,
      "step": 205
    },
    {
      "epoch": 0.03296,
      "grad_norm": 0.39801540970802307,
      "learning_rate": 0.0001,
      "loss": 0.526,
      "step": 206
    },
    {
      "epoch": 0.03312,
      "grad_norm": 0.4463099539279938,
      "learning_rate": 0.0001,
      "loss": 0.5608,
      "step": 207
    },
    {
      "epoch": 0.03328,
      "grad_norm": 0.3352719247341156,
      "learning_rate": 0.0001,
      "loss": 0.5279,
      "step": 208
    },
    {
      "epoch": 0.03344,
      "grad_norm": 0.41555145382881165,
      "learning_rate": 0.0001,
      "loss": 0.5307,
      "step": 209
    },
    {
      "epoch": 0.0336,
      "grad_norm": 0.3737277090549469,
      "learning_rate": 0.0001,
      "loss": 0.5247,
      "step": 210
    },
    {
      "epoch": 0.03376,
      "grad_norm": 0.5082979202270508,
      "learning_rate": 0.0001,
      "loss": 0.5344,
      "step": 211
    },
    {
      "epoch": 0.03392,
      "grad_norm": 0.4219430983066559,
      "learning_rate": 0.0001,
      "loss": 0.5361,
      "step": 212
    },
    {
      "epoch": 0.03408,
      "grad_norm": 0.4409572184085846,
      "learning_rate": 0.0001,
      "loss": 0.4805,
      "step": 213
    },
    {
      "epoch": 0.03424,
      "grad_norm": 0.4411924481391907,
      "learning_rate": 0.0001,
      "loss": 0.5395,
      "step": 214
    },
    {
      "epoch": 0.0344,
      "grad_norm": 0.43601253628730774,
      "learning_rate": 0.0001,
      "loss": 0.5348,
      "step": 215
    },
    {
      "epoch": 0.03456,
      "grad_norm": 0.5705487728118896,
      "learning_rate": 0.0001,
      "loss": 0.5382,
      "step": 216
    },
    {
      "epoch": 0.03472,
      "grad_norm": 0.4621902108192444,
      "learning_rate": 0.0001,
      "loss": 0.5295,
      "step": 217
    },
    {
      "epoch": 0.03488,
      "grad_norm": 0.47124987840652466,
      "learning_rate": 0.0001,
      "loss": 0.5062,
      "step": 218
    },
    {
      "epoch": 0.03504,
      "grad_norm": 0.5038824081420898,
      "learning_rate": 0.0001,
      "loss": 0.5174,
      "step": 219
    },
    {
      "epoch": 0.0352,
      "grad_norm": 0.42893165349960327,
      "learning_rate": 0.0001,
      "loss": 0.521,
      "step": 220
    },
    {
      "epoch": 0.03536,
      "grad_norm": 0.46081987023353577,
      "learning_rate": 0.0001,
      "loss": 0.5141,
      "step": 221
    },
    {
      "epoch": 0.03552,
      "grad_norm": 0.40484297275543213,
      "learning_rate": 0.0001,
      "loss": 0.5092,
      "step": 222
    },
    {
      "epoch": 0.03568,
      "grad_norm": 0.6141584515571594,
      "learning_rate": 0.0001,
      "loss": 0.5165,
      "step": 223
    },
    {
      "epoch": 0.03584,
      "grad_norm": 0.4234760105609894,
      "learning_rate": 0.0001,
      "loss": 0.5234,
      "step": 224
    },
    {
      "epoch": 0.036,
      "grad_norm": 0.8658497333526611,
      "learning_rate": 0.0001,
      "loss": 0.5032,
      "step": 225
    },
    {
      "epoch": 0.03616,
      "grad_norm": 0.5159129500389099,
      "learning_rate": 0.0001,
      "loss": 0.5321,
      "step": 226
    },
    {
      "epoch": 0.03632,
      "grad_norm": 0.6330617666244507,
      "learning_rate": 0.0001,
      "loss": 0.5234,
      "step": 227
    },
    {
      "epoch": 0.03648,
      "grad_norm": 0.5916549563407898,
      "learning_rate": 0.0001,
      "loss": 0.5111,
      "step": 228
    },
    {
      "epoch": 0.03664,
      "grad_norm": 0.4383091926574707,
      "learning_rate": 0.0001,
      "loss": 0.5039,
      "step": 229
    },
    {
      "epoch": 0.0368,
      "grad_norm": 0.5641090869903564,
      "learning_rate": 0.0001,
      "loss": 0.5243,
      "step": 230
    },
    {
      "epoch": 0.03696,
      "grad_norm": 0.377345472574234,
      "learning_rate": 0.0001,
      "loss": 0.5072,
      "step": 231
    },
    {
      "epoch": 0.03712,
      "grad_norm": 0.4786352515220642,
      "learning_rate": 0.0001,
      "loss": 0.5127,
      "step": 232
    },
    {
      "epoch": 0.03728,
      "grad_norm": 0.4703376889228821,
      "learning_rate": 0.0001,
      "loss": 0.5348,
      "step": 233
    },
    {
      "epoch": 0.03744,
      "grad_norm": 0.4025973677635193,
      "learning_rate": 0.0001,
      "loss": 0.5167,
      "step": 234
    },
    {
      "epoch": 0.0376,
      "grad_norm": 0.5061396956443787,
      "learning_rate": 0.0001,
      "loss": 0.4944,
      "step": 235
    },
    {
      "epoch": 0.03776,
      "grad_norm": 0.38022950291633606,
      "learning_rate": 0.0001,
      "loss": 0.4876,
      "step": 236
    },
    {
      "epoch": 0.03792,
      "grad_norm": 0.3865484297275543,
      "learning_rate": 0.0001,
      "loss": 0.4868,
      "step": 237
    },
    {
      "epoch": 0.03808,
      "grad_norm": 0.4807250201702118,
      "learning_rate": 0.0001,
      "loss": 0.4868,
      "step": 238
    },
    {
      "epoch": 0.03824,
      "grad_norm": 0.4414023756980896,
      "learning_rate": 0.0001,
      "loss": 0.5241,
      "step": 239
    },
    {
      "epoch": 0.0384,
      "grad_norm": 0.4205949306488037,
      "learning_rate": 0.0001,
      "loss": 0.5296,
      "step": 240
    },
    {
      "epoch": 0.03856,
      "grad_norm": 0.48418062925338745,
      "learning_rate": 0.0001,
      "loss": 0.5089,
      "step": 241
    },
    {
      "epoch": 0.03872,
      "grad_norm": 0.5170714259147644,
      "learning_rate": 0.0001,
      "loss": 0.4992,
      "step": 242
    },
    {
      "epoch": 0.03888,
      "grad_norm": 0.3944830894470215,
      "learning_rate": 0.0001,
      "loss": 0.5104,
      "step": 243
    },
    {
      "epoch": 0.03904,
      "grad_norm": 0.5142833590507507,
      "learning_rate": 0.0001,
      "loss": 0.4883,
      "step": 244
    },
    {
      "epoch": 0.0392,
      "grad_norm": 0.421736478805542,
      "learning_rate": 0.0001,
      "loss": 0.4983,
      "step": 245
    },
    {
      "epoch": 0.03936,
      "grad_norm": 0.3617413341999054,
      "learning_rate": 0.0001,
      "loss": 0.4943,
      "step": 246
    },
    {
      "epoch": 0.03952,
      "grad_norm": 0.45868027210235596,
      "learning_rate": 0.0001,
      "loss": 0.53,
      "step": 247
    },
    {
      "epoch": 0.03968,
      "grad_norm": 0.3752826452255249,
      "learning_rate": 0.0001,
      "loss": 0.4857,
      "step": 248
    },
    {
      "epoch": 0.03984,
      "grad_norm": 0.4667113721370697,
      "learning_rate": 0.0001,
      "loss": 0.5023,
      "step": 249
    },
    {
      "epoch": 0.04,
      "grad_norm": 0.4036247134208679,
      "learning_rate": 0.0001,
      "loss": 0.5139,
      "step": 250
    },
    {
      "epoch": 0.04016,
      "grad_norm": 0.4269551634788513,
      "learning_rate": 0.0001,
      "loss": 0.479,
      "step": 251
    },
    {
      "epoch": 0.04032,
      "grad_norm": 0.4471624493598938,
      "learning_rate": 0.0001,
      "loss": 0.5066,
      "step": 252
    },
    {
      "epoch": 0.04048,
      "grad_norm": 0.41622793674468994,
      "learning_rate": 0.0001,
      "loss": 0.4941,
      "step": 253
    },
    {
      "epoch": 0.04064,
      "grad_norm": 0.4290021061897278,
      "learning_rate": 0.0001,
      "loss": 0.4909,
      "step": 254
    },
    {
      "epoch": 0.0408,
      "grad_norm": 0.45290419459342957,
      "learning_rate": 0.0001,
      "loss": 0.5024,
      "step": 255
    },
    {
      "epoch": 0.04096,
      "grad_norm": 0.38887321949005127,
      "learning_rate": 0.0001,
      "loss": 0.488,
      "step": 256
    },
    {
      "epoch": 0.04112,
      "grad_norm": 0.4391036331653595,
      "learning_rate": 0.0001,
      "loss": 0.4913,
      "step": 257
    },
    {
      "epoch": 0.04128,
      "grad_norm": 0.3244125247001648,
      "learning_rate": 0.0001,
      "loss": 0.4921,
      "step": 258
    },
    {
      "epoch": 0.04144,
      "grad_norm": 0.5841965675354004,
      "learning_rate": 0.0001,
      "loss": 0.4944,
      "step": 259
    },
    {
      "epoch": 0.0416,
      "grad_norm": 0.8114727139472961,
      "learning_rate": 0.0001,
      "loss": 0.5036,
      "step": 260
    },
    {
      "epoch": 0.04176,
      "grad_norm": 1.1177241802215576,
      "learning_rate": 0.0001,
      "loss": 0.4998,
      "step": 261
    },
    {
      "epoch": 0.04192,
      "grad_norm": 0.6121273040771484,
      "learning_rate": 0.0001,
      "loss": 0.4896,
      "step": 262
    },
    {
      "epoch": 0.04208,
      "grad_norm": 0.8098846673965454,
      "learning_rate": 0.0001,
      "loss": 0.4997,
      "step": 263
    },
    {
      "epoch": 0.04224,
      "grad_norm": 0.8312938809394836,
      "learning_rate": 0.0001,
      "loss": 0.4714,
      "step": 264
    },
    {
      "epoch": 0.0424,
      "grad_norm": 0.5831167101860046,
      "learning_rate": 0.0001,
      "loss": 0.4921,
      "step": 265
    },
    {
      "epoch": 0.04256,
      "grad_norm": 0.8057853579521179,
      "learning_rate": 0.0001,
      "loss": 0.4996,
      "step": 266
    },
    {
      "epoch": 0.04272,
      "grad_norm": 0.41096732020378113,
      "learning_rate": 0.0001,
      "loss": 0.4586,
      "step": 267
    },
    {
      "epoch": 0.04288,
      "grad_norm": 0.5974665284156799,
      "learning_rate": 0.0001,
      "loss": 0.4838,
      "step": 268
    },
    {
      "epoch": 0.04304,
      "grad_norm": 0.49775686860084534,
      "learning_rate": 0.0001,
      "loss": 0.4864,
      "step": 269
    },
    {
      "epoch": 0.0432,
      "grad_norm": 0.5885093212127686,
      "learning_rate": 0.0001,
      "loss": 0.4955,
      "step": 270
    },
    {
      "epoch": 0.04336,
      "grad_norm": 0.43409430980682373,
      "learning_rate": 0.0001,
      "loss": 0.453,
      "step": 271
    },
    {
      "epoch": 0.04352,
      "grad_norm": 0.5909115076065063,
      "learning_rate": 0.0001,
      "loss": 0.4687,
      "step": 272
    },
    {
      "epoch": 0.04368,
      "grad_norm": 0.5279764533042908,
      "learning_rate": 0.0001,
      "loss": 0.4993,
      "step": 273
    },
    {
      "epoch": 0.04384,
      "grad_norm": 0.5829859375953674,
      "learning_rate": 0.0001,
      "loss": 0.4812,
      "step": 274
    },
    {
      "epoch": 0.044,
      "grad_norm": 0.5025613903999329,
      "learning_rate": 0.0001,
      "loss": 0.4795,
      "step": 275
    },
    {
      "epoch": 0.04416,
      "grad_norm": 0.47510185837745667,
      "learning_rate": 0.0001,
      "loss": 0.4738,
      "step": 276
    },
    {
      "epoch": 0.04432,
      "grad_norm": 0.6862363815307617,
      "learning_rate": 0.0001,
      "loss": 0.4917,
      "step": 277
    },
    {
      "epoch": 0.04448,
      "grad_norm": 0.6176837682723999,
      "learning_rate": 0.0001,
      "loss": 0.4825,
      "step": 278
    },
    {
      "epoch": 0.04464,
      "grad_norm": 0.3829985558986664,
      "learning_rate": 0.0001,
      "loss": 0.466,
      "step": 279
    },
    {
      "epoch": 0.0448,
      "grad_norm": 0.5115876197814941,
      "learning_rate": 0.0001,
      "loss": 0.4795,
      "step": 280
    },
    {
      "epoch": 0.04496,
      "grad_norm": 0.5673558712005615,
      "learning_rate": 0.0001,
      "loss": 0.4595,
      "step": 281
    },
    {
      "epoch": 0.04512,
      "grad_norm": 0.34696146845817566,
      "learning_rate": 0.0001,
      "loss": 0.4546,
      "step": 282
    },
    {
      "epoch": 0.04528,
      "grad_norm": 0.8831761479377747,
      "learning_rate": 0.0001,
      "loss": 0.4778,
      "step": 283
    },
    {
      "epoch": 0.04544,
      "grad_norm": 0.9309988617897034,
      "learning_rate": 0.0001,
      "loss": 0.4956,
      "step": 284
    },
    {
      "epoch": 0.0456,
      "grad_norm": 0.9458814859390259,
      "learning_rate": 0.0001,
      "loss": 0.4845,
      "step": 285
    },
    {
      "epoch": 0.04576,
      "grad_norm": 0.5393334031105042,
      "learning_rate": 0.0001,
      "loss": 0.4644,
      "step": 286
    },
    {
      "epoch": 0.04592,
      "grad_norm": 0.9739812612533569,
      "learning_rate": 0.0001,
      "loss": 0.5174,
      "step": 287
    },
    {
      "epoch": 0.04608,
      "grad_norm": 0.46353188157081604,
      "learning_rate": 0.0001,
      "loss": 0.4685,
      "step": 288
    },
    {
      "epoch": 0.04624,
      "grad_norm": 0.5349130630493164,
      "learning_rate": 0.0001,
      "loss": 0.4756,
      "step": 289
    },
    {
      "epoch": 0.0464,
      "grad_norm": 0.6596806645393372,
      "learning_rate": 0.0001,
      "loss": 0.4852,
      "step": 290
    },
    {
      "epoch": 0.04656,
      "grad_norm": 0.3644371032714844,
      "learning_rate": 0.0001,
      "loss": 0.4485,
      "step": 291
    },
    {
      "epoch": 0.04672,
      "grad_norm": 0.8390160202980042,
      "learning_rate": 0.0001,
      "loss": 0.4718,
      "step": 292
    },
    {
      "epoch": 0.04688,
      "grad_norm": 0.5926034450531006,
      "learning_rate": 0.0001,
      "loss": 0.5022,
      "step": 293
    },
    {
      "epoch": 0.04704,
      "grad_norm": 0.511601984500885,
      "learning_rate": 0.0001,
      "loss": 0.4769,
      "step": 294
    },
    {
      "epoch": 0.0472,
      "grad_norm": 0.43532317876815796,
      "learning_rate": 0.0001,
      "loss": 0.4678,
      "step": 295
    },
    {
      "epoch": 0.04736,
      "grad_norm": 0.46975913643836975,
      "learning_rate": 0.0001,
      "loss": 0.4689,
      "step": 296
    },
    {
      "epoch": 0.04752,
      "grad_norm": 0.45361122488975525,
      "learning_rate": 0.0001,
      "loss": 0.4716,
      "step": 297
    },
    {
      "epoch": 0.04768,
      "grad_norm": 0.5125935077667236,
      "learning_rate": 0.0001,
      "loss": 0.464,
      "step": 298
    },
    {
      "epoch": 0.04784,
      "grad_norm": 0.3529023826122284,
      "learning_rate": 0.0001,
      "loss": 0.4521,
      "step": 299
    },
    {
      "epoch": 0.048,
      "grad_norm": 0.4654410481452942,
      "learning_rate": 0.0001,
      "loss": 0.4748,
      "step": 300
    },
    {
      "epoch": 0.048,
      "eval_train_accuracy": 0.4992,
      "eval_train_loss": 0.45201006531715393,
      "eval_train_runtime": 4.6494,
      "eval_train_samples_per_second": 1075.396,
      "eval_train_steps_per_second": 13.55,
      "step": 300
    },
    {
      "epoch": 0.048,
      "eval_test_accuracy": 0.4946,
      "eval_test_loss": 0.4504285156726837,
      "eval_test_runtime": 4.5123,
      "eval_test_samples_per_second": 1108.092,
      "eval_test_steps_per_second": 13.962,
      "step": 300
    },
    {
      "epoch": 0.04816,
      "grad_norm": 0.39608582854270935,
      "learning_rate": 0.0001,
      "loss": 0.4683,
      "step": 301
    },
    {
      "epoch": 0.04832,
      "grad_norm": 0.37864506244659424,
      "learning_rate": 0.0001,
      "loss": 0.4617,
      "step": 302
    },
    {
      "epoch": 0.04848,
      "grad_norm": 0.3966837525367737,
      "learning_rate": 0.0001,
      "loss": 0.4716,
      "step": 303
    },
    {
      "epoch": 0.04864,
      "grad_norm": 0.484833687543869,
      "learning_rate": 0.0001,
      "loss": 0.447,
      "step": 304
    },
    {
      "epoch": 0.0488,
      "grad_norm": 0.4249228239059448,
      "learning_rate": 0.0001,
      "loss": 0.4232,
      "step": 305
    },
    {
      "epoch": 0.04896,
      "grad_norm": 0.4392454922199249,
      "learning_rate": 0.0001,
      "loss": 0.4345,
      "step": 306
    },
    {
      "epoch": 0.04912,
      "grad_norm": 0.4436151683330536,
      "learning_rate": 0.0001,
      "loss": 0.4657,
      "step": 307
    },
    {
      "epoch": 0.04928,
      "grad_norm": 0.39629262685775757,
      "learning_rate": 0.0001,
      "loss": 0.4194,
      "step": 308
    },
    {
      "epoch": 0.04944,
      "grad_norm": 0.41944947838783264,
      "learning_rate": 0.0001,
      "loss": 0.4423,
      "step": 309
    },
    {
      "epoch": 0.0496,
      "grad_norm": 0.3854512870311737,
      "learning_rate": 0.0001,
      "loss": 0.4423,
      "step": 310
    },
    {
      "epoch": 0.04976,
      "grad_norm": 0.3877759277820587,
      "learning_rate": 0.0001,
      "loss": 0.4623,
      "step": 311
    },
    {
      "epoch": 0.04992,
      "grad_norm": 0.43150660395622253,
      "learning_rate": 0.0001,
      "loss": 0.4637,
      "step": 312
    },
    {
      "epoch": 0.05008,
      "grad_norm": 0.4201996624469757,
      "learning_rate": 0.0001,
      "loss": 0.4314,
      "step": 313
    },
    {
      "epoch": 0.05024,
      "grad_norm": 0.4303872585296631,
      "learning_rate": 0.0001,
      "loss": 0.455,
      "step": 314
    },
    {
      "epoch": 0.0504,
      "grad_norm": 0.4381087124347687,
      "learning_rate": 0.0001,
      "loss": 0.4547,
      "step": 315
    },
    {
      "epoch": 0.05056,
      "grad_norm": 0.5735623836517334,
      "learning_rate": 0.0001,
      "loss": 0.4478,
      "step": 316
    },
    {
      "epoch": 0.05072,
      "grad_norm": 0.28142213821411133,
      "learning_rate": 0.0001,
      "loss": 0.4437,
      "step": 317
    },
    {
      "epoch": 0.05088,
      "grad_norm": 0.5816413164138794,
      "learning_rate": 0.0001,
      "loss": 0.456,
      "step": 318
    },
    {
      "epoch": 0.05104,
      "grad_norm": 0.7212444543838501,
      "learning_rate": 0.0001,
      "loss": 0.4497,
      "step": 319
    },
    {
      "epoch": 0.0512,
      "grad_norm": 0.33331599831581116,
      "learning_rate": 0.0001,
      "loss": 0.4343,
      "step": 320
    },
    {
      "epoch": 0.05136,
      "grad_norm": 0.5896322131156921,
      "learning_rate": 0.0001,
      "loss": 0.4425,
      "step": 321
    },
    {
      "epoch": 0.05152,
      "grad_norm": 0.49074244499206543,
      "learning_rate": 0.0001,
      "loss": 0.4487,
      "step": 322
    },
    {
      "epoch": 0.05168,
      "grad_norm": 0.39206865429878235,
      "learning_rate": 0.0001,
      "loss": 0.4498,
      "step": 323
    },
    {
      "epoch": 0.05184,
      "grad_norm": 0.3676196336746216,
      "learning_rate": 0.0001,
      "loss": 0.4329,
      "step": 324
    },
    {
      "epoch": 0.052,
      "grad_norm": 0.36346590518951416,
      "learning_rate": 0.0001,
      "loss": 0.4429,
      "step": 325
    },
    {
      "epoch": 0.05216,
      "grad_norm": 0.313772588968277,
      "learning_rate": 0.0001,
      "loss": 0.4411,
      "step": 326
    },
    {
      "epoch": 0.05232,
      "grad_norm": 0.32552823424339294,
      "learning_rate": 0.0001,
      "loss": 0.4278,
      "step": 327
    },
    {
      "epoch": 0.05248,
      "grad_norm": 0.33760005235671997,
      "learning_rate": 0.0001,
      "loss": 0.4552,
      "step": 328
    },
    {
      "epoch": 0.05264,
      "grad_norm": 0.4326048791408539,
      "learning_rate": 0.0001,
      "loss": 0.4804,
      "step": 329
    },
    {
      "epoch": 0.0528,
      "grad_norm": 0.5904853940010071,
      "learning_rate": 0.0001,
      "loss": 0.4424,
      "step": 330
    },
    {
      "epoch": 0.05296,
      "grad_norm": 0.32799577713012695,
      "learning_rate": 0.0001,
      "loss": 0.4526,
      "step": 331
    },
    {
      "epoch": 0.05312,
      "grad_norm": 0.4347635507583618,
      "learning_rate": 0.0001,
      "loss": 0.4456,
      "step": 332
    },
    {
      "epoch": 0.05328,
      "grad_norm": 0.3390342593193054,
      "learning_rate": 0.0001,
      "loss": 0.4443,
      "step": 333
    },
    {
      "epoch": 0.05344,
      "grad_norm": 0.41235384345054626,
      "learning_rate": 0.0001,
      "loss": 0.4409,
      "step": 334
    },
    {
      "epoch": 0.0536,
      "grad_norm": 0.2686103284358978,
      "learning_rate": 0.0001,
      "loss": 0.4441,
      "step": 335
    },
    {
      "epoch": 0.05376,
      "grad_norm": 0.36405062675476074,
      "learning_rate": 0.0001,
      "loss": 0.4264,
      "step": 336
    },
    {
      "epoch": 0.05392,
      "grad_norm": 0.3664514124393463,
      "learning_rate": 0.0001,
      "loss": 0.4376,
      "step": 337
    },
    {
      "epoch": 0.05408,
      "grad_norm": 0.24981167912483215,
      "learning_rate": 0.0001,
      "loss": 0.4311,
      "step": 338
    },
    {
      "epoch": 0.05424,
      "grad_norm": 0.3342618942260742,
      "learning_rate": 0.0001,
      "loss": 0.4336,
      "step": 339
    },
    {
      "epoch": 0.0544,
      "grad_norm": 0.27844882011413574,
      "learning_rate": 0.0001,
      "loss": 0.407,
      "step": 340
    },
    {
      "epoch": 0.05456,
      "grad_norm": 0.3276371657848358,
      "learning_rate": 0.0001,
      "loss": 0.4388,
      "step": 341
    },
    {
      "epoch": 0.05472,
      "grad_norm": 0.3253667652606964,
      "learning_rate": 0.0001,
      "loss": 0.4023,
      "step": 342
    },
    {
      "epoch": 0.05488,
      "grad_norm": 0.35169485211372375,
      "learning_rate": 0.0001,
      "loss": 0.4419,
      "step": 343
    },
    {
      "epoch": 0.05504,
      "grad_norm": 0.323783814907074,
      "learning_rate": 0.0001,
      "loss": 0.408,
      "step": 344
    },
    {
      "epoch": 0.0552,
      "grad_norm": 0.44585442543029785,
      "learning_rate": 0.0001,
      "loss": 0.4411,
      "step": 345
    },
    {
      "epoch": 0.05536,
      "grad_norm": 0.42144107818603516,
      "learning_rate": 0.0001,
      "loss": 0.4392,
      "step": 346
    },
    {
      "epoch": 0.05552,
      "grad_norm": 0.28269192576408386,
      "learning_rate": 0.0001,
      "loss": 0.4384,
      "step": 347
    },
    {
      "epoch": 0.05568,
      "grad_norm": 0.2902885377407074,
      "learning_rate": 0.0001,
      "loss": 0.4248,
      "step": 348
    },
    {
      "epoch": 0.05584,
      "grad_norm": 0.34484532475471497,
      "learning_rate": 0.0001,
      "loss": 0.4313,
      "step": 349
    },
    {
      "epoch": 0.056,
      "grad_norm": 0.296975702047348,
      "learning_rate": 0.0001,
      "loss": 0.4196,
      "step": 350
    },
    {
      "epoch": 0.05616,
      "grad_norm": 0.3426573574542999,
      "learning_rate": 0.0001,
      "loss": 0.4143,
      "step": 351
    },
    {
      "epoch": 0.05632,
      "grad_norm": 0.3550752103328705,
      "learning_rate": 0.0001,
      "loss": 0.437,
      "step": 352
    },
    {
      "epoch": 0.05648,
      "grad_norm": 0.3979922831058502,
      "learning_rate": 0.0001,
      "loss": 0.429,
      "step": 353
    },
    {
      "epoch": 0.05664,
      "grad_norm": 0.2757110595703125,
      "learning_rate": 0.0001,
      "loss": 0.4439,
      "step": 354
    },
    {
      "epoch": 0.0568,
      "grad_norm": 0.28511959314346313,
      "learning_rate": 0.0001,
      "loss": 0.4623,
      "step": 355
    },
    {
      "epoch": 0.05696,
      "grad_norm": 0.4241056442260742,
      "learning_rate": 0.0001,
      "loss": 0.4388,
      "step": 356
    },
    {
      "epoch": 0.05712,
      "grad_norm": 0.34657251834869385,
      "learning_rate": 0.0001,
      "loss": 0.4212,
      "step": 357
    },
    {
      "epoch": 0.05728,
      "grad_norm": 0.3662041425704956,
      "learning_rate": 0.0001,
      "loss": 0.4298,
      "step": 358
    },
    {
      "epoch": 0.05744,
      "grad_norm": 0.3156830072402954,
      "learning_rate": 0.0001,
      "loss": 0.4342,
      "step": 359
    },
    {
      "epoch": 0.0576,
      "grad_norm": 0.33679160475730896,
      "learning_rate": 0.0001,
      "loss": 0.4332,
      "step": 360
    },
    {
      "epoch": 0.05776,
      "grad_norm": 0.32869285345077515,
      "learning_rate": 0.0001,
      "loss": 0.4526,
      "step": 361
    },
    {
      "epoch": 0.05792,
      "grad_norm": 0.5099242925643921,
      "learning_rate": 0.0001,
      "loss": 0.4314,
      "step": 362
    },
    {
      "epoch": 0.05808,
      "grad_norm": 0.36387762427330017,
      "learning_rate": 0.0001,
      "loss": 0.4298,
      "step": 363
    },
    {
      "epoch": 0.05824,
      "grad_norm": 0.2838766276836395,
      "learning_rate": 0.0001,
      "loss": 0.4331,
      "step": 364
    },
    {
      "epoch": 0.0584,
      "grad_norm": 0.38314253091812134,
      "learning_rate": 0.0001,
      "loss": 0.4447,
      "step": 365
    },
    {
      "epoch": 0.05856,
      "grad_norm": 0.24991796910762787,
      "learning_rate": 0.0001,
      "loss": 0.4296,
      "step": 366
    },
    {
      "epoch": 0.05872,
      "grad_norm": 0.30235755443573,
      "learning_rate": 0.0001,
      "loss": 0.4247,
      "step": 367
    },
    {
      "epoch": 0.05888,
      "grad_norm": 0.29059481620788574,
      "learning_rate": 0.0001,
      "loss": 0.4331,
      "step": 368
    },
    {
      "epoch": 0.05904,
      "grad_norm": 0.2713058292865753,
      "learning_rate": 0.0001,
      "loss": 0.4284,
      "step": 369
    },
    {
      "epoch": 0.0592,
      "grad_norm": 0.2826420068740845,
      "learning_rate": 0.0001,
      "loss": 0.4219,
      "step": 370
    },
    {
      "epoch": 0.05936,
      "grad_norm": 0.28609699010849,
      "learning_rate": 0.0001,
      "loss": 0.4172,
      "step": 371
    },
    {
      "epoch": 0.05952,
      "grad_norm": 0.3107950687408447,
      "learning_rate": 0.0001,
      "loss": 0.4152,
      "step": 372
    },
    {
      "epoch": 0.05968,
      "grad_norm": 0.26041895151138306,
      "learning_rate": 0.0001,
      "loss": 0.4278,
      "step": 373
    },
    {
      "epoch": 0.05984,
      "grad_norm": 0.3465251922607422,
      "learning_rate": 0.0001,
      "loss": 0.4129,
      "step": 374
    },
    {
      "epoch": 0.06,
      "grad_norm": 0.29337278008461,
      "learning_rate": 0.0001,
      "loss": 0.4378,
      "step": 375
    },
    {
      "epoch": 0.06016,
      "grad_norm": 0.2655937075614929,
      "learning_rate": 0.0001,
      "loss": 0.4203,
      "step": 376
    },
    {
      "epoch": 0.06032,
      "grad_norm": 0.24435940384864807,
      "learning_rate": 0.0001,
      "loss": 0.4049,
      "step": 377
    },
    {
      "epoch": 0.06048,
      "grad_norm": 0.31150516867637634,
      "learning_rate": 0.0001,
      "loss": 0.4192,
      "step": 378
    },
    {
      "epoch": 0.06064,
      "grad_norm": 0.2610114812850952,
      "learning_rate": 0.0001,
      "loss": 0.4266,
      "step": 379
    },
    {
      "epoch": 0.0608,
      "grad_norm": 0.357187420129776,
      "learning_rate": 0.0001,
      "loss": 0.4269,
      "step": 380
    },
    {
      "epoch": 0.06096,
      "grad_norm": 0.36019402742385864,
      "learning_rate": 0.0001,
      "loss": 0.4351,
      "step": 381
    },
    {
      "epoch": 0.06112,
      "grad_norm": 0.3273111581802368,
      "learning_rate": 0.0001,
      "loss": 0.441,
      "step": 382
    },
    {
      "epoch": 0.06128,
      "grad_norm": 0.3188781142234802,
      "learning_rate": 0.0001,
      "loss": 0.4295,
      "step": 383
    },
    {
      "epoch": 0.06144,
      "grad_norm": 0.36239463090896606,
      "learning_rate": 0.0001,
      "loss": 0.4276,
      "step": 384
    },
    {
      "epoch": 0.0616,
      "grad_norm": 0.3151351809501648,
      "learning_rate": 0.0001,
      "loss": 0.4162,
      "step": 385
    },
    {
      "epoch": 0.06176,
      "grad_norm": 0.2884153127670288,
      "learning_rate": 0.0001,
      "loss": 0.437,
      "step": 386
    },
    {
      "epoch": 0.06192,
      "grad_norm": 0.48708662390708923,
      "learning_rate": 0.0001,
      "loss": 0.4238,
      "step": 387
    },
    {
      "epoch": 0.06208,
      "grad_norm": 0.25806134939193726,
      "learning_rate": 0.0001,
      "loss": 0.4122,
      "step": 388
    },
    {
      "epoch": 0.06224,
      "grad_norm": 0.3410314619541168,
      "learning_rate": 0.0001,
      "loss": 0.4231,
      "step": 389
    },
    {
      "epoch": 0.0624,
      "grad_norm": 0.3207263946533203,
      "learning_rate": 0.0001,
      "loss": 0.4193,
      "step": 390
    },
    {
      "epoch": 0.06256,
      "grad_norm": 0.36167675256729126,
      "learning_rate": 0.0001,
      "loss": 0.4194,
      "step": 391
    },
    {
      "epoch": 0.06272,
      "grad_norm": 0.38055479526519775,
      "learning_rate": 0.0001,
      "loss": 0.4181,
      "step": 392
    },
    {
      "epoch": 0.06288,
      "grad_norm": 0.3512444496154785,
      "learning_rate": 0.0001,
      "loss": 0.4181,
      "step": 393
    },
    {
      "epoch": 0.06304,
      "grad_norm": 0.5028431415557861,
      "learning_rate": 0.0001,
      "loss": 0.4276,
      "step": 394
    },
    {
      "epoch": 0.0632,
      "grad_norm": 0.4454989433288574,
      "learning_rate": 0.0001,
      "loss": 0.4115,
      "step": 395
    },
    {
      "epoch": 0.06336,
      "grad_norm": 0.2984643280506134,
      "learning_rate": 0.0001,
      "loss": 0.4453,
      "step": 396
    },
    {
      "epoch": 0.06352,
      "grad_norm": 0.37938594818115234,
      "learning_rate": 0.0001,
      "loss": 0.4141,
      "step": 397
    },
    {
      "epoch": 0.06368,
      "grad_norm": 0.37617871165275574,
      "learning_rate": 0.0001,
      "loss": 0.4221,
      "step": 398
    },
    {
      "epoch": 0.06384,
      "grad_norm": 0.3089243471622467,
      "learning_rate": 0.0001,
      "loss": 0.4359,
      "step": 399
    },
    {
      "epoch": 0.064,
      "grad_norm": 0.45868200063705444,
      "learning_rate": 0.0001,
      "loss": 0.4208,
      "step": 400
    },
    {
      "epoch": 0.064,
      "eval_train_accuracy": 0.5026,
      "eval_train_loss": 0.41689184308052063,
      "eval_train_runtime": 4.5997,
      "eval_train_samples_per_second": 1087.023,
      "eval_train_steps_per_second": 13.696,
      "step": 400
    },
    {
      "epoch": 0.064,
      "eval_test_accuracy": 0.5062,
      "eval_test_loss": 0.4150881767272949,
      "eval_test_runtime": 4.2542,
      "eval_test_samples_per_second": 1175.299,
      "eval_test_steps_per_second": 14.809,
      "step": 400
    },
    {
      "epoch": 0.06416,
      "grad_norm": 0.3588743507862091,
      "learning_rate": 0.0001,
      "loss": 0.4336,
      "step": 401
    },
    {
      "epoch": 0.06432,
      "grad_norm": 0.6174024939537048,
      "learning_rate": 0.0001,
      "loss": 0.4441,
      "step": 402
    },
    {
      "epoch": 0.06448,
      "grad_norm": 0.6371780633926392,
      "learning_rate": 0.0001,
      "loss": 0.4222,
      "step": 403
    },
    {
      "epoch": 0.06464,
      "grad_norm": 0.3349703848361969,
      "learning_rate": 0.0001,
      "loss": 0.428,
      "step": 404
    },
    {
      "epoch": 0.0648,
      "grad_norm": 0.3605439066886902,
      "learning_rate": 0.0001,
      "loss": 0.4381,
      "step": 405
    },
    {
      "epoch": 0.06496,
      "grad_norm": 0.46087101101875305,
      "learning_rate": 0.0001,
      "loss": 0.4066,
      "step": 406
    },
    {
      "epoch": 0.06512,
      "grad_norm": 0.32155805826187134,
      "learning_rate": 0.0001,
      "loss": 0.4164,
      "step": 407
    },
    {
      "epoch": 0.06528,
      "grad_norm": 0.4627705216407776,
      "learning_rate": 0.0001,
      "loss": 0.4262,
      "step": 408
    },
    {
      "epoch": 0.06544,
      "grad_norm": 0.33495891094207764,
      "learning_rate": 0.0001,
      "loss": 0.4228,
      "step": 409
    },
    {
      "epoch": 0.0656,
      "grad_norm": 0.3592594265937805,
      "learning_rate": 0.0001,
      "loss": 0.4254,
      "step": 410
    },
    {
      "epoch": 0.06576,
      "grad_norm": 0.3565671145915985,
      "learning_rate": 0.0001,
      "loss": 0.4176,
      "step": 411
    },
    {
      "epoch": 0.06592,
      "grad_norm": 0.2897014021873474,
      "learning_rate": 0.0001,
      "loss": 0.4162,
      "step": 412
    },
    {
      "epoch": 0.06608,
      "grad_norm": 0.39404943585395813,
      "learning_rate": 0.0001,
      "loss": 0.4086,
      "step": 413
    },
    {
      "epoch": 0.06624,
      "grad_norm": 0.49006187915802,
      "learning_rate": 0.0001,
      "loss": 0.3994,
      "step": 414
    },
    {
      "epoch": 0.0664,
      "grad_norm": 0.31414374709129333,
      "learning_rate": 0.0001,
      "loss": 0.409,
      "step": 415
    },
    {
      "epoch": 0.06656,
      "grad_norm": 0.28306445479393005,
      "learning_rate": 0.0001,
      "loss": 0.393,
      "step": 416
    },
    {
      "epoch": 0.06672,
      "grad_norm": 0.4233446419239044,
      "learning_rate": 0.0001,
      "loss": 0.4169,
      "step": 417
    },
    {
      "epoch": 0.06688,
      "grad_norm": 0.4762818217277527,
      "learning_rate": 0.0001,
      "loss": 0.4055,
      "step": 418
    },
    {
      "epoch": 0.06704,
      "grad_norm": 0.6615303158760071,
      "learning_rate": 0.0001,
      "loss": 0.4401,
      "step": 419
    },
    {
      "epoch": 0.0672,
      "grad_norm": 0.5682222843170166,
      "learning_rate": 0.0001,
      "loss": 0.4268,
      "step": 420
    },
    {
      "epoch": 0.06736,
      "grad_norm": 0.3217273950576782,
      "learning_rate": 0.0001,
      "loss": 0.389,
      "step": 421
    },
    {
      "epoch": 0.06752,
      "grad_norm": 0.427307665348053,
      "learning_rate": 0.0001,
      "loss": 0.4072,
      "step": 422
    },
    {
      "epoch": 0.06768,
      "grad_norm": 0.30891624093055725,
      "learning_rate": 0.0001,
      "loss": 0.4152,
      "step": 423
    },
    {
      "epoch": 0.06784,
      "grad_norm": 0.4222661852836609,
      "learning_rate": 0.0001,
      "loss": 0.4305,
      "step": 424
    },
    {
      "epoch": 0.068,
      "grad_norm": 0.3990730047225952,
      "learning_rate": 0.0001,
      "loss": 0.4206,
      "step": 425
    },
    {
      "epoch": 0.06816,
      "grad_norm": 0.5071967840194702,
      "learning_rate": 0.0001,
      "loss": 0.4092,
      "step": 426
    },
    {
      "epoch": 0.06832,
      "grad_norm": 0.27156588435173035,
      "learning_rate": 0.0001,
      "loss": 0.3875,
      "step": 427
    },
    {
      "epoch": 0.06848,
      "grad_norm": 0.7659735083580017,
      "learning_rate": 0.0001,
      "loss": 0.4258,
      "step": 428
    },
    {
      "epoch": 0.06864,
      "grad_norm": 0.4803389310836792,
      "learning_rate": 0.0001,
      "loss": 0.4161,
      "step": 429
    },
    {
      "epoch": 0.0688,
      "grad_norm": 0.35857951641082764,
      "learning_rate": 0.0001,
      "loss": 0.4086,
      "step": 430
    },
    {
      "epoch": 0.06896,
      "grad_norm": 0.37848424911499023,
      "learning_rate": 0.0001,
      "loss": 0.3957,
      "step": 431
    },
    {
      "epoch": 0.06912,
      "grad_norm": 0.3585664629936218,
      "learning_rate": 0.0001,
      "loss": 0.4014,
      "step": 432
    },
    {
      "epoch": 0.06928,
      "grad_norm": 0.3871194124221802,
      "learning_rate": 0.0001,
      "loss": 0.4086,
      "step": 433
    },
    {
      "epoch": 0.06944,
      "grad_norm": 0.4065229296684265,
      "learning_rate": 0.0001,
      "loss": 0.4121,
      "step": 434
    },
    {
      "epoch": 0.0696,
      "grad_norm": 0.4423487186431885,
      "learning_rate": 0.0001,
      "loss": 0.4204,
      "step": 435
    },
    {
      "epoch": 0.06976,
      "grad_norm": 0.36958304047584534,
      "learning_rate": 0.0001,
      "loss": 0.4323,
      "step": 436
    },
    {
      "epoch": 0.06992,
      "grad_norm": 0.2802828550338745,
      "learning_rate": 0.0001,
      "loss": 0.4086,
      "step": 437
    },
    {
      "epoch": 0.07008,
      "grad_norm": 0.33124932646751404,
      "learning_rate": 0.0001,
      "loss": 0.3976,
      "step": 438
    },
    {
      "epoch": 0.07024,
      "grad_norm": 0.37175875902175903,
      "learning_rate": 0.0001,
      "loss": 0.4158,
      "step": 439
    },
    {
      "epoch": 0.0704,
      "grad_norm": 0.3475583791732788,
      "learning_rate": 0.0001,
      "loss": 0.4056,
      "step": 440
    },
    {
      "epoch": 0.07056,
      "grad_norm": 0.3261139988899231,
      "learning_rate": 0.0001,
      "loss": 0.3995,
      "step": 441
    },
    {
      "epoch": 0.07072,
      "grad_norm": 0.4166032373905182,
      "learning_rate": 0.0001,
      "loss": 0.4201,
      "step": 442
    },
    {
      "epoch": 0.07088,
      "grad_norm": 0.3673945665359497,
      "learning_rate": 0.0001,
      "loss": 0.4032,
      "step": 443
    },
    {
      "epoch": 0.07104,
      "grad_norm": 0.4136391580104828,
      "learning_rate": 0.0001,
      "loss": 0.4049,
      "step": 444
    },
    {
      "epoch": 0.0712,
      "grad_norm": 0.31725844740867615,
      "learning_rate": 0.0001,
      "loss": 0.4196,
      "step": 445
    },
    {
      "epoch": 0.07136,
      "grad_norm": 0.5902746915817261,
      "learning_rate": 0.0001,
      "loss": 0.4384,
      "step": 446
    },
    {
      "epoch": 0.07152,
      "grad_norm": 0.41507887840270996,
      "learning_rate": 0.0001,
      "loss": 0.4123,
      "step": 447
    },
    {
      "epoch": 0.07168,
      "grad_norm": 0.389284610748291,
      "learning_rate": 0.0001,
      "loss": 0.3988,
      "step": 448
    },
    {
      "epoch": 0.07184,
      "grad_norm": 0.4878309667110443,
      "learning_rate": 0.0001,
      "loss": 0.4139,
      "step": 449
    },
    {
      "epoch": 0.072,
      "grad_norm": 0.3326282799243927,
      "learning_rate": 0.0001,
      "loss": 0.399,
      "step": 450
    },
    {
      "epoch": 0.07216,
      "grad_norm": 0.3981621563434601,
      "learning_rate": 0.0001,
      "loss": 0.4209,
      "step": 451
    },
    {
      "epoch": 0.07232,
      "grad_norm": 0.3123423159122467,
      "learning_rate": 0.0001,
      "loss": 0.3999,
      "step": 452
    },
    {
      "epoch": 0.07248,
      "grad_norm": 0.5596675276756287,
      "learning_rate": 0.0001,
      "loss": 0.4053,
      "step": 453
    },
    {
      "epoch": 0.07264,
      "grad_norm": 0.633022665977478,
      "learning_rate": 0.0001,
      "loss": 0.4026,
      "step": 454
    },
    {
      "epoch": 0.0728,
      "grad_norm": 0.42577916383743286,
      "learning_rate": 0.0001,
      "loss": 0.4014,
      "step": 455
    },
    {
      "epoch": 0.07296,
      "grad_norm": 0.6897281408309937,
      "learning_rate": 0.0001,
      "loss": 0.4207,
      "step": 456
    },
    {
      "epoch": 0.07312,
      "grad_norm": 0.3153611421585083,
      "learning_rate": 0.0001,
      "loss": 0.401,
      "step": 457
    },
    {
      "epoch": 0.07328,
      "grad_norm": 0.3908311426639557,
      "learning_rate": 0.0001,
      "loss": 0.4154,
      "step": 458
    },
    {
      "epoch": 0.07344,
      "grad_norm": 0.34539148211479187,
      "learning_rate": 0.0001,
      "loss": 0.4297,
      "step": 459
    },
    {
      "epoch": 0.0736,
      "grad_norm": 0.38096678256988525,
      "learning_rate": 0.0001,
      "loss": 0.3995,
      "step": 460
    },
    {
      "epoch": 0.07376,
      "grad_norm": 0.3629778027534485,
      "learning_rate": 0.0001,
      "loss": 0.4175,
      "step": 461
    },
    {
      "epoch": 0.07392,
      "grad_norm": 0.39260977506637573,
      "learning_rate": 0.0001,
      "loss": 0.4191,
      "step": 462
    },
    {
      "epoch": 0.07408,
      "grad_norm": 0.3504531681537628,
      "learning_rate": 0.0001,
      "loss": 0.4313,
      "step": 463
    },
    {
      "epoch": 0.07424,
      "grad_norm": 0.42774689197540283,
      "learning_rate": 0.0001,
      "loss": 0.4112,
      "step": 464
    },
    {
      "epoch": 0.0744,
      "grad_norm": 0.31379154324531555,
      "learning_rate": 0.0001,
      "loss": 0.4155,
      "step": 465
    },
    {
      "epoch": 0.07456,
      "grad_norm": 0.4618278443813324,
      "learning_rate": 0.0001,
      "loss": 0.4074,
      "step": 466
    },
    {
      "epoch": 0.07472,
      "grad_norm": 0.3518960475921631,
      "learning_rate": 0.0001,
      "loss": 0.399,
      "step": 467
    },
    {
      "epoch": 0.07488,
      "grad_norm": 0.449810653924942,
      "learning_rate": 0.0001,
      "loss": 0.401,
      "step": 468
    },
    {
      "epoch": 0.07504,
      "grad_norm": 0.32783520221710205,
      "learning_rate": 0.0001,
      "loss": 0.4,
      "step": 469
    },
    {
      "epoch": 0.0752,
      "grad_norm": 0.2640914022922516,
      "learning_rate": 0.0001,
      "loss": 0.41,
      "step": 470
    },
    {
      "epoch": 0.07536,
      "grad_norm": 0.3746854066848755,
      "learning_rate": 0.0001,
      "loss": 0.3916,
      "step": 471
    },
    {
      "epoch": 0.07552,
      "grad_norm": 0.3039800822734833,
      "learning_rate": 0.0001,
      "loss": 0.4015,
      "step": 472
    },
    {
      "epoch": 0.07568,
      "grad_norm": 0.32904374599456787,
      "learning_rate": 0.0001,
      "loss": 0.4102,
      "step": 473
    },
    {
      "epoch": 0.07584,
      "grad_norm": 0.25924062728881836,
      "learning_rate": 0.0001,
      "loss": 0.4067,
      "step": 474
    },
    {
      "epoch": 0.076,
      "grad_norm": 0.3489011228084564,
      "learning_rate": 0.0001,
      "loss": 0.3992,
      "step": 475
    },
    {
      "epoch": 0.07616,
      "grad_norm": 0.28101029992103577,
      "learning_rate": 0.0001,
      "loss": 0.4073,
      "step": 476
    },
    {
      "epoch": 0.07632,
      "grad_norm": 0.556891679763794,
      "learning_rate": 0.0001,
      "loss": 0.4315,
      "step": 477
    },
    {
      "epoch": 0.07648,
      "grad_norm": 0.43482673168182373,
      "learning_rate": 0.0001,
      "loss": 0.4046,
      "step": 478
    },
    {
      "epoch": 0.07664,
      "grad_norm": 0.33881092071533203,
      "learning_rate": 0.0001,
      "loss": 0.3984,
      "step": 479
    },
    {
      "epoch": 0.0768,
      "grad_norm": 0.3662671446800232,
      "learning_rate": 0.0001,
      "loss": 0.3924,
      "step": 480
    },
    {
      "epoch": 0.07696,
      "grad_norm": 0.31615185737609863,
      "learning_rate": 0.0001,
      "loss": 0.4018,
      "step": 481
    },
    {
      "epoch": 0.07712,
      "grad_norm": 0.3333168923854828,
      "learning_rate": 0.0001,
      "loss": 0.4174,
      "step": 482
    },
    {
      "epoch": 0.07728,
      "grad_norm": 0.4963904321193695,
      "learning_rate": 0.0001,
      "loss": 0.4131,
      "step": 483
    },
    {
      "epoch": 0.07744,
      "grad_norm": 0.2746158242225647,
      "learning_rate": 0.0001,
      "loss": 0.4074,
      "step": 484
    },
    {
      "epoch": 0.0776,
      "grad_norm": 0.3996340334415436,
      "learning_rate": 0.0001,
      "loss": 0.4002,
      "step": 485
    },
    {
      "epoch": 0.07776,
      "grad_norm": 0.36148208379745483,
      "learning_rate": 0.0001,
      "loss": 0.4077,
      "step": 486
    },
    {
      "epoch": 0.07792,
      "grad_norm": 0.284506618976593,
      "learning_rate": 0.0001,
      "loss": 0.4003,
      "step": 487
    },
    {
      "epoch": 0.07808,
      "grad_norm": 0.4031594395637512,
      "learning_rate": 0.0001,
      "loss": 0.3953,
      "step": 488
    },
    {
      "epoch": 0.07824,
      "grad_norm": 0.3097582459449768,
      "learning_rate": 0.0001,
      "loss": 0.4061,
      "step": 489
    },
    {
      "epoch": 0.0784,
      "grad_norm": 0.34305089712142944,
      "learning_rate": 0.0001,
      "loss": 0.4005,
      "step": 490
    },
    {
      "epoch": 0.07856,
      "grad_norm": 0.33289992809295654,
      "learning_rate": 0.0001,
      "loss": 0.4167,
      "step": 491
    },
    {
      "epoch": 0.07872,
      "grad_norm": 0.24289070069789886,
      "learning_rate": 0.0001,
      "loss": 0.3981,
      "step": 492
    },
    {
      "epoch": 0.07888,
      "grad_norm": 0.290189266204834,
      "learning_rate": 0.0001,
      "loss": 0.4003,
      "step": 493
    },
    {
      "epoch": 0.07904,
      "grad_norm": 0.36432743072509766,
      "learning_rate": 0.0001,
      "loss": 0.4051,
      "step": 494
    },
    {
      "epoch": 0.0792,
      "grad_norm": 0.30526259541511536,
      "learning_rate": 0.0001,
      "loss": 0.3988,
      "step": 495
    },
    {
      "epoch": 0.07936,
      "grad_norm": 0.3498801290988922,
      "learning_rate": 0.0001,
      "loss": 0.3909,
      "step": 496
    },
    {
      "epoch": 0.07952,
      "grad_norm": 0.40753743052482605,
      "learning_rate": 0.0001,
      "loss": 0.4025,
      "step": 497
    },
    {
      "epoch": 0.07968,
      "grad_norm": 0.44530734419822693,
      "learning_rate": 0.0001,
      "loss": 0.3879,
      "step": 498
    },
    {
      "epoch": 0.07984,
      "grad_norm": 0.3743400275707245,
      "learning_rate": 0.0001,
      "loss": 0.3867,
      "step": 499
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.3411090672016144,
      "learning_rate": 0.0001,
      "loss": 0.3913,
      "step": 500
    },
    {
      "epoch": 0.08,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.39610645174980164,
      "eval_train_runtime": 4.4833,
      "eval_train_samples_per_second": 1115.249,
      "eval_train_steps_per_second": 14.052,
      "step": 500
    },
    {
      "epoch": 0.08,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.3941706717014313,
      "eval_test_runtime": 4.4271,
      "eval_test_samples_per_second": 1129.42,
      "eval_test_steps_per_second": 14.231,
      "step": 500
    },
    {
      "epoch": 0.08016,
      "grad_norm": 0.31933295726776123,
      "learning_rate": 0.0001,
      "loss": 0.3806,
      "step": 501
    },
    {
      "epoch": 0.08032,
      "grad_norm": 0.24778825044631958,
      "learning_rate": 0.0001,
      "loss": 0.3702,
      "step": 502
    },
    {
      "epoch": 0.08048,
      "grad_norm": 0.3248530328273773,
      "learning_rate": 0.0001,
      "loss": 0.4142,
      "step": 503
    },
    {
      "epoch": 0.08064,
      "grad_norm": 0.40680280327796936,
      "learning_rate": 0.0001,
      "loss": 0.3953,
      "step": 504
    },
    {
      "epoch": 0.0808,
      "grad_norm": 0.3622770309448242,
      "learning_rate": 0.0001,
      "loss": 0.401,
      "step": 505
    },
    {
      "epoch": 0.08096,
      "grad_norm": 0.36521250009536743,
      "learning_rate": 0.0001,
      "loss": 0.4038,
      "step": 506
    },
    {
      "epoch": 0.08112,
      "grad_norm": 0.5655940771102905,
      "learning_rate": 0.0001,
      "loss": 0.4226,
      "step": 507
    },
    {
      "epoch": 0.08128,
      "grad_norm": 0.26198405027389526,
      "learning_rate": 0.0001,
      "loss": 0.3969,
      "step": 508
    },
    {
      "epoch": 0.08144,
      "grad_norm": 0.3553069829940796,
      "learning_rate": 0.0001,
      "loss": 0.4014,
      "step": 509
    },
    {
      "epoch": 0.0816,
      "grad_norm": 0.2919515371322632,
      "learning_rate": 0.0001,
      "loss": 0.3928,
      "step": 510
    },
    {
      "epoch": 0.08176,
      "grad_norm": 0.2754402458667755,
      "learning_rate": 0.0001,
      "loss": 0.3749,
      "step": 511
    },
    {
      "epoch": 0.08192,
      "grad_norm": 0.25546908378601074,
      "learning_rate": 0.0001,
      "loss": 0.3743,
      "step": 512
    },
    {
      "epoch": 0.08208,
      "grad_norm": 0.34833312034606934,
      "learning_rate": 0.0001,
      "loss": 0.4105,
      "step": 513
    },
    {
      "epoch": 0.08224,
      "grad_norm": 0.3219130337238312,
      "learning_rate": 0.0001,
      "loss": 0.4012,
      "step": 514
    },
    {
      "epoch": 0.0824,
      "grad_norm": 0.24857757985591888,
      "learning_rate": 0.0001,
      "loss": 0.3764,
      "step": 515
    },
    {
      "epoch": 0.08256,
      "grad_norm": 0.408819317817688,
      "learning_rate": 0.0001,
      "loss": 0.4068,
      "step": 516
    },
    {
      "epoch": 0.08272,
      "grad_norm": 0.3873292803764343,
      "learning_rate": 0.0001,
      "loss": 0.3889,
      "step": 517
    },
    {
      "epoch": 0.08288,
      "grad_norm": 0.257574200630188,
      "learning_rate": 0.0001,
      "loss": 0.3748,
      "step": 518
    },
    {
      "epoch": 0.08304,
      "grad_norm": 0.37581515312194824,
      "learning_rate": 0.0001,
      "loss": 0.4115,
      "step": 519
    },
    {
      "epoch": 0.0832,
      "grad_norm": 0.29069656133651733,
      "learning_rate": 0.0001,
      "loss": 0.3772,
      "step": 520
    },
    {
      "epoch": 0.08336,
      "grad_norm": 0.5106526017189026,
      "learning_rate": 0.0001,
      "loss": 0.3898,
      "step": 521
    },
    {
      "epoch": 0.08352,
      "grad_norm": 0.2974855303764343,
      "learning_rate": 0.0001,
      "loss": 0.3964,
      "step": 522
    },
    {
      "epoch": 0.08368,
      "grad_norm": 0.2723519206047058,
      "learning_rate": 0.0001,
      "loss": 0.3794,
      "step": 523
    },
    {
      "epoch": 0.08384,
      "grad_norm": 0.24492108821868896,
      "learning_rate": 0.0001,
      "loss": 0.3896,
      "step": 524
    },
    {
      "epoch": 0.084,
      "grad_norm": 0.31639808416366577,
      "learning_rate": 0.0001,
      "loss": 0.3997,
      "step": 525
    },
    {
      "epoch": 0.08416,
      "grad_norm": 0.3364642262458801,
      "learning_rate": 0.0001,
      "loss": 0.3895,
      "step": 526
    },
    {
      "epoch": 0.08432,
      "grad_norm": 0.27398771047592163,
      "learning_rate": 0.0001,
      "loss": 0.3886,
      "step": 527
    },
    {
      "epoch": 0.08448,
      "grad_norm": 0.303357869386673,
      "learning_rate": 0.0001,
      "loss": 0.4031,
      "step": 528
    },
    {
      "epoch": 0.08464,
      "grad_norm": 0.37443453073501587,
      "learning_rate": 0.0001,
      "loss": 0.4064,
      "step": 529
    },
    {
      "epoch": 0.0848,
      "grad_norm": 0.5950832962989807,
      "learning_rate": 0.0001,
      "loss": 0.3974,
      "step": 530
    },
    {
      "epoch": 0.08496,
      "grad_norm": 0.26710957288742065,
      "learning_rate": 0.0001,
      "loss": 0.4025,
      "step": 531
    },
    {
      "epoch": 0.08512,
      "grad_norm": 0.4710867404937744,
      "learning_rate": 0.0001,
      "loss": 0.3967,
      "step": 532
    },
    {
      "epoch": 0.08528,
      "grad_norm": 0.30684003233909607,
      "learning_rate": 0.0001,
      "loss": 0.4016,
      "step": 533
    },
    {
      "epoch": 0.08544,
      "grad_norm": 0.6120350360870361,
      "learning_rate": 0.0001,
      "loss": 0.3897,
      "step": 534
    },
    {
      "epoch": 0.0856,
      "grad_norm": 0.24179904162883759,
      "learning_rate": 0.0001,
      "loss": 0.3798,
      "step": 535
    },
    {
      "epoch": 0.08576,
      "grad_norm": 0.30523112416267395,
      "learning_rate": 0.0001,
      "loss": 0.3996,
      "step": 536
    },
    {
      "epoch": 0.08592,
      "grad_norm": 0.2909696400165558,
      "learning_rate": 0.0001,
      "loss": 0.3975,
      "step": 537
    },
    {
      "epoch": 0.08608,
      "grad_norm": 0.2705025374889374,
      "learning_rate": 0.0001,
      "loss": 0.3922,
      "step": 538
    },
    {
      "epoch": 0.08624,
      "grad_norm": 0.3158877491950989,
      "learning_rate": 0.0001,
      "loss": 0.3935,
      "step": 539
    },
    {
      "epoch": 0.0864,
      "grad_norm": 0.2488931566476822,
      "learning_rate": 0.0001,
      "loss": 0.3925,
      "step": 540
    },
    {
      "epoch": 0.08656,
      "grad_norm": 0.2742263376712799,
      "learning_rate": 0.0001,
      "loss": 0.3856,
      "step": 541
    },
    {
      "epoch": 0.08672,
      "grad_norm": 0.21622316539287567,
      "learning_rate": 0.0001,
      "loss": 0.3707,
      "step": 542
    },
    {
      "epoch": 0.08688,
      "grad_norm": 0.31771907210350037,
      "learning_rate": 0.0001,
      "loss": 0.3887,
      "step": 543
    },
    {
      "epoch": 0.08704,
      "grad_norm": 0.2939183712005615,
      "learning_rate": 0.0001,
      "loss": 0.3933,
      "step": 544
    },
    {
      "epoch": 0.0872,
      "grad_norm": 0.2707746922969818,
      "learning_rate": 0.0001,
      "loss": 0.3942,
      "step": 545
    },
    {
      "epoch": 0.08736,
      "grad_norm": 0.3316631615161896,
      "learning_rate": 0.0001,
      "loss": 0.3988,
      "step": 546
    },
    {
      "epoch": 0.08752,
      "grad_norm": 0.31262943148612976,
      "learning_rate": 0.0001,
      "loss": 0.3814,
      "step": 547
    },
    {
      "epoch": 0.08768,
      "grad_norm": 0.332658052444458,
      "learning_rate": 0.0001,
      "loss": 0.3756,
      "step": 548
    },
    {
      "epoch": 0.08784,
      "grad_norm": 0.3290672302246094,
      "learning_rate": 0.0001,
      "loss": 0.3896,
      "step": 549
    },
    {
      "epoch": 0.088,
      "grad_norm": 0.43659156560897827,
      "learning_rate": 0.0001,
      "loss": 0.4052,
      "step": 550
    },
    {
      "epoch": 0.08816,
      "grad_norm": 0.2936922609806061,
      "learning_rate": 0.0001,
      "loss": 0.3865,
      "step": 551
    },
    {
      "epoch": 0.08832,
      "grad_norm": 0.4590161144733429,
      "learning_rate": 0.0001,
      "loss": 0.3891,
      "step": 552
    },
    {
      "epoch": 0.08848,
      "grad_norm": 0.2440461963415146,
      "learning_rate": 0.0001,
      "loss": 0.3939,
      "step": 553
    },
    {
      "epoch": 0.08864,
      "grad_norm": 0.507917582988739,
      "learning_rate": 0.0001,
      "loss": 0.4033,
      "step": 554
    },
    {
      "epoch": 0.0888,
      "grad_norm": 0.2944832146167755,
      "learning_rate": 0.0001,
      "loss": 0.3885,
      "step": 555
    },
    {
      "epoch": 0.08896,
      "grad_norm": 0.19917814433574677,
      "learning_rate": 0.0001,
      "loss": 0.3811,
      "step": 556
    },
    {
      "epoch": 0.08912,
      "grad_norm": 0.3040201961994171,
      "learning_rate": 0.0001,
      "loss": 0.397,
      "step": 557
    },
    {
      "epoch": 0.08928,
      "grad_norm": 0.3470746576786041,
      "learning_rate": 0.0001,
      "loss": 0.3917,
      "step": 558
    },
    {
      "epoch": 0.08944,
      "grad_norm": 0.22198386490345,
      "learning_rate": 0.0001,
      "loss": 0.3834,
      "step": 559
    },
    {
      "epoch": 0.0896,
      "grad_norm": 0.28437086939811707,
      "learning_rate": 0.0001,
      "loss": 0.3948,
      "step": 560
    },
    {
      "epoch": 0.08976,
      "grad_norm": 0.2996319830417633,
      "learning_rate": 0.0001,
      "loss": 0.4024,
      "step": 561
    },
    {
      "epoch": 0.08992,
      "grad_norm": 0.3593040406703949,
      "learning_rate": 0.0001,
      "loss": 0.4025,
      "step": 562
    },
    {
      "epoch": 0.09008,
      "grad_norm": 0.28561335802078247,
      "learning_rate": 0.0001,
      "loss": 0.389,
      "step": 563
    },
    {
      "epoch": 0.09024,
      "grad_norm": 0.2773100733757019,
      "learning_rate": 0.0001,
      "loss": 0.3885,
      "step": 564
    },
    {
      "epoch": 0.0904,
      "grad_norm": 0.23215824365615845,
      "learning_rate": 0.0001,
      "loss": 0.3865,
      "step": 565
    },
    {
      "epoch": 0.09056,
      "grad_norm": 0.2260044813156128,
      "learning_rate": 0.0001,
      "loss": 0.3794,
      "step": 566
    },
    {
      "epoch": 0.09072,
      "grad_norm": 0.26643145084381104,
      "learning_rate": 0.0001,
      "loss": 0.3885,
      "step": 567
    },
    {
      "epoch": 0.09088,
      "grad_norm": 0.22798721492290497,
      "learning_rate": 0.0001,
      "loss": 0.3714,
      "step": 568
    },
    {
      "epoch": 0.09104,
      "grad_norm": 0.2829216718673706,
      "learning_rate": 0.0001,
      "loss": 0.3831,
      "step": 569
    },
    {
      "epoch": 0.0912,
      "grad_norm": 0.3524191379547119,
      "learning_rate": 0.0001,
      "loss": 0.4001,
      "step": 570
    },
    {
      "epoch": 0.09136,
      "grad_norm": 0.2852359414100647,
      "learning_rate": 0.0001,
      "loss": 0.3869,
      "step": 571
    },
    {
      "epoch": 0.09152,
      "grad_norm": 0.24045075476169586,
      "learning_rate": 0.0001,
      "loss": 0.389,
      "step": 572
    },
    {
      "epoch": 0.09168,
      "grad_norm": 0.2976721525192261,
      "learning_rate": 0.0001,
      "loss": 0.3872,
      "step": 573
    },
    {
      "epoch": 0.09184,
      "grad_norm": 0.2530525028705597,
      "learning_rate": 0.0001,
      "loss": 0.3811,
      "step": 574
    },
    {
      "epoch": 0.092,
      "grad_norm": 0.2963438630104065,
      "learning_rate": 0.0001,
      "loss": 0.3896,
      "step": 575
    },
    {
      "epoch": 0.09216,
      "grad_norm": 0.3237655758857727,
      "learning_rate": 0.0001,
      "loss": 0.396,
      "step": 576
    },
    {
      "epoch": 0.09232,
      "grad_norm": 0.4140378534793854,
      "learning_rate": 0.0001,
      "loss": 0.4035,
      "step": 577
    },
    {
      "epoch": 0.09248,
      "grad_norm": 0.24215160310268402,
      "learning_rate": 0.0001,
      "loss": 0.3946,
      "step": 578
    },
    {
      "epoch": 0.09264,
      "grad_norm": 0.3636932075023651,
      "learning_rate": 0.0001,
      "loss": 0.3944,
      "step": 579
    },
    {
      "epoch": 0.0928,
      "grad_norm": 0.261457234621048,
      "learning_rate": 0.0001,
      "loss": 0.3745,
      "step": 580
    },
    {
      "epoch": 0.09296,
      "grad_norm": 0.25116220116615295,
      "learning_rate": 0.0001,
      "loss": 0.3899,
      "step": 581
    },
    {
      "epoch": 0.09312,
      "grad_norm": 0.2688573896884918,
      "learning_rate": 0.0001,
      "loss": 0.3838,
      "step": 582
    },
    {
      "epoch": 0.09328,
      "grad_norm": 0.2987321615219116,
      "learning_rate": 0.0001,
      "loss": 0.3956,
      "step": 583
    },
    {
      "epoch": 0.09344,
      "grad_norm": 0.26205992698669434,
      "learning_rate": 0.0001,
      "loss": 0.3911,
      "step": 584
    },
    {
      "epoch": 0.0936,
      "grad_norm": 0.24196119606494904,
      "learning_rate": 0.0001,
      "loss": 0.3742,
      "step": 585
    },
    {
      "epoch": 0.09376,
      "grad_norm": 0.4131149649620056,
      "learning_rate": 0.0001,
      "loss": 0.3813,
      "step": 586
    },
    {
      "epoch": 0.09392,
      "grad_norm": 0.24619817733764648,
      "learning_rate": 0.0001,
      "loss": 0.3925,
      "step": 587
    },
    {
      "epoch": 0.09408,
      "grad_norm": 0.4473784267902374,
      "learning_rate": 0.0001,
      "loss": 0.3805,
      "step": 588
    },
    {
      "epoch": 0.09424,
      "grad_norm": 0.20317816734313965,
      "learning_rate": 0.0001,
      "loss": 0.3839,
      "step": 589
    },
    {
      "epoch": 0.0944,
      "grad_norm": 0.40600907802581787,
      "learning_rate": 0.0001,
      "loss": 0.3966,
      "step": 590
    },
    {
      "epoch": 0.09456,
      "grad_norm": 0.2461300939321518,
      "learning_rate": 0.0001,
      "loss": 0.3946,
      "step": 591
    },
    {
      "epoch": 0.09472,
      "grad_norm": 0.34788158535957336,
      "learning_rate": 0.0001,
      "loss": 0.3845,
      "step": 592
    },
    {
      "epoch": 0.09488,
      "grad_norm": 0.2532917857170105,
      "learning_rate": 0.0001,
      "loss": 0.3859,
      "step": 593
    },
    {
      "epoch": 0.09504,
      "grad_norm": 0.36009812355041504,
      "learning_rate": 0.0001,
      "loss": 0.3703,
      "step": 594
    },
    {
      "epoch": 0.0952,
      "grad_norm": 0.23452746868133545,
      "learning_rate": 0.0001,
      "loss": 0.3727,
      "step": 595
    },
    {
      "epoch": 0.09536,
      "grad_norm": 0.24842289090156555,
      "learning_rate": 0.0001,
      "loss": 0.3879,
      "step": 596
    },
    {
      "epoch": 0.09552,
      "grad_norm": 0.2732473909854889,
      "learning_rate": 0.0001,
      "loss": 0.3874,
      "step": 597
    },
    {
      "epoch": 0.09568,
      "grad_norm": 0.2156975418329239,
      "learning_rate": 0.0001,
      "loss": 0.3616,
      "step": 598
    },
    {
      "epoch": 0.09584,
      "grad_norm": 0.3442114293575287,
      "learning_rate": 0.0001,
      "loss": 0.3853,
      "step": 599
    },
    {
      "epoch": 0.096,
      "grad_norm": 0.22375106811523438,
      "learning_rate": 0.0001,
      "loss": 0.3744,
      "step": 600
    },
    {
      "epoch": 0.096,
      "eval_train_accuracy": 0.5082,
      "eval_train_loss": 0.37965747714042664,
      "eval_train_runtime": 4.3356,
      "eval_train_samples_per_second": 1153.246,
      "eval_train_steps_per_second": 14.531,
      "step": 600
    },
    {
      "epoch": 0.096,
      "eval_test_accuracy": 0.4992,
      "eval_test_loss": 0.3777756989002228,
      "eval_test_runtime": 4.4853,
      "eval_test_samples_per_second": 1114.742,
      "eval_test_steps_per_second": 14.046,
      "step": 600
    },
    {
      "epoch": 0.09616,
      "grad_norm": 0.28498461842536926,
      "learning_rate": 0.0001,
      "loss": 0.3821,
      "step": 601
    },
    {
      "epoch": 0.09632,
      "grad_norm": 0.2562367618083954,
      "learning_rate": 0.0001,
      "loss": 0.3821,
      "step": 602
    },
    {
      "epoch": 0.09648,
      "grad_norm": 0.22461602091789246,
      "learning_rate": 0.0001,
      "loss": 0.3717,
      "step": 603
    },
    {
      "epoch": 0.09664,
      "grad_norm": 0.22230228781700134,
      "learning_rate": 0.0001,
      "loss": 0.3799,
      "step": 604
    },
    {
      "epoch": 0.0968,
      "grad_norm": 0.30086541175842285,
      "learning_rate": 0.0001,
      "loss": 0.3848,
      "step": 605
    },
    {
      "epoch": 0.09696,
      "grad_norm": 0.24780195951461792,
      "learning_rate": 0.0001,
      "loss": 0.3844,
      "step": 606
    },
    {
      "epoch": 0.09712,
      "grad_norm": 0.2125576138496399,
      "learning_rate": 0.0001,
      "loss": 0.3736,
      "step": 607
    },
    {
      "epoch": 0.09728,
      "grad_norm": 0.21180376410484314,
      "learning_rate": 0.0001,
      "loss": 0.3695,
      "step": 608
    },
    {
      "epoch": 0.09744,
      "grad_norm": 0.2618860900402069,
      "learning_rate": 0.0001,
      "loss": 0.3872,
      "step": 609
    },
    {
      "epoch": 0.0976,
      "grad_norm": 0.2503820061683655,
      "learning_rate": 0.0001,
      "loss": 0.3755,
      "step": 610
    },
    {
      "epoch": 0.09776,
      "grad_norm": 0.22476992011070251,
      "learning_rate": 0.0001,
      "loss": 0.3757,
      "step": 611
    },
    {
      "epoch": 0.09792,
      "grad_norm": 0.26968643069267273,
      "learning_rate": 0.0001,
      "loss": 0.3824,
      "step": 612
    },
    {
      "epoch": 0.09808,
      "grad_norm": 0.2053574174642563,
      "learning_rate": 0.0001,
      "loss": 0.3604,
      "step": 613
    },
    {
      "epoch": 0.09824,
      "grad_norm": 0.2757457494735718,
      "learning_rate": 0.0001,
      "loss": 0.3933,
      "step": 614
    },
    {
      "epoch": 0.0984,
      "grad_norm": 0.22127999365329742,
      "learning_rate": 0.0001,
      "loss": 0.3601,
      "step": 615
    },
    {
      "epoch": 0.09856,
      "grad_norm": 0.33097758889198303,
      "learning_rate": 0.0001,
      "loss": 0.3737,
      "step": 616
    },
    {
      "epoch": 0.09872,
      "grad_norm": 0.2269030213356018,
      "learning_rate": 0.0001,
      "loss": 0.3855,
      "step": 617
    },
    {
      "epoch": 0.09888,
      "grad_norm": 0.377057284116745,
      "learning_rate": 0.0001,
      "loss": 0.3595,
      "step": 618
    },
    {
      "epoch": 0.09904,
      "grad_norm": 0.32939502596855164,
      "learning_rate": 0.0001,
      "loss": 0.4004,
      "step": 619
    },
    {
      "epoch": 0.0992,
      "grad_norm": 0.5145142078399658,
      "learning_rate": 0.0001,
      "loss": 0.3966,
      "step": 620
    },
    {
      "epoch": 0.09936,
      "grad_norm": 0.23963625729084015,
      "learning_rate": 0.0001,
      "loss": 0.3729,
      "step": 621
    },
    {
      "epoch": 0.09952,
      "grad_norm": 0.3699982762336731,
      "learning_rate": 0.0001,
      "loss": 0.3771,
      "step": 622
    },
    {
      "epoch": 0.09968,
      "grad_norm": 0.297508180141449,
      "learning_rate": 0.0001,
      "loss": 0.3687,
      "step": 623
    },
    {
      "epoch": 0.09984,
      "grad_norm": 0.2488584816455841,
      "learning_rate": 0.0001,
      "loss": 0.3675,
      "step": 624
    },
    {
      "epoch": 0.1,
      "grad_norm": 0.39559608697891235,
      "learning_rate": 0.0001,
      "loss": 0.3788,
      "step": 625
    },
    {
      "epoch": 0.10016,
      "grad_norm": 0.24790343642234802,
      "learning_rate": 0.0001,
      "loss": 0.3855,
      "step": 626
    },
    {
      "epoch": 0.10032,
      "grad_norm": 0.3992924988269806,
      "learning_rate": 0.0001,
      "loss": 0.3766,
      "step": 627
    },
    {
      "epoch": 0.10048,
      "grad_norm": 0.2852700352668762,
      "learning_rate": 0.0001,
      "loss": 0.3936,
      "step": 628
    },
    {
      "epoch": 0.10064,
      "grad_norm": 0.21514096856117249,
      "learning_rate": 0.0001,
      "loss": 0.3833,
      "step": 629
    },
    {
      "epoch": 0.1008,
      "grad_norm": 0.31381651759147644,
      "learning_rate": 0.0001,
      "loss": 0.4008,
      "step": 630
    },
    {
      "epoch": 0.10096,
      "grad_norm": 0.228826105594635,
      "learning_rate": 0.0001,
      "loss": 0.3752,
      "step": 631
    },
    {
      "epoch": 0.10112,
      "grad_norm": 0.20824399590492249,
      "learning_rate": 0.0001,
      "loss": 0.3647,
      "step": 632
    },
    {
      "epoch": 0.10128,
      "grad_norm": 0.21563827991485596,
      "learning_rate": 0.0001,
      "loss": 0.3712,
      "step": 633
    },
    {
      "epoch": 0.10144,
      "grad_norm": 0.27104806900024414,
      "learning_rate": 0.0001,
      "loss": 0.3908,
      "step": 634
    },
    {
      "epoch": 0.1016,
      "grad_norm": 0.24534805119037628,
      "learning_rate": 0.0001,
      "loss": 0.383,
      "step": 635
    },
    {
      "epoch": 0.10176,
      "grad_norm": 0.26342087984085083,
      "learning_rate": 0.0001,
      "loss": 0.3963,
      "step": 636
    },
    {
      "epoch": 0.10192,
      "grad_norm": 0.2490401715040207,
      "learning_rate": 0.0001,
      "loss": 0.3795,
      "step": 637
    },
    {
      "epoch": 0.10208,
      "grad_norm": 0.27435460686683655,
      "learning_rate": 0.0001,
      "loss": 0.3772,
      "step": 638
    },
    {
      "epoch": 0.10224,
      "grad_norm": 0.2967737913131714,
      "learning_rate": 0.0001,
      "loss": 0.396,
      "step": 639
    },
    {
      "epoch": 0.1024,
      "grad_norm": 0.24575795233249664,
      "learning_rate": 0.0001,
      "loss": 0.377,
      "step": 640
    },
    {
      "epoch": 0.10256,
      "grad_norm": 0.22917360067367554,
      "learning_rate": 0.0001,
      "loss": 0.3839,
      "step": 641
    },
    {
      "epoch": 0.10272,
      "grad_norm": 0.3023170530796051,
      "learning_rate": 0.0001,
      "loss": 0.3857,
      "step": 642
    },
    {
      "epoch": 0.10288,
      "grad_norm": 0.2022523432970047,
      "learning_rate": 0.0001,
      "loss": 0.3704,
      "step": 643
    },
    {
      "epoch": 0.10304,
      "grad_norm": 0.2954541742801666,
      "learning_rate": 0.0001,
      "loss": 0.393,
      "step": 644
    },
    {
      "epoch": 0.1032,
      "grad_norm": 0.29288461804389954,
      "learning_rate": 0.0001,
      "loss": 0.3821,
      "step": 645
    },
    {
      "epoch": 0.10336,
      "grad_norm": 0.23109526932239532,
      "learning_rate": 0.0001,
      "loss": 0.377,
      "step": 646
    },
    {
      "epoch": 0.10352,
      "grad_norm": 0.4487464129924774,
      "learning_rate": 0.0001,
      "loss": 0.3767,
      "step": 647
    },
    {
      "epoch": 0.10368,
      "grad_norm": 0.211373969912529,
      "learning_rate": 0.0001,
      "loss": 0.3657,
      "step": 648
    },
    {
      "epoch": 0.10384,
      "grad_norm": 0.27885475754737854,
      "learning_rate": 0.0001,
      "loss": 0.3686,
      "step": 649
    },
    {
      "epoch": 0.104,
      "grad_norm": 0.3203275203704834,
      "learning_rate": 0.0001,
      "loss": 0.374,
      "step": 650
    },
    {
      "epoch": 0.10416,
      "grad_norm": 0.227715864777565,
      "learning_rate": 0.0001,
      "loss": 0.37,
      "step": 651
    },
    {
      "epoch": 0.10432,
      "grad_norm": 0.20979906618595123,
      "learning_rate": 0.0001,
      "loss": 0.3664,
      "step": 652
    },
    {
      "epoch": 0.10448,
      "grad_norm": 0.24647265672683716,
      "learning_rate": 0.0001,
      "loss": 0.353,
      "step": 653
    },
    {
      "epoch": 0.10464,
      "grad_norm": 0.2125198096036911,
      "learning_rate": 0.0001,
      "loss": 0.3582,
      "step": 654
    },
    {
      "epoch": 0.1048,
      "grad_norm": 0.2766295075416565,
      "learning_rate": 0.0001,
      "loss": 0.393,
      "step": 655
    },
    {
      "epoch": 0.10496,
      "grad_norm": 0.18871699273586273,
      "learning_rate": 0.0001,
      "loss": 0.3591,
      "step": 656
    },
    {
      "epoch": 0.10512,
      "grad_norm": 0.2245245724916458,
      "learning_rate": 0.0001,
      "loss": 0.3903,
      "step": 657
    },
    {
      "epoch": 0.10528,
      "grad_norm": 0.24621370434761047,
      "learning_rate": 0.0001,
      "loss": 0.3725,
      "step": 658
    },
    {
      "epoch": 0.10544,
      "grad_norm": 0.21277840435504913,
      "learning_rate": 0.0001,
      "loss": 0.3939,
      "step": 659
    },
    {
      "epoch": 0.1056,
      "grad_norm": 0.1987389624118805,
      "learning_rate": 0.0001,
      "loss": 0.3756,
      "step": 660
    },
    {
      "epoch": 0.10576,
      "grad_norm": 0.18840008974075317,
      "learning_rate": 0.0001,
      "loss": 0.361,
      "step": 661
    },
    {
      "epoch": 0.10592,
      "grad_norm": 0.24629083275794983,
      "learning_rate": 0.0001,
      "loss": 0.387,
      "step": 662
    },
    {
      "epoch": 0.10608,
      "grad_norm": 0.20911063253879547,
      "learning_rate": 0.0001,
      "loss": 0.3827,
      "step": 663
    },
    {
      "epoch": 0.10624,
      "grad_norm": 0.19741949439048767,
      "learning_rate": 0.0001,
      "loss": 0.361,
      "step": 664
    },
    {
      "epoch": 0.1064,
      "grad_norm": 0.2629906237125397,
      "learning_rate": 0.0001,
      "loss": 0.3772,
      "step": 665
    },
    {
      "epoch": 0.10656,
      "grad_norm": 0.20730113983154297,
      "learning_rate": 0.0001,
      "loss": 0.3733,
      "step": 666
    },
    {
      "epoch": 0.10672,
      "grad_norm": 0.2589152157306671,
      "learning_rate": 0.0001,
      "loss": 0.3897,
      "step": 667
    },
    {
      "epoch": 0.10688,
      "grad_norm": 0.240905299782753,
      "learning_rate": 0.0001,
      "loss": 0.3736,
      "step": 668
    },
    {
      "epoch": 0.10704,
      "grad_norm": 0.29287126660346985,
      "learning_rate": 0.0001,
      "loss": 0.3826,
      "step": 669
    },
    {
      "epoch": 0.1072,
      "grad_norm": 0.22127820551395416,
      "learning_rate": 0.0001,
      "loss": 0.3672,
      "step": 670
    },
    {
      "epoch": 0.10736,
      "grad_norm": 0.3184696435928345,
      "learning_rate": 0.0001,
      "loss": 0.3825,
      "step": 671
    },
    {
      "epoch": 0.10752,
      "grad_norm": 0.19822999835014343,
      "learning_rate": 0.0001,
      "loss": 0.3899,
      "step": 672
    },
    {
      "epoch": 0.10768,
      "grad_norm": 0.35981690883636475,
      "learning_rate": 0.0001,
      "loss": 0.3846,
      "step": 673
    },
    {
      "epoch": 0.10784,
      "grad_norm": 0.2576318383216858,
      "learning_rate": 0.0001,
      "loss": 0.3805,
      "step": 674
    },
    {
      "epoch": 0.108,
      "grad_norm": 0.2275446057319641,
      "learning_rate": 0.0001,
      "loss": 0.3653,
      "step": 675
    },
    {
      "epoch": 0.10816,
      "grad_norm": 0.3319064676761627,
      "learning_rate": 0.0001,
      "loss": 0.3793,
      "step": 676
    },
    {
      "epoch": 0.10832,
      "grad_norm": 0.2631102502346039,
      "learning_rate": 0.0001,
      "loss": 0.3949,
      "step": 677
    },
    {
      "epoch": 0.10848,
      "grad_norm": 0.2844587564468384,
      "learning_rate": 0.0001,
      "loss": 0.3624,
      "step": 678
    },
    {
      "epoch": 0.10864,
      "grad_norm": 0.4516178071498871,
      "learning_rate": 0.0001,
      "loss": 0.3666,
      "step": 679
    },
    {
      "epoch": 0.1088,
      "grad_norm": 0.2748478353023529,
      "learning_rate": 0.0001,
      "loss": 0.3831,
      "step": 680
    },
    {
      "epoch": 0.10896,
      "grad_norm": 0.38585326075553894,
      "learning_rate": 0.0001,
      "loss": 0.3761,
      "step": 681
    },
    {
      "epoch": 0.10912,
      "grad_norm": 0.35145875811576843,
      "learning_rate": 0.0001,
      "loss": 0.3919,
      "step": 682
    },
    {
      "epoch": 0.10928,
      "grad_norm": 0.23991656303405762,
      "learning_rate": 0.0001,
      "loss": 0.3695,
      "step": 683
    },
    {
      "epoch": 0.10944,
      "grad_norm": 0.27897772192955017,
      "learning_rate": 0.0001,
      "loss": 0.3741,
      "step": 684
    },
    {
      "epoch": 0.1096,
      "grad_norm": 0.3011065125465393,
      "learning_rate": 0.0001,
      "loss": 0.3908,
      "step": 685
    },
    {
      "epoch": 0.10976,
      "grad_norm": 0.23445194959640503,
      "learning_rate": 0.0001,
      "loss": 0.4008,
      "step": 686
    },
    {
      "epoch": 0.10992,
      "grad_norm": 0.19255471229553223,
      "learning_rate": 0.0001,
      "loss": 0.4014,
      "step": 687
    },
    {
      "epoch": 0.11008,
      "grad_norm": 0.2601206302642822,
      "learning_rate": 0.0001,
      "loss": 0.3852,
      "step": 688
    },
    {
      "epoch": 0.11024,
      "grad_norm": 0.2155575305223465,
      "learning_rate": 0.0001,
      "loss": 0.369,
      "step": 689
    },
    {
      "epoch": 0.1104,
      "grad_norm": 0.2522868812084198,
      "learning_rate": 0.0001,
      "loss": 0.3599,
      "step": 690
    },
    {
      "epoch": 0.11056,
      "grad_norm": 0.1948661208152771,
      "learning_rate": 0.0001,
      "loss": 0.3735,
      "step": 691
    },
    {
      "epoch": 0.11072,
      "grad_norm": 0.20662547647953033,
      "learning_rate": 0.0001,
      "loss": 0.378,
      "step": 692
    },
    {
      "epoch": 0.11088,
      "grad_norm": 0.23828625679016113,
      "learning_rate": 0.0001,
      "loss": 0.3766,
      "step": 693
    },
    {
      "epoch": 0.11104,
      "grad_norm": 0.3048519492149353,
      "learning_rate": 0.0001,
      "loss": 0.3659,
      "step": 694
    },
    {
      "epoch": 0.1112,
      "grad_norm": 0.20365358889102936,
      "learning_rate": 0.0001,
      "loss": 0.3813,
      "step": 695
    },
    {
      "epoch": 0.11136,
      "grad_norm": 0.22496379911899567,
      "learning_rate": 0.0001,
      "loss": 0.3795,
      "step": 696
    },
    {
      "epoch": 0.11152,
      "grad_norm": 0.2727571129798889,
      "learning_rate": 0.0001,
      "loss": 0.3714,
      "step": 697
    },
    {
      "epoch": 0.11168,
      "grad_norm": 0.22598110139369965,
      "learning_rate": 0.0001,
      "loss": 0.3825,
      "step": 698
    },
    {
      "epoch": 0.11184,
      "grad_norm": 0.25066787004470825,
      "learning_rate": 0.0001,
      "loss": 0.3699,
      "step": 699
    },
    {
      "epoch": 0.112,
      "grad_norm": 0.19491368532180786,
      "learning_rate": 0.0001,
      "loss": 0.3772,
      "step": 700
    },
    {
      "epoch": 0.112,
      "eval_train_accuracy": 0.5016,
      "eval_train_loss": 0.37492650747299194,
      "eval_train_runtime": 4.7623,
      "eval_train_samples_per_second": 1049.918,
      "eval_train_steps_per_second": 13.229,
      "step": 700
    },
    {
      "epoch": 0.112,
      "eval_test_accuracy": 0.502,
      "eval_test_loss": 0.37313541769981384,
      "eval_test_runtime": 4.2232,
      "eval_test_samples_per_second": 1183.942,
      "eval_test_steps_per_second": 14.918,
      "step": 700
    },
    {
      "epoch": 0.11216,
      "grad_norm": 0.23917970061302185,
      "learning_rate": 0.0001,
      "loss": 0.3629,
      "step": 701
    },
    {
      "epoch": 0.11232,
      "grad_norm": 0.18801143765449524,
      "learning_rate": 0.0001,
      "loss": 0.3705,
      "step": 702
    },
    {
      "epoch": 0.11248,
      "grad_norm": 0.32207193970680237,
      "learning_rate": 0.0001,
      "loss": 0.389,
      "step": 703
    },
    {
      "epoch": 0.11264,
      "grad_norm": 0.18840773403644562,
      "learning_rate": 0.0001,
      "loss": 0.3845,
      "step": 704
    },
    {
      "epoch": 0.1128,
      "grad_norm": 0.26525601744651794,
      "learning_rate": 0.0001,
      "loss": 0.3806,
      "step": 705
    },
    {
      "epoch": 0.11296,
      "grad_norm": 0.2232830971479416,
      "learning_rate": 0.0001,
      "loss": 0.3882,
      "step": 706
    },
    {
      "epoch": 0.11312,
      "grad_norm": 0.17605619132518768,
      "learning_rate": 0.0001,
      "loss": 0.3832,
      "step": 707
    },
    {
      "epoch": 0.11328,
      "grad_norm": 0.22673974931240082,
      "learning_rate": 0.0001,
      "loss": 0.3695,
      "step": 708
    },
    {
      "epoch": 0.11344,
      "grad_norm": 0.29780733585357666,
      "learning_rate": 0.0001,
      "loss": 0.3888,
      "step": 709
    },
    {
      "epoch": 0.1136,
      "grad_norm": 0.19367535412311554,
      "learning_rate": 0.0001,
      "loss": 0.3705,
      "step": 710
    },
    {
      "epoch": 0.11376,
      "grad_norm": 0.1947614699602127,
      "learning_rate": 0.0001,
      "loss": 0.3811,
      "step": 711
    },
    {
      "epoch": 0.11392,
      "grad_norm": 0.1677674949169159,
      "learning_rate": 0.0001,
      "loss": 0.3723,
      "step": 712
    },
    {
      "epoch": 0.11408,
      "grad_norm": 0.235146164894104,
      "learning_rate": 0.0001,
      "loss": 0.3835,
      "step": 713
    },
    {
      "epoch": 0.11424,
      "grad_norm": 0.20696838200092316,
      "learning_rate": 0.0001,
      "loss": 0.3617,
      "step": 714
    },
    {
      "epoch": 0.1144,
      "grad_norm": 0.247110977768898,
      "learning_rate": 0.0001,
      "loss": 0.3707,
      "step": 715
    },
    {
      "epoch": 0.11456,
      "grad_norm": 0.19738513231277466,
      "learning_rate": 0.0001,
      "loss": 0.3813,
      "step": 716
    },
    {
      "epoch": 0.11472,
      "grad_norm": 0.22791779041290283,
      "learning_rate": 0.0001,
      "loss": 0.3683,
      "step": 717
    },
    {
      "epoch": 0.11488,
      "grad_norm": 0.25628238916397095,
      "learning_rate": 0.0001,
      "loss": 0.3796,
      "step": 718
    },
    {
      "epoch": 0.11504,
      "grad_norm": 0.16960804164409637,
      "learning_rate": 0.0001,
      "loss": 0.3862,
      "step": 719
    },
    {
      "epoch": 0.1152,
      "grad_norm": 0.22661741077899933,
      "learning_rate": 0.0001,
      "loss": 0.372,
      "step": 720
    },
    {
      "epoch": 0.11536,
      "grad_norm": 0.19606322050094604,
      "learning_rate": 0.0001,
      "loss": 0.371,
      "step": 721
    },
    {
      "epoch": 0.11552,
      "grad_norm": 0.23355674743652344,
      "learning_rate": 0.0001,
      "loss": 0.3716,
      "step": 722
    },
    {
      "epoch": 0.11568,
      "grad_norm": 0.18877379596233368,
      "learning_rate": 0.0001,
      "loss": 0.3679,
      "step": 723
    },
    {
      "epoch": 0.11584,
      "grad_norm": 0.2888208329677582,
      "learning_rate": 0.0001,
      "loss": 0.3694,
      "step": 724
    },
    {
      "epoch": 0.116,
      "grad_norm": 0.1982152760028839,
      "learning_rate": 0.0001,
      "loss": 0.3841,
      "step": 725
    },
    {
      "epoch": 0.11616,
      "grad_norm": 0.2558135688304901,
      "learning_rate": 0.0001,
      "loss": 0.3882,
      "step": 726
    },
    {
      "epoch": 0.11632,
      "grad_norm": 0.20424909889698029,
      "learning_rate": 0.0001,
      "loss": 0.3825,
      "step": 727
    },
    {
      "epoch": 0.11648,
      "grad_norm": 0.19040675461292267,
      "learning_rate": 0.0001,
      "loss": 0.3772,
      "step": 728
    },
    {
      "epoch": 0.11664,
      "grad_norm": 0.21199767291545868,
      "learning_rate": 0.0001,
      "loss": 0.3836,
      "step": 729
    },
    {
      "epoch": 0.1168,
      "grad_norm": 0.20719248056411743,
      "learning_rate": 0.0001,
      "loss": 0.3786,
      "step": 730
    },
    {
      "epoch": 0.11696,
      "grad_norm": 0.24009911715984344,
      "learning_rate": 0.0001,
      "loss": 0.3785,
      "step": 731
    },
    {
      "epoch": 0.11712,
      "grad_norm": 0.2379245162010193,
      "learning_rate": 0.0001,
      "loss": 0.3696,
      "step": 732
    },
    {
      "epoch": 0.11728,
      "grad_norm": 0.18476936221122742,
      "learning_rate": 0.0001,
      "loss": 0.3554,
      "step": 733
    },
    {
      "epoch": 0.11744,
      "grad_norm": 0.22908717393875122,
      "learning_rate": 0.0001,
      "loss": 0.3853,
      "step": 734
    },
    {
      "epoch": 0.1176,
      "grad_norm": 0.28002864122390747,
      "learning_rate": 0.0001,
      "loss": 0.3884,
      "step": 735
    },
    {
      "epoch": 0.11776,
      "grad_norm": 0.22620679438114166,
      "learning_rate": 0.0001,
      "loss": 0.3801,
      "step": 736
    },
    {
      "epoch": 0.11792,
      "grad_norm": 0.236639603972435,
      "learning_rate": 0.0001,
      "loss": 0.3808,
      "step": 737
    },
    {
      "epoch": 0.11808,
      "grad_norm": 0.17751775681972504,
      "learning_rate": 0.0001,
      "loss": 0.374,
      "step": 738
    },
    {
      "epoch": 0.11824,
      "grad_norm": 0.21755382418632507,
      "learning_rate": 0.0001,
      "loss": 0.3795,
      "step": 739
    },
    {
      "epoch": 0.1184,
      "grad_norm": 0.18590007722377777,
      "learning_rate": 0.0001,
      "loss": 0.3854,
      "step": 740
    },
    {
      "epoch": 0.11856,
      "grad_norm": 0.20173877477645874,
      "learning_rate": 0.0001,
      "loss": 0.3736,
      "step": 741
    },
    {
      "epoch": 0.11872,
      "grad_norm": 0.23270493745803833,
      "learning_rate": 0.0001,
      "loss": 0.3869,
      "step": 742
    },
    {
      "epoch": 0.11888,
      "grad_norm": 0.23396949470043182,
      "learning_rate": 0.0001,
      "loss": 0.3735,
      "step": 743
    },
    {
      "epoch": 0.11904,
      "grad_norm": 0.16316130757331848,
      "learning_rate": 0.0001,
      "loss": 0.3646,
      "step": 744
    },
    {
      "epoch": 0.1192,
      "grad_norm": 0.19373059272766113,
      "learning_rate": 0.0001,
      "loss": 0.3729,
      "step": 745
    },
    {
      "epoch": 0.11936,
      "grad_norm": 0.19710996747016907,
      "learning_rate": 0.0001,
      "loss": 0.3703,
      "step": 746
    },
    {
      "epoch": 0.11952,
      "grad_norm": 0.18086138367652893,
      "learning_rate": 0.0001,
      "loss": 0.3644,
      "step": 747
    },
    {
      "epoch": 0.11968,
      "grad_norm": 0.2243780791759491,
      "learning_rate": 0.0001,
      "loss": 0.3742,
      "step": 748
    },
    {
      "epoch": 0.11984,
      "grad_norm": 0.19079038500785828,
      "learning_rate": 0.0001,
      "loss": 0.3668,
      "step": 749
    },
    {
      "epoch": 0.12,
      "grad_norm": 0.18954145908355713,
      "learning_rate": 0.0001,
      "loss": 0.3758,
      "step": 750
    },
    {
      "epoch": 0.12016,
      "grad_norm": 0.22903238236904144,
      "learning_rate": 0.0001,
      "loss": 0.3752,
      "step": 751
    },
    {
      "epoch": 0.12032,
      "grad_norm": 0.15484684705734253,
      "learning_rate": 0.0001,
      "loss": 0.3883,
      "step": 752
    },
    {
      "epoch": 0.12048,
      "grad_norm": 0.2167891263961792,
      "learning_rate": 0.0001,
      "loss": 0.3768,
      "step": 753
    },
    {
      "epoch": 0.12064,
      "grad_norm": 0.19526593387126923,
      "learning_rate": 0.0001,
      "loss": 0.3765,
      "step": 754
    },
    {
      "epoch": 0.1208,
      "grad_norm": 0.20005609095096588,
      "learning_rate": 0.0001,
      "loss": 0.37,
      "step": 755
    },
    {
      "epoch": 0.12096,
      "grad_norm": 0.17769812047481537,
      "learning_rate": 0.0001,
      "loss": 0.3766,
      "step": 756
    },
    {
      "epoch": 0.12112,
      "grad_norm": 0.2037198692560196,
      "learning_rate": 0.0001,
      "loss": 0.3749,
      "step": 757
    },
    {
      "epoch": 0.12128,
      "grad_norm": 0.20597490668296814,
      "learning_rate": 0.0001,
      "loss": 0.3772,
      "step": 758
    },
    {
      "epoch": 0.12144,
      "grad_norm": 0.22308474779129028,
      "learning_rate": 0.0001,
      "loss": 0.3788,
      "step": 759
    },
    {
      "epoch": 0.1216,
      "grad_norm": 0.16345226764678955,
      "learning_rate": 0.0001,
      "loss": 0.3716,
      "step": 760
    },
    {
      "epoch": 0.12176,
      "grad_norm": 0.1873359978199005,
      "learning_rate": 0.0001,
      "loss": 0.3756,
      "step": 761
    },
    {
      "epoch": 0.12192,
      "grad_norm": 0.17530401051044464,
      "learning_rate": 0.0001,
      "loss": 0.3775,
      "step": 762
    },
    {
      "epoch": 0.12208,
      "grad_norm": 0.29124486446380615,
      "learning_rate": 0.0001,
      "loss": 0.3669,
      "step": 763
    },
    {
      "epoch": 0.12224,
      "grad_norm": 0.2064061313867569,
      "learning_rate": 0.0001,
      "loss": 0.3806,
      "step": 764
    },
    {
      "epoch": 0.1224,
      "grad_norm": 0.24030354619026184,
      "learning_rate": 0.0001,
      "loss": 0.3716,
      "step": 765
    },
    {
      "epoch": 0.12256,
      "grad_norm": 0.22174066305160522,
      "learning_rate": 0.0001,
      "loss": 0.3909,
      "step": 766
    },
    {
      "epoch": 0.12272,
      "grad_norm": 0.22601407766342163,
      "learning_rate": 0.0001,
      "loss": 0.3654,
      "step": 767
    },
    {
      "epoch": 0.12288,
      "grad_norm": 0.22567051649093628,
      "learning_rate": 0.0001,
      "loss": 0.3617,
      "step": 768
    },
    {
      "epoch": 0.12304,
      "grad_norm": 0.2334422916173935,
      "learning_rate": 0.0001,
      "loss": 0.3768,
      "step": 769
    },
    {
      "epoch": 0.1232,
      "grad_norm": 0.1684255450963974,
      "learning_rate": 0.0001,
      "loss": 0.3793,
      "step": 770
    },
    {
      "epoch": 0.12336,
      "grad_norm": 0.23633569478988647,
      "learning_rate": 0.0001,
      "loss": 0.3683,
      "step": 771
    },
    {
      "epoch": 0.12352,
      "grad_norm": 0.23555965721607208,
      "learning_rate": 0.0001,
      "loss": 0.3825,
      "step": 772
    },
    {
      "epoch": 0.12368,
      "grad_norm": 0.19941598176956177,
      "learning_rate": 0.0001,
      "loss": 0.3662,
      "step": 773
    },
    {
      "epoch": 0.12384,
      "grad_norm": 0.2512306869029999,
      "learning_rate": 0.0001,
      "loss": 0.3728,
      "step": 774
    },
    {
      "epoch": 0.124,
      "grad_norm": 0.2717626392841339,
      "learning_rate": 0.0001,
      "loss": 0.371,
      "step": 775
    },
    {
      "epoch": 0.12416,
      "grad_norm": 0.20216883718967438,
      "learning_rate": 0.0001,
      "loss": 0.3821,
      "step": 776
    },
    {
      "epoch": 0.12432,
      "grad_norm": 0.20350457727909088,
      "learning_rate": 0.0001,
      "loss": 0.3758,
      "step": 777
    },
    {
      "epoch": 0.12448,
      "grad_norm": 0.3131468892097473,
      "learning_rate": 0.0001,
      "loss": 0.3571,
      "step": 778
    },
    {
      "epoch": 0.12464,
      "grad_norm": 0.28507688641548157,
      "learning_rate": 0.0001,
      "loss": 0.3745,
      "step": 779
    },
    {
      "epoch": 0.1248,
      "grad_norm": 0.24969618022441864,
      "learning_rate": 0.0001,
      "loss": 0.3814,
      "step": 780
    },
    {
      "epoch": 0.12496,
      "grad_norm": 0.23418013751506805,
      "learning_rate": 0.0001,
      "loss": 0.375,
      "step": 781
    },
    {
      "epoch": 0.12512,
      "grad_norm": 0.28683656454086304,
      "learning_rate": 0.0001,
      "loss": 0.3794,
      "step": 782
    },
    {
      "epoch": 0.12528,
      "grad_norm": 0.21802657842636108,
      "learning_rate": 0.0001,
      "loss": 0.3969,
      "step": 783
    },
    {
      "epoch": 0.12544,
      "grad_norm": 0.2529509365558624,
      "learning_rate": 0.0001,
      "loss": 0.3769,
      "step": 784
    },
    {
      "epoch": 0.1256,
      "grad_norm": 0.21357113122940063,
      "learning_rate": 0.0001,
      "loss": 0.3647,
      "step": 785
    },
    {
      "epoch": 0.12576,
      "grad_norm": 0.22278812527656555,
      "learning_rate": 0.0001,
      "loss": 0.3734,
      "step": 786
    },
    {
      "epoch": 0.12592,
      "grad_norm": 0.23629671335220337,
      "learning_rate": 0.0001,
      "loss": 0.3852,
      "step": 787
    },
    {
      "epoch": 0.12608,
      "grad_norm": 0.1845715492963791,
      "learning_rate": 0.0001,
      "loss": 0.3793,
      "step": 788
    },
    {
      "epoch": 0.12624,
      "grad_norm": 0.24301913380622864,
      "learning_rate": 0.0001,
      "loss": 0.3702,
      "step": 789
    },
    {
      "epoch": 0.1264,
      "grad_norm": 0.2324075996875763,
      "learning_rate": 0.0001,
      "loss": 0.3838,
      "step": 790
    },
    {
      "epoch": 0.12656,
      "grad_norm": 0.21498391032218933,
      "learning_rate": 0.0001,
      "loss": 0.375,
      "step": 791
    },
    {
      "epoch": 0.12672,
      "grad_norm": 0.22634056210517883,
      "learning_rate": 0.0001,
      "loss": 0.3665,
      "step": 792
    },
    {
      "epoch": 0.12688,
      "grad_norm": 0.15284323692321777,
      "learning_rate": 0.0001,
      "loss": 0.3696,
      "step": 793
    },
    {
      "epoch": 0.12704,
      "grad_norm": 0.3215557336807251,
      "learning_rate": 0.0001,
      "loss": 0.3772,
      "step": 794
    },
    {
      "epoch": 0.1272,
      "grad_norm": 0.18419016897678375,
      "learning_rate": 0.0001,
      "loss": 0.3675,
      "step": 795
    },
    {
      "epoch": 0.12736,
      "grad_norm": 0.243992418050766,
      "learning_rate": 0.0001,
      "loss": 0.3737,
      "step": 796
    },
    {
      "epoch": 0.12752,
      "grad_norm": 0.22724808752536774,
      "learning_rate": 0.0001,
      "loss": 0.3718,
      "step": 797
    },
    {
      "epoch": 0.12768,
      "grad_norm": 0.19413691759109497,
      "learning_rate": 0.0001,
      "loss": 0.3793,
      "step": 798
    },
    {
      "epoch": 0.12784,
      "grad_norm": 0.18621672689914703,
      "learning_rate": 0.0001,
      "loss": 0.3781,
      "step": 799
    },
    {
      "epoch": 0.128,
      "grad_norm": 0.27990153431892395,
      "learning_rate": 0.0001,
      "loss": 0.3755,
      "step": 800
    },
    {
      "epoch": 0.128,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.3732067346572876,
      "eval_train_runtime": 4.6363,
      "eval_train_samples_per_second": 1078.454,
      "eval_train_steps_per_second": 13.589,
      "step": 800
    },
    {
      "epoch": 0.128,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.37147799134254456,
      "eval_test_runtime": 4.3044,
      "eval_test_samples_per_second": 1161.607,
      "eval_test_steps_per_second": 14.636,
      "step": 800
    },
    {
      "epoch": 0.12816,
      "grad_norm": 0.20581260323524475,
      "learning_rate": 0.0001,
      "loss": 0.3814,
      "step": 801
    },
    {
      "epoch": 0.12832,
      "grad_norm": 0.18426808714866638,
      "learning_rate": 0.0001,
      "loss": 0.3641,
      "step": 802
    },
    {
      "epoch": 0.12848,
      "grad_norm": 0.4075886309146881,
      "learning_rate": 0.0001,
      "loss": 0.3809,
      "step": 803
    },
    {
      "epoch": 0.12864,
      "grad_norm": 0.1992615908384323,
      "learning_rate": 0.0001,
      "loss": 0.373,
      "step": 804
    },
    {
      "epoch": 0.1288,
      "grad_norm": 0.312221884727478,
      "learning_rate": 0.0001,
      "loss": 0.3807,
      "step": 805
    },
    {
      "epoch": 0.12896,
      "grad_norm": 0.21224342286586761,
      "learning_rate": 0.0001,
      "loss": 0.3747,
      "step": 806
    },
    {
      "epoch": 0.12912,
      "grad_norm": 0.19991406798362732,
      "learning_rate": 0.0001,
      "loss": 0.3728,
      "step": 807
    },
    {
      "epoch": 0.12928,
      "grad_norm": 0.23613645136356354,
      "learning_rate": 0.0001,
      "loss": 0.3846,
      "step": 808
    },
    {
      "epoch": 0.12944,
      "grad_norm": 0.19819875061511993,
      "learning_rate": 0.0001,
      "loss": 0.3726,
      "step": 809
    },
    {
      "epoch": 0.1296,
      "grad_norm": 0.22938233613967896,
      "learning_rate": 0.0001,
      "loss": 0.3754,
      "step": 810
    },
    {
      "epoch": 0.12976,
      "grad_norm": 0.17925158143043518,
      "learning_rate": 0.0001,
      "loss": 0.3828,
      "step": 811
    },
    {
      "epoch": 0.12992,
      "grad_norm": 0.18155677616596222,
      "learning_rate": 0.0001,
      "loss": 0.3695,
      "step": 812
    },
    {
      "epoch": 0.13008,
      "grad_norm": 0.1874728500843048,
      "learning_rate": 0.0001,
      "loss": 0.3854,
      "step": 813
    },
    {
      "epoch": 0.13024,
      "grad_norm": 0.17921976745128632,
      "learning_rate": 0.0001,
      "loss": 0.3721,
      "step": 814
    },
    {
      "epoch": 0.1304,
      "grad_norm": 0.17115852236747742,
      "learning_rate": 0.0001,
      "loss": 0.3807,
      "step": 815
    },
    {
      "epoch": 0.13056,
      "grad_norm": 0.20942330360412598,
      "learning_rate": 0.0001,
      "loss": 0.3726,
      "step": 816
    },
    {
      "epoch": 0.13072,
      "grad_norm": 0.20812910795211792,
      "learning_rate": 0.0001,
      "loss": 0.3691,
      "step": 817
    },
    {
      "epoch": 0.13088,
      "grad_norm": 0.15691852569580078,
      "learning_rate": 0.0001,
      "loss": 0.3744,
      "step": 818
    },
    {
      "epoch": 0.13104,
      "grad_norm": 0.19714416563510895,
      "learning_rate": 0.0001,
      "loss": 0.3748,
      "step": 819
    },
    {
      "epoch": 0.1312,
      "grad_norm": 0.17352412641048431,
      "learning_rate": 0.0001,
      "loss": 0.3699,
      "step": 820
    },
    {
      "epoch": 0.13136,
      "grad_norm": 0.1867276281118393,
      "learning_rate": 0.0001,
      "loss": 0.3753,
      "step": 821
    },
    {
      "epoch": 0.13152,
      "grad_norm": 0.18978436291217804,
      "learning_rate": 0.0001,
      "loss": 0.3664,
      "step": 822
    },
    {
      "epoch": 0.13168,
      "grad_norm": 0.20492197573184967,
      "learning_rate": 0.0001,
      "loss": 0.3686,
      "step": 823
    },
    {
      "epoch": 0.13184,
      "grad_norm": 0.18029889464378357,
      "learning_rate": 0.0001,
      "loss": 0.3733,
      "step": 824
    },
    {
      "epoch": 0.132,
      "grad_norm": 0.19226931035518646,
      "learning_rate": 0.0001,
      "loss": 0.3761,
      "step": 825
    },
    {
      "epoch": 0.13216,
      "grad_norm": 0.25349193811416626,
      "learning_rate": 0.0001,
      "loss": 0.3662,
      "step": 826
    },
    {
      "epoch": 0.13232,
      "grad_norm": 0.16944792866706848,
      "learning_rate": 0.0001,
      "loss": 0.3624,
      "step": 827
    },
    {
      "epoch": 0.13248,
      "grad_norm": 0.18532375991344452,
      "learning_rate": 0.0001,
      "loss": 0.3648,
      "step": 828
    },
    {
      "epoch": 0.13264,
      "grad_norm": 0.2271190732717514,
      "learning_rate": 0.0001,
      "loss": 0.3794,
      "step": 829
    },
    {
      "epoch": 0.1328,
      "grad_norm": 0.1883034110069275,
      "learning_rate": 0.0001,
      "loss": 0.3769,
      "step": 830
    },
    {
      "epoch": 0.13296,
      "grad_norm": 0.16051509976387024,
      "learning_rate": 0.0001,
      "loss": 0.3722,
      "step": 831
    },
    {
      "epoch": 0.13312,
      "grad_norm": 0.3149360716342926,
      "learning_rate": 0.0001,
      "loss": 0.3802,
      "step": 832
    },
    {
      "epoch": 0.13328,
      "grad_norm": 0.14373250305652618,
      "learning_rate": 0.0001,
      "loss": 0.3617,
      "step": 833
    },
    {
      "epoch": 0.13344,
      "grad_norm": 0.1705411821603775,
      "learning_rate": 0.0001,
      "loss": 0.376,
      "step": 834
    },
    {
      "epoch": 0.1336,
      "grad_norm": 0.22864897549152374,
      "learning_rate": 0.0001,
      "loss": 0.381,
      "step": 835
    },
    {
      "epoch": 0.13376,
      "grad_norm": 0.1883888989686966,
      "learning_rate": 0.0001,
      "loss": 0.3881,
      "step": 836
    },
    {
      "epoch": 0.13392,
      "grad_norm": 0.1768132895231247,
      "learning_rate": 0.0001,
      "loss": 0.3622,
      "step": 837
    },
    {
      "epoch": 0.13408,
      "grad_norm": 0.24991318583488464,
      "learning_rate": 0.0001,
      "loss": 0.3697,
      "step": 838
    },
    {
      "epoch": 0.13424,
      "grad_norm": 0.2001308649778366,
      "learning_rate": 0.0001,
      "loss": 0.3727,
      "step": 839
    },
    {
      "epoch": 0.1344,
      "grad_norm": 0.21781834959983826,
      "learning_rate": 0.0001,
      "loss": 0.3747,
      "step": 840
    },
    {
      "epoch": 0.13456,
      "grad_norm": 0.19454284012317657,
      "learning_rate": 0.0001,
      "loss": 0.3713,
      "step": 841
    },
    {
      "epoch": 0.13472,
      "grad_norm": 0.19514958560466766,
      "learning_rate": 0.0001,
      "loss": 0.3708,
      "step": 842
    },
    {
      "epoch": 0.13488,
      "grad_norm": 0.20924012362957,
      "learning_rate": 0.0001,
      "loss": 0.3788,
      "step": 843
    },
    {
      "epoch": 0.13504,
      "grad_norm": 0.19991780817508698,
      "learning_rate": 0.0001,
      "loss": 0.3675,
      "step": 844
    },
    {
      "epoch": 0.1352,
      "grad_norm": 0.19018453359603882,
      "learning_rate": 0.0001,
      "loss": 0.3675,
      "step": 845
    },
    {
      "epoch": 0.13536,
      "grad_norm": 0.23387084901332855,
      "learning_rate": 0.0001,
      "loss": 0.3804,
      "step": 846
    },
    {
      "epoch": 0.13552,
      "grad_norm": 0.15519510209560394,
      "learning_rate": 0.0001,
      "loss": 0.3592,
      "step": 847
    },
    {
      "epoch": 0.13568,
      "grad_norm": 0.2851606607437134,
      "learning_rate": 0.0001,
      "loss": 0.3591,
      "step": 848
    },
    {
      "epoch": 0.13584,
      "grad_norm": 0.20212456583976746,
      "learning_rate": 0.0001,
      "loss": 0.3827,
      "step": 849
    },
    {
      "epoch": 0.136,
      "grad_norm": 0.16300515830516815,
      "learning_rate": 0.0001,
      "loss": 0.3839,
      "step": 850
    },
    {
      "epoch": 0.13616,
      "grad_norm": 0.1838582158088684,
      "learning_rate": 0.0001,
      "loss": 0.3759,
      "step": 851
    },
    {
      "epoch": 0.13632,
      "grad_norm": 0.1925700157880783,
      "learning_rate": 0.0001,
      "loss": 0.3723,
      "step": 852
    },
    {
      "epoch": 0.13648,
      "grad_norm": 0.1987312138080597,
      "learning_rate": 0.0001,
      "loss": 0.3793,
      "step": 853
    },
    {
      "epoch": 0.13664,
      "grad_norm": 0.16517019271850586,
      "learning_rate": 0.0001,
      "loss": 0.3833,
      "step": 854
    },
    {
      "epoch": 0.1368,
      "grad_norm": 0.22360557317733765,
      "learning_rate": 0.0001,
      "loss": 0.3704,
      "step": 855
    },
    {
      "epoch": 0.13696,
      "grad_norm": 0.15423625707626343,
      "learning_rate": 0.0001,
      "loss": 0.3752,
      "step": 856
    },
    {
      "epoch": 0.13712,
      "grad_norm": 0.14938649535179138,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 857
    },
    {
      "epoch": 0.13728,
      "grad_norm": 0.17067530751228333,
      "learning_rate": 0.0001,
      "loss": 0.37,
      "step": 858
    },
    {
      "epoch": 0.13744,
      "grad_norm": 0.20264247059822083,
      "learning_rate": 0.0001,
      "loss": 0.3659,
      "step": 859
    },
    {
      "epoch": 0.1376,
      "grad_norm": 0.1750536412000656,
      "learning_rate": 0.0001,
      "loss": 0.3798,
      "step": 860
    },
    {
      "epoch": 0.13776,
      "grad_norm": 0.1618185192346573,
      "learning_rate": 0.0001,
      "loss": 0.3585,
      "step": 861
    },
    {
      "epoch": 0.13792,
      "grad_norm": 0.30298155546188354,
      "learning_rate": 0.0001,
      "loss": 0.3799,
      "step": 862
    },
    {
      "epoch": 0.13808,
      "grad_norm": 0.20009639859199524,
      "learning_rate": 0.0001,
      "loss": 0.3851,
      "step": 863
    },
    {
      "epoch": 0.13824,
      "grad_norm": 0.1875382512807846,
      "learning_rate": 0.0001,
      "loss": 0.3649,
      "step": 864
    },
    {
      "epoch": 0.1384,
      "grad_norm": 0.16743899881839752,
      "learning_rate": 0.0001,
      "loss": 0.3656,
      "step": 865
    },
    {
      "epoch": 0.13856,
      "grad_norm": 0.19503755867481232,
      "learning_rate": 0.0001,
      "loss": 0.3714,
      "step": 866
    },
    {
      "epoch": 0.13872,
      "grad_norm": 0.1870070993900299,
      "learning_rate": 0.0001,
      "loss": 0.3777,
      "step": 867
    },
    {
      "epoch": 0.13888,
      "grad_norm": 0.15434890985488892,
      "learning_rate": 0.0001,
      "loss": 0.3739,
      "step": 868
    },
    {
      "epoch": 0.13904,
      "grad_norm": 0.16039985418319702,
      "learning_rate": 0.0001,
      "loss": 0.3778,
      "step": 869
    },
    {
      "epoch": 0.1392,
      "grad_norm": 0.16572701930999756,
      "learning_rate": 0.0001,
      "loss": 0.3531,
      "step": 870
    },
    {
      "epoch": 0.13936,
      "grad_norm": 0.16422826051712036,
      "learning_rate": 0.0001,
      "loss": 0.3733,
      "step": 871
    },
    {
      "epoch": 0.13952,
      "grad_norm": 0.14224690198898315,
      "learning_rate": 0.0001,
      "loss": 0.3595,
      "step": 872
    },
    {
      "epoch": 0.13968,
      "grad_norm": 0.18542666733264923,
      "learning_rate": 0.0001,
      "loss": 0.3758,
      "step": 873
    },
    {
      "epoch": 0.13984,
      "grad_norm": 0.1666616052389145,
      "learning_rate": 0.0001,
      "loss": 0.3731,
      "step": 874
    },
    {
      "epoch": 0.14,
      "grad_norm": 0.18763062357902527,
      "learning_rate": 0.0001,
      "loss": 0.3691,
      "step": 875
    },
    {
      "epoch": 0.14016,
      "grad_norm": 0.16101861000061035,
      "learning_rate": 0.0001,
      "loss": 0.3723,
      "step": 876
    },
    {
      "epoch": 0.14032,
      "grad_norm": 0.14203451573848724,
      "learning_rate": 0.0001,
      "loss": 0.3624,
      "step": 877
    },
    {
      "epoch": 0.14048,
      "grad_norm": 0.15412181615829468,
      "learning_rate": 0.0001,
      "loss": 0.3551,
      "step": 878
    },
    {
      "epoch": 0.14064,
      "grad_norm": 0.21869930624961853,
      "learning_rate": 0.0001,
      "loss": 0.3685,
      "step": 879
    },
    {
      "epoch": 0.1408,
      "grad_norm": 0.21262864768505096,
      "learning_rate": 0.0001,
      "loss": 0.3859,
      "step": 880
    },
    {
      "epoch": 0.14096,
      "grad_norm": 0.201859712600708,
      "learning_rate": 0.0001,
      "loss": 0.388,
      "step": 881
    },
    {
      "epoch": 0.14112,
      "grad_norm": 0.17705099284648895,
      "learning_rate": 0.0001,
      "loss": 0.3728,
      "step": 882
    },
    {
      "epoch": 0.14128,
      "grad_norm": 0.1787026971578598,
      "learning_rate": 0.0001,
      "loss": 0.3679,
      "step": 883
    },
    {
      "epoch": 0.14144,
      "grad_norm": 0.1717889904975891,
      "learning_rate": 0.0001,
      "loss": 0.3906,
      "step": 884
    },
    {
      "epoch": 0.1416,
      "grad_norm": 0.17861080169677734,
      "learning_rate": 0.0001,
      "loss": 0.3811,
      "step": 885
    },
    {
      "epoch": 0.14176,
      "grad_norm": 0.15057824552059174,
      "learning_rate": 0.0001,
      "loss": 0.3681,
      "step": 886
    },
    {
      "epoch": 0.14192,
      "grad_norm": 0.23312978446483612,
      "learning_rate": 0.0001,
      "loss": 0.3615,
      "step": 887
    },
    {
      "epoch": 0.14208,
      "grad_norm": 0.14646071195602417,
      "learning_rate": 0.0001,
      "loss": 0.3681,
      "step": 888
    },
    {
      "epoch": 0.14224,
      "grad_norm": 0.17173615097999573,
      "learning_rate": 0.0001,
      "loss": 0.3727,
      "step": 889
    },
    {
      "epoch": 0.1424,
      "grad_norm": 0.17907093465328217,
      "learning_rate": 0.0001,
      "loss": 0.3893,
      "step": 890
    },
    {
      "epoch": 0.14256,
      "grad_norm": 0.18731355667114258,
      "learning_rate": 0.0001,
      "loss": 0.3855,
      "step": 891
    },
    {
      "epoch": 0.14272,
      "grad_norm": 0.1967649608850479,
      "learning_rate": 0.0001,
      "loss": 0.3754,
      "step": 892
    },
    {
      "epoch": 0.14288,
      "grad_norm": 0.20007000863552094,
      "learning_rate": 0.0001,
      "loss": 0.3751,
      "step": 893
    },
    {
      "epoch": 0.14304,
      "grad_norm": 0.1485624760389328,
      "learning_rate": 0.0001,
      "loss": 0.358,
      "step": 894
    },
    {
      "epoch": 0.1432,
      "grad_norm": 0.14492009580135345,
      "learning_rate": 0.0001,
      "loss": 0.3487,
      "step": 895
    },
    {
      "epoch": 0.14336,
      "grad_norm": 0.17129802703857422,
      "learning_rate": 0.0001,
      "loss": 0.3693,
      "step": 896
    },
    {
      "epoch": 0.14352,
      "grad_norm": 0.17564205825328827,
      "learning_rate": 0.0001,
      "loss": 0.376,
      "step": 897
    },
    {
      "epoch": 0.14368,
      "grad_norm": 0.1549040973186493,
      "learning_rate": 0.0001,
      "loss": 0.3633,
      "step": 898
    },
    {
      "epoch": 0.14384,
      "grad_norm": 0.18636177480220795,
      "learning_rate": 0.0001,
      "loss": 0.377,
      "step": 899
    },
    {
      "epoch": 0.144,
      "grad_norm": 0.17704512178897858,
      "learning_rate": 0.0001,
      "loss": 0.3723,
      "step": 900
    },
    {
      "epoch": 0.144,
      "eval_train_accuracy": 0.5022,
      "eval_train_loss": 0.37065601348876953,
      "eval_train_runtime": 4.5283,
      "eval_train_samples_per_second": 1104.157,
      "eval_train_steps_per_second": 13.912,
      "step": 900
    },
    {
      "epoch": 0.144,
      "eval_test_accuracy": 0.5064,
      "eval_test_loss": 0.3691280484199524,
      "eval_test_runtime": 4.6297,
      "eval_test_samples_per_second": 1079.987,
      "eval_test_steps_per_second": 13.608,
      "step": 900
    },
    {
      "epoch": 0.14416,
      "grad_norm": 0.15036799013614655,
      "learning_rate": 0.0001,
      "loss": 0.3663,
      "step": 901
    },
    {
      "epoch": 0.14432,
      "grad_norm": 0.24304813146591187,
      "learning_rate": 0.0001,
      "loss": 0.3768,
      "step": 902
    },
    {
      "epoch": 0.14448,
      "grad_norm": 0.19262580573558807,
      "learning_rate": 0.0001,
      "loss": 0.3723,
      "step": 903
    },
    {
      "epoch": 0.14464,
      "grad_norm": 0.2722340524196625,
      "learning_rate": 0.0001,
      "loss": 0.3778,
      "step": 904
    },
    {
      "epoch": 0.1448,
      "grad_norm": 0.20259219408035278,
      "learning_rate": 0.0001,
      "loss": 0.3752,
      "step": 905
    },
    {
      "epoch": 0.14496,
      "grad_norm": 0.1589525192975998,
      "learning_rate": 0.0001,
      "loss": 0.3634,
      "step": 906
    },
    {
      "epoch": 0.14512,
      "grad_norm": 0.21281974017620087,
      "learning_rate": 0.0001,
      "loss": 0.3811,
      "step": 907
    },
    {
      "epoch": 0.14528,
      "grad_norm": 0.26933610439300537,
      "learning_rate": 0.0001,
      "loss": 0.3688,
      "step": 908
    },
    {
      "epoch": 0.14544,
      "grad_norm": 0.18119513988494873,
      "learning_rate": 0.0001,
      "loss": 0.3636,
      "step": 909
    },
    {
      "epoch": 0.1456,
      "grad_norm": 0.1765950471162796,
      "learning_rate": 0.0001,
      "loss": 0.3691,
      "step": 910
    },
    {
      "epoch": 0.14576,
      "grad_norm": 0.22304175794124603,
      "learning_rate": 0.0001,
      "loss": 0.3813,
      "step": 911
    },
    {
      "epoch": 0.14592,
      "grad_norm": 0.19182230532169342,
      "learning_rate": 0.0001,
      "loss": 0.3881,
      "step": 912
    },
    {
      "epoch": 0.14608,
      "grad_norm": 0.1654471606016159,
      "learning_rate": 0.0001,
      "loss": 0.3712,
      "step": 913
    },
    {
      "epoch": 0.14624,
      "grad_norm": 0.16559521853923798,
      "learning_rate": 0.0001,
      "loss": 0.3799,
      "step": 914
    },
    {
      "epoch": 0.1464,
      "grad_norm": 0.14919336140155792,
      "learning_rate": 0.0001,
      "loss": 0.3539,
      "step": 915
    },
    {
      "epoch": 0.14656,
      "grad_norm": 0.1935916244983673,
      "learning_rate": 0.0001,
      "loss": 0.3817,
      "step": 916
    },
    {
      "epoch": 0.14672,
      "grad_norm": 0.17732807993888855,
      "learning_rate": 0.0001,
      "loss": 0.3799,
      "step": 917
    },
    {
      "epoch": 0.14688,
      "grad_norm": 0.25952383875846863,
      "learning_rate": 0.0001,
      "loss": 0.3764,
      "step": 918
    },
    {
      "epoch": 0.14704,
      "grad_norm": 0.18685434758663177,
      "learning_rate": 0.0001,
      "loss": 0.3653,
      "step": 919
    },
    {
      "epoch": 0.1472,
      "grad_norm": 0.19747227430343628,
      "learning_rate": 0.0001,
      "loss": 0.382,
      "step": 920
    },
    {
      "epoch": 0.14736,
      "grad_norm": 0.19452805817127228,
      "learning_rate": 0.0001,
      "loss": 0.3614,
      "step": 921
    },
    {
      "epoch": 0.14752,
      "grad_norm": 0.17381760478019714,
      "learning_rate": 0.0001,
      "loss": 0.3578,
      "step": 922
    },
    {
      "epoch": 0.14768,
      "grad_norm": 0.21663647890090942,
      "learning_rate": 0.0001,
      "loss": 0.3719,
      "step": 923
    },
    {
      "epoch": 0.14784,
      "grad_norm": 0.2010987102985382,
      "learning_rate": 0.0001,
      "loss": 0.375,
      "step": 924
    },
    {
      "epoch": 0.148,
      "grad_norm": 0.3009086549282074,
      "learning_rate": 0.0001,
      "loss": 0.3597,
      "step": 925
    },
    {
      "epoch": 0.14816,
      "grad_norm": 0.16225503385066986,
      "learning_rate": 0.0001,
      "loss": 0.3907,
      "step": 926
    },
    {
      "epoch": 0.14832,
      "grad_norm": 0.3113621473312378,
      "learning_rate": 0.0001,
      "loss": 0.369,
      "step": 927
    },
    {
      "epoch": 0.14848,
      "grad_norm": 0.16110184788703918,
      "learning_rate": 0.0001,
      "loss": 0.3722,
      "step": 928
    },
    {
      "epoch": 0.14864,
      "grad_norm": 0.2779788076877594,
      "learning_rate": 0.0001,
      "loss": 0.3777,
      "step": 929
    },
    {
      "epoch": 0.1488,
      "grad_norm": 0.15740947425365448,
      "learning_rate": 0.0001,
      "loss": 0.3755,
      "step": 930
    },
    {
      "epoch": 0.14896,
      "grad_norm": 0.27901574969291687,
      "learning_rate": 0.0001,
      "loss": 0.3926,
      "step": 931
    },
    {
      "epoch": 0.14912,
      "grad_norm": 0.19498424232006073,
      "learning_rate": 0.0001,
      "loss": 0.3798,
      "step": 932
    },
    {
      "epoch": 0.14928,
      "grad_norm": 0.17310117185115814,
      "learning_rate": 0.0001,
      "loss": 0.3636,
      "step": 933
    },
    {
      "epoch": 0.14944,
      "grad_norm": 0.20489360392093658,
      "learning_rate": 0.0001,
      "loss": 0.3729,
      "step": 934
    },
    {
      "epoch": 0.1496,
      "grad_norm": 0.17051288485527039,
      "learning_rate": 0.0001,
      "loss": 0.3706,
      "step": 935
    },
    {
      "epoch": 0.14976,
      "grad_norm": 0.20200985670089722,
      "learning_rate": 0.0001,
      "loss": 0.3787,
      "step": 936
    },
    {
      "epoch": 0.14992,
      "grad_norm": 0.21063071489334106,
      "learning_rate": 0.0001,
      "loss": 0.3793,
      "step": 937
    },
    {
      "epoch": 0.15008,
      "grad_norm": 0.1590990126132965,
      "learning_rate": 0.0001,
      "loss": 0.3733,
      "step": 938
    },
    {
      "epoch": 0.15024,
      "grad_norm": 0.15295925736427307,
      "learning_rate": 0.0001,
      "loss": 0.37,
      "step": 939
    },
    {
      "epoch": 0.1504,
      "grad_norm": 0.16949696838855743,
      "learning_rate": 0.0001,
      "loss": 0.3671,
      "step": 940
    },
    {
      "epoch": 0.15056,
      "grad_norm": 0.17740309238433838,
      "learning_rate": 0.0001,
      "loss": 0.359,
      "step": 941
    },
    {
      "epoch": 0.15072,
      "grad_norm": 0.15461613237857819,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 942
    },
    {
      "epoch": 0.15088,
      "grad_norm": 0.1566152125597,
      "learning_rate": 0.0001,
      "loss": 0.3633,
      "step": 943
    },
    {
      "epoch": 0.15104,
      "grad_norm": 0.16196642816066742,
      "learning_rate": 0.0001,
      "loss": 0.3759,
      "step": 944
    },
    {
      "epoch": 0.1512,
      "grad_norm": 0.1816895306110382,
      "learning_rate": 0.0001,
      "loss": 0.3562,
      "step": 945
    },
    {
      "epoch": 0.15136,
      "grad_norm": 0.16260926425457,
      "learning_rate": 0.0001,
      "loss": 0.3776,
      "step": 946
    },
    {
      "epoch": 0.15152,
      "grad_norm": 0.1610676348209381,
      "learning_rate": 0.0001,
      "loss": 0.3813,
      "step": 947
    },
    {
      "epoch": 0.15168,
      "grad_norm": 0.16281947493553162,
      "learning_rate": 0.0001,
      "loss": 0.3581,
      "step": 948
    },
    {
      "epoch": 0.15184,
      "grad_norm": 0.15723586082458496,
      "learning_rate": 0.0001,
      "loss": 0.3674,
      "step": 949
    },
    {
      "epoch": 0.152,
      "grad_norm": 0.15860316157341003,
      "learning_rate": 0.0001,
      "loss": 0.3765,
      "step": 950
    },
    {
      "epoch": 0.15216,
      "grad_norm": 0.1914493590593338,
      "learning_rate": 0.0001,
      "loss": 0.3689,
      "step": 951
    },
    {
      "epoch": 0.15232,
      "grad_norm": 0.16254928708076477,
      "learning_rate": 0.0001,
      "loss": 0.3736,
      "step": 952
    },
    {
      "epoch": 0.15248,
      "grad_norm": 0.1854812502861023,
      "learning_rate": 0.0001,
      "loss": 0.3768,
      "step": 953
    },
    {
      "epoch": 0.15264,
      "grad_norm": 0.17479562759399414,
      "learning_rate": 0.0001,
      "loss": 0.3657,
      "step": 954
    },
    {
      "epoch": 0.1528,
      "grad_norm": 0.15159371495246887,
      "learning_rate": 0.0001,
      "loss": 0.3709,
      "step": 955
    },
    {
      "epoch": 0.15296,
      "grad_norm": 0.15071769058704376,
      "learning_rate": 0.0001,
      "loss": 0.3686,
      "step": 956
    },
    {
      "epoch": 0.15312,
      "grad_norm": 0.17582674324512482,
      "learning_rate": 0.0001,
      "loss": 0.3642,
      "step": 957
    },
    {
      "epoch": 0.15328,
      "grad_norm": 0.16190814971923828,
      "learning_rate": 0.0001,
      "loss": 0.3801,
      "step": 958
    },
    {
      "epoch": 0.15344,
      "grad_norm": 0.15195409953594208,
      "learning_rate": 0.0001,
      "loss": 0.3868,
      "step": 959
    },
    {
      "epoch": 0.1536,
      "grad_norm": 0.19567671418190002,
      "learning_rate": 0.0001,
      "loss": 0.3781,
      "step": 960
    },
    {
      "epoch": 0.15376,
      "grad_norm": 0.15911319851875305,
      "learning_rate": 0.0001,
      "loss": 0.3691,
      "step": 961
    },
    {
      "epoch": 0.15392,
      "grad_norm": 0.1516372114419937,
      "learning_rate": 0.0001,
      "loss": 0.3729,
      "step": 962
    },
    {
      "epoch": 0.15408,
      "grad_norm": 0.15768206119537354,
      "learning_rate": 0.0001,
      "loss": 0.382,
      "step": 963
    },
    {
      "epoch": 0.15424,
      "grad_norm": 0.13835395872592926,
      "learning_rate": 0.0001,
      "loss": 0.3662,
      "step": 964
    },
    {
      "epoch": 0.1544,
      "grad_norm": 0.15345440804958344,
      "learning_rate": 0.0001,
      "loss": 0.3774,
      "step": 965
    },
    {
      "epoch": 0.15456,
      "grad_norm": 0.14603960514068604,
      "learning_rate": 0.0001,
      "loss": 0.3775,
      "step": 966
    },
    {
      "epoch": 0.15472,
      "grad_norm": 0.16103483736515045,
      "learning_rate": 0.0001,
      "loss": 0.3822,
      "step": 967
    },
    {
      "epoch": 0.15488,
      "grad_norm": 0.1813780963420868,
      "learning_rate": 0.0001,
      "loss": 0.372,
      "step": 968
    },
    {
      "epoch": 0.15504,
      "grad_norm": 0.15540467202663422,
      "learning_rate": 0.0001,
      "loss": 0.3761,
      "step": 969
    },
    {
      "epoch": 0.1552,
      "grad_norm": 0.1636764258146286,
      "learning_rate": 0.0001,
      "loss": 0.386,
      "step": 970
    },
    {
      "epoch": 0.15536,
      "grad_norm": 0.21697160601615906,
      "learning_rate": 0.0001,
      "loss": 0.3834,
      "step": 971
    },
    {
      "epoch": 0.15552,
      "grad_norm": 0.15157149732112885,
      "learning_rate": 0.0001,
      "loss": 0.3674,
      "step": 972
    },
    {
      "epoch": 0.15568,
      "grad_norm": 0.19589944183826447,
      "learning_rate": 0.0001,
      "loss": 0.369,
      "step": 973
    },
    {
      "epoch": 0.15584,
      "grad_norm": 0.2317388653755188,
      "learning_rate": 0.0001,
      "loss": 0.3735,
      "step": 974
    },
    {
      "epoch": 0.156,
      "grad_norm": 0.15211215615272522,
      "learning_rate": 0.0001,
      "loss": 0.3458,
      "step": 975
    },
    {
      "epoch": 0.15616,
      "grad_norm": 0.1546478271484375,
      "learning_rate": 0.0001,
      "loss": 0.3477,
      "step": 976
    },
    {
      "epoch": 0.15632,
      "grad_norm": 0.19486834108829498,
      "learning_rate": 0.0001,
      "loss": 0.3701,
      "step": 977
    },
    {
      "epoch": 0.15648,
      "grad_norm": 0.16448502242565155,
      "learning_rate": 0.0001,
      "loss": 0.3665,
      "step": 978
    },
    {
      "epoch": 0.15664,
      "grad_norm": 0.20384085178375244,
      "learning_rate": 0.0001,
      "loss": 0.3658,
      "step": 979
    },
    {
      "epoch": 0.1568,
      "grad_norm": 0.20655691623687744,
      "learning_rate": 0.0001,
      "loss": 0.3661,
      "step": 980
    },
    {
      "epoch": 0.15696,
      "grad_norm": 0.16594597697257996,
      "learning_rate": 0.0001,
      "loss": 0.3616,
      "step": 981
    },
    {
      "epoch": 0.15712,
      "grad_norm": 0.16671554744243622,
      "learning_rate": 0.0001,
      "loss": 0.3748,
      "step": 982
    },
    {
      "epoch": 0.15728,
      "grad_norm": 0.1776733100414276,
      "learning_rate": 0.0001,
      "loss": 0.3664,
      "step": 983
    },
    {
      "epoch": 0.15744,
      "grad_norm": 0.17205742001533508,
      "learning_rate": 0.0001,
      "loss": 0.3671,
      "step": 984
    },
    {
      "epoch": 0.1576,
      "grad_norm": 0.1473296582698822,
      "learning_rate": 0.0001,
      "loss": 0.3602,
      "step": 985
    },
    {
      "epoch": 0.15776,
      "grad_norm": 0.19107824563980103,
      "learning_rate": 0.0001,
      "loss": 0.3788,
      "step": 986
    },
    {
      "epoch": 0.15792,
      "grad_norm": 0.16542239487171173,
      "learning_rate": 0.0001,
      "loss": 0.3624,
      "step": 987
    },
    {
      "epoch": 0.15808,
      "grad_norm": 0.2052556425333023,
      "learning_rate": 0.0001,
      "loss": 0.3729,
      "step": 988
    },
    {
      "epoch": 0.15824,
      "grad_norm": 0.14822182059288025,
      "learning_rate": 0.0001,
      "loss": 0.3707,
      "step": 989
    },
    {
      "epoch": 0.1584,
      "grad_norm": 0.18102909624576569,
      "learning_rate": 0.0001,
      "loss": 0.3744,
      "step": 990
    },
    {
      "epoch": 0.15856,
      "grad_norm": 0.154232457280159,
      "learning_rate": 0.0001,
      "loss": 0.372,
      "step": 991
    },
    {
      "epoch": 0.15872,
      "grad_norm": 0.16609734296798706,
      "learning_rate": 0.0001,
      "loss": 0.3749,
      "step": 992
    },
    {
      "epoch": 0.15888,
      "grad_norm": 0.1436215341091156,
      "learning_rate": 0.0001,
      "loss": 0.3736,
      "step": 993
    },
    {
      "epoch": 0.15904,
      "grad_norm": 0.1589873731136322,
      "learning_rate": 0.0001,
      "loss": 0.3864,
      "step": 994
    },
    {
      "epoch": 0.1592,
      "grad_norm": 0.16241472959518433,
      "learning_rate": 0.0001,
      "loss": 0.3653,
      "step": 995
    },
    {
      "epoch": 0.15936,
      "grad_norm": 0.16462893784046173,
      "learning_rate": 0.0001,
      "loss": 0.3642,
      "step": 996
    },
    {
      "epoch": 0.15952,
      "grad_norm": 0.15397800505161285,
      "learning_rate": 0.0001,
      "loss": 0.359,
      "step": 997
    },
    {
      "epoch": 0.15968,
      "grad_norm": 0.15016065537929535,
      "learning_rate": 0.0001,
      "loss": 0.361,
      "step": 998
    },
    {
      "epoch": 0.15984,
      "grad_norm": 0.1568789929151535,
      "learning_rate": 0.0001,
      "loss": 0.3669,
      "step": 999
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.16351941227912903,
      "learning_rate": 0.0001,
      "loss": 0.3577,
      "step": 1000
    },
    {
      "epoch": 0.16,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.3691727817058563,
      "eval_train_runtime": 4.7559,
      "eval_train_samples_per_second": 1051.318,
      "eval_train_steps_per_second": 13.247,
      "step": 1000
    },
    {
      "epoch": 0.16,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.3674299120903015,
      "eval_test_runtime": 4.2814,
      "eval_test_samples_per_second": 1167.85,
      "eval_test_steps_per_second": 14.715,
      "step": 1000
    },
    {
      "epoch": 0.16016,
      "grad_norm": 0.14168870449066162,
      "learning_rate": 0.0001,
      "loss": 0.3676,
      "step": 1001
    },
    {
      "epoch": 0.16032,
      "grad_norm": 0.1628342717885971,
      "learning_rate": 0.0001,
      "loss": 0.3781,
      "step": 1002
    },
    {
      "epoch": 0.16048,
      "grad_norm": 0.16815710067749023,
      "learning_rate": 0.0001,
      "loss": 0.3572,
      "step": 1003
    },
    {
      "epoch": 0.16064,
      "grad_norm": 0.18539394438266754,
      "learning_rate": 0.0001,
      "loss": 0.3718,
      "step": 1004
    },
    {
      "epoch": 0.1608,
      "grad_norm": 0.13908100128173828,
      "learning_rate": 0.0001,
      "loss": 0.3752,
      "step": 1005
    },
    {
      "epoch": 0.16096,
      "grad_norm": 0.15073497593402863,
      "learning_rate": 0.0001,
      "loss": 0.3617,
      "step": 1006
    },
    {
      "epoch": 0.16112,
      "grad_norm": 0.170275017619133,
      "learning_rate": 0.0001,
      "loss": 0.36,
      "step": 1007
    },
    {
      "epoch": 0.16128,
      "grad_norm": 0.16815027594566345,
      "learning_rate": 0.0001,
      "loss": 0.3696,
      "step": 1008
    },
    {
      "epoch": 0.16144,
      "grad_norm": 0.17042435705661774,
      "learning_rate": 0.0001,
      "loss": 0.3586,
      "step": 1009
    },
    {
      "epoch": 0.1616,
      "grad_norm": 0.1631857454776764,
      "learning_rate": 0.0001,
      "loss": 0.3726,
      "step": 1010
    },
    {
      "epoch": 0.16176,
      "grad_norm": 0.14621052145957947,
      "learning_rate": 0.0001,
      "loss": 0.3712,
      "step": 1011
    },
    {
      "epoch": 0.16192,
      "grad_norm": 0.15610606968402863,
      "learning_rate": 0.0001,
      "loss": 0.3577,
      "step": 1012
    },
    {
      "epoch": 0.16208,
      "grad_norm": 0.16627392172813416,
      "learning_rate": 0.0001,
      "loss": 0.3708,
      "step": 1013
    },
    {
      "epoch": 0.16224,
      "grad_norm": 0.18554817140102386,
      "learning_rate": 0.0001,
      "loss": 0.3638,
      "step": 1014
    },
    {
      "epoch": 0.1624,
      "grad_norm": 0.1418088674545288,
      "learning_rate": 0.0001,
      "loss": 0.3604,
      "step": 1015
    },
    {
      "epoch": 0.16256,
      "grad_norm": 0.12338781356811523,
      "learning_rate": 0.0001,
      "loss": 0.3458,
      "step": 1016
    },
    {
      "epoch": 0.16272,
      "grad_norm": 0.1614808589220047,
      "learning_rate": 0.0001,
      "loss": 0.3685,
      "step": 1017
    },
    {
      "epoch": 0.16288,
      "grad_norm": 0.14826424419879913,
      "learning_rate": 0.0001,
      "loss": 0.3696,
      "step": 1018
    },
    {
      "epoch": 0.16304,
      "grad_norm": 0.1924430876970291,
      "learning_rate": 0.0001,
      "loss": 0.3734,
      "step": 1019
    },
    {
      "epoch": 0.1632,
      "grad_norm": 0.13483896851539612,
      "learning_rate": 0.0001,
      "loss": 0.3767,
      "step": 1020
    },
    {
      "epoch": 0.16336,
      "grad_norm": 0.14857864379882812,
      "learning_rate": 0.0001,
      "loss": 0.3706,
      "step": 1021
    },
    {
      "epoch": 0.16352,
      "grad_norm": 0.19772396981716156,
      "learning_rate": 0.0001,
      "loss": 0.3718,
      "step": 1022
    },
    {
      "epoch": 0.16368,
      "grad_norm": 0.15332932770252228,
      "learning_rate": 0.0001,
      "loss": 0.3747,
      "step": 1023
    },
    {
      "epoch": 0.16384,
      "grad_norm": 0.14484748244285583,
      "learning_rate": 0.0001,
      "loss": 0.3791,
      "step": 1024
    },
    {
      "epoch": 0.164,
      "grad_norm": 0.13959500193595886,
      "learning_rate": 0.0001,
      "loss": 0.3646,
      "step": 1025
    },
    {
      "epoch": 0.16416,
      "grad_norm": 0.17070460319519043,
      "learning_rate": 0.0001,
      "loss": 0.3786,
      "step": 1026
    },
    {
      "epoch": 0.16432,
      "grad_norm": 0.1437705159187317,
      "learning_rate": 0.0001,
      "loss": 0.3597,
      "step": 1027
    },
    {
      "epoch": 0.16448,
      "grad_norm": 0.1549587845802307,
      "learning_rate": 0.0001,
      "loss": 0.3839,
      "step": 1028
    },
    {
      "epoch": 0.16464,
      "grad_norm": 0.20221716165542603,
      "learning_rate": 0.0001,
      "loss": 0.3928,
      "step": 1029
    },
    {
      "epoch": 0.1648,
      "grad_norm": 0.1507492959499359,
      "learning_rate": 0.0001,
      "loss": 0.3941,
      "step": 1030
    },
    {
      "epoch": 0.16496,
      "grad_norm": 0.1453893929719925,
      "learning_rate": 0.0001,
      "loss": 0.3603,
      "step": 1031
    },
    {
      "epoch": 0.16512,
      "grad_norm": 0.24747368693351746,
      "learning_rate": 0.0001,
      "loss": 0.38,
      "step": 1032
    },
    {
      "epoch": 0.16528,
      "grad_norm": 0.14740747213363647,
      "learning_rate": 0.0001,
      "loss": 0.3596,
      "step": 1033
    },
    {
      "epoch": 0.16544,
      "grad_norm": 0.15852245688438416,
      "learning_rate": 0.0001,
      "loss": 0.3768,
      "step": 1034
    },
    {
      "epoch": 0.1656,
      "grad_norm": 0.16946257650852203,
      "learning_rate": 0.0001,
      "loss": 0.3651,
      "step": 1035
    },
    {
      "epoch": 0.16576,
      "grad_norm": 0.24311502277851105,
      "learning_rate": 0.0001,
      "loss": 0.3671,
      "step": 1036
    },
    {
      "epoch": 0.16592,
      "grad_norm": 0.18289346992969513,
      "learning_rate": 0.0001,
      "loss": 0.3741,
      "step": 1037
    },
    {
      "epoch": 0.16608,
      "grad_norm": 0.16661256551742554,
      "learning_rate": 0.0001,
      "loss": 0.348,
      "step": 1038
    },
    {
      "epoch": 0.16624,
      "grad_norm": 0.27619075775146484,
      "learning_rate": 0.0001,
      "loss": 0.3788,
      "step": 1039
    },
    {
      "epoch": 0.1664,
      "grad_norm": 0.17931775748729706,
      "learning_rate": 0.0001,
      "loss": 0.3588,
      "step": 1040
    },
    {
      "epoch": 0.16656,
      "grad_norm": 0.33621877431869507,
      "learning_rate": 0.0001,
      "loss": 0.3789,
      "step": 1041
    },
    {
      "epoch": 0.16672,
      "grad_norm": 0.15914921462535858,
      "learning_rate": 0.0001,
      "loss": 0.3838,
      "step": 1042
    },
    {
      "epoch": 0.16688,
      "grad_norm": 0.18978658318519592,
      "learning_rate": 0.0001,
      "loss": 0.3698,
      "step": 1043
    },
    {
      "epoch": 0.16704,
      "grad_norm": 0.26422280073165894,
      "learning_rate": 0.0001,
      "loss": 0.3863,
      "step": 1044
    },
    {
      "epoch": 0.1672,
      "grad_norm": 0.1647745966911316,
      "learning_rate": 0.0001,
      "loss": 0.3801,
      "step": 1045
    },
    {
      "epoch": 0.16736,
      "grad_norm": 0.184989795088768,
      "learning_rate": 0.0001,
      "loss": 0.3677,
      "step": 1046
    },
    {
      "epoch": 0.16752,
      "grad_norm": 0.3278777301311493,
      "learning_rate": 0.0001,
      "loss": 0.3555,
      "step": 1047
    },
    {
      "epoch": 0.16768,
      "grad_norm": 0.18581083416938782,
      "learning_rate": 0.0001,
      "loss": 0.3657,
      "step": 1048
    },
    {
      "epoch": 0.16784,
      "grad_norm": 0.17968808114528656,
      "learning_rate": 0.0001,
      "loss": 0.3832,
      "step": 1049
    },
    {
      "epoch": 0.168,
      "grad_norm": 0.2106018364429474,
      "learning_rate": 0.0001,
      "loss": 0.3659,
      "step": 1050
    },
    {
      "epoch": 0.16816,
      "grad_norm": 0.2690993845462799,
      "learning_rate": 0.0001,
      "loss": 0.3794,
      "step": 1051
    },
    {
      "epoch": 0.16832,
      "grad_norm": 0.20564594864845276,
      "learning_rate": 0.0001,
      "loss": 0.3616,
      "step": 1052
    },
    {
      "epoch": 0.16848,
      "grad_norm": 0.27932778000831604,
      "learning_rate": 0.0001,
      "loss": 0.3748,
      "step": 1053
    },
    {
      "epoch": 0.16864,
      "grad_norm": 0.18345926702022552,
      "learning_rate": 0.0001,
      "loss": 0.3672,
      "step": 1054
    },
    {
      "epoch": 0.1688,
      "grad_norm": 0.3082685172557831,
      "learning_rate": 0.0001,
      "loss": 0.3665,
      "step": 1055
    },
    {
      "epoch": 0.16896,
      "grad_norm": 0.2463383823633194,
      "learning_rate": 0.0001,
      "loss": 0.3768,
      "step": 1056
    },
    {
      "epoch": 0.16912,
      "grad_norm": 0.18842476606369019,
      "learning_rate": 0.0001,
      "loss": 0.3649,
      "step": 1057
    },
    {
      "epoch": 0.16928,
      "grad_norm": 0.3329630196094513,
      "learning_rate": 0.0001,
      "loss": 0.3711,
      "step": 1058
    },
    {
      "epoch": 0.16944,
      "grad_norm": 0.18918965756893158,
      "learning_rate": 0.0001,
      "loss": 0.3791,
      "step": 1059
    },
    {
      "epoch": 0.1696,
      "grad_norm": 0.18829849362373352,
      "learning_rate": 0.0001,
      "loss": 0.3762,
      "step": 1060
    },
    {
      "epoch": 0.16976,
      "grad_norm": 0.30108213424682617,
      "learning_rate": 0.0001,
      "loss": 0.3652,
      "step": 1061
    },
    {
      "epoch": 0.16992,
      "grad_norm": 0.17584028840065002,
      "learning_rate": 0.0001,
      "loss": 0.3608,
      "step": 1062
    },
    {
      "epoch": 0.17008,
      "grad_norm": 0.17970946431159973,
      "learning_rate": 0.0001,
      "loss": 0.384,
      "step": 1063
    },
    {
      "epoch": 0.17024,
      "grad_norm": 0.16987882554531097,
      "learning_rate": 0.0001,
      "loss": 0.3787,
      "step": 1064
    },
    {
      "epoch": 0.1704,
      "grad_norm": 0.2841412425041199,
      "learning_rate": 0.0001,
      "loss": 0.3756,
      "step": 1065
    },
    {
      "epoch": 0.17056,
      "grad_norm": 0.15117324888706207,
      "learning_rate": 0.0001,
      "loss": 0.3715,
      "step": 1066
    },
    {
      "epoch": 0.17072,
      "grad_norm": 0.14223922789096832,
      "learning_rate": 0.0001,
      "loss": 0.3703,
      "step": 1067
    },
    {
      "epoch": 0.17088,
      "grad_norm": 0.19961008429527283,
      "learning_rate": 0.0001,
      "loss": 0.3635,
      "step": 1068
    },
    {
      "epoch": 0.17104,
      "grad_norm": 0.1945384293794632,
      "learning_rate": 0.0001,
      "loss": 0.3826,
      "step": 1069
    },
    {
      "epoch": 0.1712,
      "grad_norm": 0.14931528270244598,
      "learning_rate": 0.0001,
      "loss": 0.3663,
      "step": 1070
    },
    {
      "epoch": 0.17136,
      "grad_norm": 0.14445120096206665,
      "learning_rate": 0.0001,
      "loss": 0.3527,
      "step": 1071
    },
    {
      "epoch": 0.17152,
      "grad_norm": 0.17150245606899261,
      "learning_rate": 0.0001,
      "loss": 0.3812,
      "step": 1072
    },
    {
      "epoch": 0.17168,
      "grad_norm": 0.21853682398796082,
      "learning_rate": 0.0001,
      "loss": 0.3748,
      "step": 1073
    },
    {
      "epoch": 0.17184,
      "grad_norm": 0.1497858464717865,
      "learning_rate": 0.0001,
      "loss": 0.3764,
      "step": 1074
    },
    {
      "epoch": 0.172,
      "grad_norm": 0.1505279392004013,
      "learning_rate": 0.0001,
      "loss": 0.3628,
      "step": 1075
    },
    {
      "epoch": 0.17216,
      "grad_norm": 0.17145703732967377,
      "learning_rate": 0.0001,
      "loss": 0.3594,
      "step": 1076
    },
    {
      "epoch": 0.17232,
      "grad_norm": 0.22394254803657532,
      "learning_rate": 0.0001,
      "loss": 0.3784,
      "step": 1077
    },
    {
      "epoch": 0.17248,
      "grad_norm": 0.1756603866815567,
      "learning_rate": 0.0001,
      "loss": 0.379,
      "step": 1078
    },
    {
      "epoch": 0.17264,
      "grad_norm": 0.1391804963350296,
      "learning_rate": 0.0001,
      "loss": 0.357,
      "step": 1079
    },
    {
      "epoch": 0.1728,
      "grad_norm": 0.2009211778640747,
      "learning_rate": 0.0001,
      "loss": 0.3769,
      "step": 1080
    },
    {
      "epoch": 0.17296,
      "grad_norm": 0.18606583774089813,
      "learning_rate": 0.0001,
      "loss": 0.3712,
      "step": 1081
    },
    {
      "epoch": 0.17312,
      "grad_norm": 0.15487739443778992,
      "learning_rate": 0.0001,
      "loss": 0.3609,
      "step": 1082
    },
    {
      "epoch": 0.17328,
      "grad_norm": 0.14357440173625946,
      "learning_rate": 0.0001,
      "loss": 0.3574,
      "step": 1083
    },
    {
      "epoch": 0.17344,
      "grad_norm": 0.19789251685142517,
      "learning_rate": 0.0001,
      "loss": 0.3963,
      "step": 1084
    },
    {
      "epoch": 0.1736,
      "grad_norm": 0.1483965963125229,
      "learning_rate": 0.0001,
      "loss": 0.3662,
      "step": 1085
    },
    {
      "epoch": 0.17376,
      "grad_norm": 0.16348473727703094,
      "learning_rate": 0.0001,
      "loss": 0.3613,
      "step": 1086
    },
    {
      "epoch": 0.17392,
      "grad_norm": 0.19141939282417297,
      "learning_rate": 0.0001,
      "loss": 0.3785,
      "step": 1087
    },
    {
      "epoch": 0.17408,
      "grad_norm": 0.13239245116710663,
      "learning_rate": 0.0001,
      "loss": 0.3715,
      "step": 1088
    },
    {
      "epoch": 0.17424,
      "grad_norm": 0.15503886342048645,
      "learning_rate": 0.0001,
      "loss": 0.3812,
      "step": 1089
    },
    {
      "epoch": 0.1744,
      "grad_norm": 0.14857444167137146,
      "learning_rate": 0.0001,
      "loss": 0.3558,
      "step": 1090
    },
    {
      "epoch": 0.17456,
      "grad_norm": 0.2002396285533905,
      "learning_rate": 0.0001,
      "loss": 0.3885,
      "step": 1091
    },
    {
      "epoch": 0.17472,
      "grad_norm": 0.17586390674114227,
      "learning_rate": 0.0001,
      "loss": 0.3806,
      "step": 1092
    },
    {
      "epoch": 0.17488,
      "grad_norm": 0.17588989436626434,
      "learning_rate": 0.0001,
      "loss": 0.3677,
      "step": 1093
    },
    {
      "epoch": 0.17504,
      "grad_norm": 0.1700982004404068,
      "learning_rate": 0.0001,
      "loss": 0.3604,
      "step": 1094
    },
    {
      "epoch": 0.1752,
      "grad_norm": 0.1602507382631302,
      "learning_rate": 0.0001,
      "loss": 0.3556,
      "step": 1095
    },
    {
      "epoch": 0.17536,
      "grad_norm": 0.20575395226478577,
      "learning_rate": 0.0001,
      "loss": 0.3831,
      "step": 1096
    },
    {
      "epoch": 0.17552,
      "grad_norm": 0.15979249775409698,
      "learning_rate": 0.0001,
      "loss": 0.3681,
      "step": 1097
    },
    {
      "epoch": 0.17568,
      "grad_norm": 0.14688582718372345,
      "learning_rate": 0.0001,
      "loss": 0.3641,
      "step": 1098
    },
    {
      "epoch": 0.17584,
      "grad_norm": 0.19512587785720825,
      "learning_rate": 0.0001,
      "loss": 0.3557,
      "step": 1099
    },
    {
      "epoch": 0.176,
      "grad_norm": 0.16108037531375885,
      "learning_rate": 0.0001,
      "loss": 0.3681,
      "step": 1100
    },
    {
      "epoch": 0.176,
      "eval_train_accuracy": 0.501,
      "eval_train_loss": 0.3676755428314209,
      "eval_train_runtime": 4.7255,
      "eval_train_samples_per_second": 1058.095,
      "eval_train_steps_per_second": 13.332,
      "step": 1100
    },
    {
      "epoch": 0.176,
      "eval_test_accuracy": 0.5052,
      "eval_test_loss": 0.3660951554775238,
      "eval_test_runtime": 4.3934,
      "eval_test_samples_per_second": 1138.068,
      "eval_test_steps_per_second": 14.34,
      "step": 1100
    },
    {
      "epoch": 0.17616,
      "grad_norm": 0.16061437129974365,
      "learning_rate": 0.0001,
      "loss": 0.3578,
      "step": 1101
    },
    {
      "epoch": 0.17632,
      "grad_norm": 0.13428868353366852,
      "learning_rate": 0.0001,
      "loss": 0.3675,
      "step": 1102
    },
    {
      "epoch": 0.17648,
      "grad_norm": 0.15264077484607697,
      "learning_rate": 0.0001,
      "loss": 0.357,
      "step": 1103
    },
    {
      "epoch": 0.17664,
      "grad_norm": 0.15566527843475342,
      "learning_rate": 0.0001,
      "loss": 0.3835,
      "step": 1104
    },
    {
      "epoch": 0.1768,
      "grad_norm": 0.15575581789016724,
      "learning_rate": 0.0001,
      "loss": 0.3573,
      "step": 1105
    },
    {
      "epoch": 0.17696,
      "grad_norm": 0.15039509534835815,
      "learning_rate": 0.0001,
      "loss": 0.3604,
      "step": 1106
    },
    {
      "epoch": 0.17712,
      "grad_norm": 0.16519267857074738,
      "learning_rate": 0.0001,
      "loss": 0.3735,
      "step": 1107
    },
    {
      "epoch": 0.17728,
      "grad_norm": 0.17352691292762756,
      "learning_rate": 0.0001,
      "loss": 0.3676,
      "step": 1108
    },
    {
      "epoch": 0.17744,
      "grad_norm": 0.17877063155174255,
      "learning_rate": 0.0001,
      "loss": 0.3765,
      "step": 1109
    },
    {
      "epoch": 0.1776,
      "grad_norm": 0.15189431607723236,
      "learning_rate": 0.0001,
      "loss": 0.3846,
      "step": 1110
    },
    {
      "epoch": 0.17776,
      "grad_norm": 0.14697380363941193,
      "learning_rate": 0.0001,
      "loss": 0.3704,
      "step": 1111
    },
    {
      "epoch": 0.17792,
      "grad_norm": 0.1676197350025177,
      "learning_rate": 0.0001,
      "loss": 0.3749,
      "step": 1112
    },
    {
      "epoch": 0.17808,
      "grad_norm": 0.15819771587848663,
      "learning_rate": 0.0001,
      "loss": 0.37,
      "step": 1113
    },
    {
      "epoch": 0.17824,
      "grad_norm": 0.15663190186023712,
      "learning_rate": 0.0001,
      "loss": 0.3842,
      "step": 1114
    },
    {
      "epoch": 0.1784,
      "grad_norm": 0.14353984594345093,
      "learning_rate": 0.0001,
      "loss": 0.3765,
      "step": 1115
    },
    {
      "epoch": 0.17856,
      "grad_norm": 0.13937295973300934,
      "learning_rate": 0.0001,
      "loss": 0.3803,
      "step": 1116
    },
    {
      "epoch": 0.17872,
      "grad_norm": 0.17032469809055328,
      "learning_rate": 0.0001,
      "loss": 0.3689,
      "step": 1117
    },
    {
      "epoch": 0.17888,
      "grad_norm": 0.16237111389636993,
      "learning_rate": 0.0001,
      "loss": 0.3571,
      "step": 1118
    },
    {
      "epoch": 0.17904,
      "grad_norm": 0.15620912611484528,
      "learning_rate": 0.0001,
      "loss": 0.3514,
      "step": 1119
    },
    {
      "epoch": 0.1792,
      "grad_norm": 0.144023135304451,
      "learning_rate": 0.0001,
      "loss": 0.3803,
      "step": 1120
    },
    {
      "epoch": 0.17936,
      "grad_norm": 0.19193924963474274,
      "learning_rate": 0.0001,
      "loss": 0.3774,
      "step": 1121
    },
    {
      "epoch": 0.17952,
      "grad_norm": 0.16224835813045502,
      "learning_rate": 0.0001,
      "loss": 0.3699,
      "step": 1122
    },
    {
      "epoch": 0.17968,
      "grad_norm": 0.15544164180755615,
      "learning_rate": 0.0001,
      "loss": 0.3879,
      "step": 1123
    },
    {
      "epoch": 0.17984,
      "grad_norm": 0.1521776020526886,
      "learning_rate": 0.0001,
      "loss": 0.3686,
      "step": 1124
    },
    {
      "epoch": 0.18,
      "grad_norm": 0.15043136477470398,
      "learning_rate": 0.0001,
      "loss": 0.3744,
      "step": 1125
    },
    {
      "epoch": 0.18016,
      "grad_norm": 0.19758611917495728,
      "learning_rate": 0.0001,
      "loss": 0.3611,
      "step": 1126
    },
    {
      "epoch": 0.18032,
      "grad_norm": 0.14096693694591522,
      "learning_rate": 0.0001,
      "loss": 0.355,
      "step": 1127
    },
    {
      "epoch": 0.18048,
      "grad_norm": 0.2018500715494156,
      "learning_rate": 0.0001,
      "loss": 0.3702,
      "step": 1128
    },
    {
      "epoch": 0.18064,
      "grad_norm": 0.15796367824077606,
      "learning_rate": 0.0001,
      "loss": 0.3778,
      "step": 1129
    },
    {
      "epoch": 0.1808,
      "grad_norm": 0.26184460520744324,
      "learning_rate": 0.0001,
      "loss": 0.3806,
      "step": 1130
    },
    {
      "epoch": 0.18096,
      "grad_norm": 0.17904239892959595,
      "learning_rate": 0.0001,
      "loss": 0.3594,
      "step": 1131
    },
    {
      "epoch": 0.18112,
      "grad_norm": 0.18495948612689972,
      "learning_rate": 0.0001,
      "loss": 0.3698,
      "step": 1132
    },
    {
      "epoch": 0.18128,
      "grad_norm": 0.2295284867286682,
      "learning_rate": 0.0001,
      "loss": 0.3646,
      "step": 1133
    },
    {
      "epoch": 0.18144,
      "grad_norm": 0.26142245531082153,
      "learning_rate": 0.0001,
      "loss": 0.3766,
      "step": 1134
    },
    {
      "epoch": 0.1816,
      "grad_norm": 0.1824701428413391,
      "learning_rate": 0.0001,
      "loss": 0.3797,
      "step": 1135
    },
    {
      "epoch": 0.18176,
      "grad_norm": 0.17220914363861084,
      "learning_rate": 0.0001,
      "loss": 0.3701,
      "step": 1136
    },
    {
      "epoch": 0.18192,
      "grad_norm": 0.22869086265563965,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 1137
    },
    {
      "epoch": 0.18208,
      "grad_norm": 0.15459884703159332,
      "learning_rate": 0.0001,
      "loss": 0.3799,
      "step": 1138
    },
    {
      "epoch": 0.18224,
      "grad_norm": 0.19034916162490845,
      "learning_rate": 0.0001,
      "loss": 0.3688,
      "step": 1139
    },
    {
      "epoch": 0.1824,
      "grad_norm": 0.1827193945646286,
      "learning_rate": 0.0001,
      "loss": 0.3626,
      "step": 1140
    },
    {
      "epoch": 0.18256,
      "grad_norm": 0.1622348427772522,
      "learning_rate": 0.0001,
      "loss": 0.3692,
      "step": 1141
    },
    {
      "epoch": 0.18272,
      "grad_norm": 0.20962679386138916,
      "learning_rate": 0.0001,
      "loss": 0.3735,
      "step": 1142
    },
    {
      "epoch": 0.18288,
      "grad_norm": 0.1669769287109375,
      "learning_rate": 0.0001,
      "loss": 0.3682,
      "step": 1143
    },
    {
      "epoch": 0.18304,
      "grad_norm": 0.13723039627075195,
      "learning_rate": 0.0001,
      "loss": 0.3705,
      "step": 1144
    },
    {
      "epoch": 0.1832,
      "grad_norm": 0.13958291709423065,
      "learning_rate": 0.0001,
      "loss": 0.3484,
      "step": 1145
    },
    {
      "epoch": 0.18336,
      "grad_norm": 0.22193561494350433,
      "learning_rate": 0.0001,
      "loss": 0.3782,
      "step": 1146
    },
    {
      "epoch": 0.18352,
      "grad_norm": 0.16672378778457642,
      "learning_rate": 0.0001,
      "loss": 0.3741,
      "step": 1147
    },
    {
      "epoch": 0.18368,
      "grad_norm": 0.14235886931419373,
      "learning_rate": 0.0001,
      "loss": 0.3504,
      "step": 1148
    },
    {
      "epoch": 0.18384,
      "grad_norm": 0.14734366536140442,
      "learning_rate": 0.0001,
      "loss": 0.3664,
      "step": 1149
    },
    {
      "epoch": 0.184,
      "grad_norm": 0.14260266721248627,
      "learning_rate": 0.0001,
      "loss": 0.3715,
      "step": 1150
    },
    {
      "epoch": 0.18416,
      "grad_norm": 0.18159326910972595,
      "learning_rate": 0.0001,
      "loss": 0.3864,
      "step": 1151
    },
    {
      "epoch": 0.18432,
      "grad_norm": 0.15880201756954193,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 1152
    },
    {
      "epoch": 0.18448,
      "grad_norm": 0.1701117753982544,
      "learning_rate": 0.0001,
      "loss": 0.3745,
      "step": 1153
    },
    {
      "epoch": 0.18464,
      "grad_norm": 0.1387382298707962,
      "learning_rate": 0.0001,
      "loss": 0.3768,
      "step": 1154
    },
    {
      "epoch": 0.1848,
      "grad_norm": 0.14935605227947235,
      "learning_rate": 0.0001,
      "loss": 0.372,
      "step": 1155
    },
    {
      "epoch": 0.18496,
      "grad_norm": 0.15221165120601654,
      "learning_rate": 0.0001,
      "loss": 0.3718,
      "step": 1156
    },
    {
      "epoch": 0.18512,
      "grad_norm": 0.14759178459644318,
      "learning_rate": 0.0001,
      "loss": 0.3709,
      "step": 1157
    },
    {
      "epoch": 0.18528,
      "grad_norm": 0.13188333809375763,
      "learning_rate": 0.0001,
      "loss": 0.3558,
      "step": 1158
    },
    {
      "epoch": 0.18544,
      "grad_norm": 0.13402260839939117,
      "learning_rate": 0.0001,
      "loss": 0.3623,
      "step": 1159
    },
    {
      "epoch": 0.1856,
      "grad_norm": 0.17331886291503906,
      "learning_rate": 0.0001,
      "loss": 0.3774,
      "step": 1160
    },
    {
      "epoch": 0.18576,
      "grad_norm": 0.15575629472732544,
      "learning_rate": 0.0001,
      "loss": 0.3725,
      "step": 1161
    },
    {
      "epoch": 0.18592,
      "grad_norm": 0.1687924861907959,
      "learning_rate": 0.0001,
      "loss": 0.3813,
      "step": 1162
    },
    {
      "epoch": 0.18608,
      "grad_norm": 0.18565219640731812,
      "learning_rate": 0.0001,
      "loss": 0.3722,
      "step": 1163
    },
    {
      "epoch": 0.18624,
      "grad_norm": 0.14164580404758453,
      "learning_rate": 0.0001,
      "loss": 0.3592,
      "step": 1164
    },
    {
      "epoch": 0.1864,
      "grad_norm": 0.18750706315040588,
      "learning_rate": 0.0001,
      "loss": 0.3661,
      "step": 1165
    },
    {
      "epoch": 0.18656,
      "grad_norm": 0.19594869017601013,
      "learning_rate": 0.0001,
      "loss": 0.3609,
      "step": 1166
    },
    {
      "epoch": 0.18672,
      "grad_norm": 0.15738405287265778,
      "learning_rate": 0.0001,
      "loss": 0.3614,
      "step": 1167
    },
    {
      "epoch": 0.18688,
      "grad_norm": 0.15702322125434875,
      "learning_rate": 0.0001,
      "loss": 0.363,
      "step": 1168
    },
    {
      "epoch": 0.18704,
      "grad_norm": 0.1557425558567047,
      "learning_rate": 0.0001,
      "loss": 0.3646,
      "step": 1169
    },
    {
      "epoch": 0.1872,
      "grad_norm": 0.19467096030712128,
      "learning_rate": 0.0001,
      "loss": 0.362,
      "step": 1170
    },
    {
      "epoch": 0.18736,
      "grad_norm": 0.1462082415819168,
      "learning_rate": 0.0001,
      "loss": 0.3731,
      "step": 1171
    },
    {
      "epoch": 0.18752,
      "grad_norm": 0.19774314761161804,
      "learning_rate": 0.0001,
      "loss": 0.3683,
      "step": 1172
    },
    {
      "epoch": 0.18768,
      "grad_norm": 0.13867820799350739,
      "learning_rate": 0.0001,
      "loss": 0.3673,
      "step": 1173
    },
    {
      "epoch": 0.18784,
      "grad_norm": 0.17070259153842926,
      "learning_rate": 0.0001,
      "loss": 0.363,
      "step": 1174
    },
    {
      "epoch": 0.188,
      "grad_norm": 0.16153669357299805,
      "learning_rate": 0.0001,
      "loss": 0.3592,
      "step": 1175
    },
    {
      "epoch": 0.18816,
      "grad_norm": 0.16808053851127625,
      "learning_rate": 0.0001,
      "loss": 0.3783,
      "step": 1176
    },
    {
      "epoch": 0.18832,
      "grad_norm": 0.1544555127620697,
      "learning_rate": 0.0001,
      "loss": 0.3628,
      "step": 1177
    },
    {
      "epoch": 0.18848,
      "grad_norm": 0.17782752215862274,
      "learning_rate": 0.0001,
      "loss": 0.3703,
      "step": 1178
    },
    {
      "epoch": 0.18864,
      "grad_norm": 0.1451413929462433,
      "learning_rate": 0.0001,
      "loss": 0.3575,
      "step": 1179
    },
    {
      "epoch": 0.1888,
      "grad_norm": 0.15101225674152374,
      "learning_rate": 0.0001,
      "loss": 0.3574,
      "step": 1180
    },
    {
      "epoch": 0.18896,
      "grad_norm": 0.16654132306575775,
      "learning_rate": 0.0001,
      "loss": 0.3692,
      "step": 1181
    },
    {
      "epoch": 0.18912,
      "grad_norm": 0.15543173253536224,
      "learning_rate": 0.0001,
      "loss": 0.3532,
      "step": 1182
    },
    {
      "epoch": 0.18928,
      "grad_norm": 0.1452939510345459,
      "learning_rate": 0.0001,
      "loss": 0.3781,
      "step": 1183
    },
    {
      "epoch": 0.18944,
      "grad_norm": 0.23462462425231934,
      "learning_rate": 0.0001,
      "loss": 0.3803,
      "step": 1184
    },
    {
      "epoch": 0.1896,
      "grad_norm": 0.2010422945022583,
      "learning_rate": 0.0001,
      "loss": 0.368,
      "step": 1185
    },
    {
      "epoch": 0.18976,
      "grad_norm": 0.2086213082075119,
      "learning_rate": 0.0001,
      "loss": 0.3927,
      "step": 1186
    },
    {
      "epoch": 0.18992,
      "grad_norm": 0.1740254908800125,
      "learning_rate": 0.0001,
      "loss": 0.378,
      "step": 1187
    },
    {
      "epoch": 0.19008,
      "grad_norm": 0.18367476761341095,
      "learning_rate": 0.0001,
      "loss": 0.3685,
      "step": 1188
    },
    {
      "epoch": 0.19024,
      "grad_norm": 0.1543489396572113,
      "learning_rate": 0.0001,
      "loss": 0.3725,
      "step": 1189
    },
    {
      "epoch": 0.1904,
      "grad_norm": 0.1685195118188858,
      "learning_rate": 0.0001,
      "loss": 0.3755,
      "step": 1190
    },
    {
      "epoch": 0.19056,
      "grad_norm": 0.13690949976444244,
      "learning_rate": 0.0001,
      "loss": 0.3663,
      "step": 1191
    },
    {
      "epoch": 0.19072,
      "grad_norm": 0.13156268000602722,
      "learning_rate": 0.0001,
      "loss": 0.3476,
      "step": 1192
    },
    {
      "epoch": 0.19088,
      "grad_norm": 0.1639792025089264,
      "learning_rate": 0.0001,
      "loss": 0.3561,
      "step": 1193
    },
    {
      "epoch": 0.19104,
      "grad_norm": 0.15040075778961182,
      "learning_rate": 0.0001,
      "loss": 0.378,
      "step": 1194
    },
    {
      "epoch": 0.1912,
      "grad_norm": 0.13685128092765808,
      "learning_rate": 0.0001,
      "loss": 0.3691,
      "step": 1195
    },
    {
      "epoch": 0.19136,
      "grad_norm": 0.151362344622612,
      "learning_rate": 0.0001,
      "loss": 0.3562,
      "step": 1196
    },
    {
      "epoch": 0.19152,
      "grad_norm": 0.13724280893802643,
      "learning_rate": 0.0001,
      "loss": 0.3641,
      "step": 1197
    },
    {
      "epoch": 0.19168,
      "grad_norm": 0.15553994476795197,
      "learning_rate": 0.0001,
      "loss": 0.3875,
      "step": 1198
    },
    {
      "epoch": 0.19184,
      "grad_norm": 0.12465572357177734,
      "learning_rate": 0.0001,
      "loss": 0.3742,
      "step": 1199
    },
    {
      "epoch": 0.192,
      "grad_norm": 0.19378620386123657,
      "learning_rate": 0.0001,
      "loss": 0.3671,
      "step": 1200
    },
    {
      "epoch": 0.192,
      "eval_train_accuracy": 0.4984,
      "eval_train_loss": 0.36689385771751404,
      "eval_train_runtime": 4.5512,
      "eval_train_samples_per_second": 1098.6,
      "eval_train_steps_per_second": 13.842,
      "step": 1200
    },
    {
      "epoch": 0.192,
      "eval_test_accuracy": 0.499,
      "eval_test_loss": 0.36541929841041565,
      "eval_test_runtime": 4.6325,
      "eval_test_samples_per_second": 1079.327,
      "eval_test_steps_per_second": 13.6,
      "step": 1200
    },
    {
      "epoch": 0.19216,
      "grad_norm": 0.12279855459928513,
      "learning_rate": 0.0001,
      "loss": 0.3531,
      "step": 1201
    },
    {
      "epoch": 0.19232,
      "grad_norm": 0.1513526439666748,
      "learning_rate": 0.0001,
      "loss": 0.3582,
      "step": 1202
    },
    {
      "epoch": 0.19248,
      "grad_norm": 0.16487929224967957,
      "learning_rate": 0.0001,
      "loss": 0.3624,
      "step": 1203
    },
    {
      "epoch": 0.19264,
      "grad_norm": 0.13468630611896515,
      "learning_rate": 0.0001,
      "loss": 0.3635,
      "step": 1204
    },
    {
      "epoch": 0.1928,
      "grad_norm": 0.12051457911729813,
      "learning_rate": 0.0001,
      "loss": 0.3463,
      "step": 1205
    },
    {
      "epoch": 0.19296,
      "grad_norm": 0.14052143692970276,
      "learning_rate": 0.0001,
      "loss": 0.3689,
      "step": 1206
    },
    {
      "epoch": 0.19312,
      "grad_norm": 0.15201465785503387,
      "learning_rate": 0.0001,
      "loss": 0.3733,
      "step": 1207
    },
    {
      "epoch": 0.19328,
      "grad_norm": 0.15284867584705353,
      "learning_rate": 0.0001,
      "loss": 0.3642,
      "step": 1208
    },
    {
      "epoch": 0.19344,
      "grad_norm": 0.16164593398571014,
      "learning_rate": 0.0001,
      "loss": 0.3778,
      "step": 1209
    },
    {
      "epoch": 0.1936,
      "grad_norm": 0.13639363646507263,
      "learning_rate": 0.0001,
      "loss": 0.3766,
      "step": 1210
    },
    {
      "epoch": 0.19376,
      "grad_norm": 0.13734707236289978,
      "learning_rate": 0.0001,
      "loss": 0.3705,
      "step": 1211
    },
    {
      "epoch": 0.19392,
      "grad_norm": 0.14813372492790222,
      "learning_rate": 0.0001,
      "loss": 0.3706,
      "step": 1212
    },
    {
      "epoch": 0.19408,
      "grad_norm": 0.1585320085287094,
      "learning_rate": 0.0001,
      "loss": 0.3641,
      "step": 1213
    },
    {
      "epoch": 0.19424,
      "grad_norm": 0.13629303872585297,
      "learning_rate": 0.0001,
      "loss": 0.3691,
      "step": 1214
    },
    {
      "epoch": 0.1944,
      "grad_norm": 0.1365976780653,
      "learning_rate": 0.0001,
      "loss": 0.371,
      "step": 1215
    },
    {
      "epoch": 0.19456,
      "grad_norm": 0.13238780200481415,
      "learning_rate": 0.0001,
      "loss": 0.3637,
      "step": 1216
    },
    {
      "epoch": 0.19472,
      "grad_norm": 0.1594698131084442,
      "learning_rate": 0.0001,
      "loss": 0.3804,
      "step": 1217
    },
    {
      "epoch": 0.19488,
      "grad_norm": 0.1420021653175354,
      "learning_rate": 0.0001,
      "loss": 0.3529,
      "step": 1218
    },
    {
      "epoch": 0.19504,
      "grad_norm": 0.18834450840950012,
      "learning_rate": 0.0001,
      "loss": 0.3576,
      "step": 1219
    },
    {
      "epoch": 0.1952,
      "grad_norm": 0.13310466706752777,
      "learning_rate": 0.0001,
      "loss": 0.3619,
      "step": 1220
    },
    {
      "epoch": 0.19536,
      "grad_norm": 0.1814686506986618,
      "learning_rate": 0.0001,
      "loss": 0.3828,
      "step": 1221
    },
    {
      "epoch": 0.19552,
      "grad_norm": 0.1364152580499649,
      "learning_rate": 0.0001,
      "loss": 0.3714,
      "step": 1222
    },
    {
      "epoch": 0.19568,
      "grad_norm": 0.14705771207809448,
      "learning_rate": 0.0001,
      "loss": 0.3522,
      "step": 1223
    },
    {
      "epoch": 0.19584,
      "grad_norm": 0.1606043130159378,
      "learning_rate": 0.0001,
      "loss": 0.3719,
      "step": 1224
    },
    {
      "epoch": 0.196,
      "grad_norm": 0.18249666690826416,
      "learning_rate": 0.0001,
      "loss": 0.3674,
      "step": 1225
    },
    {
      "epoch": 0.19616,
      "grad_norm": 0.15926524996757507,
      "learning_rate": 0.0001,
      "loss": 0.3708,
      "step": 1226
    },
    {
      "epoch": 0.19632,
      "grad_norm": 0.15353022515773773,
      "learning_rate": 0.0001,
      "loss": 0.3507,
      "step": 1227
    },
    {
      "epoch": 0.19648,
      "grad_norm": 0.22417952120304108,
      "learning_rate": 0.0001,
      "loss": 0.3712,
      "step": 1228
    },
    {
      "epoch": 0.19664,
      "grad_norm": 0.158924400806427,
      "learning_rate": 0.0001,
      "loss": 0.3669,
      "step": 1229
    },
    {
      "epoch": 0.1968,
      "grad_norm": 0.14016495645046234,
      "learning_rate": 0.0001,
      "loss": 0.3819,
      "step": 1230
    },
    {
      "epoch": 0.19696,
      "grad_norm": 0.1581455022096634,
      "learning_rate": 0.0001,
      "loss": 0.3746,
      "step": 1231
    },
    {
      "epoch": 0.19712,
      "grad_norm": 0.2602314054965973,
      "learning_rate": 0.0001,
      "loss": 0.3659,
      "step": 1232
    },
    {
      "epoch": 0.19728,
      "grad_norm": 0.14352403581142426,
      "learning_rate": 0.0001,
      "loss": 0.351,
      "step": 1233
    },
    {
      "epoch": 0.19744,
      "grad_norm": 0.1812640279531479,
      "learning_rate": 0.0001,
      "loss": 0.3644,
      "step": 1234
    },
    {
      "epoch": 0.1976,
      "grad_norm": 0.32981881499290466,
      "learning_rate": 0.0001,
      "loss": 0.3649,
      "step": 1235
    },
    {
      "epoch": 0.19776,
      "grad_norm": 0.13163095712661743,
      "learning_rate": 0.0001,
      "loss": 0.3686,
      "step": 1236
    },
    {
      "epoch": 0.19792,
      "grad_norm": 0.1442926675081253,
      "learning_rate": 0.0001,
      "loss": 0.3686,
      "step": 1237
    },
    {
      "epoch": 0.19808,
      "grad_norm": 0.23474131524562836,
      "learning_rate": 0.0001,
      "loss": 0.3688,
      "step": 1238
    },
    {
      "epoch": 0.19824,
      "grad_norm": 0.2565016448497772,
      "learning_rate": 0.0001,
      "loss": 0.3628,
      "step": 1239
    },
    {
      "epoch": 0.1984,
      "grad_norm": 0.15506143867969513,
      "learning_rate": 0.0001,
      "loss": 0.3889,
      "step": 1240
    },
    {
      "epoch": 0.19856,
      "grad_norm": 0.14199048280715942,
      "learning_rate": 0.0001,
      "loss": 0.3714,
      "step": 1241
    },
    {
      "epoch": 0.19872,
      "grad_norm": 0.21575629711151123,
      "learning_rate": 0.0001,
      "loss": 0.3769,
      "step": 1242
    },
    {
      "epoch": 0.19888,
      "grad_norm": 0.16296352446079254,
      "learning_rate": 0.0001,
      "loss": 0.3606,
      "step": 1243
    },
    {
      "epoch": 0.19904,
      "grad_norm": 0.1444777250289917,
      "learning_rate": 0.0001,
      "loss": 0.3737,
      "step": 1244
    },
    {
      "epoch": 0.1992,
      "grad_norm": 0.12928256392478943,
      "learning_rate": 0.0001,
      "loss": 0.3633,
      "step": 1245
    },
    {
      "epoch": 0.19936,
      "grad_norm": 0.16341160237789154,
      "learning_rate": 0.0001,
      "loss": 0.3615,
      "step": 1246
    },
    {
      "epoch": 0.19952,
      "grad_norm": 0.1379290074110031,
      "learning_rate": 0.0001,
      "loss": 0.3745,
      "step": 1247
    },
    {
      "epoch": 0.19968,
      "grad_norm": 0.14538009464740753,
      "learning_rate": 0.0001,
      "loss": 0.3392,
      "step": 1248
    },
    {
      "epoch": 0.19984,
      "grad_norm": 0.15615896880626678,
      "learning_rate": 0.0001,
      "loss": 0.3649,
      "step": 1249
    },
    {
      "epoch": 0.2,
      "grad_norm": 0.13652965426445007,
      "learning_rate": 0.0001,
      "loss": 0.3536,
      "step": 1250
    },
    {
      "epoch": 0.20016,
      "grad_norm": 0.14447560906410217,
      "learning_rate": 0.0001,
      "loss": 0.3693,
      "step": 1251
    },
    {
      "epoch": 0.20032,
      "grad_norm": 0.15065298974514008,
      "learning_rate": 0.0001,
      "loss": 0.3798,
      "step": 1252
    },
    {
      "epoch": 0.20048,
      "grad_norm": 0.16355343163013458,
      "learning_rate": 0.0001,
      "loss": 0.3778,
      "step": 1253
    },
    {
      "epoch": 0.20064,
      "grad_norm": 0.12842394411563873,
      "learning_rate": 0.0001,
      "loss": 0.3806,
      "step": 1254
    },
    {
      "epoch": 0.2008,
      "grad_norm": 0.13819290697574615,
      "learning_rate": 0.0001,
      "loss": 0.3673,
      "step": 1255
    },
    {
      "epoch": 0.20096,
      "grad_norm": 0.15856637060642242,
      "learning_rate": 0.0001,
      "loss": 0.3783,
      "step": 1256
    },
    {
      "epoch": 0.20112,
      "grad_norm": 0.13044588267803192,
      "learning_rate": 0.0001,
      "loss": 0.361,
      "step": 1257
    },
    {
      "epoch": 0.20128,
      "grad_norm": 0.13950498402118683,
      "learning_rate": 0.0001,
      "loss": 0.3524,
      "step": 1258
    },
    {
      "epoch": 0.20144,
      "grad_norm": 0.186458021402359,
      "learning_rate": 0.0001,
      "loss": 0.3816,
      "step": 1259
    },
    {
      "epoch": 0.2016,
      "grad_norm": 0.13451656699180603,
      "learning_rate": 0.0001,
      "loss": 0.3658,
      "step": 1260
    },
    {
      "epoch": 0.20176,
      "grad_norm": 0.15606309473514557,
      "learning_rate": 0.0001,
      "loss": 0.36,
      "step": 1261
    },
    {
      "epoch": 0.20192,
      "grad_norm": 0.1546175181865692,
      "learning_rate": 0.0001,
      "loss": 0.3638,
      "step": 1262
    },
    {
      "epoch": 0.20208,
      "grad_norm": 0.1405663788318634,
      "learning_rate": 0.0001,
      "loss": 0.3617,
      "step": 1263
    },
    {
      "epoch": 0.20224,
      "grad_norm": 0.15079325437545776,
      "learning_rate": 0.0001,
      "loss": 0.365,
      "step": 1264
    },
    {
      "epoch": 0.2024,
      "grad_norm": 0.15368817746639252,
      "learning_rate": 0.0001,
      "loss": 0.3624,
      "step": 1265
    },
    {
      "epoch": 0.20256,
      "grad_norm": 0.1623399555683136,
      "learning_rate": 0.0001,
      "loss": 0.3665,
      "step": 1266
    },
    {
      "epoch": 0.20272,
      "grad_norm": 0.13507552444934845,
      "learning_rate": 0.0001,
      "loss": 0.3628,
      "step": 1267
    },
    {
      "epoch": 0.20288,
      "grad_norm": 0.14205440878868103,
      "learning_rate": 0.0001,
      "loss": 0.3641,
      "step": 1268
    },
    {
      "epoch": 0.20304,
      "grad_norm": 0.13297632336616516,
      "learning_rate": 0.0001,
      "loss": 0.3562,
      "step": 1269
    },
    {
      "epoch": 0.2032,
      "grad_norm": 0.13612088561058044,
      "learning_rate": 0.0001,
      "loss": 0.3676,
      "step": 1270
    },
    {
      "epoch": 0.20336,
      "grad_norm": 0.1541633903980255,
      "learning_rate": 0.0001,
      "loss": 0.3711,
      "step": 1271
    },
    {
      "epoch": 0.20352,
      "grad_norm": 0.12934976816177368,
      "learning_rate": 0.0001,
      "loss": 0.3698,
      "step": 1272
    },
    {
      "epoch": 0.20368,
      "grad_norm": 0.157146155834198,
      "learning_rate": 0.0001,
      "loss": 0.3615,
      "step": 1273
    },
    {
      "epoch": 0.20384,
      "grad_norm": 0.13328340649604797,
      "learning_rate": 0.0001,
      "loss": 0.374,
      "step": 1274
    },
    {
      "epoch": 0.204,
      "grad_norm": 0.13629363477230072,
      "learning_rate": 0.0001,
      "loss": 0.371,
      "step": 1275
    },
    {
      "epoch": 0.20416,
      "grad_norm": 0.17790894210338593,
      "learning_rate": 0.0001,
      "loss": 0.3676,
      "step": 1276
    },
    {
      "epoch": 0.20432,
      "grad_norm": 0.12866486608982086,
      "learning_rate": 0.0001,
      "loss": 0.3641,
      "step": 1277
    },
    {
      "epoch": 0.20448,
      "grad_norm": 0.18450616300106049,
      "learning_rate": 0.0001,
      "loss": 0.3597,
      "step": 1278
    },
    {
      "epoch": 0.20464,
      "grad_norm": 0.17041203379631042,
      "learning_rate": 0.0001,
      "loss": 0.3706,
      "step": 1279
    },
    {
      "epoch": 0.2048,
      "grad_norm": 0.12148156762123108,
      "learning_rate": 0.0001,
      "loss": 0.3542,
      "step": 1280
    },
    {
      "epoch": 0.20496,
      "grad_norm": 0.12415722012519836,
      "learning_rate": 0.0001,
      "loss": 0.3683,
      "step": 1281
    },
    {
      "epoch": 0.20512,
      "grad_norm": 0.1553071290254593,
      "learning_rate": 0.0001,
      "loss": 0.3634,
      "step": 1282
    },
    {
      "epoch": 0.20528,
      "grad_norm": 0.12647083401679993,
      "learning_rate": 0.0001,
      "loss": 0.3486,
      "step": 1283
    },
    {
      "epoch": 0.20544,
      "grad_norm": 0.15285621583461761,
      "learning_rate": 0.0001,
      "loss": 0.3779,
      "step": 1284
    },
    {
      "epoch": 0.2056,
      "grad_norm": 0.20005162060260773,
      "learning_rate": 0.0001,
      "loss": 0.3717,
      "step": 1285
    },
    {
      "epoch": 0.20576,
      "grad_norm": 0.1609719693660736,
      "learning_rate": 0.0001,
      "loss": 0.3775,
      "step": 1286
    },
    {
      "epoch": 0.20592,
      "grad_norm": 0.15616540610790253,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 1287
    },
    {
      "epoch": 0.20608,
      "grad_norm": 0.17511200904846191,
      "learning_rate": 0.0001,
      "loss": 0.3745,
      "step": 1288
    },
    {
      "epoch": 0.20624,
      "grad_norm": 0.1622336357831955,
      "learning_rate": 0.0001,
      "loss": 0.3843,
      "step": 1289
    },
    {
      "epoch": 0.2064,
      "grad_norm": 0.2228599637746811,
      "learning_rate": 0.0001,
      "loss": 0.3806,
      "step": 1290
    },
    {
      "epoch": 0.20656,
      "grad_norm": 0.14653848111629486,
      "learning_rate": 0.0001,
      "loss": 0.3639,
      "step": 1291
    },
    {
      "epoch": 0.20672,
      "grad_norm": 0.13327793776988983,
      "learning_rate": 0.0001,
      "loss": 0.3615,
      "step": 1292
    },
    {
      "epoch": 0.20688,
      "grad_norm": 0.13943645358085632,
      "learning_rate": 0.0001,
      "loss": 0.3659,
      "step": 1293
    },
    {
      "epoch": 0.20704,
      "grad_norm": 0.2528870701789856,
      "learning_rate": 0.0001,
      "loss": 0.349,
      "step": 1294
    },
    {
      "epoch": 0.2072,
      "grad_norm": 0.14119704067707062,
      "learning_rate": 0.0001,
      "loss": 0.3697,
      "step": 1295
    },
    {
      "epoch": 0.20736,
      "grad_norm": 0.18999627232551575,
      "learning_rate": 0.0001,
      "loss": 0.3559,
      "step": 1296
    },
    {
      "epoch": 0.20752,
      "grad_norm": 0.18318809568881989,
      "learning_rate": 0.0001,
      "loss": 0.3801,
      "step": 1297
    },
    {
      "epoch": 0.20768,
      "grad_norm": 0.1516823172569275,
      "learning_rate": 0.0001,
      "loss": 0.3544,
      "step": 1298
    },
    {
      "epoch": 0.20784,
      "grad_norm": 0.21356719732284546,
      "learning_rate": 0.0001,
      "loss": 0.3673,
      "step": 1299
    },
    {
      "epoch": 0.208,
      "grad_norm": 0.14812013506889343,
      "learning_rate": 0.0001,
      "loss": 0.3646,
      "step": 1300
    },
    {
      "epoch": 0.208,
      "eval_train_accuracy": 0.5068,
      "eval_train_loss": 0.36482667922973633,
      "eval_train_runtime": 4.6634,
      "eval_train_samples_per_second": 1072.185,
      "eval_train_steps_per_second": 13.51,
      "step": 1300
    },
    {
      "epoch": 0.208,
      "eval_test_accuracy": 0.5118,
      "eval_test_loss": 0.36338332295417786,
      "eval_test_runtime": 4.2717,
      "eval_test_samples_per_second": 1170.506,
      "eval_test_steps_per_second": 14.748,
      "step": 1300
    },
    {
      "epoch": 0.20816,
      "grad_norm": 0.15621329843997955,
      "learning_rate": 0.0001,
      "loss": 0.3691,
      "step": 1301
    },
    {
      "epoch": 0.20832,
      "grad_norm": 0.1294126957654953,
      "learning_rate": 0.0001,
      "loss": 0.376,
      "step": 1302
    },
    {
      "epoch": 0.20848,
      "grad_norm": 0.1537076085805893,
      "learning_rate": 0.0001,
      "loss": 0.3702,
      "step": 1303
    },
    {
      "epoch": 0.20864,
      "grad_norm": 0.15258724987506866,
      "learning_rate": 0.0001,
      "loss": 0.3749,
      "step": 1304
    },
    {
      "epoch": 0.2088,
      "grad_norm": 0.13405004143714905,
      "learning_rate": 0.0001,
      "loss": 0.3721,
      "step": 1305
    },
    {
      "epoch": 0.20896,
      "grad_norm": 0.15356901288032532,
      "learning_rate": 0.0001,
      "loss": 0.3836,
      "step": 1306
    },
    {
      "epoch": 0.20912,
      "grad_norm": 0.13739128410816193,
      "learning_rate": 0.0001,
      "loss": 0.3614,
      "step": 1307
    },
    {
      "epoch": 0.20928,
      "grad_norm": 0.13757522404193878,
      "learning_rate": 0.0001,
      "loss": 0.3687,
      "step": 1308
    },
    {
      "epoch": 0.20944,
      "grad_norm": 0.13916075229644775,
      "learning_rate": 0.0001,
      "loss": 0.3585,
      "step": 1309
    },
    {
      "epoch": 0.2096,
      "grad_norm": 0.1476927399635315,
      "learning_rate": 0.0001,
      "loss": 0.3696,
      "step": 1310
    },
    {
      "epoch": 0.20976,
      "grad_norm": 0.1698295921087265,
      "learning_rate": 0.0001,
      "loss": 0.372,
      "step": 1311
    },
    {
      "epoch": 0.20992,
      "grad_norm": 0.15640830993652344,
      "learning_rate": 0.0001,
      "loss": 0.3736,
      "step": 1312
    },
    {
      "epoch": 0.21008,
      "grad_norm": 0.16966192424297333,
      "learning_rate": 0.0001,
      "loss": 0.3692,
      "step": 1313
    },
    {
      "epoch": 0.21024,
      "grad_norm": 0.15144149959087372,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 1314
    },
    {
      "epoch": 0.2104,
      "grad_norm": 0.1608903408050537,
      "learning_rate": 0.0001,
      "loss": 0.3578,
      "step": 1315
    },
    {
      "epoch": 0.21056,
      "grad_norm": 0.2039736807346344,
      "learning_rate": 0.0001,
      "loss": 0.3581,
      "step": 1316
    },
    {
      "epoch": 0.21072,
      "grad_norm": 0.15922759473323822,
      "learning_rate": 0.0001,
      "loss": 0.357,
      "step": 1317
    },
    {
      "epoch": 0.21088,
      "grad_norm": 0.1586785912513733,
      "learning_rate": 0.0001,
      "loss": 0.3846,
      "step": 1318
    },
    {
      "epoch": 0.21104,
      "grad_norm": 0.20147137343883514,
      "learning_rate": 0.0001,
      "loss": 0.3608,
      "step": 1319
    },
    {
      "epoch": 0.2112,
      "grad_norm": 0.19637353718280792,
      "learning_rate": 0.0001,
      "loss": 0.3634,
      "step": 1320
    },
    {
      "epoch": 0.21136,
      "grad_norm": 0.12941433489322662,
      "learning_rate": 0.0001,
      "loss": 0.3609,
      "step": 1321
    },
    {
      "epoch": 0.21152,
      "grad_norm": 0.3205054998397827,
      "learning_rate": 0.0001,
      "loss": 0.356,
      "step": 1322
    },
    {
      "epoch": 0.21168,
      "grad_norm": 0.12922698259353638,
      "learning_rate": 0.0001,
      "loss": 0.3653,
      "step": 1323
    },
    {
      "epoch": 0.21184,
      "grad_norm": 0.13886789977550507,
      "learning_rate": 0.0001,
      "loss": 0.37,
      "step": 1324
    },
    {
      "epoch": 0.212,
      "grad_norm": 0.16605804860591888,
      "learning_rate": 0.0001,
      "loss": 0.3577,
      "step": 1325
    },
    {
      "epoch": 0.21216,
      "grad_norm": 0.22645203769207,
      "learning_rate": 0.0001,
      "loss": 0.3745,
      "step": 1326
    },
    {
      "epoch": 0.21232,
      "grad_norm": 0.1275641918182373,
      "learning_rate": 0.0001,
      "loss": 0.3716,
      "step": 1327
    },
    {
      "epoch": 0.21248,
      "grad_norm": 0.14871151745319366,
      "learning_rate": 0.0001,
      "loss": 0.3718,
      "step": 1328
    },
    {
      "epoch": 0.21264,
      "grad_norm": 0.14957071840763092,
      "learning_rate": 0.0001,
      "loss": 0.3788,
      "step": 1329
    },
    {
      "epoch": 0.2128,
      "grad_norm": 0.17145588994026184,
      "learning_rate": 0.0001,
      "loss": 0.3585,
      "step": 1330
    },
    {
      "epoch": 0.21296,
      "grad_norm": 0.14237019419670105,
      "learning_rate": 0.0001,
      "loss": 0.3602,
      "step": 1331
    },
    {
      "epoch": 0.21312,
      "grad_norm": 0.15287019312381744,
      "learning_rate": 0.0001,
      "loss": 0.3783,
      "step": 1332
    },
    {
      "epoch": 0.21328,
      "grad_norm": 0.14125275611877441,
      "learning_rate": 0.0001,
      "loss": 0.3728,
      "step": 1333
    },
    {
      "epoch": 0.21344,
      "grad_norm": 0.17835602164268494,
      "learning_rate": 0.0001,
      "loss": 0.3745,
      "step": 1334
    },
    {
      "epoch": 0.2136,
      "grad_norm": 0.13984814286231995,
      "learning_rate": 0.0001,
      "loss": 0.3632,
      "step": 1335
    },
    {
      "epoch": 0.21376,
      "grad_norm": 0.16715940833091736,
      "learning_rate": 0.0001,
      "loss": 0.3614,
      "step": 1336
    },
    {
      "epoch": 0.21392,
      "grad_norm": 0.15252456068992615,
      "learning_rate": 0.0001,
      "loss": 0.3743,
      "step": 1337
    },
    {
      "epoch": 0.21408,
      "grad_norm": 0.14807552099227905,
      "learning_rate": 0.0001,
      "loss": 0.3625,
      "step": 1338
    },
    {
      "epoch": 0.21424,
      "grad_norm": 0.18730074167251587,
      "learning_rate": 0.0001,
      "loss": 0.3792,
      "step": 1339
    },
    {
      "epoch": 0.2144,
      "grad_norm": 0.1595245599746704,
      "learning_rate": 0.0001,
      "loss": 0.3683,
      "step": 1340
    },
    {
      "epoch": 0.21456,
      "grad_norm": 0.17754106223583221,
      "learning_rate": 0.0001,
      "loss": 0.3724,
      "step": 1341
    },
    {
      "epoch": 0.21472,
      "grad_norm": 0.13335007429122925,
      "learning_rate": 0.0001,
      "loss": 0.3482,
      "step": 1342
    },
    {
      "epoch": 0.21488,
      "grad_norm": 0.11947204917669296,
      "learning_rate": 0.0001,
      "loss": 0.3499,
      "step": 1343
    },
    {
      "epoch": 0.21504,
      "grad_norm": 0.1455477625131607,
      "learning_rate": 0.0001,
      "loss": 0.3636,
      "step": 1344
    },
    {
      "epoch": 0.2152,
      "grad_norm": 0.16822609305381775,
      "learning_rate": 0.0001,
      "loss": 0.3597,
      "step": 1345
    },
    {
      "epoch": 0.21536,
      "grad_norm": 0.1685643494129181,
      "learning_rate": 0.0001,
      "loss": 0.3797,
      "step": 1346
    },
    {
      "epoch": 0.21552,
      "grad_norm": 0.15934796631336212,
      "learning_rate": 0.0001,
      "loss": 0.373,
      "step": 1347
    },
    {
      "epoch": 0.21568,
      "grad_norm": 0.14385060966014862,
      "learning_rate": 0.0001,
      "loss": 0.3641,
      "step": 1348
    },
    {
      "epoch": 0.21584,
      "grad_norm": 0.14401748776435852,
      "learning_rate": 0.0001,
      "loss": 0.3653,
      "step": 1349
    },
    {
      "epoch": 0.216,
      "grad_norm": 0.13014888763427734,
      "learning_rate": 0.0001,
      "loss": 0.3505,
      "step": 1350
    },
    {
      "epoch": 0.21616,
      "grad_norm": 0.16342602670192719,
      "learning_rate": 0.0001,
      "loss": 0.3899,
      "step": 1351
    },
    {
      "epoch": 0.21632,
      "grad_norm": 0.18659192323684692,
      "learning_rate": 0.0001,
      "loss": 0.3717,
      "step": 1352
    },
    {
      "epoch": 0.21648,
      "grad_norm": 0.20024128258228302,
      "learning_rate": 0.0001,
      "loss": 0.3722,
      "step": 1353
    },
    {
      "epoch": 0.21664,
      "grad_norm": 0.12422670423984528,
      "learning_rate": 0.0001,
      "loss": 0.3619,
      "step": 1354
    },
    {
      "epoch": 0.2168,
      "grad_norm": 0.14825975894927979,
      "learning_rate": 0.0001,
      "loss": 0.3601,
      "step": 1355
    },
    {
      "epoch": 0.21696,
      "grad_norm": 0.15469451248645782,
      "learning_rate": 0.0001,
      "loss": 0.3855,
      "step": 1356
    },
    {
      "epoch": 0.21712,
      "grad_norm": 0.13756388425827026,
      "learning_rate": 0.0001,
      "loss": 0.3629,
      "step": 1357
    },
    {
      "epoch": 0.21728,
      "grad_norm": 0.1353481113910675,
      "learning_rate": 0.0001,
      "loss": 0.3562,
      "step": 1358
    },
    {
      "epoch": 0.21744,
      "grad_norm": 0.1531083732843399,
      "learning_rate": 0.0001,
      "loss": 0.3579,
      "step": 1359
    },
    {
      "epoch": 0.2176,
      "grad_norm": 0.1464185267686844,
      "learning_rate": 0.0001,
      "loss": 0.3545,
      "step": 1360
    },
    {
      "epoch": 0.21776,
      "grad_norm": 0.13612772524356842,
      "learning_rate": 0.0001,
      "loss": 0.3746,
      "step": 1361
    },
    {
      "epoch": 0.21792,
      "grad_norm": 0.17260770499706268,
      "learning_rate": 0.0001,
      "loss": 0.3603,
      "step": 1362
    },
    {
      "epoch": 0.21808,
      "grad_norm": 0.13056668639183044,
      "learning_rate": 0.0001,
      "loss": 0.3709,
      "step": 1363
    },
    {
      "epoch": 0.21824,
      "grad_norm": 0.18387995660305023,
      "learning_rate": 0.0001,
      "loss": 0.3658,
      "step": 1364
    },
    {
      "epoch": 0.2184,
      "grad_norm": 0.12148669362068176,
      "learning_rate": 0.0001,
      "loss": 0.3595,
      "step": 1365
    },
    {
      "epoch": 0.21856,
      "grad_norm": 0.1707446277141571,
      "learning_rate": 0.0001,
      "loss": 0.3448,
      "step": 1366
    },
    {
      "epoch": 0.21872,
      "grad_norm": 0.18620307743549347,
      "learning_rate": 0.0001,
      "loss": 0.3675,
      "step": 1367
    },
    {
      "epoch": 0.21888,
      "grad_norm": 0.14252708852291107,
      "learning_rate": 0.0001,
      "loss": 0.362,
      "step": 1368
    },
    {
      "epoch": 0.21904,
      "grad_norm": 0.16619740426540375,
      "learning_rate": 0.0001,
      "loss": 0.3787,
      "step": 1369
    },
    {
      "epoch": 0.2192,
      "grad_norm": 0.1389429122209549,
      "learning_rate": 0.0001,
      "loss": 0.3616,
      "step": 1370
    },
    {
      "epoch": 0.21936,
      "grad_norm": 0.15242260694503784,
      "learning_rate": 0.0001,
      "loss": 0.3586,
      "step": 1371
    },
    {
      "epoch": 0.21952,
      "grad_norm": 0.1439414918422699,
      "learning_rate": 0.0001,
      "loss": 0.3594,
      "step": 1372
    },
    {
      "epoch": 0.21968,
      "grad_norm": 0.11929316073656082,
      "learning_rate": 0.0001,
      "loss": 0.3577,
      "step": 1373
    },
    {
      "epoch": 0.21984,
      "grad_norm": 0.1641293317079544,
      "learning_rate": 0.0001,
      "loss": 0.3766,
      "step": 1374
    },
    {
      "epoch": 0.22,
      "grad_norm": 0.1373591274023056,
      "learning_rate": 0.0001,
      "loss": 0.3596,
      "step": 1375
    },
    {
      "epoch": 0.22016,
      "grad_norm": 0.1881674975156784,
      "learning_rate": 0.0001,
      "loss": 0.3761,
      "step": 1376
    },
    {
      "epoch": 0.22032,
      "grad_norm": 0.15011227130889893,
      "learning_rate": 0.0001,
      "loss": 0.352,
      "step": 1377
    },
    {
      "epoch": 0.22048,
      "grad_norm": 0.15696829557418823,
      "learning_rate": 0.0001,
      "loss": 0.366,
      "step": 1378
    },
    {
      "epoch": 0.22064,
      "grad_norm": 0.13821686804294586,
      "learning_rate": 0.0001,
      "loss": 0.3456,
      "step": 1379
    },
    {
      "epoch": 0.2208,
      "grad_norm": 0.17262987792491913,
      "learning_rate": 0.0001,
      "loss": 0.357,
      "step": 1380
    },
    {
      "epoch": 0.22096,
      "grad_norm": 0.1772352159023285,
      "learning_rate": 0.0001,
      "loss": 0.3592,
      "step": 1381
    },
    {
      "epoch": 0.22112,
      "grad_norm": 0.16496731340885162,
      "learning_rate": 0.0001,
      "loss": 0.3678,
      "step": 1382
    },
    {
      "epoch": 0.22128,
      "grad_norm": 0.14004288613796234,
      "learning_rate": 0.0001,
      "loss": 0.3596,
      "step": 1383
    },
    {
      "epoch": 0.22144,
      "grad_norm": 0.19798468053340912,
      "learning_rate": 0.0001,
      "loss": 0.3629,
      "step": 1384
    },
    {
      "epoch": 0.2216,
      "grad_norm": 0.22668714821338654,
      "learning_rate": 0.0001,
      "loss": 0.3636,
      "step": 1385
    },
    {
      "epoch": 0.22176,
      "grad_norm": 0.15370391309261322,
      "learning_rate": 0.0001,
      "loss": 0.3759,
      "step": 1386
    },
    {
      "epoch": 0.22192,
      "grad_norm": 0.13811908662319183,
      "learning_rate": 0.0001,
      "loss": 0.3635,
      "step": 1387
    },
    {
      "epoch": 0.22208,
      "grad_norm": 0.15498283505439758,
      "learning_rate": 0.0001,
      "loss": 0.3657,
      "step": 1388
    },
    {
      "epoch": 0.22224,
      "grad_norm": 0.24069516360759735,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 1389
    },
    {
      "epoch": 0.2224,
      "grad_norm": 0.1325298547744751,
      "learning_rate": 0.0001,
      "loss": 0.3723,
      "step": 1390
    },
    {
      "epoch": 0.22256,
      "grad_norm": 0.17479589581489563,
      "learning_rate": 0.0001,
      "loss": 0.3626,
      "step": 1391
    },
    {
      "epoch": 0.22272,
      "grad_norm": 0.24027544260025024,
      "learning_rate": 0.0001,
      "loss": 0.3768,
      "step": 1392
    },
    {
      "epoch": 0.22288,
      "grad_norm": 0.14505362510681152,
      "learning_rate": 0.0001,
      "loss": 0.355,
      "step": 1393
    },
    {
      "epoch": 0.22304,
      "grad_norm": 0.21495597064495087,
      "learning_rate": 0.0001,
      "loss": 0.3708,
      "step": 1394
    },
    {
      "epoch": 0.2232,
      "grad_norm": 0.17681989073753357,
      "learning_rate": 0.0001,
      "loss": 0.3697,
      "step": 1395
    },
    {
      "epoch": 0.22336,
      "grad_norm": 0.17812350392341614,
      "learning_rate": 0.0001,
      "loss": 0.3623,
      "step": 1396
    },
    {
      "epoch": 0.22352,
      "grad_norm": 0.14695307612419128,
      "learning_rate": 0.0001,
      "loss": 0.3647,
      "step": 1397
    },
    {
      "epoch": 0.22368,
      "grad_norm": 0.14071491360664368,
      "learning_rate": 0.0001,
      "loss": 0.3678,
      "step": 1398
    },
    {
      "epoch": 0.22384,
      "grad_norm": 0.1331825703382492,
      "learning_rate": 0.0001,
      "loss": 0.37,
      "step": 1399
    },
    {
      "epoch": 0.224,
      "grad_norm": 0.14372389018535614,
      "learning_rate": 0.0001,
      "loss": 0.3533,
      "step": 1400
    },
    {
      "epoch": 0.224,
      "eval_train_accuracy": 0.499,
      "eval_train_loss": 0.3627099096775055,
      "eval_train_runtime": 4.813,
      "eval_train_samples_per_second": 1038.856,
      "eval_train_steps_per_second": 13.09,
      "step": 1400
    },
    {
      "epoch": 0.224,
      "eval_test_accuracy": 0.4948,
      "eval_test_loss": 0.3613765239715576,
      "eval_test_runtime": 4.2462,
      "eval_test_samples_per_second": 1177.523,
      "eval_test_steps_per_second": 14.837,
      "step": 1400
    },
    {
      "epoch": 0.22416,
      "grad_norm": 0.17299741506576538,
      "learning_rate": 0.0001,
      "loss": 0.37,
      "step": 1401
    },
    {
      "epoch": 0.22432,
      "grad_norm": 0.12635721266269684,
      "learning_rate": 0.0001,
      "loss": 0.3664,
      "step": 1402
    },
    {
      "epoch": 0.22448,
      "grad_norm": 0.12310237437486649,
      "learning_rate": 0.0001,
      "loss": 0.3609,
      "step": 1403
    },
    {
      "epoch": 0.22464,
      "grad_norm": 0.11938150227069855,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 1404
    },
    {
      "epoch": 0.2248,
      "grad_norm": 0.1462882161140442,
      "learning_rate": 0.0001,
      "loss": 0.3742,
      "step": 1405
    },
    {
      "epoch": 0.22496,
      "grad_norm": 0.1463804394006729,
      "learning_rate": 0.0001,
      "loss": 0.3669,
      "step": 1406
    },
    {
      "epoch": 0.22512,
      "grad_norm": 0.12544041872024536,
      "learning_rate": 0.0001,
      "loss": 0.3588,
      "step": 1407
    },
    {
      "epoch": 0.22528,
      "grad_norm": 0.14355941116809845,
      "learning_rate": 0.0001,
      "loss": 0.363,
      "step": 1408
    },
    {
      "epoch": 0.22544,
      "grad_norm": 0.16421957314014435,
      "learning_rate": 0.0001,
      "loss": 0.3698,
      "step": 1409
    },
    {
      "epoch": 0.2256,
      "grad_norm": 0.12315895408391953,
      "learning_rate": 0.0001,
      "loss": 0.3486,
      "step": 1410
    },
    {
      "epoch": 0.22576,
      "grad_norm": 0.14623279869556427,
      "learning_rate": 0.0001,
      "loss": 0.362,
      "step": 1411
    },
    {
      "epoch": 0.22592,
      "grad_norm": 0.14150607585906982,
      "learning_rate": 0.0001,
      "loss": 0.376,
      "step": 1412
    },
    {
      "epoch": 0.22608,
      "grad_norm": 0.15324616432189941,
      "learning_rate": 0.0001,
      "loss": 0.3545,
      "step": 1413
    },
    {
      "epoch": 0.22624,
      "grad_norm": 0.14124321937561035,
      "learning_rate": 0.0001,
      "loss": 0.371,
      "step": 1414
    },
    {
      "epoch": 0.2264,
      "grad_norm": 0.12481765449047089,
      "learning_rate": 0.0001,
      "loss": 0.3595,
      "step": 1415
    },
    {
      "epoch": 0.22656,
      "grad_norm": 0.1325225830078125,
      "learning_rate": 0.0001,
      "loss": 0.355,
      "step": 1416
    },
    {
      "epoch": 0.22672,
      "grad_norm": 0.1285783052444458,
      "learning_rate": 0.0001,
      "loss": 0.3702,
      "step": 1417
    },
    {
      "epoch": 0.22688,
      "grad_norm": 0.13923490047454834,
      "learning_rate": 0.0001,
      "loss": 0.363,
      "step": 1418
    },
    {
      "epoch": 0.22704,
      "grad_norm": 0.12448624521493912,
      "learning_rate": 0.0001,
      "loss": 0.3618,
      "step": 1419
    },
    {
      "epoch": 0.2272,
      "grad_norm": 0.13965225219726562,
      "learning_rate": 0.0001,
      "loss": 0.3613,
      "step": 1420
    },
    {
      "epoch": 0.22736,
      "grad_norm": 0.13544294238090515,
      "learning_rate": 0.0001,
      "loss": 0.3623,
      "step": 1421
    },
    {
      "epoch": 0.22752,
      "grad_norm": 0.12751901149749756,
      "learning_rate": 0.0001,
      "loss": 0.3739,
      "step": 1422
    },
    {
      "epoch": 0.22768,
      "grad_norm": 0.14817839860916138,
      "learning_rate": 0.0001,
      "loss": 0.3542,
      "step": 1423
    },
    {
      "epoch": 0.22784,
      "grad_norm": 0.12783411145210266,
      "learning_rate": 0.0001,
      "loss": 0.3693,
      "step": 1424
    },
    {
      "epoch": 0.228,
      "grad_norm": 0.1524166762828827,
      "learning_rate": 0.0001,
      "loss": 0.3612,
      "step": 1425
    },
    {
      "epoch": 0.22816,
      "grad_norm": 0.1257297247648239,
      "learning_rate": 0.0001,
      "loss": 0.363,
      "step": 1426
    },
    {
      "epoch": 0.22832,
      "grad_norm": 0.13606825470924377,
      "learning_rate": 0.0001,
      "loss": 0.3615,
      "step": 1427
    },
    {
      "epoch": 0.22848,
      "grad_norm": 0.13315102458000183,
      "learning_rate": 0.0001,
      "loss": 0.3636,
      "step": 1428
    },
    {
      "epoch": 0.22864,
      "grad_norm": 0.12056092172861099,
      "learning_rate": 0.0001,
      "loss": 0.3652,
      "step": 1429
    },
    {
      "epoch": 0.2288,
      "grad_norm": 0.12713392078876495,
      "learning_rate": 0.0001,
      "loss": 0.3688,
      "step": 1430
    },
    {
      "epoch": 0.22896,
      "grad_norm": 0.14033712446689606,
      "learning_rate": 0.0001,
      "loss": 0.3777,
      "step": 1431
    },
    {
      "epoch": 0.22912,
      "grad_norm": 0.16572999954223633,
      "learning_rate": 0.0001,
      "loss": 0.3674,
      "step": 1432
    },
    {
      "epoch": 0.22928,
      "grad_norm": 0.13814619183540344,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 1433
    },
    {
      "epoch": 0.22944,
      "grad_norm": 0.12732873857021332,
      "learning_rate": 0.0001,
      "loss": 0.3761,
      "step": 1434
    },
    {
      "epoch": 0.2296,
      "grad_norm": 0.12496016174554825,
      "learning_rate": 0.0001,
      "loss": 0.3576,
      "step": 1435
    },
    {
      "epoch": 0.22976,
      "grad_norm": 0.15960626304149628,
      "learning_rate": 0.0001,
      "loss": 0.3779,
      "step": 1436
    },
    {
      "epoch": 0.22992,
      "grad_norm": 0.15290617942810059,
      "learning_rate": 0.0001,
      "loss": 0.3697,
      "step": 1437
    },
    {
      "epoch": 0.23008,
      "grad_norm": 0.13182538747787476,
      "learning_rate": 0.0001,
      "loss": 0.3538,
      "step": 1438
    },
    {
      "epoch": 0.23024,
      "grad_norm": 0.13618668913841248,
      "learning_rate": 0.0001,
      "loss": 0.3627,
      "step": 1439
    },
    {
      "epoch": 0.2304,
      "grad_norm": 0.1389371007680893,
      "learning_rate": 0.0001,
      "loss": 0.3548,
      "step": 1440
    },
    {
      "epoch": 0.23056,
      "grad_norm": 0.1218278631567955,
      "learning_rate": 0.0001,
      "loss": 0.3616,
      "step": 1441
    },
    {
      "epoch": 0.23072,
      "grad_norm": 0.16205540299415588,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 1442
    },
    {
      "epoch": 0.23088,
      "grad_norm": 0.1620342880487442,
      "learning_rate": 0.0001,
      "loss": 0.359,
      "step": 1443
    },
    {
      "epoch": 0.23104,
      "grad_norm": 0.1571299433708191,
      "learning_rate": 0.0001,
      "loss": 0.3558,
      "step": 1444
    },
    {
      "epoch": 0.2312,
      "grad_norm": 0.15537714958190918,
      "learning_rate": 0.0001,
      "loss": 0.346,
      "step": 1445
    },
    {
      "epoch": 0.23136,
      "grad_norm": 0.18079134821891785,
      "learning_rate": 0.0001,
      "loss": 0.3697,
      "step": 1446
    },
    {
      "epoch": 0.23152,
      "grad_norm": 0.16337156295776367,
      "learning_rate": 0.0001,
      "loss": 0.3562,
      "step": 1447
    },
    {
      "epoch": 0.23168,
      "grad_norm": 0.13784046471118927,
      "learning_rate": 0.0001,
      "loss": 0.362,
      "step": 1448
    },
    {
      "epoch": 0.23184,
      "grad_norm": 0.13895267248153687,
      "learning_rate": 0.0001,
      "loss": 0.3649,
      "step": 1449
    },
    {
      "epoch": 0.232,
      "grad_norm": 0.17020352184772491,
      "learning_rate": 0.0001,
      "loss": 0.3509,
      "step": 1450
    },
    {
      "epoch": 0.23216,
      "grad_norm": 0.1531115025281906,
      "learning_rate": 0.0001,
      "loss": 0.3638,
      "step": 1451
    },
    {
      "epoch": 0.23232,
      "grad_norm": 0.2070939689874649,
      "learning_rate": 0.0001,
      "loss": 0.3731,
      "step": 1452
    },
    {
      "epoch": 0.23248,
      "grad_norm": 0.1478748917579651,
      "learning_rate": 0.0001,
      "loss": 0.3611,
      "step": 1453
    },
    {
      "epoch": 0.23264,
      "grad_norm": 0.16676755249500275,
      "learning_rate": 0.0001,
      "loss": 0.3582,
      "step": 1454
    },
    {
      "epoch": 0.2328,
      "grad_norm": 0.13827833533287048,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 1455
    },
    {
      "epoch": 0.23296,
      "grad_norm": 0.19073069095611572,
      "learning_rate": 0.0001,
      "loss": 0.3726,
      "step": 1456
    },
    {
      "epoch": 0.23312,
      "grad_norm": 0.12506774067878723,
      "learning_rate": 0.0001,
      "loss": 0.3789,
      "step": 1457
    },
    {
      "epoch": 0.23328,
      "grad_norm": 0.3058977723121643,
      "learning_rate": 0.0001,
      "loss": 0.358,
      "step": 1458
    },
    {
      "epoch": 0.23344,
      "grad_norm": 0.16071097552776337,
      "learning_rate": 0.0001,
      "loss": 0.3519,
      "step": 1459
    },
    {
      "epoch": 0.2336,
      "grad_norm": 0.19800785183906555,
      "learning_rate": 0.0001,
      "loss": 0.3795,
      "step": 1460
    },
    {
      "epoch": 0.23376,
      "grad_norm": 0.1827060878276825,
      "learning_rate": 0.0001,
      "loss": 0.369,
      "step": 1461
    },
    {
      "epoch": 0.23392,
      "grad_norm": 0.14324574172496796,
      "learning_rate": 0.0001,
      "loss": 0.376,
      "step": 1462
    },
    {
      "epoch": 0.23408,
      "grad_norm": 0.25566622614860535,
      "learning_rate": 0.0001,
      "loss": 0.3769,
      "step": 1463
    },
    {
      "epoch": 0.23424,
      "grad_norm": 0.14150786399841309,
      "learning_rate": 0.0001,
      "loss": 0.3694,
      "step": 1464
    },
    {
      "epoch": 0.2344,
      "grad_norm": 0.15510331094264984,
      "learning_rate": 0.0001,
      "loss": 0.3666,
      "step": 1465
    },
    {
      "epoch": 0.23456,
      "grad_norm": 0.25308138132095337,
      "learning_rate": 0.0001,
      "loss": 0.3583,
      "step": 1466
    },
    {
      "epoch": 0.23472,
      "grad_norm": 0.1656583845615387,
      "learning_rate": 0.0001,
      "loss": 0.3639,
      "step": 1467
    },
    {
      "epoch": 0.23488,
      "grad_norm": 0.1554332822561264,
      "learning_rate": 0.0001,
      "loss": 0.3643,
      "step": 1468
    },
    {
      "epoch": 0.23504,
      "grad_norm": 0.2565877437591553,
      "learning_rate": 0.0001,
      "loss": 0.3702,
      "step": 1469
    },
    {
      "epoch": 0.2352,
      "grad_norm": 0.14884765446186066,
      "learning_rate": 0.0001,
      "loss": 0.3524,
      "step": 1470
    },
    {
      "epoch": 0.23536,
      "grad_norm": 0.18989241123199463,
      "learning_rate": 0.0001,
      "loss": 0.342,
      "step": 1471
    },
    {
      "epoch": 0.23552,
      "grad_norm": 0.21696841716766357,
      "learning_rate": 0.0001,
      "loss": 0.3703,
      "step": 1472
    },
    {
      "epoch": 0.23568,
      "grad_norm": 0.13840967416763306,
      "learning_rate": 0.0001,
      "loss": 0.3457,
      "step": 1473
    },
    {
      "epoch": 0.23584,
      "grad_norm": 0.12431041896343231,
      "learning_rate": 0.0001,
      "loss": 0.3615,
      "step": 1474
    },
    {
      "epoch": 0.236,
      "grad_norm": 0.19709038734436035,
      "learning_rate": 0.0001,
      "loss": 0.3686,
      "step": 1475
    },
    {
      "epoch": 0.23616,
      "grad_norm": 0.15886016190052032,
      "learning_rate": 0.0001,
      "loss": 0.362,
      "step": 1476
    },
    {
      "epoch": 0.23632,
      "grad_norm": 0.1982441395521164,
      "learning_rate": 0.0001,
      "loss": 0.3725,
      "step": 1477
    },
    {
      "epoch": 0.23648,
      "grad_norm": 0.12618665397167206,
      "learning_rate": 0.0001,
      "loss": 0.3576,
      "step": 1478
    },
    {
      "epoch": 0.23664,
      "grad_norm": 0.16012181341648102,
      "learning_rate": 0.0001,
      "loss": 0.3619,
      "step": 1479
    },
    {
      "epoch": 0.2368,
      "grad_norm": 0.2202206701040268,
      "learning_rate": 0.0001,
      "loss": 0.3708,
      "step": 1480
    },
    {
      "epoch": 0.23696,
      "grad_norm": 0.14114969968795776,
      "learning_rate": 0.0001,
      "loss": 0.3624,
      "step": 1481
    },
    {
      "epoch": 0.23712,
      "grad_norm": 0.1495479792356491,
      "learning_rate": 0.0001,
      "loss": 0.3658,
      "step": 1482
    },
    {
      "epoch": 0.23728,
      "grad_norm": 0.18390683829784393,
      "learning_rate": 0.0001,
      "loss": 0.3726,
      "step": 1483
    },
    {
      "epoch": 0.23744,
      "grad_norm": 0.15800747275352478,
      "learning_rate": 0.0001,
      "loss": 0.35,
      "step": 1484
    },
    {
      "epoch": 0.2376,
      "grad_norm": 0.17098407447338104,
      "learning_rate": 0.0001,
      "loss": 0.3751,
      "step": 1485
    },
    {
      "epoch": 0.23776,
      "grad_norm": 0.14098313450813293,
      "learning_rate": 0.0001,
      "loss": 0.3658,
      "step": 1486
    },
    {
      "epoch": 0.23792,
      "grad_norm": 0.22132372856140137,
      "learning_rate": 0.0001,
      "loss": 0.3601,
      "step": 1487
    },
    {
      "epoch": 0.23808,
      "grad_norm": 0.29176726937294006,
      "learning_rate": 0.0001,
      "loss": 0.3761,
      "step": 1488
    },
    {
      "epoch": 0.23824,
      "grad_norm": 0.15305863320827484,
      "learning_rate": 0.0001,
      "loss": 0.3708,
      "step": 1489
    },
    {
      "epoch": 0.2384,
      "grad_norm": 0.16927438974380493,
      "learning_rate": 0.0001,
      "loss": 0.3601,
      "step": 1490
    },
    {
      "epoch": 0.23856,
      "grad_norm": 0.1910427361726761,
      "learning_rate": 0.0001,
      "loss": 0.3702,
      "step": 1491
    },
    {
      "epoch": 0.23872,
      "grad_norm": 0.15637058019638062,
      "learning_rate": 0.0001,
      "loss": 0.3569,
      "step": 1492
    },
    {
      "epoch": 0.23888,
      "grad_norm": 0.16942651569843292,
      "learning_rate": 0.0001,
      "loss": 0.3605,
      "step": 1493
    },
    {
      "epoch": 0.23904,
      "grad_norm": 0.16898782551288605,
      "learning_rate": 0.0001,
      "loss": 0.3453,
      "step": 1494
    },
    {
      "epoch": 0.2392,
      "grad_norm": 0.13839854300022125,
      "learning_rate": 0.0001,
      "loss": 0.3571,
      "step": 1495
    },
    {
      "epoch": 0.23936,
      "grad_norm": 0.1460108458995819,
      "learning_rate": 0.0001,
      "loss": 0.3789,
      "step": 1496
    },
    {
      "epoch": 0.23952,
      "grad_norm": 0.21858535706996918,
      "learning_rate": 0.0001,
      "loss": 0.372,
      "step": 1497
    },
    {
      "epoch": 0.23968,
      "grad_norm": 0.1667192131280899,
      "learning_rate": 0.0001,
      "loss": 0.3625,
      "step": 1498
    },
    {
      "epoch": 0.23984,
      "grad_norm": 0.13859112560749054,
      "learning_rate": 0.0001,
      "loss": 0.3622,
      "step": 1499
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.19941192865371704,
      "learning_rate": 0.0001,
      "loss": 0.36,
      "step": 1500
    },
    {
      "epoch": 0.24,
      "eval_train_accuracy": 0.5352,
      "eval_train_loss": 0.36144810914993286,
      "eval_train_runtime": 4.3959,
      "eval_train_samples_per_second": 1137.428,
      "eval_train_steps_per_second": 14.332,
      "step": 1500
    },
    {
      "epoch": 0.24,
      "eval_test_accuracy": 0.5314,
      "eval_test_loss": 0.35983216762542725,
      "eval_test_runtime": 4.4499,
      "eval_test_samples_per_second": 1123.614,
      "eval_test_steps_per_second": 14.158,
      "step": 1500
    },
    {
      "epoch": 0.24016,
      "grad_norm": 0.16657431423664093,
      "learning_rate": 0.0001,
      "loss": 0.3689,
      "step": 1501
    },
    {
      "epoch": 0.24032,
      "grad_norm": 0.19702140986919403,
      "learning_rate": 0.0001,
      "loss": 0.3764,
      "step": 1502
    },
    {
      "epoch": 0.24048,
      "grad_norm": 0.16588479280471802,
      "learning_rate": 0.0001,
      "loss": 0.3684,
      "step": 1503
    },
    {
      "epoch": 0.24064,
      "grad_norm": 0.1312105506658554,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 1504
    },
    {
      "epoch": 0.2408,
      "grad_norm": 0.1469290852546692,
      "learning_rate": 0.0001,
      "loss": 0.363,
      "step": 1505
    },
    {
      "epoch": 0.24096,
      "grad_norm": 0.150352343916893,
      "learning_rate": 0.0001,
      "loss": 0.3631,
      "step": 1506
    },
    {
      "epoch": 0.24112,
      "grad_norm": 0.11893483251333237,
      "learning_rate": 0.0001,
      "loss": 0.3553,
      "step": 1507
    },
    {
      "epoch": 0.24128,
      "grad_norm": 0.15406867861747742,
      "learning_rate": 0.0001,
      "loss": 0.3565,
      "step": 1508
    },
    {
      "epoch": 0.24144,
      "grad_norm": 0.2266235202550888,
      "learning_rate": 0.0001,
      "loss": 0.3884,
      "step": 1509
    },
    {
      "epoch": 0.2416,
      "grad_norm": 0.13668489456176758,
      "learning_rate": 0.0001,
      "loss": 0.3667,
      "step": 1510
    },
    {
      "epoch": 0.24176,
      "grad_norm": 0.14513899385929108,
      "learning_rate": 0.0001,
      "loss": 0.3674,
      "step": 1511
    },
    {
      "epoch": 0.24192,
      "grad_norm": 0.1439831405878067,
      "learning_rate": 0.0001,
      "loss": 0.3464,
      "step": 1512
    },
    {
      "epoch": 0.24208,
      "grad_norm": 0.13585668802261353,
      "learning_rate": 0.0001,
      "loss": 0.3578,
      "step": 1513
    },
    {
      "epoch": 0.24224,
      "grad_norm": 0.16605645418167114,
      "learning_rate": 0.0001,
      "loss": 0.3752,
      "step": 1514
    },
    {
      "epoch": 0.2424,
      "grad_norm": 0.14791616797447205,
      "learning_rate": 0.0001,
      "loss": 0.345,
      "step": 1515
    },
    {
      "epoch": 0.24256,
      "grad_norm": 0.1488034874200821,
      "learning_rate": 0.0001,
      "loss": 0.3661,
      "step": 1516
    },
    {
      "epoch": 0.24272,
      "grad_norm": 0.1556890904903412,
      "learning_rate": 0.0001,
      "loss": 0.3625,
      "step": 1517
    },
    {
      "epoch": 0.24288,
      "grad_norm": 0.13808324933052063,
      "learning_rate": 0.0001,
      "loss": 0.3626,
      "step": 1518
    },
    {
      "epoch": 0.24304,
      "grad_norm": 0.13352181017398834,
      "learning_rate": 0.0001,
      "loss": 0.36,
      "step": 1519
    },
    {
      "epoch": 0.2432,
      "grad_norm": 0.15137013792991638,
      "learning_rate": 0.0001,
      "loss": 0.3355,
      "step": 1520
    },
    {
      "epoch": 0.24336,
      "grad_norm": 0.13753481209278107,
      "learning_rate": 0.0001,
      "loss": 0.3622,
      "step": 1521
    },
    {
      "epoch": 0.24352,
      "grad_norm": 0.1917145699262619,
      "learning_rate": 0.0001,
      "loss": 0.3703,
      "step": 1522
    },
    {
      "epoch": 0.24368,
      "grad_norm": 0.1520133763551712,
      "learning_rate": 0.0001,
      "loss": 0.3582,
      "step": 1523
    },
    {
      "epoch": 0.24384,
      "grad_norm": 0.12496163696050644,
      "learning_rate": 0.0001,
      "loss": 0.374,
      "step": 1524
    },
    {
      "epoch": 0.244,
      "grad_norm": 0.16171154379844666,
      "learning_rate": 0.0001,
      "loss": 0.3742,
      "step": 1525
    },
    {
      "epoch": 0.24416,
      "grad_norm": 0.15579842031002045,
      "learning_rate": 0.0001,
      "loss": 0.3472,
      "step": 1526
    },
    {
      "epoch": 0.24432,
      "grad_norm": 0.1601712703704834,
      "learning_rate": 0.0001,
      "loss": 0.3617,
      "step": 1527
    },
    {
      "epoch": 0.24448,
      "grad_norm": 0.1675935834646225,
      "learning_rate": 0.0001,
      "loss": 0.3729,
      "step": 1528
    },
    {
      "epoch": 0.24464,
      "grad_norm": 0.14876067638397217,
      "learning_rate": 0.0001,
      "loss": 0.3631,
      "step": 1529
    },
    {
      "epoch": 0.2448,
      "grad_norm": 0.1729859709739685,
      "learning_rate": 0.0001,
      "loss": 0.3553,
      "step": 1530
    },
    {
      "epoch": 0.24496,
      "grad_norm": 0.1863575279712677,
      "learning_rate": 0.0001,
      "loss": 0.3406,
      "step": 1531
    },
    {
      "epoch": 0.24512,
      "grad_norm": 0.15445205569267273,
      "learning_rate": 0.0001,
      "loss": 0.3561,
      "step": 1532
    },
    {
      "epoch": 0.24528,
      "grad_norm": 0.1537490338087082,
      "learning_rate": 0.0001,
      "loss": 0.3703,
      "step": 1533
    },
    {
      "epoch": 0.24544,
      "grad_norm": 0.16664214432239532,
      "learning_rate": 0.0001,
      "loss": 0.3627,
      "step": 1534
    },
    {
      "epoch": 0.2456,
      "grad_norm": 0.13872624933719635,
      "learning_rate": 0.0001,
      "loss": 0.3626,
      "step": 1535
    },
    {
      "epoch": 0.24576,
      "grad_norm": 0.18374444544315338,
      "learning_rate": 0.0001,
      "loss": 0.3756,
      "step": 1536
    },
    {
      "epoch": 0.24592,
      "grad_norm": 0.12749913334846497,
      "learning_rate": 0.0001,
      "loss": 0.3646,
      "step": 1537
    },
    {
      "epoch": 0.24608,
      "grad_norm": 0.13057629764080048,
      "learning_rate": 0.0001,
      "loss": 0.3442,
      "step": 1538
    },
    {
      "epoch": 0.24624,
      "grad_norm": 0.19154669344425201,
      "learning_rate": 0.0001,
      "loss": 0.3798,
      "step": 1539
    },
    {
      "epoch": 0.2464,
      "grad_norm": 0.17298924922943115,
      "learning_rate": 0.0001,
      "loss": 0.3705,
      "step": 1540
    },
    {
      "epoch": 0.24656,
      "grad_norm": 0.1278064101934433,
      "learning_rate": 0.0001,
      "loss": 0.3646,
      "step": 1541
    },
    {
      "epoch": 0.24672,
      "grad_norm": 0.14425696432590485,
      "learning_rate": 0.0001,
      "loss": 0.3708,
      "step": 1542
    },
    {
      "epoch": 0.24688,
      "grad_norm": 0.14215104281902313,
      "learning_rate": 0.0001,
      "loss": 0.3545,
      "step": 1543
    },
    {
      "epoch": 0.24704,
      "grad_norm": 0.15314637124538422,
      "learning_rate": 0.0001,
      "loss": 0.3506,
      "step": 1544
    },
    {
      "epoch": 0.2472,
      "grad_norm": 0.12044911831617355,
      "learning_rate": 0.0001,
      "loss": 0.3549,
      "step": 1545
    },
    {
      "epoch": 0.24736,
      "grad_norm": 0.12169352918863297,
      "learning_rate": 0.0001,
      "loss": 0.3665,
      "step": 1546
    },
    {
      "epoch": 0.24752,
      "grad_norm": 0.14973503351211548,
      "learning_rate": 0.0001,
      "loss": 0.3769,
      "step": 1547
    },
    {
      "epoch": 0.24768,
      "grad_norm": 0.17748983204364777,
      "learning_rate": 0.0001,
      "loss": 0.3466,
      "step": 1548
    },
    {
      "epoch": 0.24784,
      "grad_norm": 0.13318376243114471,
      "learning_rate": 0.0001,
      "loss": 0.3625,
      "step": 1549
    },
    {
      "epoch": 0.248,
      "grad_norm": 0.12953706085681915,
      "learning_rate": 0.0001,
      "loss": 0.3634,
      "step": 1550
    },
    {
      "epoch": 0.24816,
      "grad_norm": 0.136166051030159,
      "learning_rate": 0.0001,
      "loss": 0.3524,
      "step": 1551
    },
    {
      "epoch": 0.24832,
      "grad_norm": 0.15915049612522125,
      "learning_rate": 0.0001,
      "loss": 0.3643,
      "step": 1552
    },
    {
      "epoch": 0.24848,
      "grad_norm": 0.12839604914188385,
      "learning_rate": 0.0001,
      "loss": 0.3529,
      "step": 1553
    },
    {
      "epoch": 0.24864,
      "grad_norm": 0.15972410142421722,
      "learning_rate": 0.0001,
      "loss": 0.3739,
      "step": 1554
    },
    {
      "epoch": 0.2488,
      "grad_norm": 0.14723321795463562,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 1555
    },
    {
      "epoch": 0.24896,
      "grad_norm": 0.14821116626262665,
      "learning_rate": 0.0001,
      "loss": 0.3599,
      "step": 1556
    },
    {
      "epoch": 0.24912,
      "grad_norm": 0.13306504487991333,
      "learning_rate": 0.0001,
      "loss": 0.343,
      "step": 1557
    },
    {
      "epoch": 0.24928,
      "grad_norm": 0.1607833206653595,
      "learning_rate": 0.0001,
      "loss": 0.3671,
      "step": 1558
    },
    {
      "epoch": 0.24944,
      "grad_norm": 0.11672617495059967,
      "learning_rate": 0.0001,
      "loss": 0.3446,
      "step": 1559
    },
    {
      "epoch": 0.2496,
      "grad_norm": 0.1651950478553772,
      "learning_rate": 0.0001,
      "loss": 0.37,
      "step": 1560
    },
    {
      "epoch": 0.24976,
      "grad_norm": 0.14031925797462463,
      "learning_rate": 0.0001,
      "loss": 0.3531,
      "step": 1561
    },
    {
      "epoch": 0.24992,
      "grad_norm": 0.139689102768898,
      "learning_rate": 0.0001,
      "loss": 0.3581,
      "step": 1562
    },
    {
      "epoch": 0.25008,
      "grad_norm": 0.11549444496631622,
      "learning_rate": 0.0001,
      "loss": 0.3521,
      "step": 1563
    },
    {
      "epoch": 0.25024,
      "grad_norm": 0.13682787120342255,
      "learning_rate": 0.0001,
      "loss": 0.3482,
      "step": 1564
    },
    {
      "epoch": 0.2504,
      "grad_norm": 0.17297741770744324,
      "learning_rate": 0.0001,
      "loss": 0.354,
      "step": 1565
    },
    {
      "epoch": 0.25056,
      "grad_norm": 0.1222650557756424,
      "learning_rate": 0.0001,
      "loss": 0.3537,
      "step": 1566
    },
    {
      "epoch": 0.25072,
      "grad_norm": 0.1473587155342102,
      "learning_rate": 0.0001,
      "loss": 0.3694,
      "step": 1567
    },
    {
      "epoch": 0.25088,
      "grad_norm": 0.15018092095851898,
      "learning_rate": 0.0001,
      "loss": 0.3615,
      "step": 1568
    },
    {
      "epoch": 0.25104,
      "grad_norm": 0.11865460127592087,
      "learning_rate": 0.0001,
      "loss": 0.3406,
      "step": 1569
    },
    {
      "epoch": 0.2512,
      "grad_norm": 0.140169158577919,
      "learning_rate": 0.0001,
      "loss": 0.357,
      "step": 1570
    },
    {
      "epoch": 0.25136,
      "grad_norm": 0.13801580667495728,
      "learning_rate": 0.0001,
      "loss": 0.3544,
      "step": 1571
    },
    {
      "epoch": 0.25152,
      "grad_norm": 0.1506703794002533,
      "learning_rate": 0.0001,
      "loss": 0.3569,
      "step": 1572
    },
    {
      "epoch": 0.25168,
      "grad_norm": 0.13200680911540985,
      "learning_rate": 0.0001,
      "loss": 0.3592,
      "step": 1573
    },
    {
      "epoch": 0.25184,
      "grad_norm": 0.13253606855869293,
      "learning_rate": 0.0001,
      "loss": 0.3657,
      "step": 1574
    },
    {
      "epoch": 0.252,
      "grad_norm": 0.14550651609897614,
      "learning_rate": 0.0001,
      "loss": 0.3522,
      "step": 1575
    },
    {
      "epoch": 0.25216,
      "grad_norm": 0.1681850254535675,
      "learning_rate": 0.0001,
      "loss": 0.3649,
      "step": 1576
    },
    {
      "epoch": 0.25232,
      "grad_norm": 0.1401584893465042,
      "learning_rate": 0.0001,
      "loss": 0.3615,
      "step": 1577
    },
    {
      "epoch": 0.25248,
      "grad_norm": 0.12466565519571304,
      "learning_rate": 0.0001,
      "loss": 0.3605,
      "step": 1578
    },
    {
      "epoch": 0.25264,
      "grad_norm": 0.16189046204090118,
      "learning_rate": 0.0001,
      "loss": 0.3687,
      "step": 1579
    },
    {
      "epoch": 0.2528,
      "grad_norm": 0.13250280916690826,
      "learning_rate": 0.0001,
      "loss": 0.3403,
      "step": 1580
    },
    {
      "epoch": 0.25296,
      "grad_norm": 0.14171355962753296,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 1581
    },
    {
      "epoch": 0.25312,
      "grad_norm": 0.1846919059753418,
      "learning_rate": 0.0001,
      "loss": 0.3601,
      "step": 1582
    },
    {
      "epoch": 0.25328,
      "grad_norm": 0.15974831581115723,
      "learning_rate": 0.0001,
      "loss": 0.3549,
      "step": 1583
    },
    {
      "epoch": 0.25344,
      "grad_norm": 0.15767334401607513,
      "learning_rate": 0.0001,
      "loss": 0.3541,
      "step": 1584
    },
    {
      "epoch": 0.2536,
      "grad_norm": 0.14397476613521576,
      "learning_rate": 0.0001,
      "loss": 0.3452,
      "step": 1585
    },
    {
      "epoch": 0.25376,
      "grad_norm": 0.1613955944776535,
      "learning_rate": 0.0001,
      "loss": 0.3633,
      "step": 1586
    },
    {
      "epoch": 0.25392,
      "grad_norm": 0.20160460472106934,
      "learning_rate": 0.0001,
      "loss": 0.3568,
      "step": 1587
    },
    {
      "epoch": 0.25408,
      "grad_norm": 0.17413011193275452,
      "learning_rate": 0.0001,
      "loss": 0.3454,
      "step": 1588
    },
    {
      "epoch": 0.25424,
      "grad_norm": 0.1782766431570053,
      "learning_rate": 0.0001,
      "loss": 0.3465,
      "step": 1589
    },
    {
      "epoch": 0.2544,
      "grad_norm": 0.1801181584596634,
      "learning_rate": 0.0001,
      "loss": 0.3488,
      "step": 1590
    },
    {
      "epoch": 0.25456,
      "grad_norm": 0.14943750202655792,
      "learning_rate": 0.0001,
      "loss": 0.3674,
      "step": 1591
    },
    {
      "epoch": 0.25472,
      "grad_norm": 0.18049374222755432,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 1592
    },
    {
      "epoch": 0.25488,
      "grad_norm": 0.1560518741607666,
      "learning_rate": 0.0001,
      "loss": 0.3488,
      "step": 1593
    },
    {
      "epoch": 0.25504,
      "grad_norm": 0.1517602503299713,
      "learning_rate": 0.0001,
      "loss": 0.3551,
      "step": 1594
    },
    {
      "epoch": 0.2552,
      "grad_norm": 0.18439067900180817,
      "learning_rate": 0.0001,
      "loss": 0.3494,
      "step": 1595
    },
    {
      "epoch": 0.25536,
      "grad_norm": 0.17502667009830475,
      "learning_rate": 0.0001,
      "loss": 0.3459,
      "step": 1596
    },
    {
      "epoch": 0.25552,
      "grad_norm": 0.15766611695289612,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 1597
    },
    {
      "epoch": 0.25568,
      "grad_norm": 0.20401982963085175,
      "learning_rate": 0.0001,
      "loss": 0.3632,
      "step": 1598
    },
    {
      "epoch": 0.25584,
      "grad_norm": 0.164297416806221,
      "learning_rate": 0.0001,
      "loss": 0.3542,
      "step": 1599
    },
    {
      "epoch": 0.256,
      "grad_norm": 0.17975322902202606,
      "learning_rate": 0.0001,
      "loss": 0.3664,
      "step": 1600
    },
    {
      "epoch": 0.256,
      "eval_train_accuracy": 0.5404,
      "eval_train_loss": 0.3531106412410736,
      "eval_train_runtime": 4.4713,
      "eval_train_samples_per_second": 1118.254,
      "eval_train_steps_per_second": 14.09,
      "step": 1600
    },
    {
      "epoch": 0.256,
      "eval_test_accuracy": 0.5424,
      "eval_test_loss": 0.35180941224098206,
      "eval_test_runtime": 4.7784,
      "eval_test_samples_per_second": 1046.381,
      "eval_test_steps_per_second": 13.184,
      "step": 1600
    },
    {
      "epoch": 0.25616,
      "grad_norm": 0.18101295828819275,
      "learning_rate": 0.0001,
      "loss": 0.3603,
      "step": 1601
    },
    {
      "epoch": 0.25632,
      "grad_norm": 0.16878867149353027,
      "learning_rate": 0.0001,
      "loss": 0.3638,
      "step": 1602
    },
    {
      "epoch": 0.25648,
      "grad_norm": 0.20598703622817993,
      "learning_rate": 0.0001,
      "loss": 0.3639,
      "step": 1603
    },
    {
      "epoch": 0.25664,
      "grad_norm": 0.2246871143579483,
      "learning_rate": 0.0001,
      "loss": 0.374,
      "step": 1604
    },
    {
      "epoch": 0.2568,
      "grad_norm": 0.17708231508731842,
      "learning_rate": 0.0001,
      "loss": 0.3667,
      "step": 1605
    },
    {
      "epoch": 0.25696,
      "grad_norm": 0.16012585163116455,
      "learning_rate": 0.0001,
      "loss": 0.3629,
      "step": 1606
    },
    {
      "epoch": 0.25712,
      "grad_norm": 0.19252370297908783,
      "learning_rate": 0.0001,
      "loss": 0.3424,
      "step": 1607
    },
    {
      "epoch": 0.25728,
      "grad_norm": 0.1745760589838028,
      "learning_rate": 0.0001,
      "loss": 0.3507,
      "step": 1608
    },
    {
      "epoch": 0.25744,
      "grad_norm": 0.16930784285068512,
      "learning_rate": 0.0001,
      "loss": 0.3403,
      "step": 1609
    },
    {
      "epoch": 0.2576,
      "grad_norm": 0.1784280240535736,
      "learning_rate": 0.0001,
      "loss": 0.3612,
      "step": 1610
    },
    {
      "epoch": 0.25776,
      "grad_norm": 0.1493600308895111,
      "learning_rate": 0.0001,
      "loss": 0.3448,
      "step": 1611
    },
    {
      "epoch": 0.25792,
      "grad_norm": 0.1616717427968979,
      "learning_rate": 0.0001,
      "loss": 0.3622,
      "step": 1612
    },
    {
      "epoch": 0.25808,
      "grad_norm": 0.17624568939208984,
      "learning_rate": 0.0001,
      "loss": 0.3533,
      "step": 1613
    },
    {
      "epoch": 0.25824,
      "grad_norm": 0.16562466323375702,
      "learning_rate": 0.0001,
      "loss": 0.357,
      "step": 1614
    },
    {
      "epoch": 0.2584,
      "grad_norm": 0.18512435257434845,
      "learning_rate": 0.0001,
      "loss": 0.3511,
      "step": 1615
    },
    {
      "epoch": 0.25856,
      "grad_norm": 0.16590699553489685,
      "learning_rate": 0.0001,
      "loss": 0.3614,
      "step": 1616
    },
    {
      "epoch": 0.25872,
      "grad_norm": 0.15025487542152405,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 1617
    },
    {
      "epoch": 0.25888,
      "grad_norm": 0.16685891151428223,
      "learning_rate": 0.0001,
      "loss": 0.3507,
      "step": 1618
    },
    {
      "epoch": 0.25904,
      "grad_norm": 0.16174252331256866,
      "learning_rate": 0.0001,
      "loss": 0.3632,
      "step": 1619
    },
    {
      "epoch": 0.2592,
      "grad_norm": 0.15613138675689697,
      "learning_rate": 0.0001,
      "loss": 0.3662,
      "step": 1620
    },
    {
      "epoch": 0.25936,
      "grad_norm": 0.17787638306617737,
      "learning_rate": 0.0001,
      "loss": 0.3743,
      "step": 1621
    },
    {
      "epoch": 0.25952,
      "grad_norm": 0.19168423116207123,
      "learning_rate": 0.0001,
      "loss": 0.3517,
      "step": 1622
    },
    {
      "epoch": 0.25968,
      "grad_norm": 0.17324575781822205,
      "learning_rate": 0.0001,
      "loss": 0.3626,
      "step": 1623
    },
    {
      "epoch": 0.25984,
      "grad_norm": 0.1531016230583191,
      "learning_rate": 0.0001,
      "loss": 0.3483,
      "step": 1624
    },
    {
      "epoch": 0.26,
      "grad_norm": 0.17913049459457397,
      "learning_rate": 0.0001,
      "loss": 0.3461,
      "step": 1625
    },
    {
      "epoch": 0.26016,
      "grad_norm": 0.15626947581768036,
      "learning_rate": 0.0001,
      "loss": 0.3614,
      "step": 1626
    },
    {
      "epoch": 0.26032,
      "grad_norm": 0.1694750040769577,
      "learning_rate": 0.0001,
      "loss": 0.3469,
      "step": 1627
    },
    {
      "epoch": 0.26048,
      "grad_norm": 0.18295292556285858,
      "learning_rate": 0.0001,
      "loss": 0.3483,
      "step": 1628
    },
    {
      "epoch": 0.26064,
      "grad_norm": 0.1947043538093567,
      "learning_rate": 0.0001,
      "loss": 0.3644,
      "step": 1629
    },
    {
      "epoch": 0.2608,
      "grad_norm": 0.1645675003528595,
      "learning_rate": 0.0001,
      "loss": 0.3452,
      "step": 1630
    },
    {
      "epoch": 0.26096,
      "grad_norm": 0.17074894905090332,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 1631
    },
    {
      "epoch": 0.26112,
      "grad_norm": 0.16084620356559753,
      "learning_rate": 0.0001,
      "loss": 0.347,
      "step": 1632
    },
    {
      "epoch": 0.26128,
      "grad_norm": 0.16269004344940186,
      "learning_rate": 0.0001,
      "loss": 0.357,
      "step": 1633
    },
    {
      "epoch": 0.26144,
      "grad_norm": 0.1639706790447235,
      "learning_rate": 0.0001,
      "loss": 0.3391,
      "step": 1634
    },
    {
      "epoch": 0.2616,
      "grad_norm": 0.13925497233867645,
      "learning_rate": 0.0001,
      "loss": 0.3518,
      "step": 1635
    },
    {
      "epoch": 0.26176,
      "grad_norm": 0.1444394886493683,
      "learning_rate": 0.0001,
      "loss": 0.3472,
      "step": 1636
    },
    {
      "epoch": 0.26192,
      "grad_norm": 0.18336264789104462,
      "learning_rate": 0.0001,
      "loss": 0.3468,
      "step": 1637
    },
    {
      "epoch": 0.26208,
      "grad_norm": 0.19625557959079742,
      "learning_rate": 0.0001,
      "loss": 0.3447,
      "step": 1638
    },
    {
      "epoch": 0.26224,
      "grad_norm": 0.139002725481987,
      "learning_rate": 0.0001,
      "loss": 0.3536,
      "step": 1639
    },
    {
      "epoch": 0.2624,
      "grad_norm": 0.17038199305534363,
      "learning_rate": 0.0001,
      "loss": 0.3613,
      "step": 1640
    },
    {
      "epoch": 0.26256,
      "grad_norm": 0.16842077672481537,
      "learning_rate": 0.0001,
      "loss": 0.3615,
      "step": 1641
    },
    {
      "epoch": 0.26272,
      "grad_norm": 0.14963755011558533,
      "learning_rate": 0.0001,
      "loss": 0.3567,
      "step": 1642
    },
    {
      "epoch": 0.26288,
      "grad_norm": 0.1540813148021698,
      "learning_rate": 0.0001,
      "loss": 0.3576,
      "step": 1643
    },
    {
      "epoch": 0.26304,
      "grad_norm": 0.16026388108730316,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 1644
    },
    {
      "epoch": 0.2632,
      "grad_norm": 0.1411466896533966,
      "learning_rate": 0.0001,
      "loss": 0.3451,
      "step": 1645
    },
    {
      "epoch": 0.26336,
      "grad_norm": 0.15458686649799347,
      "learning_rate": 0.0001,
      "loss": 0.3511,
      "step": 1646
    },
    {
      "epoch": 0.26352,
      "grad_norm": 0.1518757939338684,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 1647
    },
    {
      "epoch": 0.26368,
      "grad_norm": 0.18960490822792053,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 1648
    },
    {
      "epoch": 0.26384,
      "grad_norm": 0.13126970827579498,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 1649
    },
    {
      "epoch": 0.264,
      "grad_norm": 0.1339682638645172,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 1650
    },
    {
      "epoch": 0.26416,
      "grad_norm": 0.16494633257389069,
      "learning_rate": 0.0001,
      "loss": 0.3608,
      "step": 1651
    },
    {
      "epoch": 0.26432,
      "grad_norm": 0.1789139211177826,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 1652
    },
    {
      "epoch": 0.26448,
      "grad_norm": 0.15587592124938965,
      "learning_rate": 0.0001,
      "loss": 0.3458,
      "step": 1653
    },
    {
      "epoch": 0.26464,
      "grad_norm": 0.15501035749912262,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 1654
    },
    {
      "epoch": 0.2648,
      "grad_norm": 0.15824759006500244,
      "learning_rate": 0.0001,
      "loss": 0.3423,
      "step": 1655
    },
    {
      "epoch": 0.26496,
      "grad_norm": 0.20673386752605438,
      "learning_rate": 0.0001,
      "loss": 0.3442,
      "step": 1656
    },
    {
      "epoch": 0.26512,
      "grad_norm": 0.13331001996994019,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 1657
    },
    {
      "epoch": 0.26528,
      "grad_norm": 0.14646145701408386,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 1658
    },
    {
      "epoch": 0.26544,
      "grad_norm": 0.17173552513122559,
      "learning_rate": 0.0001,
      "loss": 0.3469,
      "step": 1659
    },
    {
      "epoch": 0.2656,
      "grad_norm": 0.1565180867910385,
      "learning_rate": 0.0001,
      "loss": 0.3518,
      "step": 1660
    },
    {
      "epoch": 0.26576,
      "grad_norm": 0.15829351544380188,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 1661
    },
    {
      "epoch": 0.26592,
      "grad_norm": 0.16113777458667755,
      "learning_rate": 0.0001,
      "loss": 0.3581,
      "step": 1662
    },
    {
      "epoch": 0.26608,
      "grad_norm": 0.16566848754882812,
      "learning_rate": 0.0001,
      "loss": 0.3388,
      "step": 1663
    },
    {
      "epoch": 0.26624,
      "grad_norm": 0.16332904994487762,
      "learning_rate": 0.0001,
      "loss": 0.3534,
      "step": 1664
    },
    {
      "epoch": 0.2664,
      "grad_norm": 0.16530583798885345,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 1665
    },
    {
      "epoch": 0.26656,
      "grad_norm": 0.14816999435424805,
      "learning_rate": 0.0001,
      "loss": 0.3441,
      "step": 1666
    },
    {
      "epoch": 0.26672,
      "grad_norm": 0.2013755589723587,
      "learning_rate": 0.0001,
      "loss": 0.3566,
      "step": 1667
    },
    {
      "epoch": 0.26688,
      "grad_norm": 0.182124063372612,
      "learning_rate": 0.0001,
      "loss": 0.3666,
      "step": 1668
    },
    {
      "epoch": 0.26704,
      "grad_norm": 0.15787455439567566,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 1669
    },
    {
      "epoch": 0.2672,
      "grad_norm": 0.19166214764118195,
      "learning_rate": 0.0001,
      "loss": 0.3535,
      "step": 1670
    },
    {
      "epoch": 0.26736,
      "grad_norm": 0.1644478142261505,
      "learning_rate": 0.0001,
      "loss": 0.3564,
      "step": 1671
    },
    {
      "epoch": 0.26752,
      "grad_norm": 0.18082313239574432,
      "learning_rate": 0.0001,
      "loss": 0.3548,
      "step": 1672
    },
    {
      "epoch": 0.26768,
      "grad_norm": 0.16649657487869263,
      "learning_rate": 0.0001,
      "loss": 0.3567,
      "step": 1673
    },
    {
      "epoch": 0.26784,
      "grad_norm": 0.18206383287906647,
      "learning_rate": 0.0001,
      "loss": 0.3407,
      "step": 1674
    },
    {
      "epoch": 0.268,
      "grad_norm": 0.17337705194950104,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 1675
    },
    {
      "epoch": 0.26816,
      "grad_norm": 0.15002384781837463,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 1676
    },
    {
      "epoch": 0.26832,
      "grad_norm": 0.14129358530044556,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 1677
    },
    {
      "epoch": 0.26848,
      "grad_norm": 0.17469868063926697,
      "learning_rate": 0.0001,
      "loss": 0.3503,
      "step": 1678
    },
    {
      "epoch": 0.26864,
      "grad_norm": 0.16064539551734924,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 1679
    },
    {
      "epoch": 0.2688,
      "grad_norm": 0.153654545545578,
      "learning_rate": 0.0001,
      "loss": 0.3491,
      "step": 1680
    },
    {
      "epoch": 0.26896,
      "grad_norm": 0.17716437578201294,
      "learning_rate": 0.0001,
      "loss": 0.3437,
      "step": 1681
    },
    {
      "epoch": 0.26912,
      "grad_norm": 0.20295128226280212,
      "learning_rate": 0.0001,
      "loss": 0.3424,
      "step": 1682
    },
    {
      "epoch": 0.26928,
      "grad_norm": 0.22280006110668182,
      "learning_rate": 0.0001,
      "loss": 0.3539,
      "step": 1683
    },
    {
      "epoch": 0.26944,
      "grad_norm": 0.1579551249742508,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 1684
    },
    {
      "epoch": 0.2696,
      "grad_norm": 0.20230911672115326,
      "learning_rate": 0.0001,
      "loss": 0.3461,
      "step": 1685
    },
    {
      "epoch": 0.26976,
      "grad_norm": 0.22380389273166656,
      "learning_rate": 0.0001,
      "loss": 0.3463,
      "step": 1686
    },
    {
      "epoch": 0.26992,
      "grad_norm": 0.2316524237394333,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 1687
    },
    {
      "epoch": 0.27008,
      "grad_norm": 0.21764427423477173,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 1688
    },
    {
      "epoch": 0.27024,
      "grad_norm": 0.2354562133550644,
      "learning_rate": 0.0001,
      "loss": 0.3471,
      "step": 1689
    },
    {
      "epoch": 0.2704,
      "grad_norm": 0.24831363558769226,
      "learning_rate": 0.0001,
      "loss": 0.3473,
      "step": 1690
    },
    {
      "epoch": 0.27056,
      "grad_norm": 0.23493176698684692,
      "learning_rate": 0.0001,
      "loss": 0.3392,
      "step": 1691
    },
    {
      "epoch": 0.27072,
      "grad_norm": 0.16041527688503265,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 1692
    },
    {
      "epoch": 0.27088,
      "grad_norm": 0.18664489686489105,
      "learning_rate": 0.0001,
      "loss": 0.358,
      "step": 1693
    },
    {
      "epoch": 0.27104,
      "grad_norm": 0.21034939587116241,
      "learning_rate": 0.0001,
      "loss": 0.3459,
      "step": 1694
    },
    {
      "epoch": 0.2712,
      "grad_norm": 0.16563768684864044,
      "learning_rate": 0.0001,
      "loss": 0.3531,
      "step": 1695
    },
    {
      "epoch": 0.27136,
      "grad_norm": 0.18949581682682037,
      "learning_rate": 0.0001,
      "loss": 0.3447,
      "step": 1696
    },
    {
      "epoch": 0.27152,
      "grad_norm": 0.16275089979171753,
      "learning_rate": 0.0001,
      "loss": 0.3595,
      "step": 1697
    },
    {
      "epoch": 0.27168,
      "grad_norm": 0.1894388645887375,
      "learning_rate": 0.0001,
      "loss": 0.3459,
      "step": 1698
    },
    {
      "epoch": 0.27184,
      "grad_norm": 0.19361016154289246,
      "learning_rate": 0.0001,
      "loss": 0.3543,
      "step": 1699
    },
    {
      "epoch": 0.272,
      "grad_norm": 0.19207213819026947,
      "learning_rate": 0.0001,
      "loss": 0.3487,
      "step": 1700
    },
    {
      "epoch": 0.272,
      "eval_train_accuracy": 0.6966,
      "eval_train_loss": 0.34296315908432007,
      "eval_train_runtime": 4.5793,
      "eval_train_samples_per_second": 1091.86,
      "eval_train_steps_per_second": 13.757,
      "step": 1700
    },
    {
      "epoch": 0.272,
      "eval_test_accuracy": 0.694,
      "eval_test_loss": 0.3414507508277893,
      "eval_test_runtime": 4.5594,
      "eval_test_samples_per_second": 1096.634,
      "eval_test_steps_per_second": 13.818,
      "step": 1700
    },
    {
      "epoch": 0.27216,
      "grad_norm": 0.1632964313030243,
      "learning_rate": 0.0001,
      "loss": 0.3533,
      "step": 1701
    },
    {
      "epoch": 0.27232,
      "grad_norm": 0.189178928732872,
      "learning_rate": 0.0001,
      "loss": 0.3525,
      "step": 1702
    },
    {
      "epoch": 0.27248,
      "grad_norm": 0.24454432725906372,
      "learning_rate": 0.0001,
      "loss": 0.3503,
      "step": 1703
    },
    {
      "epoch": 0.27264,
      "grad_norm": 0.1633763313293457,
      "learning_rate": 0.0001,
      "loss": 0.3464,
      "step": 1704
    },
    {
      "epoch": 0.2728,
      "grad_norm": 0.18274009227752686,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 1705
    },
    {
      "epoch": 0.27296,
      "grad_norm": 0.19296897947788239,
      "learning_rate": 0.0001,
      "loss": 0.3462,
      "step": 1706
    },
    {
      "epoch": 0.27312,
      "grad_norm": 0.1745138168334961,
      "learning_rate": 0.0001,
      "loss": 0.3602,
      "step": 1707
    },
    {
      "epoch": 0.27328,
      "grad_norm": 0.19613400101661682,
      "learning_rate": 0.0001,
      "loss": 0.3513,
      "step": 1708
    },
    {
      "epoch": 0.27344,
      "grad_norm": 0.16799254715442657,
      "learning_rate": 0.0001,
      "loss": 0.3453,
      "step": 1709
    },
    {
      "epoch": 0.2736,
      "grad_norm": 0.14886420965194702,
      "learning_rate": 0.0001,
      "loss": 0.35,
      "step": 1710
    },
    {
      "epoch": 0.27376,
      "grad_norm": 0.16532890498638153,
      "learning_rate": 0.0001,
      "loss": 0.348,
      "step": 1711
    },
    {
      "epoch": 0.27392,
      "grad_norm": 0.16102074086666107,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 1712
    },
    {
      "epoch": 0.27408,
      "grad_norm": 0.14464373886585236,
      "learning_rate": 0.0001,
      "loss": 0.3453,
      "step": 1713
    },
    {
      "epoch": 0.27424,
      "grad_norm": 0.16243354976177216,
      "learning_rate": 0.0001,
      "loss": 0.3588,
      "step": 1714
    },
    {
      "epoch": 0.2744,
      "grad_norm": 0.15799672901630402,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 1715
    },
    {
      "epoch": 0.27456,
      "grad_norm": 0.18154165148735046,
      "learning_rate": 0.0001,
      "loss": 0.3512,
      "step": 1716
    },
    {
      "epoch": 0.27472,
      "grad_norm": 0.14636746048927307,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 1717
    },
    {
      "epoch": 0.27488,
      "grad_norm": 0.1963759958744049,
      "learning_rate": 0.0001,
      "loss": 0.3453,
      "step": 1718
    },
    {
      "epoch": 0.27504,
      "grad_norm": 0.21650443971157074,
      "learning_rate": 0.0001,
      "loss": 0.3461,
      "step": 1719
    },
    {
      "epoch": 0.2752,
      "grad_norm": 0.18174491822719574,
      "learning_rate": 0.0001,
      "loss": 0.3514,
      "step": 1720
    },
    {
      "epoch": 0.27536,
      "grad_norm": 0.4651813507080078,
      "learning_rate": 0.0001,
      "loss": 0.3399,
      "step": 1721
    },
    {
      "epoch": 0.27552,
      "grad_norm": 0.26291969418525696,
      "learning_rate": 0.0001,
      "loss": 0.3441,
      "step": 1722
    },
    {
      "epoch": 0.27568,
      "grad_norm": 0.30212706327438354,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 1723
    },
    {
      "epoch": 0.27584,
      "grad_norm": 0.17458923161029816,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 1724
    },
    {
      "epoch": 0.276,
      "grad_norm": 0.18932496011257172,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 1725
    },
    {
      "epoch": 0.27616,
      "grad_norm": 0.20996540784835815,
      "learning_rate": 0.0001,
      "loss": 0.3555,
      "step": 1726
    },
    {
      "epoch": 0.27632,
      "grad_norm": 0.21078020334243774,
      "learning_rate": 0.0001,
      "loss": 0.3483,
      "step": 1727
    },
    {
      "epoch": 0.27648,
      "grad_norm": 0.15762080252170563,
      "learning_rate": 0.0001,
      "loss": 0.3646,
      "step": 1728
    },
    {
      "epoch": 0.27664,
      "grad_norm": 0.21615347266197205,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 1729
    },
    {
      "epoch": 0.2768,
      "grad_norm": 0.22153638303279877,
      "learning_rate": 0.0001,
      "loss": 0.3512,
      "step": 1730
    },
    {
      "epoch": 0.27696,
      "grad_norm": 0.16210509836673737,
      "learning_rate": 0.0001,
      "loss": 0.3355,
      "step": 1731
    },
    {
      "epoch": 0.27712,
      "grad_norm": 0.1489216685295105,
      "learning_rate": 0.0001,
      "loss": 0.3485,
      "step": 1732
    },
    {
      "epoch": 0.27728,
      "grad_norm": 0.16954851150512695,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 1733
    },
    {
      "epoch": 0.27744,
      "grad_norm": 0.14947688579559326,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 1734
    },
    {
      "epoch": 0.2776,
      "grad_norm": 0.1712241917848587,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 1735
    },
    {
      "epoch": 0.27776,
      "grad_norm": 0.20308330655097961,
      "learning_rate": 0.0001,
      "loss": 0.3555,
      "step": 1736
    },
    {
      "epoch": 0.27792,
      "grad_norm": 0.17427921295166016,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 1737
    },
    {
      "epoch": 0.27808,
      "grad_norm": 0.22052687406539917,
      "learning_rate": 0.0001,
      "loss": 0.345,
      "step": 1738
    },
    {
      "epoch": 0.27824,
      "grad_norm": 0.15904022753238678,
      "learning_rate": 0.0001,
      "loss": 0.3513,
      "step": 1739
    },
    {
      "epoch": 0.2784,
      "grad_norm": 0.18230977654457092,
      "learning_rate": 0.0001,
      "loss": 0.3455,
      "step": 1740
    },
    {
      "epoch": 0.27856,
      "grad_norm": 0.1780400276184082,
      "learning_rate": 0.0001,
      "loss": 0.3397,
      "step": 1741
    },
    {
      "epoch": 0.27872,
      "grad_norm": 0.18245644867420197,
      "learning_rate": 0.0001,
      "loss": 0.3513,
      "step": 1742
    },
    {
      "epoch": 0.27888,
      "grad_norm": 0.15424342453479767,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 1743
    },
    {
      "epoch": 0.27904,
      "grad_norm": 0.16920751333236694,
      "learning_rate": 0.0001,
      "loss": 0.3463,
      "step": 1744
    },
    {
      "epoch": 0.2792,
      "grad_norm": 0.162009596824646,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 1745
    },
    {
      "epoch": 0.27936,
      "grad_norm": 0.1578560173511505,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 1746
    },
    {
      "epoch": 0.27952,
      "grad_norm": 0.13606519997119904,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 1747
    },
    {
      "epoch": 0.27968,
      "grad_norm": 0.13960865139961243,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 1748
    },
    {
      "epoch": 0.27984,
      "grad_norm": 0.1705155074596405,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 1749
    },
    {
      "epoch": 0.28,
      "grad_norm": 0.15104377269744873,
      "learning_rate": 0.0001,
      "loss": 0.3389,
      "step": 1750
    },
    {
      "epoch": 0.28016,
      "grad_norm": 0.14202472567558289,
      "learning_rate": 0.0001,
      "loss": 0.3536,
      "step": 1751
    },
    {
      "epoch": 0.28032,
      "grad_norm": 0.17785970866680145,
      "learning_rate": 0.0001,
      "loss": 0.346,
      "step": 1752
    },
    {
      "epoch": 0.28048,
      "grad_norm": 0.187936469912529,
      "learning_rate": 0.0001,
      "loss": 0.341,
      "step": 1753
    },
    {
      "epoch": 0.28064,
      "grad_norm": 0.13731598854064941,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 1754
    },
    {
      "epoch": 0.2808,
      "grad_norm": 0.13122272491455078,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 1755
    },
    {
      "epoch": 0.28096,
      "grad_norm": 0.15188203752040863,
      "learning_rate": 0.0001,
      "loss": 0.3451,
      "step": 1756
    },
    {
      "epoch": 0.28112,
      "grad_norm": 0.17910556495189667,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 1757
    },
    {
      "epoch": 0.28128,
      "grad_norm": 0.20646247267723083,
      "learning_rate": 0.0001,
      "loss": 0.3452,
      "step": 1758
    },
    {
      "epoch": 0.28144,
      "grad_norm": 0.16944929957389832,
      "learning_rate": 0.0001,
      "loss": 0.3465,
      "step": 1759
    },
    {
      "epoch": 0.2816,
      "grad_norm": 0.1589338481426239,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 1760
    },
    {
      "epoch": 0.28176,
      "grad_norm": 0.14035025238990784,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 1761
    },
    {
      "epoch": 0.28192,
      "grad_norm": 0.1494527906179428,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 1762
    },
    {
      "epoch": 0.28208,
      "grad_norm": 0.19674667716026306,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 1763
    },
    {
      "epoch": 0.28224,
      "grad_norm": 0.1749802976846695,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 1764
    },
    {
      "epoch": 0.2824,
      "grad_norm": 0.1876346617937088,
      "learning_rate": 0.0001,
      "loss": 0.3397,
      "step": 1765
    },
    {
      "epoch": 0.28256,
      "grad_norm": 0.15768548846244812,
      "learning_rate": 0.0001,
      "loss": 0.3407,
      "step": 1766
    },
    {
      "epoch": 0.28272,
      "grad_norm": 0.14151331782341003,
      "learning_rate": 0.0001,
      "loss": 0.3407,
      "step": 1767
    },
    {
      "epoch": 0.28288,
      "grad_norm": 0.17766200006008148,
      "learning_rate": 0.0001,
      "loss": 0.3394,
      "step": 1768
    },
    {
      "epoch": 0.28304,
      "grad_norm": 0.15581117570400238,
      "learning_rate": 0.0001,
      "loss": 0.3406,
      "step": 1769
    },
    {
      "epoch": 0.2832,
      "grad_norm": 0.16638077795505524,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 1770
    },
    {
      "epoch": 0.28336,
      "grad_norm": 0.20707207918167114,
      "learning_rate": 0.0001,
      "loss": 0.3484,
      "step": 1771
    },
    {
      "epoch": 0.28352,
      "grad_norm": 0.18255403637886047,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 1772
    },
    {
      "epoch": 0.28368,
      "grad_norm": 0.20492340624332428,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 1773
    },
    {
      "epoch": 0.28384,
      "grad_norm": 0.18612144887447357,
      "learning_rate": 0.0001,
      "loss": 0.3441,
      "step": 1774
    },
    {
      "epoch": 0.284,
      "grad_norm": 0.14476235210895538,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 1775
    },
    {
      "epoch": 0.28416,
      "grad_norm": 0.1710672825574875,
      "learning_rate": 0.0001,
      "loss": 0.3442,
      "step": 1776
    },
    {
      "epoch": 0.28432,
      "grad_norm": 0.18019647896289825,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 1777
    },
    {
      "epoch": 0.28448,
      "grad_norm": 0.1595280021429062,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 1778
    },
    {
      "epoch": 0.28464,
      "grad_norm": 0.16585205495357513,
      "learning_rate": 0.0001,
      "loss": 0.344,
      "step": 1779
    },
    {
      "epoch": 0.2848,
      "grad_norm": 0.1621883511543274,
      "learning_rate": 0.0001,
      "loss": 0.3453,
      "step": 1780
    },
    {
      "epoch": 0.28496,
      "grad_norm": 0.14904218912124634,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 1781
    },
    {
      "epoch": 0.28512,
      "grad_norm": 0.19407914578914642,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 1782
    },
    {
      "epoch": 0.28528,
      "grad_norm": 0.13262015581130981,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 1783
    },
    {
      "epoch": 0.28544,
      "grad_norm": 0.17345328629016876,
      "learning_rate": 0.0001,
      "loss": 0.3549,
      "step": 1784
    },
    {
      "epoch": 0.2856,
      "grad_norm": 0.16583772003650665,
      "learning_rate": 0.0001,
      "loss": 0.3419,
      "step": 1785
    },
    {
      "epoch": 0.28576,
      "grad_norm": 0.15629881620407104,
      "learning_rate": 0.0001,
      "loss": 0.3397,
      "step": 1786
    },
    {
      "epoch": 0.28592,
      "grad_norm": 0.1781477928161621,
      "learning_rate": 0.0001,
      "loss": 0.3524,
      "step": 1787
    },
    {
      "epoch": 0.28608,
      "grad_norm": 0.14976753294467926,
      "learning_rate": 0.0001,
      "loss": 0.3433,
      "step": 1788
    },
    {
      "epoch": 0.28624,
      "grad_norm": 0.14336884021759033,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 1789
    },
    {
      "epoch": 0.2864,
      "grad_norm": 0.13539260625839233,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 1790
    },
    {
      "epoch": 0.28656,
      "grad_norm": 0.1563113033771515,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 1791
    },
    {
      "epoch": 0.28672,
      "grad_norm": 0.16632422804832458,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 1792
    },
    {
      "epoch": 0.28688,
      "grad_norm": 0.22350186109542847,
      "learning_rate": 0.0001,
      "loss": 0.3594,
      "step": 1793
    },
    {
      "epoch": 0.28704,
      "grad_norm": 0.14431458711624146,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 1794
    },
    {
      "epoch": 0.2872,
      "grad_norm": 0.3749196529388428,
      "learning_rate": 0.0001,
      "loss": 0.3529,
      "step": 1795
    },
    {
      "epoch": 0.28736,
      "grad_norm": 0.16446417570114136,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 1796
    },
    {
      "epoch": 0.28752,
      "grad_norm": 0.4039355218410492,
      "learning_rate": 0.0001,
      "loss": 0.3385,
      "step": 1797
    },
    {
      "epoch": 0.28768,
      "grad_norm": 0.17373283207416534,
      "learning_rate": 0.0001,
      "loss": 0.3533,
      "step": 1798
    },
    {
      "epoch": 0.28784,
      "grad_norm": 0.28464654088020325,
      "learning_rate": 0.0001,
      "loss": 0.3474,
      "step": 1799
    },
    {
      "epoch": 0.288,
      "grad_norm": 0.19490619003772736,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 1800
    },
    {
      "epoch": 0.288,
      "eval_train_accuracy": 0.8164,
      "eval_train_loss": 0.3394676446914673,
      "eval_train_runtime": 4.5108,
      "eval_train_samples_per_second": 1108.444,
      "eval_train_steps_per_second": 13.966,
      "step": 1800
    },
    {
      "epoch": 0.288,
      "eval_test_accuracy": 0.815,
      "eval_test_loss": 0.33784618973731995,
      "eval_test_runtime": 4.7506,
      "eval_test_samples_per_second": 1052.501,
      "eval_test_steps_per_second": 13.262,
      "step": 1800
    },
    {
      "epoch": 0.28816,
      "grad_norm": 0.19879353046417236,
      "learning_rate": 0.0001,
      "loss": 0.3505,
      "step": 1801
    },
    {
      "epoch": 0.28832,
      "grad_norm": 0.14371992647647858,
      "learning_rate": 0.0001,
      "loss": 0.3423,
      "step": 1802
    },
    {
      "epoch": 0.28848,
      "grad_norm": 0.18179430067539215,
      "learning_rate": 0.0001,
      "loss": 0.3504,
      "step": 1803
    },
    {
      "epoch": 0.28864,
      "grad_norm": 0.1413845717906952,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 1804
    },
    {
      "epoch": 0.2888,
      "grad_norm": 0.18833886086940765,
      "learning_rate": 0.0001,
      "loss": 0.3481,
      "step": 1805
    },
    {
      "epoch": 0.28896,
      "grad_norm": 0.14708535373210907,
      "learning_rate": 0.0001,
      "loss": 0.3514,
      "step": 1806
    },
    {
      "epoch": 0.28912,
      "grad_norm": 0.16988222301006317,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 1807
    },
    {
      "epoch": 0.28928,
      "grad_norm": 0.17044387757778168,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 1808
    },
    {
      "epoch": 0.28944,
      "grad_norm": 0.1260722577571869,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 1809
    },
    {
      "epoch": 0.2896,
      "grad_norm": 0.15545949339866638,
      "learning_rate": 0.0001,
      "loss": 0.3534,
      "step": 1810
    },
    {
      "epoch": 0.28976,
      "grad_norm": 0.14901818335056305,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 1811
    },
    {
      "epoch": 0.28992,
      "grad_norm": 0.2119348645210266,
      "learning_rate": 0.0001,
      "loss": 0.3426,
      "step": 1812
    },
    {
      "epoch": 0.29008,
      "grad_norm": 0.19570395350456238,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 1813
    },
    {
      "epoch": 0.29024,
      "grad_norm": 0.12746445834636688,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 1814
    },
    {
      "epoch": 0.2904,
      "grad_norm": 0.1893116682767868,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 1815
    },
    {
      "epoch": 0.29056,
      "grad_norm": 0.15613190829753876,
      "learning_rate": 0.0001,
      "loss": 0.3488,
      "step": 1816
    },
    {
      "epoch": 0.29072,
      "grad_norm": 0.15974873304367065,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 1817
    },
    {
      "epoch": 0.29088,
      "grad_norm": 0.3102336823940277,
      "learning_rate": 0.0001,
      "loss": 0.3521,
      "step": 1818
    },
    {
      "epoch": 0.29104,
      "grad_norm": 0.18003155291080475,
      "learning_rate": 0.0001,
      "loss": 0.3392,
      "step": 1819
    },
    {
      "epoch": 0.2912,
      "grad_norm": 0.35327181220054626,
      "learning_rate": 0.0001,
      "loss": 0.3406,
      "step": 1820
    },
    {
      "epoch": 0.29136,
      "grad_norm": 0.14826473593711853,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 1821
    },
    {
      "epoch": 0.29152,
      "grad_norm": 0.17364034056663513,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 1822
    },
    {
      "epoch": 0.29168,
      "grad_norm": 0.219793438911438,
      "learning_rate": 0.0001,
      "loss": 0.3449,
      "step": 1823
    },
    {
      "epoch": 0.29184,
      "grad_norm": 0.17137321829795837,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 1824
    },
    {
      "epoch": 0.292,
      "grad_norm": 0.15326464176177979,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 1825
    },
    {
      "epoch": 0.29216,
      "grad_norm": 0.18236330151557922,
      "learning_rate": 0.0001,
      "loss": 0.3549,
      "step": 1826
    },
    {
      "epoch": 0.29232,
      "grad_norm": 0.4080602824687958,
      "learning_rate": 0.0001,
      "loss": 0.3513,
      "step": 1827
    },
    {
      "epoch": 0.29248,
      "grad_norm": 0.13963188230991364,
      "learning_rate": 0.0001,
      "loss": 0.3523,
      "step": 1828
    },
    {
      "epoch": 0.29264,
      "grad_norm": 0.16348350048065186,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 1829
    },
    {
      "epoch": 0.2928,
      "grad_norm": 0.2222730964422226,
      "learning_rate": 0.0001,
      "loss": 0.3375,
      "step": 1830
    },
    {
      "epoch": 0.29296,
      "grad_norm": 0.1825064718723297,
      "learning_rate": 0.0001,
      "loss": 0.3424,
      "step": 1831
    },
    {
      "epoch": 0.29312,
      "grad_norm": 0.14736276865005493,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 1832
    },
    {
      "epoch": 0.29328,
      "grad_norm": 0.15811830759048462,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 1833
    },
    {
      "epoch": 0.29344,
      "grad_norm": 0.16092155873775482,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 1834
    },
    {
      "epoch": 0.2936,
      "grad_norm": 0.15184345841407776,
      "learning_rate": 0.0001,
      "loss": 0.3515,
      "step": 1835
    },
    {
      "epoch": 0.29376,
      "grad_norm": 0.16194282472133636,
      "learning_rate": 0.0001,
      "loss": 0.355,
      "step": 1836
    },
    {
      "epoch": 0.29392,
      "grad_norm": 0.24742907285690308,
      "learning_rate": 0.0001,
      "loss": 0.3437,
      "step": 1837
    },
    {
      "epoch": 0.29408,
      "grad_norm": 0.14219307899475098,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 1838
    },
    {
      "epoch": 0.29424,
      "grad_norm": 0.1731870472431183,
      "learning_rate": 0.0001,
      "loss": 0.3381,
      "step": 1839
    },
    {
      "epoch": 0.2944,
      "grad_norm": 0.17462001740932465,
      "learning_rate": 0.0001,
      "loss": 0.344,
      "step": 1840
    },
    {
      "epoch": 0.29456,
      "grad_norm": 0.16005069017410278,
      "learning_rate": 0.0001,
      "loss": 0.3511,
      "step": 1841
    },
    {
      "epoch": 0.29472,
      "grad_norm": 0.1475525051355362,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 1842
    },
    {
      "epoch": 0.29488,
      "grad_norm": 0.13828584551811218,
      "learning_rate": 0.0001,
      "loss": 0.3547,
      "step": 1843
    },
    {
      "epoch": 0.29504,
      "grad_norm": 0.1863085776567459,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 1844
    },
    {
      "epoch": 0.2952,
      "grad_norm": 0.14414145052433014,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 1845
    },
    {
      "epoch": 0.29536,
      "grad_norm": 0.1600216031074524,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 1846
    },
    {
      "epoch": 0.29552,
      "grad_norm": 0.16172164678573608,
      "learning_rate": 0.0001,
      "loss": 0.3506,
      "step": 1847
    },
    {
      "epoch": 0.29568,
      "grad_norm": 0.1872839331626892,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 1848
    },
    {
      "epoch": 0.29584,
      "grad_norm": 0.1663896143436432,
      "learning_rate": 0.0001,
      "loss": 0.3402,
      "step": 1849
    },
    {
      "epoch": 0.296,
      "grad_norm": 0.14522941410541534,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 1850
    },
    {
      "epoch": 0.29616,
      "grad_norm": 0.16770309209823608,
      "learning_rate": 0.0001,
      "loss": 0.3555,
      "step": 1851
    },
    {
      "epoch": 0.29632,
      "grad_norm": 0.13187357783317566,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 1852
    },
    {
      "epoch": 0.29648,
      "grad_norm": 0.14287321269512177,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 1853
    },
    {
      "epoch": 0.29664,
      "grad_norm": 0.14022250473499298,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 1854
    },
    {
      "epoch": 0.2968,
      "grad_norm": 0.16702327132225037,
      "learning_rate": 0.0001,
      "loss": 0.346,
      "step": 1855
    },
    {
      "epoch": 0.29696,
      "grad_norm": 0.14403405785560608,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 1856
    },
    {
      "epoch": 0.29712,
      "grad_norm": 0.1674373894929886,
      "learning_rate": 0.0001,
      "loss": 0.3385,
      "step": 1857
    },
    {
      "epoch": 0.29728,
      "grad_norm": 0.13365668058395386,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 1858
    },
    {
      "epoch": 0.29744,
      "grad_norm": 0.1826944351196289,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 1859
    },
    {
      "epoch": 0.2976,
      "grad_norm": 0.13832996785640717,
      "learning_rate": 0.0001,
      "loss": 0.35,
      "step": 1860
    },
    {
      "epoch": 0.29776,
      "grad_norm": 0.14826028048992157,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 1861
    },
    {
      "epoch": 0.29792,
      "grad_norm": 0.17729789018630981,
      "learning_rate": 0.0001,
      "loss": 0.3392,
      "step": 1862
    },
    {
      "epoch": 0.29808,
      "grad_norm": 0.18475595116615295,
      "learning_rate": 0.0001,
      "loss": 0.3469,
      "step": 1863
    },
    {
      "epoch": 0.29824,
      "grad_norm": 0.15777352452278137,
      "learning_rate": 0.0001,
      "loss": 0.3395,
      "step": 1864
    },
    {
      "epoch": 0.2984,
      "grad_norm": 0.15828467905521393,
      "learning_rate": 0.0001,
      "loss": 0.3479,
      "step": 1865
    },
    {
      "epoch": 0.29856,
      "grad_norm": 0.2014952450990677,
      "learning_rate": 0.0001,
      "loss": 0.3516,
      "step": 1866
    },
    {
      "epoch": 0.29872,
      "grad_norm": 0.16779930889606476,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 1867
    },
    {
      "epoch": 0.29888,
      "grad_norm": 0.1432652622461319,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 1868
    },
    {
      "epoch": 0.29904,
      "grad_norm": 0.1564362496137619,
      "learning_rate": 0.0001,
      "loss": 0.3492,
      "step": 1869
    },
    {
      "epoch": 0.2992,
      "grad_norm": 0.17575319111347198,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 1870
    },
    {
      "epoch": 0.29936,
      "grad_norm": 0.12390152364969254,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 1871
    },
    {
      "epoch": 0.29952,
      "grad_norm": 0.15960437059402466,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 1872
    },
    {
      "epoch": 0.29968,
      "grad_norm": 0.15476514399051666,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 1873
    },
    {
      "epoch": 0.29984,
      "grad_norm": 0.17301344871520996,
      "learning_rate": 0.0001,
      "loss": 0.3478,
      "step": 1874
    },
    {
      "epoch": 0.3,
      "grad_norm": 0.15817879140377045,
      "learning_rate": 0.0001,
      "loss": 0.3382,
      "step": 1875
    },
    {
      "epoch": 0.30016,
      "grad_norm": 0.15694767236709595,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 1876
    },
    {
      "epoch": 0.30032,
      "grad_norm": 0.14694255590438843,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 1877
    },
    {
      "epoch": 0.30048,
      "grad_norm": 0.21951080858707428,
      "learning_rate": 0.0001,
      "loss": 0.3498,
      "step": 1878
    },
    {
      "epoch": 0.30064,
      "grad_norm": 0.15685079991817474,
      "learning_rate": 0.0001,
      "loss": 0.3433,
      "step": 1879
    },
    {
      "epoch": 0.3008,
      "grad_norm": 0.16740640997886658,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 1880
    },
    {
      "epoch": 0.30096,
      "grad_norm": 0.16498494148254395,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 1881
    },
    {
      "epoch": 0.30112,
      "grad_norm": 0.1794658750295639,
      "learning_rate": 0.0001,
      "loss": 0.3405,
      "step": 1882
    },
    {
      "epoch": 0.30128,
      "grad_norm": 0.13969632983207703,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 1883
    },
    {
      "epoch": 0.30144,
      "grad_norm": 0.14960704743862152,
      "learning_rate": 0.0001,
      "loss": 0.3424,
      "step": 1884
    },
    {
      "epoch": 0.3016,
      "grad_norm": 0.17347091436386108,
      "learning_rate": 0.0001,
      "loss": 0.3551,
      "step": 1885
    },
    {
      "epoch": 0.30176,
      "grad_norm": 0.18035787343978882,
      "learning_rate": 0.0001,
      "loss": 0.3549,
      "step": 1886
    },
    {
      "epoch": 0.30192,
      "grad_norm": 0.16255931556224823,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 1887
    },
    {
      "epoch": 0.30208,
      "grad_norm": 0.1580977886915207,
      "learning_rate": 0.0001,
      "loss": 0.3473,
      "step": 1888
    },
    {
      "epoch": 0.30224,
      "grad_norm": 0.13177219033241272,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 1889
    },
    {
      "epoch": 0.3024,
      "grad_norm": 0.20129166543483734,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 1890
    },
    {
      "epoch": 0.30256,
      "grad_norm": 0.1461314707994461,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 1891
    },
    {
      "epoch": 0.30272,
      "grad_norm": 0.14834676682949066,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 1892
    },
    {
      "epoch": 0.30288,
      "grad_norm": 0.18007290363311768,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 1893
    },
    {
      "epoch": 0.30304,
      "grad_norm": 0.1464102417230606,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 1894
    },
    {
      "epoch": 0.3032,
      "grad_norm": 0.16258031129837036,
      "learning_rate": 0.0001,
      "loss": 0.3446,
      "step": 1895
    },
    {
      "epoch": 0.30336,
      "grad_norm": 0.14805728197097778,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 1896
    },
    {
      "epoch": 0.30352,
      "grad_norm": 0.16233503818511963,
      "learning_rate": 0.0001,
      "loss": 0.3468,
      "step": 1897
    },
    {
      "epoch": 0.30368,
      "grad_norm": 0.22522184252738953,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 1898
    },
    {
      "epoch": 0.30384,
      "grad_norm": 0.14775247871875763,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 1899
    },
    {
      "epoch": 0.304,
      "grad_norm": 0.13630586862564087,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 1900
    },
    {
      "epoch": 0.304,
      "eval_train_accuracy": 0.904,
      "eval_train_loss": 0.33796826004981995,
      "eval_train_runtime": 4.4091,
      "eval_train_samples_per_second": 1134.028,
      "eval_train_steps_per_second": 14.289,
      "step": 1900
    },
    {
      "epoch": 0.304,
      "eval_test_accuracy": 0.8954,
      "eval_test_loss": 0.3368259072303772,
      "eval_test_runtime": 5.0228,
      "eval_test_samples_per_second": 995.456,
      "eval_test_steps_per_second": 12.543,
      "step": 1900
    },
    {
      "epoch": 0.30416,
      "grad_norm": 0.28149735927581787,
      "learning_rate": 0.0001,
      "loss": 0.3427,
      "step": 1901
    },
    {
      "epoch": 0.30432,
      "grad_norm": 0.1615094691514969,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 1902
    },
    {
      "epoch": 0.30448,
      "grad_norm": 0.21008367836475372,
      "learning_rate": 0.0001,
      "loss": 0.3496,
      "step": 1903
    },
    {
      "epoch": 0.30464,
      "grad_norm": 0.18814995884895325,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 1904
    },
    {
      "epoch": 0.3048,
      "grad_norm": 0.20540356636047363,
      "learning_rate": 0.0001,
      "loss": 0.3479,
      "step": 1905
    },
    {
      "epoch": 0.30496,
      "grad_norm": 0.14734883606433868,
      "learning_rate": 0.0001,
      "loss": 0.3375,
      "step": 1906
    },
    {
      "epoch": 0.30512,
      "grad_norm": 0.13659638166427612,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 1907
    },
    {
      "epoch": 0.30528,
      "grad_norm": 0.16549979150295258,
      "learning_rate": 0.0001,
      "loss": 0.3448,
      "step": 1908
    },
    {
      "epoch": 0.30544,
      "grad_norm": 0.16135165095329285,
      "learning_rate": 0.0001,
      "loss": 0.3488,
      "step": 1909
    },
    {
      "epoch": 0.3056,
      "grad_norm": 0.26480478048324585,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 1910
    },
    {
      "epoch": 0.30576,
      "grad_norm": 0.18876224756240845,
      "learning_rate": 0.0001,
      "loss": 0.3457,
      "step": 1911
    },
    {
      "epoch": 0.30592,
      "grad_norm": 0.146495521068573,
      "learning_rate": 0.0001,
      "loss": 0.3441,
      "step": 1912
    },
    {
      "epoch": 0.30608,
      "grad_norm": 0.233910471200943,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 1913
    },
    {
      "epoch": 0.30624,
      "grad_norm": 0.17579379677772522,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 1914
    },
    {
      "epoch": 0.3064,
      "grad_norm": 0.14086273312568665,
      "learning_rate": 0.0001,
      "loss": 0.3483,
      "step": 1915
    },
    {
      "epoch": 0.30656,
      "grad_norm": 0.1467439979314804,
      "learning_rate": 0.0001,
      "loss": 0.3347,
      "step": 1916
    },
    {
      "epoch": 0.30672,
      "grad_norm": 0.16310162842273712,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 1917
    },
    {
      "epoch": 0.30688,
      "grad_norm": 0.20750784873962402,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 1918
    },
    {
      "epoch": 0.30704,
      "grad_norm": 0.18381932377815247,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 1919
    },
    {
      "epoch": 0.3072,
      "grad_norm": 0.1511673778295517,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 1920
    },
    {
      "epoch": 0.30736,
      "grad_norm": 0.27568209171295166,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 1921
    },
    {
      "epoch": 0.30752,
      "grad_norm": 0.1736854910850525,
      "learning_rate": 0.0001,
      "loss": 0.3483,
      "step": 1922
    },
    {
      "epoch": 0.30768,
      "grad_norm": 0.1710733324289322,
      "learning_rate": 0.0001,
      "loss": 0.3421,
      "step": 1923
    },
    {
      "epoch": 0.30784,
      "grad_norm": 0.315233051776886,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 1924
    },
    {
      "epoch": 0.308,
      "grad_norm": 0.1347796618938446,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 1925
    },
    {
      "epoch": 0.30816,
      "grad_norm": 0.147870272397995,
      "learning_rate": 0.0001,
      "loss": 0.3514,
      "step": 1926
    },
    {
      "epoch": 0.30832,
      "grad_norm": 0.24625450372695923,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 1927
    },
    {
      "epoch": 0.30848,
      "grad_norm": 0.1847020834684372,
      "learning_rate": 0.0001,
      "loss": 0.3536,
      "step": 1928
    },
    {
      "epoch": 0.30864,
      "grad_norm": 0.12744919955730438,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 1929
    },
    {
      "epoch": 0.3088,
      "grad_norm": 0.18854527175426483,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 1930
    },
    {
      "epoch": 0.30896,
      "grad_norm": 0.15179388225078583,
      "learning_rate": 0.0001,
      "loss": 0.3469,
      "step": 1931
    },
    {
      "epoch": 0.30912,
      "grad_norm": 0.20911262929439545,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 1932
    },
    {
      "epoch": 0.30928,
      "grad_norm": 0.16345389187335968,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 1933
    },
    {
      "epoch": 0.30944,
      "grad_norm": 0.1622992902994156,
      "learning_rate": 0.0001,
      "loss": 0.3446,
      "step": 1934
    },
    {
      "epoch": 0.3096,
      "grad_norm": 0.15968111157417297,
      "learning_rate": 0.0001,
      "loss": 0.3463,
      "step": 1935
    },
    {
      "epoch": 0.30976,
      "grad_norm": 0.1570403128862381,
      "learning_rate": 0.0001,
      "loss": 0.3488,
      "step": 1936
    },
    {
      "epoch": 0.30992,
      "grad_norm": 0.15106318891048431,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 1937
    },
    {
      "epoch": 0.31008,
      "grad_norm": 0.15396498143672943,
      "learning_rate": 0.0001,
      "loss": 0.3478,
      "step": 1938
    },
    {
      "epoch": 0.31024,
      "grad_norm": 0.1301126331090927,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 1939
    },
    {
      "epoch": 0.3104,
      "grad_norm": 0.14787845313549042,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 1940
    },
    {
      "epoch": 0.31056,
      "grad_norm": 0.1297646015882492,
      "learning_rate": 0.0001,
      "loss": 0.3444,
      "step": 1941
    },
    {
      "epoch": 0.31072,
      "grad_norm": 0.1602688431739807,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 1942
    },
    {
      "epoch": 0.31088,
      "grad_norm": 0.14919057488441467,
      "learning_rate": 0.0001,
      "loss": 0.3539,
      "step": 1943
    },
    {
      "epoch": 0.31104,
      "grad_norm": 0.1328352987766266,
      "learning_rate": 0.0001,
      "loss": 0.3403,
      "step": 1944
    },
    {
      "epoch": 0.3112,
      "grad_norm": 0.15390263497829437,
      "learning_rate": 0.0001,
      "loss": 0.3486,
      "step": 1945
    },
    {
      "epoch": 0.31136,
      "grad_norm": 0.14806567132472992,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 1946
    },
    {
      "epoch": 0.31152,
      "grad_norm": 0.15237195789813995,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 1947
    },
    {
      "epoch": 0.31168,
      "grad_norm": 0.14733102917671204,
      "learning_rate": 0.0001,
      "loss": 0.3525,
      "step": 1948
    },
    {
      "epoch": 0.31184,
      "grad_norm": 0.14521557092666626,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 1949
    },
    {
      "epoch": 0.312,
      "grad_norm": 0.11488381028175354,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 1950
    },
    {
      "epoch": 0.31216,
      "grad_norm": 0.1641419529914856,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 1951
    },
    {
      "epoch": 0.31232,
      "grad_norm": 0.16461753845214844,
      "learning_rate": 0.0001,
      "loss": 0.3479,
      "step": 1952
    },
    {
      "epoch": 0.31248,
      "grad_norm": 0.11967292428016663,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 1953
    },
    {
      "epoch": 0.31264,
      "grad_norm": 0.1293514519929886,
      "learning_rate": 0.0001,
      "loss": 0.3395,
      "step": 1954
    },
    {
      "epoch": 0.3128,
      "grad_norm": 0.15823550522327423,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 1955
    },
    {
      "epoch": 0.31296,
      "grad_norm": 0.13247311115264893,
      "learning_rate": 0.0001,
      "loss": 0.3455,
      "step": 1956
    },
    {
      "epoch": 0.31312,
      "grad_norm": 0.11801761388778687,
      "learning_rate": 0.0001,
      "loss": 0.3439,
      "step": 1957
    },
    {
      "epoch": 0.31328,
      "grad_norm": 0.12409716099500656,
      "learning_rate": 0.0001,
      "loss": 0.3436,
      "step": 1958
    },
    {
      "epoch": 0.31344,
      "grad_norm": 0.11972589790821075,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 1959
    },
    {
      "epoch": 0.3136,
      "grad_norm": 0.23110666871070862,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 1960
    },
    {
      "epoch": 0.31376,
      "grad_norm": 0.12995080649852753,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 1961
    },
    {
      "epoch": 0.31392,
      "grad_norm": 0.11840826272964478,
      "learning_rate": 0.0001,
      "loss": 0.3394,
      "step": 1962
    },
    {
      "epoch": 0.31408,
      "grad_norm": 0.1521545946598053,
      "learning_rate": 0.0001,
      "loss": 0.3347,
      "step": 1963
    },
    {
      "epoch": 0.31424,
      "grad_norm": 0.18324162065982819,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 1964
    },
    {
      "epoch": 0.3144,
      "grad_norm": 0.2926582098007202,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 1965
    },
    {
      "epoch": 0.31456,
      "grad_norm": 0.12158892303705215,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 1966
    },
    {
      "epoch": 0.31472,
      "grad_norm": 0.18023501336574554,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 1967
    },
    {
      "epoch": 0.31488,
      "grad_norm": 0.363037645816803,
      "learning_rate": 0.0001,
      "loss": 0.3493,
      "step": 1968
    },
    {
      "epoch": 0.31504,
      "grad_norm": 0.16137054562568665,
      "learning_rate": 0.0001,
      "loss": 0.3511,
      "step": 1969
    },
    {
      "epoch": 0.3152,
      "grad_norm": 0.23878902196884155,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 1970
    },
    {
      "epoch": 0.31536,
      "grad_norm": 0.2792661488056183,
      "learning_rate": 0.0001,
      "loss": 0.3397,
      "step": 1971
    },
    {
      "epoch": 0.31552,
      "grad_norm": 0.19086770713329315,
      "learning_rate": 0.0001,
      "loss": 0.3506,
      "step": 1972
    },
    {
      "epoch": 0.31568,
      "grad_norm": 0.14430609345436096,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 1973
    },
    {
      "epoch": 0.31584,
      "grad_norm": 0.2523614168167114,
      "learning_rate": 0.0001,
      "loss": 0.352,
      "step": 1974
    },
    {
      "epoch": 0.316,
      "grad_norm": 0.26380372047424316,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 1975
    },
    {
      "epoch": 0.31616,
      "grad_norm": 0.14385998249053955,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 1976
    },
    {
      "epoch": 0.31632,
      "grad_norm": 0.14705437421798706,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 1977
    },
    {
      "epoch": 0.31648,
      "grad_norm": 0.3126715421676636,
      "learning_rate": 0.0001,
      "loss": 0.3679,
      "step": 1978
    },
    {
      "epoch": 0.31664,
      "grad_norm": 0.198294997215271,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 1979
    },
    {
      "epoch": 0.3168,
      "grad_norm": 0.20611244440078735,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 1980
    },
    {
      "epoch": 0.31696,
      "grad_norm": 0.2093082070350647,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 1981
    },
    {
      "epoch": 0.31712,
      "grad_norm": 0.18941590189933777,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 1982
    },
    {
      "epoch": 0.31728,
      "grad_norm": 0.16468265652656555,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 1983
    },
    {
      "epoch": 0.31744,
      "grad_norm": 0.17012548446655273,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 1984
    },
    {
      "epoch": 0.3176,
      "grad_norm": 0.1578339785337448,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 1985
    },
    {
      "epoch": 0.31776,
      "grad_norm": 0.16033680737018585,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 1986
    },
    {
      "epoch": 0.31792,
      "grad_norm": 0.13359513878822327,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 1987
    },
    {
      "epoch": 0.31808,
      "grad_norm": 0.17174032330513,
      "learning_rate": 0.0001,
      "loss": 0.353,
      "step": 1988
    },
    {
      "epoch": 0.31824,
      "grad_norm": 0.14965596795082092,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 1989
    },
    {
      "epoch": 0.3184,
      "grad_norm": 0.13623961806297302,
      "learning_rate": 0.0001,
      "loss": 0.3492,
      "step": 1990
    },
    {
      "epoch": 0.31856,
      "grad_norm": 0.1306723952293396,
      "learning_rate": 0.0001,
      "loss": 0.3424,
      "step": 1991
    },
    {
      "epoch": 0.31872,
      "grad_norm": 0.11080466210842133,
      "learning_rate": 0.0001,
      "loss": 0.353,
      "step": 1992
    },
    {
      "epoch": 0.31888,
      "grad_norm": 0.14562608301639557,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 1993
    },
    {
      "epoch": 0.31904,
      "grad_norm": 0.1838517040014267,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 1994
    },
    {
      "epoch": 0.3192,
      "grad_norm": 0.1364324986934662,
      "learning_rate": 0.0001,
      "loss": 0.3426,
      "step": 1995
    },
    {
      "epoch": 0.31936,
      "grad_norm": 0.15167029201984406,
      "learning_rate": 0.0001,
      "loss": 0.3451,
      "step": 1996
    },
    {
      "epoch": 0.31952,
      "grad_norm": 0.11794228106737137,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 1997
    },
    {
      "epoch": 0.31968,
      "grad_norm": 0.1349457949399948,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 1998
    },
    {
      "epoch": 0.31984,
      "grad_norm": 0.14547085762023926,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 1999
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.1752459853887558,
      "learning_rate": 0.0001,
      "loss": 0.3554,
      "step": 2000
    },
    {
      "epoch": 0.32,
      "eval_train_accuracy": 0.9094,
      "eval_train_loss": 0.3365224003791809,
      "eval_train_runtime": 4.7643,
      "eval_train_samples_per_second": 1049.466,
      "eval_train_steps_per_second": 13.223,
      "step": 2000
    },
    {
      "epoch": 0.32,
      "eval_test_accuracy": 0.907,
      "eval_test_loss": 0.3350398540496826,
      "eval_test_runtime": 4.3774,
      "eval_test_samples_per_second": 1142.225,
      "eval_test_steps_per_second": 14.392,
      "step": 2000
    },
    {
      "epoch": 0.32016,
      "grad_norm": 0.1515035778284073,
      "learning_rate": 0.0001,
      "loss": 0.3382,
      "step": 2001
    },
    {
      "epoch": 0.32032,
      "grad_norm": 0.16249513626098633,
      "learning_rate": 0.0001,
      "loss": 0.3516,
      "step": 2002
    },
    {
      "epoch": 0.32048,
      "grad_norm": 0.11571888625621796,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 2003
    },
    {
      "epoch": 0.32064,
      "grad_norm": 0.1478504240512848,
      "learning_rate": 0.0001,
      "loss": 0.3502,
      "step": 2004
    },
    {
      "epoch": 0.3208,
      "grad_norm": 0.1295798122882843,
      "learning_rate": 0.0001,
      "loss": 0.3399,
      "step": 2005
    },
    {
      "epoch": 0.32096,
      "grad_norm": 0.11879619210958481,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 2006
    },
    {
      "epoch": 0.32112,
      "grad_norm": 0.15846753120422363,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 2007
    },
    {
      "epoch": 0.32128,
      "grad_norm": 0.1552589386701584,
      "learning_rate": 0.0001,
      "loss": 0.3476,
      "step": 2008
    },
    {
      "epoch": 0.32144,
      "grad_norm": 0.1459657996892929,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 2009
    },
    {
      "epoch": 0.3216,
      "grad_norm": 0.12455055862665176,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 2010
    },
    {
      "epoch": 0.32176,
      "grad_norm": 0.18192075192928314,
      "learning_rate": 0.0001,
      "loss": 0.3403,
      "step": 2011
    },
    {
      "epoch": 0.32192,
      "grad_norm": 0.14689567685127258,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2012
    },
    {
      "epoch": 0.32208,
      "grad_norm": 0.11968302726745605,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 2013
    },
    {
      "epoch": 0.32224,
      "grad_norm": 0.14310069382190704,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 2014
    },
    {
      "epoch": 0.3224,
      "grad_norm": 0.1365279108285904,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 2015
    },
    {
      "epoch": 0.32256,
      "grad_norm": 0.14341634511947632,
      "learning_rate": 0.0001,
      "loss": 0.3464,
      "step": 2016
    },
    {
      "epoch": 0.32272,
      "grad_norm": 0.12833474576473236,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 2017
    },
    {
      "epoch": 0.32288,
      "grad_norm": 0.13040444254875183,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 2018
    },
    {
      "epoch": 0.32304,
      "grad_norm": 0.14652569591999054,
      "learning_rate": 0.0001,
      "loss": 0.3448,
      "step": 2019
    },
    {
      "epoch": 0.3232,
      "grad_norm": 0.13746605813503265,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 2020
    },
    {
      "epoch": 0.32336,
      "grad_norm": 0.10779692977666855,
      "learning_rate": 0.0001,
      "loss": 0.3379,
      "step": 2021
    },
    {
      "epoch": 0.32352,
      "grad_norm": 0.11008882522583008,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 2022
    },
    {
      "epoch": 0.32368,
      "grad_norm": 0.11581708490848541,
      "learning_rate": 0.0001,
      "loss": 0.3406,
      "step": 2023
    },
    {
      "epoch": 0.32384,
      "grad_norm": 0.15518437325954437,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 2024
    },
    {
      "epoch": 0.324,
      "grad_norm": 0.14434398710727692,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 2025
    },
    {
      "epoch": 0.32416,
      "grad_norm": 0.1242397129535675,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 2026
    },
    {
      "epoch": 0.32432,
      "grad_norm": 0.13565485179424286,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 2027
    },
    {
      "epoch": 0.32448,
      "grad_norm": 0.15023988485336304,
      "learning_rate": 0.0001,
      "loss": 0.3492,
      "step": 2028
    },
    {
      "epoch": 0.32464,
      "grad_norm": 0.15056736767292023,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2029
    },
    {
      "epoch": 0.3248,
      "grad_norm": 0.16038332879543304,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 2030
    },
    {
      "epoch": 0.32496,
      "grad_norm": 0.16563722491264343,
      "learning_rate": 0.0001,
      "loss": 0.3534,
      "step": 2031
    },
    {
      "epoch": 0.32512,
      "grad_norm": 0.11108148843050003,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2032
    },
    {
      "epoch": 0.32528,
      "grad_norm": 0.16986936330795288,
      "learning_rate": 0.0001,
      "loss": 0.346,
      "step": 2033
    },
    {
      "epoch": 0.32544,
      "grad_norm": 0.1311723291873932,
      "learning_rate": 0.0001,
      "loss": 0.3388,
      "step": 2034
    },
    {
      "epoch": 0.3256,
      "grad_norm": 0.1350652575492859,
      "learning_rate": 0.0001,
      "loss": 0.341,
      "step": 2035
    },
    {
      "epoch": 0.32576,
      "grad_norm": 0.14887943863868713,
      "learning_rate": 0.0001,
      "loss": 0.3513,
      "step": 2036
    },
    {
      "epoch": 0.32592,
      "grad_norm": 0.13140954077243805,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 2037
    },
    {
      "epoch": 0.32608,
      "grad_norm": 0.17408981919288635,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 2038
    },
    {
      "epoch": 0.32624,
      "grad_norm": 0.11831106245517731,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 2039
    },
    {
      "epoch": 0.3264,
      "grad_norm": 0.25713980197906494,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 2040
    },
    {
      "epoch": 0.32656,
      "grad_norm": 0.12455353140830994,
      "learning_rate": 0.0001,
      "loss": 0.3419,
      "step": 2041
    },
    {
      "epoch": 0.32672,
      "grad_norm": 0.14036259055137634,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 2042
    },
    {
      "epoch": 0.32688,
      "grad_norm": 0.1316937506198883,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 2043
    },
    {
      "epoch": 0.32704,
      "grad_norm": 0.22190017998218536,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 2044
    },
    {
      "epoch": 0.3272,
      "grad_norm": 0.14121413230895996,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 2045
    },
    {
      "epoch": 0.32736,
      "grad_norm": 0.2598241865634918,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2046
    },
    {
      "epoch": 0.32752,
      "grad_norm": 0.15444494783878326,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 2047
    },
    {
      "epoch": 0.32768,
      "grad_norm": 0.16239769756793976,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 2048
    },
    {
      "epoch": 0.32784,
      "grad_norm": 0.1563197523355484,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 2049
    },
    {
      "epoch": 0.328,
      "grad_norm": 0.15579135715961456,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 2050
    },
    {
      "epoch": 0.32816,
      "grad_norm": 0.13976603746414185,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 2051
    },
    {
      "epoch": 0.32832,
      "grad_norm": 0.16126833856105804,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 2052
    },
    {
      "epoch": 0.32848,
      "grad_norm": 0.1978561282157898,
      "learning_rate": 0.0001,
      "loss": 0.3517,
      "step": 2053
    },
    {
      "epoch": 0.32864,
      "grad_norm": 0.14485414326190948,
      "learning_rate": 0.0001,
      "loss": 0.3487,
      "step": 2054
    },
    {
      "epoch": 0.3288,
      "grad_norm": 0.14209972321987152,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 2055
    },
    {
      "epoch": 0.32896,
      "grad_norm": 0.17549721896648407,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 2056
    },
    {
      "epoch": 0.32912,
      "grad_norm": 0.15604008734226227,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 2057
    },
    {
      "epoch": 0.32928,
      "grad_norm": 0.14271250367164612,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 2058
    },
    {
      "epoch": 0.32944,
      "grad_norm": 0.16497546434402466,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 2059
    },
    {
      "epoch": 0.3296,
      "grad_norm": 0.10523578524589539,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 2060
    },
    {
      "epoch": 0.32976,
      "grad_norm": 0.1794167309999466,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 2061
    },
    {
      "epoch": 0.32992,
      "grad_norm": 0.1651395857334137,
      "learning_rate": 0.0001,
      "loss": 0.344,
      "step": 2062
    },
    {
      "epoch": 0.33008,
      "grad_norm": 0.1364685297012329,
      "learning_rate": 0.0001,
      "loss": 0.3436,
      "step": 2063
    },
    {
      "epoch": 0.33024,
      "grad_norm": 0.21755656599998474,
      "learning_rate": 0.0001,
      "loss": 0.3445,
      "step": 2064
    },
    {
      "epoch": 0.3304,
      "grad_norm": 0.13481080532073975,
      "learning_rate": 0.0001,
      "loss": 0.3526,
      "step": 2065
    },
    {
      "epoch": 0.33056,
      "grad_norm": 0.15991123020648956,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 2066
    },
    {
      "epoch": 0.33072,
      "grad_norm": 0.1643144190311432,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 2067
    },
    {
      "epoch": 0.33088,
      "grad_norm": 0.14273646473884583,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2068
    },
    {
      "epoch": 0.33104,
      "grad_norm": 0.13984788954257965,
      "learning_rate": 0.0001,
      "loss": 0.3402,
      "step": 2069
    },
    {
      "epoch": 0.3312,
      "grad_norm": 0.15663887560367584,
      "learning_rate": 0.0001,
      "loss": 0.3495,
      "step": 2070
    },
    {
      "epoch": 0.33136,
      "grad_norm": 0.13121268153190613,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 2071
    },
    {
      "epoch": 0.33152,
      "grad_norm": 0.14454397559165955,
      "learning_rate": 0.0001,
      "loss": 0.3493,
      "step": 2072
    },
    {
      "epoch": 0.33168,
      "grad_norm": 0.1636456698179245,
      "learning_rate": 0.0001,
      "loss": 0.3381,
      "step": 2073
    },
    {
      "epoch": 0.33184,
      "grad_norm": 0.1710740178823471,
      "learning_rate": 0.0001,
      "loss": 0.3405,
      "step": 2074
    },
    {
      "epoch": 0.332,
      "grad_norm": 0.1388443410396576,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 2075
    },
    {
      "epoch": 0.33216,
      "grad_norm": 0.1240362823009491,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 2076
    },
    {
      "epoch": 0.33232,
      "grad_norm": 0.179491326212883,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 2077
    },
    {
      "epoch": 0.33248,
      "grad_norm": 0.16107277572155,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 2078
    },
    {
      "epoch": 0.33264,
      "grad_norm": 0.14369158446788788,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 2079
    },
    {
      "epoch": 0.3328,
      "grad_norm": 0.15765643119812012,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 2080
    },
    {
      "epoch": 0.33296,
      "grad_norm": 0.22739626467227936,
      "learning_rate": 0.0001,
      "loss": 0.3461,
      "step": 2081
    },
    {
      "epoch": 0.33312,
      "grad_norm": 0.12264184653759003,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 2082
    },
    {
      "epoch": 0.33328,
      "grad_norm": 0.1923135221004486,
      "learning_rate": 0.0001,
      "loss": 0.3401,
      "step": 2083
    },
    {
      "epoch": 0.33344,
      "grad_norm": 0.1165337860584259,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 2084
    },
    {
      "epoch": 0.3336,
      "grad_norm": 0.12699489295482635,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 2085
    },
    {
      "epoch": 0.33376,
      "grad_norm": 0.12349903583526611,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 2086
    },
    {
      "epoch": 0.33392,
      "grad_norm": 0.14218196272850037,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 2087
    },
    {
      "epoch": 0.33408,
      "grad_norm": 0.11438590288162231,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 2088
    },
    {
      "epoch": 0.33424,
      "grad_norm": 0.20096455514431,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 2089
    },
    {
      "epoch": 0.3344,
      "grad_norm": 0.12109260261058807,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 2090
    },
    {
      "epoch": 0.33456,
      "grad_norm": 0.11501003056764603,
      "learning_rate": 0.0001,
      "loss": 0.3421,
      "step": 2091
    },
    {
      "epoch": 0.33472,
      "grad_norm": 0.17206397652626038,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 2092
    },
    {
      "epoch": 0.33488,
      "grad_norm": 0.13904397189617157,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 2093
    },
    {
      "epoch": 0.33504,
      "grad_norm": 0.11012644320726395,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 2094
    },
    {
      "epoch": 0.3352,
      "grad_norm": 0.11551656574010849,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 2095
    },
    {
      "epoch": 0.33536,
      "grad_norm": 0.1322774887084961,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 2096
    },
    {
      "epoch": 0.33552,
      "grad_norm": 0.15912674367427826,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 2097
    },
    {
      "epoch": 0.33568,
      "grad_norm": 0.1120598167181015,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 2098
    },
    {
      "epoch": 0.33584,
      "grad_norm": 0.11317389458417892,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 2099
    },
    {
      "epoch": 0.336,
      "grad_norm": 0.11966896057128906,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 2100
    },
    {
      "epoch": 0.336,
      "eval_train_accuracy": 0.9718,
      "eval_train_loss": 0.33440065383911133,
      "eval_train_runtime": 4.4768,
      "eval_train_samples_per_second": 1116.881,
      "eval_train_steps_per_second": 14.073,
      "step": 2100
    },
    {
      "epoch": 0.336,
      "eval_test_accuracy": 0.973,
      "eval_test_loss": 0.33280932903289795,
      "eval_test_runtime": 4.6477,
      "eval_test_samples_per_second": 1075.805,
      "eval_test_steps_per_second": 13.555,
      "step": 2100
    },
    {
      "epoch": 0.33616,
      "grad_norm": 0.13122838735580444,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 2101
    },
    {
      "epoch": 0.33632,
      "grad_norm": 0.1352715790271759,
      "learning_rate": 0.0001,
      "loss": 0.3392,
      "step": 2102
    },
    {
      "epoch": 0.33648,
      "grad_norm": 0.12918443977832794,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 2103
    },
    {
      "epoch": 0.33664,
      "grad_norm": 0.14152444899082184,
      "learning_rate": 0.0001,
      "loss": 0.3449,
      "step": 2104
    },
    {
      "epoch": 0.3368,
      "grad_norm": 0.13926617801189423,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 2105
    },
    {
      "epoch": 0.33696,
      "grad_norm": 0.1091560423374176,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 2106
    },
    {
      "epoch": 0.33712,
      "grad_norm": 0.12698011100292206,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2107
    },
    {
      "epoch": 0.33728,
      "grad_norm": 0.14866146445274353,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 2108
    },
    {
      "epoch": 0.33744,
      "grad_norm": 0.12920504808425903,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 2109
    },
    {
      "epoch": 0.3376,
      "grad_norm": 0.13200895488262177,
      "learning_rate": 0.0001,
      "loss": 0.3419,
      "step": 2110
    },
    {
      "epoch": 0.33776,
      "grad_norm": 0.11517700552940369,
      "learning_rate": 0.0001,
      "loss": 0.3483,
      "step": 2111
    },
    {
      "epoch": 0.33792,
      "grad_norm": 0.1050090342760086,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 2112
    },
    {
      "epoch": 0.33808,
      "grad_norm": 0.16749440133571625,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 2113
    },
    {
      "epoch": 0.33824,
      "grad_norm": 0.1038680225610733,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 2114
    },
    {
      "epoch": 0.3384,
      "grad_norm": 0.10468362271785736,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 2115
    },
    {
      "epoch": 0.33856,
      "grad_norm": 0.1340859830379486,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 2116
    },
    {
      "epoch": 0.33872,
      "grad_norm": 0.14604529738426208,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 2117
    },
    {
      "epoch": 0.33888,
      "grad_norm": 0.12674380838871002,
      "learning_rate": 0.0001,
      "loss": 0.3466,
      "step": 2118
    },
    {
      "epoch": 0.33904,
      "grad_norm": 0.13237974047660828,
      "learning_rate": 0.0001,
      "loss": 0.341,
      "step": 2119
    },
    {
      "epoch": 0.3392,
      "grad_norm": 0.111565962433815,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 2120
    },
    {
      "epoch": 0.33936,
      "grad_norm": 0.10886779427528381,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 2121
    },
    {
      "epoch": 0.33952,
      "grad_norm": 0.11970680952072144,
      "learning_rate": 0.0001,
      "loss": 0.3429,
      "step": 2122
    },
    {
      "epoch": 0.33968,
      "grad_norm": 0.16848550736904144,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 2123
    },
    {
      "epoch": 0.33984,
      "grad_norm": 0.10393907129764557,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 2124
    },
    {
      "epoch": 0.34,
      "grad_norm": 0.10789991170167923,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 2125
    },
    {
      "epoch": 0.34016,
      "grad_norm": 0.120583675801754,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 2126
    },
    {
      "epoch": 0.34032,
      "grad_norm": 0.11947911232709885,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 2127
    },
    {
      "epoch": 0.34048,
      "grad_norm": 0.1485174596309662,
      "learning_rate": 0.0001,
      "loss": 0.3394,
      "step": 2128
    },
    {
      "epoch": 0.34064,
      "grad_norm": 0.10695909708738327,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 2129
    },
    {
      "epoch": 0.3408,
      "grad_norm": 0.1106855720281601,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 2130
    },
    {
      "epoch": 0.34096,
      "grad_norm": 0.13835960626602173,
      "learning_rate": 0.0001,
      "loss": 0.3373,
      "step": 2131
    },
    {
      "epoch": 0.34112,
      "grad_norm": 0.16339385509490967,
      "learning_rate": 0.0001,
      "loss": 0.3347,
      "step": 2132
    },
    {
      "epoch": 0.34128,
      "grad_norm": 0.14275522530078888,
      "learning_rate": 0.0001,
      "loss": 0.3399,
      "step": 2133
    },
    {
      "epoch": 0.34144,
      "grad_norm": 0.12276598066091537,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 2134
    },
    {
      "epoch": 0.3416,
      "grad_norm": 0.11281758546829224,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 2135
    },
    {
      "epoch": 0.34176,
      "grad_norm": 0.13303272426128387,
      "learning_rate": 0.0001,
      "loss": 0.3484,
      "step": 2136
    },
    {
      "epoch": 0.34192,
      "grad_norm": 0.18321023881435394,
      "learning_rate": 0.0001,
      "loss": 0.3463,
      "step": 2137
    },
    {
      "epoch": 0.34208,
      "grad_norm": 0.11641106754541397,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 2138
    },
    {
      "epoch": 0.34224,
      "grad_norm": 0.1020195409655571,
      "learning_rate": 0.0001,
      "loss": 0.3403,
      "step": 2139
    },
    {
      "epoch": 0.3424,
      "grad_norm": 0.13080653548240662,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 2140
    },
    {
      "epoch": 0.34256,
      "grad_norm": 0.11846966296434402,
      "learning_rate": 0.0001,
      "loss": 0.3563,
      "step": 2141
    },
    {
      "epoch": 0.34272,
      "grad_norm": 0.11261099576950073,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2142
    },
    {
      "epoch": 0.34288,
      "grad_norm": 0.13301126658916473,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 2143
    },
    {
      "epoch": 0.34304,
      "grad_norm": 0.15358275175094604,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 2144
    },
    {
      "epoch": 0.3432,
      "grad_norm": 0.11578414589166641,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2145
    },
    {
      "epoch": 0.34336,
      "grad_norm": 0.1128377914428711,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 2146
    },
    {
      "epoch": 0.34352,
      "grad_norm": 0.15102402865886688,
      "learning_rate": 0.0001,
      "loss": 0.3498,
      "step": 2147
    },
    {
      "epoch": 0.34368,
      "grad_norm": 0.1271449327468872,
      "learning_rate": 0.0001,
      "loss": 0.3395,
      "step": 2148
    },
    {
      "epoch": 0.34384,
      "grad_norm": 0.1214861050248146,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 2149
    },
    {
      "epoch": 0.344,
      "grad_norm": 0.1779819130897522,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 2150
    },
    {
      "epoch": 0.34416,
      "grad_norm": 0.1509215086698532,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 2151
    },
    {
      "epoch": 0.34432,
      "grad_norm": 0.15565086901187897,
      "learning_rate": 0.0001,
      "loss": 0.3405,
      "step": 2152
    },
    {
      "epoch": 0.34448,
      "grad_norm": 0.1487254500389099,
      "learning_rate": 0.0001,
      "loss": 0.3454,
      "step": 2153
    },
    {
      "epoch": 0.34464,
      "grad_norm": 0.12302403151988983,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 2154
    },
    {
      "epoch": 0.3448,
      "grad_norm": 0.12224006652832031,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 2155
    },
    {
      "epoch": 0.34496,
      "grad_norm": 0.12369617074728012,
      "learning_rate": 0.0001,
      "loss": 0.3529,
      "step": 2156
    },
    {
      "epoch": 0.34512,
      "grad_norm": 0.11315473169088364,
      "learning_rate": 0.0001,
      "loss": 0.3419,
      "step": 2157
    },
    {
      "epoch": 0.34528,
      "grad_norm": 0.10507918149232864,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 2158
    },
    {
      "epoch": 0.34544,
      "grad_norm": 0.13831695914268494,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 2159
    },
    {
      "epoch": 0.3456,
      "grad_norm": 0.1636006385087967,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 2160
    },
    {
      "epoch": 0.34576,
      "grad_norm": 0.11748452484607697,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 2161
    },
    {
      "epoch": 0.34592,
      "grad_norm": 0.11302652209997177,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 2162
    },
    {
      "epoch": 0.34608,
      "grad_norm": 0.15463979542255402,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 2163
    },
    {
      "epoch": 0.34624,
      "grad_norm": 0.11788638681173325,
      "learning_rate": 0.0001,
      "loss": 0.3444,
      "step": 2164
    },
    {
      "epoch": 0.3464,
      "grad_norm": 0.11179991066455841,
      "learning_rate": 0.0001,
      "loss": 0.344,
      "step": 2165
    },
    {
      "epoch": 0.34656,
      "grad_norm": 0.11935967206954956,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 2166
    },
    {
      "epoch": 0.34672,
      "grad_norm": 0.13040074706077576,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 2167
    },
    {
      "epoch": 0.34688,
      "grad_norm": 0.136619433760643,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 2168
    },
    {
      "epoch": 0.34704,
      "grad_norm": 0.16382940113544464,
      "learning_rate": 0.0001,
      "loss": 0.3537,
      "step": 2169
    },
    {
      "epoch": 0.3472,
      "grad_norm": 0.1242077425122261,
      "learning_rate": 0.0001,
      "loss": 0.3506,
      "step": 2170
    },
    {
      "epoch": 0.34736,
      "grad_norm": 0.14422234892845154,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 2171
    },
    {
      "epoch": 0.34752,
      "grad_norm": 0.13775022327899933,
      "learning_rate": 0.0001,
      "loss": 0.3503,
      "step": 2172
    },
    {
      "epoch": 0.34768,
      "grad_norm": 0.16705065965652466,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 2173
    },
    {
      "epoch": 0.34784,
      "grad_norm": 0.12422212958335876,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 2174
    },
    {
      "epoch": 0.348,
      "grad_norm": 0.2732580602169037,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 2175
    },
    {
      "epoch": 0.34816,
      "grad_norm": 0.18051818013191223,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2176
    },
    {
      "epoch": 0.34832,
      "grad_norm": 0.2617289125919342,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 2177
    },
    {
      "epoch": 0.34848,
      "grad_norm": 0.12946313619613647,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 2178
    },
    {
      "epoch": 0.34864,
      "grad_norm": 0.2534182667732239,
      "learning_rate": 0.0001,
      "loss": 0.3468,
      "step": 2179
    },
    {
      "epoch": 0.3488,
      "grad_norm": 0.15543456375598907,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 2180
    },
    {
      "epoch": 0.34896,
      "grad_norm": 0.13968637585639954,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 2181
    },
    {
      "epoch": 0.34912,
      "grad_norm": 0.12872105836868286,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2182
    },
    {
      "epoch": 0.34928,
      "grad_norm": 0.14310532808303833,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 2183
    },
    {
      "epoch": 0.34944,
      "grad_norm": 0.17509996891021729,
      "learning_rate": 0.0001,
      "loss": 0.3379,
      "step": 2184
    },
    {
      "epoch": 0.3496,
      "grad_norm": 0.25451603531837463,
      "learning_rate": 0.0001,
      "loss": 0.3403,
      "step": 2185
    },
    {
      "epoch": 0.34976,
      "grad_norm": 0.15366049110889435,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 2186
    },
    {
      "epoch": 0.34992,
      "grad_norm": 0.18533529341220856,
      "learning_rate": 0.0001,
      "loss": 0.3465,
      "step": 2187
    },
    {
      "epoch": 0.35008,
      "grad_norm": 0.1677788347005844,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 2188
    },
    {
      "epoch": 0.35024,
      "grad_norm": 0.175576314330101,
      "learning_rate": 0.0001,
      "loss": 0.3492,
      "step": 2189
    },
    {
      "epoch": 0.3504,
      "grad_norm": 0.17196577787399292,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 2190
    },
    {
      "epoch": 0.35056,
      "grad_norm": 0.13490688800811768,
      "learning_rate": 0.0001,
      "loss": 0.3466,
      "step": 2191
    },
    {
      "epoch": 0.35072,
      "grad_norm": 0.14712952077388763,
      "learning_rate": 0.0001,
      "loss": 0.3444,
      "step": 2192
    },
    {
      "epoch": 0.35088,
      "grad_norm": 0.23867890238761902,
      "learning_rate": 0.0001,
      "loss": 0.3443,
      "step": 2193
    },
    {
      "epoch": 0.35104,
      "grad_norm": 0.18601414561271667,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 2194
    },
    {
      "epoch": 0.3512,
      "grad_norm": 0.1782335638999939,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 2195
    },
    {
      "epoch": 0.35136,
      "grad_norm": 0.12770159542560577,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 2196
    },
    {
      "epoch": 0.35152,
      "grad_norm": 0.29139652848243713,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2197
    },
    {
      "epoch": 0.35168,
      "grad_norm": 0.1204923540353775,
      "learning_rate": 0.0001,
      "loss": 0.3532,
      "step": 2198
    },
    {
      "epoch": 0.35184,
      "grad_norm": 0.1621198058128357,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 2199
    },
    {
      "epoch": 0.352,
      "grad_norm": 0.13689103722572327,
      "learning_rate": 0.0001,
      "loss": 0.3507,
      "step": 2200
    },
    {
      "epoch": 0.352,
      "eval_train_accuracy": 0.9888,
      "eval_train_loss": 0.33421599864959717,
      "eval_train_runtime": 4.6599,
      "eval_train_samples_per_second": 1072.988,
      "eval_train_steps_per_second": 13.52,
      "step": 2200
    },
    {
      "epoch": 0.352,
      "eval_test_accuracy": 0.988,
      "eval_test_loss": 0.33261990547180176,
      "eval_test_runtime": 4.5071,
      "eval_test_samples_per_second": 1109.366,
      "eval_test_steps_per_second": 13.978,
      "step": 2200
    },
    {
      "epoch": 0.35216,
      "grad_norm": 0.14863941073417664,
      "learning_rate": 0.0001,
      "loss": 0.3469,
      "step": 2201
    },
    {
      "epoch": 0.35232,
      "grad_norm": 0.1431974470615387,
      "learning_rate": 0.0001,
      "loss": 0.3531,
      "step": 2202
    },
    {
      "epoch": 0.35248,
      "grad_norm": 0.13162314891815186,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 2203
    },
    {
      "epoch": 0.35264,
      "grad_norm": 0.1274530589580536,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 2204
    },
    {
      "epoch": 0.3528,
      "grad_norm": 0.1677848845720291,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 2205
    },
    {
      "epoch": 0.35296,
      "grad_norm": 0.18399262428283691,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 2206
    },
    {
      "epoch": 0.35312,
      "grad_norm": 0.14095045626163483,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 2207
    },
    {
      "epoch": 0.35328,
      "grad_norm": 0.15605007112026215,
      "learning_rate": 0.0001,
      "loss": 0.3472,
      "step": 2208
    },
    {
      "epoch": 0.35344,
      "grad_norm": 0.13476458191871643,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 2209
    },
    {
      "epoch": 0.3536,
      "grad_norm": 0.13794955611228943,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 2210
    },
    {
      "epoch": 0.35376,
      "grad_norm": 0.19383195042610168,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 2211
    },
    {
      "epoch": 0.35392,
      "grad_norm": 0.13070610165596008,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 2212
    },
    {
      "epoch": 0.35408,
      "grad_norm": 0.143936887383461,
      "learning_rate": 0.0001,
      "loss": 0.3355,
      "step": 2213
    },
    {
      "epoch": 0.35424,
      "grad_norm": 0.12562738358974457,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 2214
    },
    {
      "epoch": 0.3544,
      "grad_norm": 0.14067424833774567,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2215
    },
    {
      "epoch": 0.35456,
      "grad_norm": 0.14934055507183075,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 2216
    },
    {
      "epoch": 0.35472,
      "grad_norm": 0.137632355093956,
      "learning_rate": 0.0001,
      "loss": 0.345,
      "step": 2217
    },
    {
      "epoch": 0.35488,
      "grad_norm": 0.17933914065361023,
      "learning_rate": 0.0001,
      "loss": 0.3537,
      "step": 2218
    },
    {
      "epoch": 0.35504,
      "grad_norm": 0.16221193969249725,
      "learning_rate": 0.0001,
      "loss": 0.3507,
      "step": 2219
    },
    {
      "epoch": 0.3552,
      "grad_norm": 0.11655370891094208,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 2220
    },
    {
      "epoch": 0.35536,
      "grad_norm": 0.18098559975624084,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 2221
    },
    {
      "epoch": 0.35552,
      "grad_norm": 0.1446317583322525,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 2222
    },
    {
      "epoch": 0.35568,
      "grad_norm": 0.1409229189157486,
      "learning_rate": 0.0001,
      "loss": 0.3429,
      "step": 2223
    },
    {
      "epoch": 0.35584,
      "grad_norm": 0.1410222053527832,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 2224
    },
    {
      "epoch": 0.356,
      "grad_norm": 0.13709081709384918,
      "learning_rate": 0.0001,
      "loss": 0.3534,
      "step": 2225
    },
    {
      "epoch": 0.35616,
      "grad_norm": 0.11434084177017212,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 2226
    },
    {
      "epoch": 0.35632,
      "grad_norm": 0.12813688814640045,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 2227
    },
    {
      "epoch": 0.35648,
      "grad_norm": 0.14445848762989044,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 2228
    },
    {
      "epoch": 0.35664,
      "grad_norm": 0.11433245986700058,
      "learning_rate": 0.0001,
      "loss": 0.3454,
      "step": 2229
    },
    {
      "epoch": 0.3568,
      "grad_norm": 0.1169535294175148,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 2230
    },
    {
      "epoch": 0.35696,
      "grad_norm": 0.10979533940553665,
      "learning_rate": 0.0001,
      "loss": 0.3443,
      "step": 2231
    },
    {
      "epoch": 0.35712,
      "grad_norm": 0.18176934123039246,
      "learning_rate": 0.0001,
      "loss": 0.3405,
      "step": 2232
    },
    {
      "epoch": 0.35728,
      "grad_norm": 0.14468230307102203,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 2233
    },
    {
      "epoch": 0.35744,
      "grad_norm": 0.11509398370981216,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 2234
    },
    {
      "epoch": 0.3576,
      "grad_norm": 0.1801479011774063,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 2235
    },
    {
      "epoch": 0.35776,
      "grad_norm": 0.11336083710193634,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2236
    },
    {
      "epoch": 0.35792,
      "grad_norm": 0.13498489558696747,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 2237
    },
    {
      "epoch": 0.35808,
      "grad_norm": 0.1527697592973709,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 2238
    },
    {
      "epoch": 0.35824,
      "grad_norm": 0.1440800577402115,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 2239
    },
    {
      "epoch": 0.3584,
      "grad_norm": 0.14776112139225006,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 2240
    },
    {
      "epoch": 0.35856,
      "grad_norm": 0.11326829344034195,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 2241
    },
    {
      "epoch": 0.35872,
      "grad_norm": 0.1350383162498474,
      "learning_rate": 0.0001,
      "loss": 0.3446,
      "step": 2242
    },
    {
      "epoch": 0.35888,
      "grad_norm": 0.12452146410942078,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 2243
    },
    {
      "epoch": 0.35904,
      "grad_norm": 0.11348546296358109,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 2244
    },
    {
      "epoch": 0.3592,
      "grad_norm": 0.11008264124393463,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 2245
    },
    {
      "epoch": 0.35936,
      "grad_norm": 0.1674875169992447,
      "learning_rate": 0.0001,
      "loss": 0.3454,
      "step": 2246
    },
    {
      "epoch": 0.35952,
      "grad_norm": 0.13677746057510376,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 2247
    },
    {
      "epoch": 0.35968,
      "grad_norm": 0.11834321916103363,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 2248
    },
    {
      "epoch": 0.35984,
      "grad_norm": 0.1165311187505722,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 2249
    },
    {
      "epoch": 0.36,
      "grad_norm": 0.1604948192834854,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 2250
    },
    {
      "epoch": 0.36016,
      "grad_norm": 0.1291571408510208,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 2251
    },
    {
      "epoch": 0.36032,
      "grad_norm": 0.12675392627716064,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 2252
    },
    {
      "epoch": 0.36048,
      "grad_norm": 0.11967787146568298,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 2253
    },
    {
      "epoch": 0.36064,
      "grad_norm": 0.1530894637107849,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 2254
    },
    {
      "epoch": 0.3608,
      "grad_norm": 0.10324247926473618,
      "learning_rate": 0.0001,
      "loss": 0.3462,
      "step": 2255
    },
    {
      "epoch": 0.36096,
      "grad_norm": 0.1186964213848114,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 2256
    },
    {
      "epoch": 0.36112,
      "grad_norm": 0.12769439816474915,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 2257
    },
    {
      "epoch": 0.36128,
      "grad_norm": 0.13195328414440155,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 2258
    },
    {
      "epoch": 0.36144,
      "grad_norm": 0.12370158731937408,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 2259
    },
    {
      "epoch": 0.3616,
      "grad_norm": 0.13347966969013214,
      "learning_rate": 0.0001,
      "loss": 0.3442,
      "step": 2260
    },
    {
      "epoch": 0.36176,
      "grad_norm": 0.12280620634555817,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2261
    },
    {
      "epoch": 0.36192,
      "grad_norm": 0.11921671777963638,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 2262
    },
    {
      "epoch": 0.36208,
      "grad_norm": 0.11048448085784912,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 2263
    },
    {
      "epoch": 0.36224,
      "grad_norm": 0.13318657875061035,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 2264
    },
    {
      "epoch": 0.3624,
      "grad_norm": 0.11183520406484604,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 2265
    },
    {
      "epoch": 0.36256,
      "grad_norm": 0.11176886409521103,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 2266
    },
    {
      "epoch": 0.36272,
      "grad_norm": 0.12288287281990051,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 2267
    },
    {
      "epoch": 0.36288,
      "grad_norm": 0.11320103704929352,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 2268
    },
    {
      "epoch": 0.36304,
      "grad_norm": 0.10298972576856613,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 2269
    },
    {
      "epoch": 0.3632,
      "grad_norm": 0.12729354202747345,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 2270
    },
    {
      "epoch": 0.36336,
      "grad_norm": 0.11902300268411636,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 2271
    },
    {
      "epoch": 0.36352,
      "grad_norm": 0.11119633167982101,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 2272
    },
    {
      "epoch": 0.36368,
      "grad_norm": 0.15880416333675385,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 2273
    },
    {
      "epoch": 0.36384,
      "grad_norm": 0.17441120743751526,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 2274
    },
    {
      "epoch": 0.364,
      "grad_norm": 0.11062929034233093,
      "learning_rate": 0.0001,
      "loss": 0.3467,
      "step": 2275
    },
    {
      "epoch": 0.36416,
      "grad_norm": 0.11443078517913818,
      "learning_rate": 0.0001,
      "loss": 0.3439,
      "step": 2276
    },
    {
      "epoch": 0.36432,
      "grad_norm": 0.11267227679491043,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 2277
    },
    {
      "epoch": 0.36448,
      "grad_norm": 0.1464885026216507,
      "learning_rate": 0.0001,
      "loss": 0.3426,
      "step": 2278
    },
    {
      "epoch": 0.36464,
      "grad_norm": 0.11490672081708908,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 2279
    },
    {
      "epoch": 0.3648,
      "grad_norm": 0.14002656936645508,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 2280
    },
    {
      "epoch": 0.36496,
      "grad_norm": 0.12059474736452103,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 2281
    },
    {
      "epoch": 0.36512,
      "grad_norm": 0.12109995633363724,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 2282
    },
    {
      "epoch": 0.36528,
      "grad_norm": 0.12038526684045792,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 2283
    },
    {
      "epoch": 0.36544,
      "grad_norm": 0.09909594058990479,
      "learning_rate": 0.0001,
      "loss": 0.3389,
      "step": 2284
    },
    {
      "epoch": 0.3656,
      "grad_norm": 0.11061403155326843,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 2285
    },
    {
      "epoch": 0.36576,
      "grad_norm": 0.11685793101787567,
      "learning_rate": 0.0001,
      "loss": 0.3412,
      "step": 2286
    },
    {
      "epoch": 0.36592,
      "grad_norm": 0.11411169916391373,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 2287
    },
    {
      "epoch": 0.36608,
      "grad_norm": 0.12202207744121552,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 2288
    },
    {
      "epoch": 0.36624,
      "grad_norm": 0.12961387634277344,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 2289
    },
    {
      "epoch": 0.3664,
      "grad_norm": 0.11599557846784592,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2290
    },
    {
      "epoch": 0.36656,
      "grad_norm": 0.12044933438301086,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 2291
    },
    {
      "epoch": 0.36672,
      "grad_norm": 0.10902368277311325,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 2292
    },
    {
      "epoch": 0.36688,
      "grad_norm": 0.11093795299530029,
      "learning_rate": 0.0001,
      "loss": 0.3381,
      "step": 2293
    },
    {
      "epoch": 0.36704,
      "grad_norm": 0.12294412404298782,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 2294
    },
    {
      "epoch": 0.3672,
      "grad_norm": 0.10703908652067184,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 2295
    },
    {
      "epoch": 0.36736,
      "grad_norm": 0.11816782504320145,
      "learning_rate": 0.0001,
      "loss": 0.3401,
      "step": 2296
    },
    {
      "epoch": 0.36752,
      "grad_norm": 0.11249488592147827,
      "learning_rate": 0.0001,
      "loss": 0.3509,
      "step": 2297
    },
    {
      "epoch": 0.36768,
      "grad_norm": 0.11808251589536667,
      "learning_rate": 0.0001,
      "loss": 0.3433,
      "step": 2298
    },
    {
      "epoch": 0.36784,
      "grad_norm": 0.11558102071285248,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 2299
    },
    {
      "epoch": 0.368,
      "grad_norm": 0.12654529511928558,
      "learning_rate": 0.0001,
      "loss": 0.3511,
      "step": 2300
    },
    {
      "epoch": 0.368,
      "eval_train_accuracy": 0.987,
      "eval_train_loss": 0.333007276058197,
      "eval_train_runtime": 4.6536,
      "eval_train_samples_per_second": 1074.428,
      "eval_train_steps_per_second": 13.538,
      "step": 2300
    },
    {
      "epoch": 0.368,
      "eval_test_accuracy": 0.9858,
      "eval_test_loss": 0.3316853642463684,
      "eval_test_runtime": 4.8565,
      "eval_test_samples_per_second": 1029.543,
      "eval_test_steps_per_second": 12.972,
      "step": 2300
    },
    {
      "epoch": 0.36816,
      "grad_norm": 0.1407136172056198,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 2301
    },
    {
      "epoch": 0.36832,
      "grad_norm": 0.15359343588352203,
      "learning_rate": 0.0001,
      "loss": 0.3426,
      "step": 2302
    },
    {
      "epoch": 0.36848,
      "grad_norm": 0.11838746070861816,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 2303
    },
    {
      "epoch": 0.36864,
      "grad_norm": 0.10921959578990936,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 2304
    },
    {
      "epoch": 0.3688,
      "grad_norm": 0.12704245746135712,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 2305
    },
    {
      "epoch": 0.36896,
      "grad_norm": 0.1265593022108078,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 2306
    },
    {
      "epoch": 0.36912,
      "grad_norm": 0.10095056891441345,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 2307
    },
    {
      "epoch": 0.36928,
      "grad_norm": 0.12758868932724,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2308
    },
    {
      "epoch": 0.36944,
      "grad_norm": 0.19494584202766418,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 2309
    },
    {
      "epoch": 0.3696,
      "grad_norm": 0.14610767364501953,
      "learning_rate": 0.0001,
      "loss": 0.3394,
      "step": 2310
    },
    {
      "epoch": 0.36976,
      "grad_norm": 0.11971699446439743,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 2311
    },
    {
      "epoch": 0.36992,
      "grad_norm": 0.14597094058990479,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 2312
    },
    {
      "epoch": 0.37008,
      "grad_norm": 0.12948089838027954,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 2313
    },
    {
      "epoch": 0.37024,
      "grad_norm": 0.10194586217403412,
      "learning_rate": 0.0001,
      "loss": 0.3503,
      "step": 2314
    },
    {
      "epoch": 0.3704,
      "grad_norm": 0.11630246043205261,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2315
    },
    {
      "epoch": 0.37056,
      "grad_norm": 0.10944101959466934,
      "learning_rate": 0.0001,
      "loss": 0.3406,
      "step": 2316
    },
    {
      "epoch": 0.37072,
      "grad_norm": 0.1209668442606926,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 2317
    },
    {
      "epoch": 0.37088,
      "grad_norm": 0.1073279082775116,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 2318
    },
    {
      "epoch": 0.37104,
      "grad_norm": 0.14581097662448883,
      "learning_rate": 0.0001,
      "loss": 0.3381,
      "step": 2319
    },
    {
      "epoch": 0.3712,
      "grad_norm": 0.11286572366952896,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 2320
    },
    {
      "epoch": 0.37136,
      "grad_norm": 0.10353601723909378,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 2321
    },
    {
      "epoch": 0.37152,
      "grad_norm": 0.12725652754306793,
      "learning_rate": 0.0001,
      "loss": 0.3399,
      "step": 2322
    },
    {
      "epoch": 0.37168,
      "grad_norm": 0.11308583617210388,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 2323
    },
    {
      "epoch": 0.37184,
      "grad_norm": 0.11825995147228241,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 2324
    },
    {
      "epoch": 0.372,
      "grad_norm": 0.13307806849479675,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 2325
    },
    {
      "epoch": 0.37216,
      "grad_norm": 0.17007721960544586,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 2326
    },
    {
      "epoch": 0.37232,
      "grad_norm": 0.13152916729450226,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 2327
    },
    {
      "epoch": 0.37248,
      "grad_norm": 0.17180871963500977,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 2328
    },
    {
      "epoch": 0.37264,
      "grad_norm": 0.11337634176015854,
      "learning_rate": 0.0001,
      "loss": 0.3586,
      "step": 2329
    },
    {
      "epoch": 0.3728,
      "grad_norm": 0.14273545145988464,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 2330
    },
    {
      "epoch": 0.37296,
      "grad_norm": 0.11169497668743134,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 2331
    },
    {
      "epoch": 0.37312,
      "grad_norm": 0.1207062229514122,
      "learning_rate": 0.0001,
      "loss": 0.3545,
      "step": 2332
    },
    {
      "epoch": 0.37328,
      "grad_norm": 0.1354561299085617,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 2333
    },
    {
      "epoch": 0.37344,
      "grad_norm": 0.1466568410396576,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 2334
    },
    {
      "epoch": 0.3736,
      "grad_norm": 0.1684255599975586,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 2335
    },
    {
      "epoch": 0.37376,
      "grad_norm": 0.15825970470905304,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 2336
    },
    {
      "epoch": 0.37392,
      "grad_norm": 0.2207617312669754,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 2337
    },
    {
      "epoch": 0.37408,
      "grad_norm": 0.25222235918045044,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 2338
    },
    {
      "epoch": 0.37424,
      "grad_norm": 0.16075904667377472,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 2339
    },
    {
      "epoch": 0.3744,
      "grad_norm": 0.13822823762893677,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 2340
    },
    {
      "epoch": 0.37456,
      "grad_norm": 0.1332424134016037,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 2341
    },
    {
      "epoch": 0.37472,
      "grad_norm": 0.1293170005083084,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 2342
    },
    {
      "epoch": 0.37488,
      "grad_norm": 0.14636234939098358,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 2343
    },
    {
      "epoch": 0.37504,
      "grad_norm": 0.15917545557022095,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 2344
    },
    {
      "epoch": 0.3752,
      "grad_norm": 0.12526988983154297,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 2345
    },
    {
      "epoch": 0.37536,
      "grad_norm": 0.12352876365184784,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2346
    },
    {
      "epoch": 0.37552,
      "grad_norm": 0.10270701348781586,
      "learning_rate": 0.0001,
      "loss": 0.3457,
      "step": 2347
    },
    {
      "epoch": 0.37568,
      "grad_norm": 0.13445943593978882,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2348
    },
    {
      "epoch": 0.37584,
      "grad_norm": 0.12985028326511383,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 2349
    },
    {
      "epoch": 0.376,
      "grad_norm": 0.14453370869159698,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 2350
    },
    {
      "epoch": 0.37616,
      "grad_norm": 0.14378327131271362,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2351
    },
    {
      "epoch": 0.37632,
      "grad_norm": 0.18186718225479126,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 2352
    },
    {
      "epoch": 0.37648,
      "grad_norm": 0.14823274314403534,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2353
    },
    {
      "epoch": 0.37664,
      "grad_norm": 0.18735486268997192,
      "learning_rate": 0.0001,
      "loss": 0.3453,
      "step": 2354
    },
    {
      "epoch": 0.3768,
      "grad_norm": 0.10908825695514679,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2355
    },
    {
      "epoch": 0.37696,
      "grad_norm": 0.12045900523662567,
      "learning_rate": 0.0001,
      "loss": 0.3511,
      "step": 2356
    },
    {
      "epoch": 0.37712,
      "grad_norm": 0.13485941290855408,
      "learning_rate": 0.0001,
      "loss": 0.3547,
      "step": 2357
    },
    {
      "epoch": 0.37728,
      "grad_norm": 0.12492062151432037,
      "learning_rate": 0.0001,
      "loss": 0.3414,
      "step": 2358
    },
    {
      "epoch": 0.37744,
      "grad_norm": 0.12294743955135345,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 2359
    },
    {
      "epoch": 0.3776,
      "grad_norm": 0.17761777341365814,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 2360
    },
    {
      "epoch": 0.37776,
      "grad_norm": 0.12077882885932922,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 2361
    },
    {
      "epoch": 0.37792,
      "grad_norm": 0.12762387096881866,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 2362
    },
    {
      "epoch": 0.37808,
      "grad_norm": 0.15218022465705872,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 2363
    },
    {
      "epoch": 0.37824,
      "grad_norm": 0.15957607328891754,
      "learning_rate": 0.0001,
      "loss": 0.3392,
      "step": 2364
    },
    {
      "epoch": 0.3784,
      "grad_norm": 0.1344740092754364,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 2365
    },
    {
      "epoch": 0.37856,
      "grad_norm": 0.1179647147655487,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 2366
    },
    {
      "epoch": 0.37872,
      "grad_norm": 0.12647223472595215,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 2367
    },
    {
      "epoch": 0.37888,
      "grad_norm": 0.14646761119365692,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 2368
    },
    {
      "epoch": 0.37904,
      "grad_norm": 0.21361786127090454,
      "learning_rate": 0.0001,
      "loss": 0.3533,
      "step": 2369
    },
    {
      "epoch": 0.3792,
      "grad_norm": 0.11921711266040802,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 2370
    },
    {
      "epoch": 0.37936,
      "grad_norm": 0.13744372129440308,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 2371
    },
    {
      "epoch": 0.37952,
      "grad_norm": 0.19521084427833557,
      "learning_rate": 0.0001,
      "loss": 0.3508,
      "step": 2372
    },
    {
      "epoch": 0.37968,
      "grad_norm": 0.16737471520900726,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 2373
    },
    {
      "epoch": 0.37984,
      "grad_norm": 0.13749544322490692,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 2374
    },
    {
      "epoch": 0.38,
      "grad_norm": 0.13378116488456726,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 2375
    },
    {
      "epoch": 0.38016,
      "grad_norm": 0.14515072107315063,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 2376
    },
    {
      "epoch": 0.38032,
      "grad_norm": 0.10696486383676529,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 2377
    },
    {
      "epoch": 0.38048,
      "grad_norm": 0.1540810614824295,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 2378
    },
    {
      "epoch": 0.38064,
      "grad_norm": 0.1145508661866188,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 2379
    },
    {
      "epoch": 0.3808,
      "grad_norm": 0.11614459753036499,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2380
    },
    {
      "epoch": 0.38096,
      "grad_norm": 0.12143896520137787,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 2381
    },
    {
      "epoch": 0.38112,
      "grad_norm": 0.17089009284973145,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 2382
    },
    {
      "epoch": 0.38128,
      "grad_norm": 0.11915769428014755,
      "learning_rate": 0.0001,
      "loss": 0.3341,
      "step": 2383
    },
    {
      "epoch": 0.38144,
      "grad_norm": 0.13723613321781158,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 2384
    },
    {
      "epoch": 0.3816,
      "grad_norm": 0.12631914019584656,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 2385
    },
    {
      "epoch": 0.38176,
      "grad_norm": 0.12327449023723602,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 2386
    },
    {
      "epoch": 0.38192,
      "grad_norm": 0.1355346292257309,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 2387
    },
    {
      "epoch": 0.38208,
      "grad_norm": 0.1300690919160843,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 2388
    },
    {
      "epoch": 0.38224,
      "grad_norm": 0.12749922275543213,
      "learning_rate": 0.0001,
      "loss": 0.3533,
      "step": 2389
    },
    {
      "epoch": 0.3824,
      "grad_norm": 0.11726424098014832,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 2390
    },
    {
      "epoch": 0.38256,
      "grad_norm": 0.13428270816802979,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2391
    },
    {
      "epoch": 0.38272,
      "grad_norm": 0.18666554987430573,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 2392
    },
    {
      "epoch": 0.38288,
      "grad_norm": 0.1035359725356102,
      "learning_rate": 0.0001,
      "loss": 0.3432,
      "step": 2393
    },
    {
      "epoch": 0.38304,
      "grad_norm": 0.10927893966436386,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 2394
    },
    {
      "epoch": 0.3832,
      "grad_norm": 0.174956813454628,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2395
    },
    {
      "epoch": 0.38336,
      "grad_norm": 0.13123053312301636,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 2396
    },
    {
      "epoch": 0.38352,
      "grad_norm": 0.12042754143476486,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 2397
    },
    {
      "epoch": 0.38368,
      "grad_norm": 0.11362358182668686,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 2398
    },
    {
      "epoch": 0.38384,
      "grad_norm": 0.12338445335626602,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 2399
    },
    {
      "epoch": 0.384,
      "grad_norm": 0.11220209300518036,
      "learning_rate": 0.0001,
      "loss": 0.3421,
      "step": 2400
    },
    {
      "epoch": 0.384,
      "eval_train_accuracy": 0.9886,
      "eval_train_loss": 0.3322828710079193,
      "eval_train_runtime": 4.6546,
      "eval_train_samples_per_second": 1074.199,
      "eval_train_steps_per_second": 13.535,
      "step": 2400
    },
    {
      "epoch": 0.384,
      "eval_test_accuracy": 0.9872,
      "eval_test_loss": 0.33065247535705566,
      "eval_test_runtime": 4.4842,
      "eval_test_samples_per_second": 1115.022,
      "eval_test_steps_per_second": 14.049,
      "step": 2400
    },
    {
      "epoch": 0.38416,
      "grad_norm": 0.14744779467582703,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 2401
    },
    {
      "epoch": 0.38432,
      "grad_norm": 0.14422276616096497,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 2402
    },
    {
      "epoch": 0.38448,
      "grad_norm": 0.10983643680810928,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 2403
    },
    {
      "epoch": 0.38464,
      "grad_norm": 0.13403290510177612,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 2404
    },
    {
      "epoch": 0.3848,
      "grad_norm": 0.17378968000411987,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 2405
    },
    {
      "epoch": 0.38496,
      "grad_norm": 0.16850152611732483,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 2406
    },
    {
      "epoch": 0.38512,
      "grad_norm": 0.11192064732313156,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 2407
    },
    {
      "epoch": 0.38528,
      "grad_norm": 0.14271889626979828,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 2408
    },
    {
      "epoch": 0.38544,
      "grad_norm": 0.1295255720615387,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 2409
    },
    {
      "epoch": 0.3856,
      "grad_norm": 0.1430850625038147,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 2410
    },
    {
      "epoch": 0.38576,
      "grad_norm": 0.16691942512989044,
      "learning_rate": 0.0001,
      "loss": 0.3375,
      "step": 2411
    },
    {
      "epoch": 0.38592,
      "grad_norm": 0.12821927666664124,
      "learning_rate": 0.0001,
      "loss": 0.3547,
      "step": 2412
    },
    {
      "epoch": 0.38608,
      "grad_norm": 0.1297873854637146,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 2413
    },
    {
      "epoch": 0.38624,
      "grad_norm": 0.1691918969154358,
      "learning_rate": 0.0001,
      "loss": 0.3466,
      "step": 2414
    },
    {
      "epoch": 0.3864,
      "grad_norm": 0.16898834705352783,
      "learning_rate": 0.0001,
      "loss": 0.3558,
      "step": 2415
    },
    {
      "epoch": 0.38656,
      "grad_norm": 0.14273573458194733,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 2416
    },
    {
      "epoch": 0.38672,
      "grad_norm": 0.10734619200229645,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 2417
    },
    {
      "epoch": 0.38688,
      "grad_norm": 0.13137297332286835,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2418
    },
    {
      "epoch": 0.38704,
      "grad_norm": 0.11246422678232193,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 2419
    },
    {
      "epoch": 0.3872,
      "grad_norm": 0.11782930046319962,
      "learning_rate": 0.0001,
      "loss": 0.3427,
      "step": 2420
    },
    {
      "epoch": 0.38736,
      "grad_norm": 0.1396922916173935,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 2421
    },
    {
      "epoch": 0.38752,
      "grad_norm": 0.11972294002771378,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 2422
    },
    {
      "epoch": 0.38768,
      "grad_norm": 0.1421908438205719,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2423
    },
    {
      "epoch": 0.38784,
      "grad_norm": 0.125197172164917,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 2424
    },
    {
      "epoch": 0.388,
      "grad_norm": 0.1495179533958435,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 2425
    },
    {
      "epoch": 0.38816,
      "grad_norm": 0.16201773285865784,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 2426
    },
    {
      "epoch": 0.38832,
      "grad_norm": 0.15751349925994873,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 2427
    },
    {
      "epoch": 0.38848,
      "grad_norm": 0.11603415757417679,
      "learning_rate": 0.0001,
      "loss": 0.3384,
      "step": 2428
    },
    {
      "epoch": 0.38864,
      "grad_norm": 0.1122913733124733,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 2429
    },
    {
      "epoch": 0.3888,
      "grad_norm": 0.14626559615135193,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2430
    },
    {
      "epoch": 0.38896,
      "grad_norm": 0.13613665103912354,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 2431
    },
    {
      "epoch": 0.38912,
      "grad_norm": 0.20032374560832977,
      "learning_rate": 0.0001,
      "loss": 0.3443,
      "step": 2432
    },
    {
      "epoch": 0.38928,
      "grad_norm": 0.10803280770778656,
      "learning_rate": 0.0001,
      "loss": 0.3053,
      "step": 2433
    },
    {
      "epoch": 0.38944,
      "grad_norm": 0.1643204241991043,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 2434
    },
    {
      "epoch": 0.3896,
      "grad_norm": 0.1412106156349182,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 2435
    },
    {
      "epoch": 0.38976,
      "grad_norm": 0.13901393115520477,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 2436
    },
    {
      "epoch": 0.38992,
      "grad_norm": 0.12145905941724777,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 2437
    },
    {
      "epoch": 0.39008,
      "grad_norm": 0.13698557019233704,
      "learning_rate": 0.0001,
      "loss": 0.3374,
      "step": 2438
    },
    {
      "epoch": 0.39024,
      "grad_norm": 0.14141273498535156,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 2439
    },
    {
      "epoch": 0.3904,
      "grad_norm": 0.1548732966184616,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 2440
    },
    {
      "epoch": 0.39056,
      "grad_norm": 0.13906548917293549,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 2441
    },
    {
      "epoch": 0.39072,
      "grad_norm": 0.15619628131389618,
      "learning_rate": 0.0001,
      "loss": 0.3374,
      "step": 2442
    },
    {
      "epoch": 0.39088,
      "grad_norm": 0.13027805089950562,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 2443
    },
    {
      "epoch": 0.39104,
      "grad_norm": 0.13013112545013428,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 2444
    },
    {
      "epoch": 0.3912,
      "grad_norm": 0.1519942730665207,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 2445
    },
    {
      "epoch": 0.39136,
      "grad_norm": 0.11953707039356232,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 2446
    },
    {
      "epoch": 0.39152,
      "grad_norm": 0.11008085310459137,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2447
    },
    {
      "epoch": 0.39168,
      "grad_norm": 0.11422401666641235,
      "learning_rate": 0.0001,
      "loss": 0.3484,
      "step": 2448
    },
    {
      "epoch": 0.39184,
      "grad_norm": 0.12077146023511887,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 2449
    },
    {
      "epoch": 0.392,
      "grad_norm": 0.1263134926557541,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 2450
    },
    {
      "epoch": 0.39216,
      "grad_norm": 0.16046084463596344,
      "learning_rate": 0.0001,
      "loss": 0.3459,
      "step": 2451
    },
    {
      "epoch": 0.39232,
      "grad_norm": 0.14450637996196747,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 2452
    },
    {
      "epoch": 0.39248,
      "grad_norm": 0.11931217461824417,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 2453
    },
    {
      "epoch": 0.39264,
      "grad_norm": 0.14329637587070465,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 2454
    },
    {
      "epoch": 0.3928,
      "grad_norm": 0.14433979988098145,
      "learning_rate": 0.0001,
      "loss": 0.3442,
      "step": 2455
    },
    {
      "epoch": 0.39296,
      "grad_norm": 0.11715363711118698,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 2456
    },
    {
      "epoch": 0.39312,
      "grad_norm": 0.11519450694322586,
      "learning_rate": 0.0001,
      "loss": 0.341,
      "step": 2457
    },
    {
      "epoch": 0.39328,
      "grad_norm": 0.10800262540578842,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 2458
    },
    {
      "epoch": 0.39344,
      "grad_norm": 0.11842767149209976,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 2459
    },
    {
      "epoch": 0.3936,
      "grad_norm": 0.12543338537216187,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 2460
    },
    {
      "epoch": 0.39376,
      "grad_norm": 0.1354420930147171,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 2461
    },
    {
      "epoch": 0.39392,
      "grad_norm": 0.1379251331090927,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 2462
    },
    {
      "epoch": 0.39408,
      "grad_norm": 0.12495768070220947,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 2463
    },
    {
      "epoch": 0.39424,
      "grad_norm": 0.20785871148109436,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 2464
    },
    {
      "epoch": 0.3944,
      "grad_norm": 0.15117323398590088,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 2465
    },
    {
      "epoch": 0.39456,
      "grad_norm": 0.12387955188751221,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 2466
    },
    {
      "epoch": 0.39472,
      "grad_norm": 0.15864023566246033,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 2467
    },
    {
      "epoch": 0.39488,
      "grad_norm": 0.15213295817375183,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 2468
    },
    {
      "epoch": 0.39504,
      "grad_norm": 0.13219167292118073,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 2469
    },
    {
      "epoch": 0.3952,
      "grad_norm": 0.14348942041397095,
      "learning_rate": 0.0001,
      "loss": 0.3452,
      "step": 2470
    },
    {
      "epoch": 0.39536,
      "grad_norm": 0.12973102927207947,
      "learning_rate": 0.0001,
      "loss": 0.3532,
      "step": 2471
    },
    {
      "epoch": 0.39552,
      "grad_norm": 0.11771857738494873,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 2472
    },
    {
      "epoch": 0.39568,
      "grad_norm": 0.12933358550071716,
      "learning_rate": 0.0001,
      "loss": 0.3355,
      "step": 2473
    },
    {
      "epoch": 0.39584,
      "grad_norm": 0.11953598260879517,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2474
    },
    {
      "epoch": 0.396,
      "grad_norm": 0.13399623334407806,
      "learning_rate": 0.0001,
      "loss": 0.3467,
      "step": 2475
    },
    {
      "epoch": 0.39616,
      "grad_norm": 0.20765168964862823,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 2476
    },
    {
      "epoch": 0.39632,
      "grad_norm": 0.1327570676803589,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2477
    },
    {
      "epoch": 0.39648,
      "grad_norm": 0.1245938166975975,
      "learning_rate": 0.0001,
      "loss": 0.3421,
      "step": 2478
    },
    {
      "epoch": 0.39664,
      "grad_norm": 0.131271094083786,
      "learning_rate": 0.0001,
      "loss": 0.3448,
      "step": 2479
    },
    {
      "epoch": 0.3968,
      "grad_norm": 0.145147442817688,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 2480
    },
    {
      "epoch": 0.39696,
      "grad_norm": 0.13808272778987885,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 2481
    },
    {
      "epoch": 0.39712,
      "grad_norm": 0.11407746374607086,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 2482
    },
    {
      "epoch": 0.39728,
      "grad_norm": 0.1263829618692398,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 2483
    },
    {
      "epoch": 0.39744,
      "grad_norm": 0.15280641615390778,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 2484
    },
    {
      "epoch": 0.3976,
      "grad_norm": 0.14505594968795776,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 2485
    },
    {
      "epoch": 0.39776,
      "grad_norm": 0.24907207489013672,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 2486
    },
    {
      "epoch": 0.39792,
      "grad_norm": 0.12879328429698944,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 2487
    },
    {
      "epoch": 0.39808,
      "grad_norm": 0.11581971496343613,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 2488
    },
    {
      "epoch": 0.39824,
      "grad_norm": 0.10834292322397232,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2489
    },
    {
      "epoch": 0.3984,
      "grad_norm": 0.38530874252319336,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 2490
    },
    {
      "epoch": 0.39856,
      "grad_norm": 0.14020664989948273,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 2491
    },
    {
      "epoch": 0.39872,
      "grad_norm": 0.14434275031089783,
      "learning_rate": 0.0001,
      "loss": 0.3439,
      "step": 2492
    },
    {
      "epoch": 0.39888,
      "grad_norm": 0.18618476390838623,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 2493
    },
    {
      "epoch": 0.39904,
      "grad_norm": 0.48596134781837463,
      "learning_rate": 0.0001,
      "loss": 0.3442,
      "step": 2494
    },
    {
      "epoch": 0.3992,
      "grad_norm": 0.2893610894680023,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 2495
    },
    {
      "epoch": 0.39936,
      "grad_norm": 0.3512455224990845,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 2496
    },
    {
      "epoch": 0.39952,
      "grad_norm": 0.5084971189498901,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 2497
    },
    {
      "epoch": 0.39968,
      "grad_norm": 0.17238594591617584,
      "learning_rate": 0.0001,
      "loss": 0.3412,
      "step": 2498
    },
    {
      "epoch": 0.39984,
      "grad_norm": 0.30822739005088806,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 2499
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.28402912616729736,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2500
    },
    {
      "epoch": 0.4,
      "eval_train_accuracy": 0.9916,
      "eval_train_loss": 0.33132174611091614,
      "eval_train_runtime": 4.8738,
      "eval_train_samples_per_second": 1025.885,
      "eval_train_steps_per_second": 12.926,
      "step": 2500
    },
    {
      "epoch": 0.4,
      "eval_test_accuracy": 0.994,
      "eval_test_loss": 0.3299676179885864,
      "eval_test_runtime": 4.5948,
      "eval_test_samples_per_second": 1088.177,
      "eval_test_steps_per_second": 13.711,
      "step": 2500
    },
    {
      "epoch": 0.40016,
      "grad_norm": 0.26852577924728394,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2501
    },
    {
      "epoch": 0.40032,
      "grad_norm": 0.28414615988731384,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 2502
    },
    {
      "epoch": 0.40048,
      "grad_norm": 0.1827961504459381,
      "learning_rate": 0.0001,
      "loss": 0.3398,
      "step": 2503
    },
    {
      "epoch": 0.40064,
      "grad_norm": 0.17578257620334625,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 2504
    },
    {
      "epoch": 0.4008,
      "grad_norm": 0.22047947347164154,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2505
    },
    {
      "epoch": 0.40096,
      "grad_norm": 0.2122969925403595,
      "learning_rate": 0.0001,
      "loss": 0.3443,
      "step": 2506
    },
    {
      "epoch": 0.40112,
      "grad_norm": 0.17733074724674225,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 2507
    },
    {
      "epoch": 0.40128,
      "grad_norm": 0.15052415430545807,
      "learning_rate": 0.0001,
      "loss": 0.3413,
      "step": 2508
    },
    {
      "epoch": 0.40144,
      "grad_norm": 0.15044479072093964,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 2509
    },
    {
      "epoch": 0.4016,
      "grad_norm": 0.19669634103775024,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 2510
    },
    {
      "epoch": 0.40176,
      "grad_norm": 0.19213885068893433,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2511
    },
    {
      "epoch": 0.40192,
      "grad_norm": 0.14965292811393738,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 2512
    },
    {
      "epoch": 0.40208,
      "grad_norm": 0.16962477564811707,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 2513
    },
    {
      "epoch": 0.40224,
      "grad_norm": 0.13224361836910248,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 2514
    },
    {
      "epoch": 0.4024,
      "grad_norm": 0.1810697764158249,
      "learning_rate": 0.0001,
      "loss": 0.3414,
      "step": 2515
    },
    {
      "epoch": 0.40256,
      "grad_norm": 0.2306060492992401,
      "learning_rate": 0.0001,
      "loss": 0.3444,
      "step": 2516
    },
    {
      "epoch": 0.40272,
      "grad_norm": 0.12926284968852997,
      "learning_rate": 0.0001,
      "loss": 0.3493,
      "step": 2517
    },
    {
      "epoch": 0.40288,
      "grad_norm": 0.1702280193567276,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 2518
    },
    {
      "epoch": 0.40304,
      "grad_norm": 0.23128385841846466,
      "learning_rate": 0.0001,
      "loss": 0.3521,
      "step": 2519
    },
    {
      "epoch": 0.4032,
      "grad_norm": 0.1741044521331787,
      "learning_rate": 0.0001,
      "loss": 0.3393,
      "step": 2520
    },
    {
      "epoch": 0.40336,
      "grad_norm": 0.16720086336135864,
      "learning_rate": 0.0001,
      "loss": 0.3491,
      "step": 2521
    },
    {
      "epoch": 0.40352,
      "grad_norm": 0.15601994097232819,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 2522
    },
    {
      "epoch": 0.40368,
      "grad_norm": 0.1740248203277588,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 2523
    },
    {
      "epoch": 0.40384,
      "grad_norm": 0.13305246829986572,
      "learning_rate": 0.0001,
      "loss": 0.3458,
      "step": 2524
    },
    {
      "epoch": 0.404,
      "grad_norm": 0.15763291716575623,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 2525
    },
    {
      "epoch": 0.40416,
      "grad_norm": 0.18812087178230286,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 2526
    },
    {
      "epoch": 0.40432,
      "grad_norm": 0.16344372928142548,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 2527
    },
    {
      "epoch": 0.40448,
      "grad_norm": 0.12774758040905,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 2528
    },
    {
      "epoch": 0.40464,
      "grad_norm": 0.14317041635513306,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2529
    },
    {
      "epoch": 0.4048,
      "grad_norm": 0.15652839839458466,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2530
    },
    {
      "epoch": 0.40496,
      "grad_norm": 0.16187894344329834,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2531
    },
    {
      "epoch": 0.40512,
      "grad_norm": 0.1325867474079132,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 2532
    },
    {
      "epoch": 0.40528,
      "grad_norm": 0.14870168268680573,
      "learning_rate": 0.0001,
      "loss": 0.3517,
      "step": 2533
    },
    {
      "epoch": 0.40544,
      "grad_norm": 0.1379881054162979,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 2534
    },
    {
      "epoch": 0.4056,
      "grad_norm": 0.1387915015220642,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 2535
    },
    {
      "epoch": 0.40576,
      "grad_norm": 0.1765810251235962,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 2536
    },
    {
      "epoch": 0.40592,
      "grad_norm": 0.15951165556907654,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 2537
    },
    {
      "epoch": 0.40608,
      "grad_norm": 0.13315147161483765,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 2538
    },
    {
      "epoch": 0.40624,
      "grad_norm": 0.14573752880096436,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 2539
    },
    {
      "epoch": 0.4064,
      "grad_norm": 0.13842113316059113,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 2540
    },
    {
      "epoch": 0.40656,
      "grad_norm": 0.2067074179649353,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 2541
    },
    {
      "epoch": 0.40672,
      "grad_norm": 0.1478700041770935,
      "learning_rate": 0.0001,
      "loss": 0.344,
      "step": 2542
    },
    {
      "epoch": 0.40688,
      "grad_norm": 0.11647752672433853,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 2543
    },
    {
      "epoch": 0.40704,
      "grad_norm": 0.2552080750465393,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 2544
    },
    {
      "epoch": 0.4072,
      "grad_norm": 0.1555006355047226,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 2545
    },
    {
      "epoch": 0.40736,
      "grad_norm": 0.11336883157491684,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 2546
    },
    {
      "epoch": 0.40752,
      "grad_norm": 0.13776795566082,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 2547
    },
    {
      "epoch": 0.40768,
      "grad_norm": 0.1553598791360855,
      "learning_rate": 0.0001,
      "loss": 0.3456,
      "step": 2548
    },
    {
      "epoch": 0.40784,
      "grad_norm": 0.16956067085266113,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 2549
    },
    {
      "epoch": 0.408,
      "grad_norm": 0.13903428614139557,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 2550
    },
    {
      "epoch": 0.40816,
      "grad_norm": 0.16218088567256927,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 2551
    },
    {
      "epoch": 0.40832,
      "grad_norm": 0.1361602544784546,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 2552
    },
    {
      "epoch": 0.40848,
      "grad_norm": 0.162087544798851,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 2553
    },
    {
      "epoch": 0.40864,
      "grad_norm": 0.14367087185382843,
      "learning_rate": 0.0001,
      "loss": 0.3562,
      "step": 2554
    },
    {
      "epoch": 0.4088,
      "grad_norm": 0.1305360347032547,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 2555
    },
    {
      "epoch": 0.40896,
      "grad_norm": 0.15219713747501373,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 2556
    },
    {
      "epoch": 0.40912,
      "grad_norm": 0.11832715570926666,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 2557
    },
    {
      "epoch": 0.40928,
      "grad_norm": 0.13474442064762115,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 2558
    },
    {
      "epoch": 0.40944,
      "grad_norm": 0.11400045454502106,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 2559
    },
    {
      "epoch": 0.4096,
      "grad_norm": 0.12525063753128052,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 2560
    },
    {
      "epoch": 0.40976,
      "grad_norm": 0.11733149737119675,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 2561
    },
    {
      "epoch": 0.40992,
      "grad_norm": 0.10474713146686554,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 2562
    },
    {
      "epoch": 0.41008,
      "grad_norm": 0.11584130674600601,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 2563
    },
    {
      "epoch": 0.41024,
      "grad_norm": 0.12199755012989044,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 2564
    },
    {
      "epoch": 0.4104,
      "grad_norm": 0.13433586061000824,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 2565
    },
    {
      "epoch": 0.41056,
      "grad_norm": 0.12804608047008514,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 2566
    },
    {
      "epoch": 0.41072,
      "grad_norm": 0.11843842267990112,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 2567
    },
    {
      "epoch": 0.41088,
      "grad_norm": 0.12216608971357346,
      "learning_rate": 0.0001,
      "loss": 0.3391,
      "step": 2568
    },
    {
      "epoch": 0.41104,
      "grad_norm": 0.12721912562847137,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 2569
    },
    {
      "epoch": 0.4112,
      "grad_norm": 0.10424968600273132,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 2570
    },
    {
      "epoch": 0.41136,
      "grad_norm": 0.12885288894176483,
      "learning_rate": 0.0001,
      "loss": 0.3429,
      "step": 2571
    },
    {
      "epoch": 0.41152,
      "grad_norm": 0.10874490439891815,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 2572
    },
    {
      "epoch": 0.41168,
      "grad_norm": 0.12289052456617355,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 2573
    },
    {
      "epoch": 0.41184,
      "grad_norm": 0.11080817133188248,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 2574
    },
    {
      "epoch": 0.412,
      "grad_norm": 0.12084402143955231,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 2575
    },
    {
      "epoch": 0.41216,
      "grad_norm": 0.1052107885479927,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 2576
    },
    {
      "epoch": 0.41232,
      "grad_norm": 0.11202425509691238,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 2577
    },
    {
      "epoch": 0.41248,
      "grad_norm": 0.11981618404388428,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 2578
    },
    {
      "epoch": 0.41264,
      "grad_norm": 0.10534123331308365,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 2579
    },
    {
      "epoch": 0.4128,
      "grad_norm": 0.11388937383890152,
      "learning_rate": 0.0001,
      "loss": 0.3492,
      "step": 2580
    },
    {
      "epoch": 0.41296,
      "grad_norm": 0.12641602754592896,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 2581
    },
    {
      "epoch": 0.41312,
      "grad_norm": 0.11076124012470245,
      "learning_rate": 0.0001,
      "loss": 0.3341,
      "step": 2582
    },
    {
      "epoch": 0.41328,
      "grad_norm": 0.12415217608213425,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 2583
    },
    {
      "epoch": 0.41344,
      "grad_norm": 0.1330147534608841,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 2584
    },
    {
      "epoch": 0.4136,
      "grad_norm": 0.11432752013206482,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 2585
    },
    {
      "epoch": 0.41376,
      "grad_norm": 0.1120903342962265,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 2586
    },
    {
      "epoch": 0.41392,
      "grad_norm": 0.10580241680145264,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 2587
    },
    {
      "epoch": 0.41408,
      "grad_norm": 0.12502671778202057,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 2588
    },
    {
      "epoch": 0.41424,
      "grad_norm": 0.10247888416051865,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 2589
    },
    {
      "epoch": 0.4144,
      "grad_norm": 0.10231222957372665,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 2590
    },
    {
      "epoch": 0.41456,
      "grad_norm": 0.11035741120576859,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 2591
    },
    {
      "epoch": 0.41472,
      "grad_norm": 0.11327888816595078,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 2592
    },
    {
      "epoch": 0.41488,
      "grad_norm": 0.10399483144283295,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 2593
    },
    {
      "epoch": 0.41504,
      "grad_norm": 0.10763990879058838,
      "learning_rate": 0.0001,
      "loss": 0.3556,
      "step": 2594
    },
    {
      "epoch": 0.4152,
      "grad_norm": 0.11330506950616837,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 2595
    },
    {
      "epoch": 0.41536,
      "grad_norm": 0.11096648126840591,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 2596
    },
    {
      "epoch": 0.41552,
      "grad_norm": 0.10909251868724823,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 2597
    },
    {
      "epoch": 0.41568,
      "grad_norm": 0.11377264559268951,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2598
    },
    {
      "epoch": 0.41584,
      "grad_norm": 0.10156972706317902,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 2599
    },
    {
      "epoch": 0.416,
      "grad_norm": 0.11423616111278534,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 2600
    },
    {
      "epoch": 0.416,
      "eval_train_accuracy": 0.991,
      "eval_train_loss": 0.3290443420410156,
      "eval_train_runtime": 4.6807,
      "eval_train_samples_per_second": 1068.218,
      "eval_train_steps_per_second": 13.46,
      "step": 2600
    },
    {
      "epoch": 0.416,
      "eval_test_accuracy": 0.9938,
      "eval_test_loss": 0.3275260627269745,
      "eval_test_runtime": 4.95,
      "eval_test_samples_per_second": 1010.094,
      "eval_test_steps_per_second": 12.727,
      "step": 2600
    },
    {
      "epoch": 0.41616,
      "grad_norm": 0.10762060433626175,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 2601
    },
    {
      "epoch": 0.41632,
      "grad_norm": 0.11639310419559479,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 2602
    },
    {
      "epoch": 0.41648,
      "grad_norm": 0.10081961750984192,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 2603
    },
    {
      "epoch": 0.41664,
      "grad_norm": 0.12055879831314087,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 2604
    },
    {
      "epoch": 0.4168,
      "grad_norm": 0.10806696116924286,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2605
    },
    {
      "epoch": 0.41696,
      "grad_norm": 0.1126120388507843,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2606
    },
    {
      "epoch": 0.41712,
      "grad_norm": 0.11396432667970657,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 2607
    },
    {
      "epoch": 0.41728,
      "grad_norm": 0.1040635034441948,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 2608
    },
    {
      "epoch": 0.41744,
      "grad_norm": 0.11433528363704681,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 2609
    },
    {
      "epoch": 0.4176,
      "grad_norm": 0.10325722396373749,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 2610
    },
    {
      "epoch": 0.41776,
      "grad_norm": 0.11242669075727463,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 2611
    },
    {
      "epoch": 0.41792,
      "grad_norm": 0.1009744182229042,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 2612
    },
    {
      "epoch": 0.41808,
      "grad_norm": 0.09419877082109451,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 2613
    },
    {
      "epoch": 0.41824,
      "grad_norm": 0.10432840883731842,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 2614
    },
    {
      "epoch": 0.4184,
      "grad_norm": 0.13262464106082916,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 2615
    },
    {
      "epoch": 0.41856,
      "grad_norm": 0.10278335958719254,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 2616
    },
    {
      "epoch": 0.41872,
      "grad_norm": 0.1081194058060646,
      "learning_rate": 0.0001,
      "loss": 0.3477,
      "step": 2617
    },
    {
      "epoch": 0.41888,
      "grad_norm": 0.10114585608243942,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 2618
    },
    {
      "epoch": 0.41904,
      "grad_norm": 0.12116856873035431,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 2619
    },
    {
      "epoch": 0.4192,
      "grad_norm": 0.10103695094585419,
      "learning_rate": 0.0001,
      "loss": 0.344,
      "step": 2620
    },
    {
      "epoch": 0.41936,
      "grad_norm": 0.11775227636098862,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 2621
    },
    {
      "epoch": 0.41952,
      "grad_norm": 0.11512846499681473,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 2622
    },
    {
      "epoch": 0.41968,
      "grad_norm": 0.10328418761491776,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 2623
    },
    {
      "epoch": 0.41984,
      "grad_norm": 0.11516224592924118,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 2624
    },
    {
      "epoch": 0.42,
      "grad_norm": 0.13147667050361633,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 2625
    },
    {
      "epoch": 0.42016,
      "grad_norm": 0.1335706114768982,
      "learning_rate": 0.0001,
      "loss": 0.3075,
      "step": 2626
    },
    {
      "epoch": 0.42032,
      "grad_norm": 0.11658565700054169,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 2627
    },
    {
      "epoch": 0.42048,
      "grad_norm": 0.13591338694095612,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 2628
    },
    {
      "epoch": 0.42064,
      "grad_norm": 0.14241309463977814,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 2629
    },
    {
      "epoch": 0.4208,
      "grad_norm": 0.1312198042869568,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 2630
    },
    {
      "epoch": 0.42096,
      "grad_norm": 0.10160918533802032,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 2631
    },
    {
      "epoch": 0.42112,
      "grad_norm": 0.13282538950443268,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 2632
    },
    {
      "epoch": 0.42128,
      "grad_norm": 0.13402999937534332,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 2633
    },
    {
      "epoch": 0.42144,
      "grad_norm": 0.16255150735378265,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 2634
    },
    {
      "epoch": 0.4216,
      "grad_norm": 0.12156479060649872,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2635
    },
    {
      "epoch": 0.42176,
      "grad_norm": 0.1875932812690735,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 2636
    },
    {
      "epoch": 0.42192,
      "grad_norm": 0.13627029955387115,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 2637
    },
    {
      "epoch": 0.42208,
      "grad_norm": 0.12309788912534714,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 2638
    },
    {
      "epoch": 0.42224,
      "grad_norm": 0.11753693222999573,
      "learning_rate": 0.0001,
      "loss": 0.3407,
      "step": 2639
    },
    {
      "epoch": 0.4224,
      "grad_norm": 0.1223178580403328,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 2640
    },
    {
      "epoch": 0.42256,
      "grad_norm": 0.10553653538227081,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 2641
    },
    {
      "epoch": 0.42272,
      "grad_norm": 0.12990307807922363,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 2642
    },
    {
      "epoch": 0.42288,
      "grad_norm": 0.12138189375400543,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 2643
    },
    {
      "epoch": 0.42304,
      "grad_norm": 0.11429343372583389,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 2644
    },
    {
      "epoch": 0.4232,
      "grad_norm": 0.11817152798175812,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 2645
    },
    {
      "epoch": 0.42336,
      "grad_norm": 0.13666367530822754,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 2646
    },
    {
      "epoch": 0.42352,
      "grad_norm": 0.11675296723842621,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 2647
    },
    {
      "epoch": 0.42368,
      "grad_norm": 0.15416304767131805,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 2648
    },
    {
      "epoch": 0.42384,
      "grad_norm": 0.11390499025583267,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 2649
    },
    {
      "epoch": 0.424,
      "grad_norm": 0.11249708384275436,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2650
    },
    {
      "epoch": 0.42416,
      "grad_norm": 0.20342127978801727,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 2651
    },
    {
      "epoch": 0.42432,
      "grad_norm": 0.10425562411546707,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 2652
    },
    {
      "epoch": 0.42448,
      "grad_norm": 0.129064679145813,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 2653
    },
    {
      "epoch": 0.42464,
      "grad_norm": 0.6534366607666016,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 2654
    },
    {
      "epoch": 0.4248,
      "grad_norm": 0.5882387161254883,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2655
    },
    {
      "epoch": 0.42496,
      "grad_norm": 0.23166677355766296,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 2656
    },
    {
      "epoch": 0.42512,
      "grad_norm": 0.12066160142421722,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 2657
    },
    {
      "epoch": 0.42528,
      "grad_norm": 0.3060864508152008,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 2658
    },
    {
      "epoch": 0.42544,
      "grad_norm": 0.18750104308128357,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 2659
    },
    {
      "epoch": 0.4256,
      "grad_norm": 0.12526297569274902,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2660
    },
    {
      "epoch": 0.42576,
      "grad_norm": 0.19863402843475342,
      "learning_rate": 0.0001,
      "loss": 0.3461,
      "step": 2661
    },
    {
      "epoch": 0.42592,
      "grad_norm": 0.1284654289484024,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 2662
    },
    {
      "epoch": 0.42608,
      "grad_norm": 0.14913055300712585,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 2663
    },
    {
      "epoch": 0.42624,
      "grad_norm": 0.14477767050266266,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 2664
    },
    {
      "epoch": 0.4264,
      "grad_norm": 0.13720402121543884,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 2665
    },
    {
      "epoch": 0.42656,
      "grad_norm": 0.13686060905456543,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 2666
    },
    {
      "epoch": 0.42672,
      "grad_norm": 0.15018153190612793,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 2667
    },
    {
      "epoch": 0.42688,
      "grad_norm": 0.23019880056381226,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 2668
    },
    {
      "epoch": 0.42704,
      "grad_norm": 0.13653410971164703,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 2669
    },
    {
      "epoch": 0.4272,
      "grad_norm": 0.16471533477306366,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 2670
    },
    {
      "epoch": 0.42736,
      "grad_norm": 0.1468632072210312,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 2671
    },
    {
      "epoch": 0.42752,
      "grad_norm": 0.13820752501487732,
      "learning_rate": 0.0001,
      "loss": 0.3385,
      "step": 2672
    },
    {
      "epoch": 0.42768,
      "grad_norm": 0.11429576575756073,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 2673
    },
    {
      "epoch": 0.42784,
      "grad_norm": 0.20836594700813293,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 2674
    },
    {
      "epoch": 0.428,
      "grad_norm": 0.13131718337535858,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 2675
    },
    {
      "epoch": 0.42816,
      "grad_norm": 0.11862285435199738,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2676
    },
    {
      "epoch": 0.42832,
      "grad_norm": 0.17209210991859436,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 2677
    },
    {
      "epoch": 0.42848,
      "grad_norm": 0.12019920349121094,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 2678
    },
    {
      "epoch": 0.42864,
      "grad_norm": 0.10746373981237411,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 2679
    },
    {
      "epoch": 0.4288,
      "grad_norm": 0.13815103471279144,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 2680
    },
    {
      "epoch": 0.42896,
      "grad_norm": 0.1229957789182663,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 2681
    },
    {
      "epoch": 0.42912,
      "grad_norm": 0.16365152597427368,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 2682
    },
    {
      "epoch": 0.42928,
      "grad_norm": 0.13731536269187927,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 2683
    },
    {
      "epoch": 0.42944,
      "grad_norm": 0.18406182527542114,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 2684
    },
    {
      "epoch": 0.4296,
      "grad_norm": 0.12435786426067352,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2685
    },
    {
      "epoch": 0.42976,
      "grad_norm": 0.1383284628391266,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 2686
    },
    {
      "epoch": 0.42992,
      "grad_norm": 0.15268062055110931,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 2687
    },
    {
      "epoch": 0.43008,
      "grad_norm": 0.12120731174945831,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 2688
    },
    {
      "epoch": 0.43024,
      "grad_norm": 0.10577846318483353,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 2689
    },
    {
      "epoch": 0.4304,
      "grad_norm": 0.10892470926046371,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 2690
    },
    {
      "epoch": 0.43056,
      "grad_norm": 0.3376009464263916,
      "learning_rate": 0.0001,
      "loss": 0.3486,
      "step": 2691
    },
    {
      "epoch": 0.43072,
      "grad_norm": 0.126976877450943,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 2692
    },
    {
      "epoch": 0.43088,
      "grad_norm": 0.16773180663585663,
      "learning_rate": 0.0001,
      "loss": 0.3491,
      "step": 2693
    },
    {
      "epoch": 0.43104,
      "grad_norm": 0.1461976319551468,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 2694
    },
    {
      "epoch": 0.4312,
      "grad_norm": 0.18830718100070953,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 2695
    },
    {
      "epoch": 0.43136,
      "grad_norm": 0.13729426264762878,
      "learning_rate": 0.0001,
      "loss": 0.3423,
      "step": 2696
    },
    {
      "epoch": 0.43152,
      "grad_norm": 0.12973694503307343,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 2697
    },
    {
      "epoch": 0.43168,
      "grad_norm": 0.12342237681150436,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 2698
    },
    {
      "epoch": 0.43184,
      "grad_norm": 0.15469500422477722,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 2699
    },
    {
      "epoch": 0.432,
      "grad_norm": 0.1267920285463333,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 2700
    },
    {
      "epoch": 0.432,
      "eval_train_accuracy": 0.9902,
      "eval_train_loss": 0.3288991153240204,
      "eval_train_runtime": 4.5996,
      "eval_train_samples_per_second": 1087.062,
      "eval_train_steps_per_second": 13.697,
      "step": 2700
    },
    {
      "epoch": 0.432,
      "eval_test_accuracy": 0.991,
      "eval_test_loss": 0.3274761438369751,
      "eval_test_runtime": 4.5403,
      "eval_test_samples_per_second": 1101.238,
      "eval_test_steps_per_second": 13.876,
      "step": 2700
    },
    {
      "epoch": 0.43216,
      "grad_norm": 0.12772288918495178,
      "learning_rate": 0.0001,
      "loss": 0.3533,
      "step": 2701
    },
    {
      "epoch": 0.43232,
      "grad_norm": 0.12382282316684723,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 2702
    },
    {
      "epoch": 0.43248,
      "grad_norm": 0.11788945645093918,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 2703
    },
    {
      "epoch": 0.43264,
      "grad_norm": 0.1089351549744606,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 2704
    },
    {
      "epoch": 0.4328,
      "grad_norm": 0.12026897072792053,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 2705
    },
    {
      "epoch": 0.43296,
      "grad_norm": 0.14544299244880676,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 2706
    },
    {
      "epoch": 0.43312,
      "grad_norm": 0.15275821089744568,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 2707
    },
    {
      "epoch": 0.43328,
      "grad_norm": 0.18903665244579315,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 2708
    },
    {
      "epoch": 0.43344,
      "grad_norm": 0.11092968285083771,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 2709
    },
    {
      "epoch": 0.4336,
      "grad_norm": 0.18813292682170868,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 2710
    },
    {
      "epoch": 0.43376,
      "grad_norm": 0.13257569074630737,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 2711
    },
    {
      "epoch": 0.43392,
      "grad_norm": 0.1284361332654953,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 2712
    },
    {
      "epoch": 0.43408,
      "grad_norm": 0.12026448547840118,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 2713
    },
    {
      "epoch": 0.43424,
      "grad_norm": 0.10216866433620453,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 2714
    },
    {
      "epoch": 0.4344,
      "grad_norm": 0.22627456486225128,
      "learning_rate": 0.0001,
      "loss": 0.3437,
      "step": 2715
    },
    {
      "epoch": 0.43456,
      "grad_norm": 0.12676768004894257,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 2716
    },
    {
      "epoch": 0.43472,
      "grad_norm": 0.10570228099822998,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 2717
    },
    {
      "epoch": 0.43488,
      "grad_norm": 0.12319208681583405,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 2718
    },
    {
      "epoch": 0.43504,
      "grad_norm": 0.10763607174158096,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2719
    },
    {
      "epoch": 0.4352,
      "grad_norm": 0.22686007618904114,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 2720
    },
    {
      "epoch": 0.43536,
      "grad_norm": 0.12865717709064484,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 2721
    },
    {
      "epoch": 0.43552,
      "grad_norm": 0.18231259286403656,
      "learning_rate": 0.0001,
      "loss": 0.3347,
      "step": 2722
    },
    {
      "epoch": 0.43568,
      "grad_norm": 0.11507859826087952,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 2723
    },
    {
      "epoch": 0.43584,
      "grad_norm": 0.135407954454422,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 2724
    },
    {
      "epoch": 0.436,
      "grad_norm": 0.11074703186750412,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 2725
    },
    {
      "epoch": 0.43616,
      "grad_norm": 0.12997572124004364,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 2726
    },
    {
      "epoch": 0.43632,
      "grad_norm": 0.11923151463270187,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 2727
    },
    {
      "epoch": 0.43648,
      "grad_norm": 0.14650987088680267,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 2728
    },
    {
      "epoch": 0.43664,
      "grad_norm": 0.1358165591955185,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 2729
    },
    {
      "epoch": 0.4368,
      "grad_norm": 0.12036759406328201,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 2730
    },
    {
      "epoch": 0.43696,
      "grad_norm": 0.14316704869270325,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 2731
    },
    {
      "epoch": 0.43712,
      "grad_norm": 0.12216447293758392,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 2732
    },
    {
      "epoch": 0.43728,
      "grad_norm": 0.12211017310619354,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 2733
    },
    {
      "epoch": 0.43744,
      "grad_norm": 0.14937925338745117,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 2734
    },
    {
      "epoch": 0.4376,
      "grad_norm": 0.11326772719621658,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 2735
    },
    {
      "epoch": 0.43776,
      "grad_norm": 0.1317741870880127,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 2736
    },
    {
      "epoch": 0.43792,
      "grad_norm": 0.10526532679796219,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 2737
    },
    {
      "epoch": 0.43808,
      "grad_norm": 0.1287289708852768,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 2738
    },
    {
      "epoch": 0.43824,
      "grad_norm": 0.12003084272146225,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 2739
    },
    {
      "epoch": 0.4384,
      "grad_norm": 0.10767919570207596,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 2740
    },
    {
      "epoch": 0.43856,
      "grad_norm": 0.10534394532442093,
      "learning_rate": 0.0001,
      "loss": 0.3083,
      "step": 2741
    },
    {
      "epoch": 0.43872,
      "grad_norm": 0.1313111037015915,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 2742
    },
    {
      "epoch": 0.43888,
      "grad_norm": 0.10670424997806549,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 2743
    },
    {
      "epoch": 0.43904,
      "grad_norm": 0.10016459971666336,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 2744
    },
    {
      "epoch": 0.4392,
      "grad_norm": 0.11532644182443619,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 2745
    },
    {
      "epoch": 0.43936,
      "grad_norm": 0.13409674167633057,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 2746
    },
    {
      "epoch": 0.43952,
      "grad_norm": 0.12320675700902939,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 2747
    },
    {
      "epoch": 0.43968,
      "grad_norm": 0.09318824857473373,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 2748
    },
    {
      "epoch": 0.43984,
      "grad_norm": 0.10304464399814606,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 2749
    },
    {
      "epoch": 0.44,
      "grad_norm": 0.12896938621997833,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 2750
    },
    {
      "epoch": 0.44016,
      "grad_norm": 0.12810011208057404,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 2751
    },
    {
      "epoch": 0.44032,
      "grad_norm": 0.1225908100605011,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 2752
    },
    {
      "epoch": 0.44048,
      "grad_norm": 0.11250899732112885,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 2753
    },
    {
      "epoch": 0.44064,
      "grad_norm": 0.13730181753635406,
      "learning_rate": 0.0001,
      "loss": 0.3524,
      "step": 2754
    },
    {
      "epoch": 0.4408,
      "grad_norm": 0.10996439307928085,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 2755
    },
    {
      "epoch": 0.44096,
      "grad_norm": 0.1255318820476532,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2756
    },
    {
      "epoch": 0.44112,
      "grad_norm": 0.13744789361953735,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 2757
    },
    {
      "epoch": 0.44128,
      "grad_norm": 0.1082133948802948,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 2758
    },
    {
      "epoch": 0.44144,
      "grad_norm": 0.10520032793283463,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 2759
    },
    {
      "epoch": 0.4416,
      "grad_norm": 0.12822754681110382,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 2760
    },
    {
      "epoch": 0.44176,
      "grad_norm": 0.1345062255859375,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 2761
    },
    {
      "epoch": 0.44192,
      "grad_norm": 0.13761132955551147,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 2762
    },
    {
      "epoch": 0.44208,
      "grad_norm": 0.11988413333892822,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 2763
    },
    {
      "epoch": 0.44224,
      "grad_norm": 0.11323978006839752,
      "learning_rate": 0.0001,
      "loss": 0.3517,
      "step": 2764
    },
    {
      "epoch": 0.4424,
      "grad_norm": 0.11709977686405182,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 2765
    },
    {
      "epoch": 0.44256,
      "grad_norm": 0.11265888810157776,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2766
    },
    {
      "epoch": 0.44272,
      "grad_norm": 0.1273760348558426,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 2767
    },
    {
      "epoch": 0.44288,
      "grad_norm": 0.11068569868803024,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 2768
    },
    {
      "epoch": 0.44304,
      "grad_norm": 0.14229020476341248,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2769
    },
    {
      "epoch": 0.4432,
      "grad_norm": 0.10978977382183075,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 2770
    },
    {
      "epoch": 0.44336,
      "grad_norm": 0.1142006367444992,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 2771
    },
    {
      "epoch": 0.44352,
      "grad_norm": 0.0963340699672699,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2772
    },
    {
      "epoch": 0.44368,
      "grad_norm": 0.23106268048286438,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 2773
    },
    {
      "epoch": 0.44384,
      "grad_norm": 0.1266949623823166,
      "learning_rate": 0.0001,
      "loss": 0.3386,
      "step": 2774
    },
    {
      "epoch": 0.444,
      "grad_norm": 0.12775076925754547,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 2775
    },
    {
      "epoch": 0.44416,
      "grad_norm": 0.19295215606689453,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 2776
    },
    {
      "epoch": 0.44432,
      "grad_norm": 0.10984784364700317,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2777
    },
    {
      "epoch": 0.44448,
      "grad_norm": 0.12139701098203659,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 2778
    },
    {
      "epoch": 0.44464,
      "grad_norm": 0.13125008344650269,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 2779
    },
    {
      "epoch": 0.4448,
      "grad_norm": 0.13856664299964905,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2780
    },
    {
      "epoch": 0.44496,
      "grad_norm": 0.11088060587644577,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 2781
    },
    {
      "epoch": 0.44512,
      "grad_norm": 0.10029517114162445,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 2782
    },
    {
      "epoch": 0.44528,
      "grad_norm": 0.14051903784275055,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 2783
    },
    {
      "epoch": 0.44544,
      "grad_norm": 0.11385010182857513,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 2784
    },
    {
      "epoch": 0.4456,
      "grad_norm": 0.1375742256641388,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 2785
    },
    {
      "epoch": 0.44576,
      "grad_norm": 0.11043562740087509,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 2786
    },
    {
      "epoch": 0.44592,
      "grad_norm": 0.11500782519578934,
      "learning_rate": 0.0001,
      "loss": 0.3373,
      "step": 2787
    },
    {
      "epoch": 0.44608,
      "grad_norm": 0.1592491865158081,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 2788
    },
    {
      "epoch": 0.44624,
      "grad_norm": 0.13673314452171326,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 2789
    },
    {
      "epoch": 0.4464,
      "grad_norm": 0.12154430896043777,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 2790
    },
    {
      "epoch": 0.44656,
      "grad_norm": 0.10368286818265915,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 2791
    },
    {
      "epoch": 0.44672,
      "grad_norm": 0.09815006703138351,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 2792
    },
    {
      "epoch": 0.44688,
      "grad_norm": 0.16631007194519043,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 2793
    },
    {
      "epoch": 0.44704,
      "grad_norm": 0.12611202895641327,
      "learning_rate": 0.0001,
      "loss": 0.3337,
      "step": 2794
    },
    {
      "epoch": 0.4472,
      "grad_norm": 0.14157557487487793,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 2795
    },
    {
      "epoch": 0.44736,
      "grad_norm": 0.10857260227203369,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 2796
    },
    {
      "epoch": 0.44752,
      "grad_norm": 0.11299353837966919,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2797
    },
    {
      "epoch": 0.44768,
      "grad_norm": 0.1086123064160347,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2798
    },
    {
      "epoch": 0.44784,
      "grad_norm": 0.13888315856456757,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 2799
    },
    {
      "epoch": 0.448,
      "grad_norm": 0.15797832608222961,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 2800
    },
    {
      "epoch": 0.448,
      "eval_train_accuracy": 0.9908,
      "eval_train_loss": 0.3275779187679291,
      "eval_train_runtime": 4.6015,
      "eval_train_samples_per_second": 1086.596,
      "eval_train_steps_per_second": 13.691,
      "step": 2800
    },
    {
      "epoch": 0.448,
      "eval_test_accuracy": 0.9922,
      "eval_test_loss": 0.3262002766132355,
      "eval_test_runtime": 4.6175,
      "eval_test_samples_per_second": 1082.835,
      "eval_test_steps_per_second": 13.644,
      "step": 2800
    },
    {
      "epoch": 0.44816,
      "grad_norm": 0.1147218719124794,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 2801
    },
    {
      "epoch": 0.44832,
      "grad_norm": 0.10509246587753296,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 2802
    },
    {
      "epoch": 0.44848,
      "grad_norm": 0.1284331977367401,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 2803
    },
    {
      "epoch": 0.44864,
      "grad_norm": 0.1273719221353531,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 2804
    },
    {
      "epoch": 0.4488,
      "grad_norm": 0.13837416470050812,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 2805
    },
    {
      "epoch": 0.44896,
      "grad_norm": 0.11638715118169785,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 2806
    },
    {
      "epoch": 0.44912,
      "grad_norm": 0.14839129149913788,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 2807
    },
    {
      "epoch": 0.44928,
      "grad_norm": 0.10812277346849442,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 2808
    },
    {
      "epoch": 0.44944,
      "grad_norm": 0.13203009963035583,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 2809
    },
    {
      "epoch": 0.4496,
      "grad_norm": 0.13157254457473755,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 2810
    },
    {
      "epoch": 0.44976,
      "grad_norm": 0.1566230207681656,
      "learning_rate": 0.0001,
      "loss": 0.3396,
      "step": 2811
    },
    {
      "epoch": 0.44992,
      "grad_norm": 0.12061800062656403,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 2812
    },
    {
      "epoch": 0.45008,
      "grad_norm": 0.11744377762079239,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 2813
    },
    {
      "epoch": 0.45024,
      "grad_norm": 0.10705417394638062,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 2814
    },
    {
      "epoch": 0.4504,
      "grad_norm": 0.1298302412033081,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2815
    },
    {
      "epoch": 0.45056,
      "grad_norm": 0.14268286526203156,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 2816
    },
    {
      "epoch": 0.45072,
      "grad_norm": 0.10948969423770905,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2817
    },
    {
      "epoch": 0.45088,
      "grad_norm": 0.12159745395183563,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 2818
    },
    {
      "epoch": 0.45104,
      "grad_norm": 0.11752137541770935,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 2819
    },
    {
      "epoch": 0.4512,
      "grad_norm": 0.13519027829170227,
      "learning_rate": 0.0001,
      "loss": 0.3446,
      "step": 2820
    },
    {
      "epoch": 0.45136,
      "grad_norm": 0.10356775671243668,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 2821
    },
    {
      "epoch": 0.45152,
      "grad_norm": 0.10914254188537598,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 2822
    },
    {
      "epoch": 0.45168,
      "grad_norm": 0.12184200435876846,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 2823
    },
    {
      "epoch": 0.45184,
      "grad_norm": 0.13585126399993896,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 2824
    },
    {
      "epoch": 0.452,
      "grad_norm": 0.1495065838098526,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 2825
    },
    {
      "epoch": 0.45216,
      "grad_norm": 0.11429327726364136,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 2826
    },
    {
      "epoch": 0.45232,
      "grad_norm": 0.1499863713979721,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2827
    },
    {
      "epoch": 0.45248,
      "grad_norm": 0.10633966326713562,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 2828
    },
    {
      "epoch": 0.45264,
      "grad_norm": 0.09906011074781418,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 2829
    },
    {
      "epoch": 0.4528,
      "grad_norm": 0.15453140437602997,
      "learning_rate": 0.0001,
      "loss": 0.3441,
      "step": 2830
    },
    {
      "epoch": 0.45296,
      "grad_norm": 0.1506127268075943,
      "learning_rate": 0.0001,
      "loss": 0.3436,
      "step": 2831
    },
    {
      "epoch": 0.45312,
      "grad_norm": 0.12356273084878922,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 2832
    },
    {
      "epoch": 0.45328,
      "grad_norm": 0.12902338802814484,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 2833
    },
    {
      "epoch": 0.45344,
      "grad_norm": 0.1324908286333084,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 2834
    },
    {
      "epoch": 0.4536,
      "grad_norm": 0.13429392874240875,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 2835
    },
    {
      "epoch": 0.45376,
      "grad_norm": 0.11393215507268906,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 2836
    },
    {
      "epoch": 0.45392,
      "grad_norm": 0.1294458955526352,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 2837
    },
    {
      "epoch": 0.45408,
      "grad_norm": 0.1329103261232376,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 2838
    },
    {
      "epoch": 0.45424,
      "grad_norm": 0.12173573672771454,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 2839
    },
    {
      "epoch": 0.4544,
      "grad_norm": 0.10330786556005478,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2840
    },
    {
      "epoch": 0.45456,
      "grad_norm": 0.12171930819749832,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2841
    },
    {
      "epoch": 0.45472,
      "grad_norm": 0.10995745658874512,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 2842
    },
    {
      "epoch": 0.45488,
      "grad_norm": 0.11552929878234863,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 2843
    },
    {
      "epoch": 0.45504,
      "grad_norm": 0.13545817136764526,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 2844
    },
    {
      "epoch": 0.4552,
      "grad_norm": 0.10345413535833359,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 2845
    },
    {
      "epoch": 0.45536,
      "grad_norm": 0.1030944362282753,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 2846
    },
    {
      "epoch": 0.45552,
      "grad_norm": 0.10840994119644165,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 2847
    },
    {
      "epoch": 0.45568,
      "grad_norm": 0.13051575422286987,
      "learning_rate": 0.0001,
      "loss": 0.3387,
      "step": 2848
    },
    {
      "epoch": 0.45584,
      "grad_norm": 0.10450810939073563,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 2849
    },
    {
      "epoch": 0.456,
      "grad_norm": 0.09879542887210846,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 2850
    },
    {
      "epoch": 0.45616,
      "grad_norm": 0.10478071123361588,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 2851
    },
    {
      "epoch": 0.45632,
      "grad_norm": 0.11988716572523117,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 2852
    },
    {
      "epoch": 0.45648,
      "grad_norm": 0.13646532595157623,
      "learning_rate": 0.0001,
      "loss": 0.3456,
      "step": 2853
    },
    {
      "epoch": 0.45664,
      "grad_norm": 0.12469169497489929,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 2854
    },
    {
      "epoch": 0.4568,
      "grad_norm": 0.1133786290884018,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 2855
    },
    {
      "epoch": 0.45696,
      "grad_norm": 0.10439658164978027,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 2856
    },
    {
      "epoch": 0.45712,
      "grad_norm": 0.11119192838668823,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 2857
    },
    {
      "epoch": 0.45728,
      "grad_norm": 0.11766644567251205,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 2858
    },
    {
      "epoch": 0.45744,
      "grad_norm": 0.1128944531083107,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 2859
    },
    {
      "epoch": 0.4576,
      "grad_norm": 0.11662803590297699,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 2860
    },
    {
      "epoch": 0.45776,
      "grad_norm": 0.10702548921108246,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 2861
    },
    {
      "epoch": 0.45792,
      "grad_norm": 0.10321981459856033,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 2862
    },
    {
      "epoch": 0.45808,
      "grad_norm": 0.10079174488782883,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 2863
    },
    {
      "epoch": 0.45824,
      "grad_norm": 0.1253025084733963,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 2864
    },
    {
      "epoch": 0.4584,
      "grad_norm": 0.11115312576293945,
      "learning_rate": 0.0001,
      "loss": 0.2999,
      "step": 2865
    },
    {
      "epoch": 0.45856,
      "grad_norm": 0.1003887802362442,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 2866
    },
    {
      "epoch": 0.45872,
      "grad_norm": 0.13767403364181519,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 2867
    },
    {
      "epoch": 0.45888,
      "grad_norm": 0.1139942929148674,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 2868
    },
    {
      "epoch": 0.45904,
      "grad_norm": 0.11258725076913834,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 2869
    },
    {
      "epoch": 0.4592,
      "grad_norm": 0.12408872693777084,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 2870
    },
    {
      "epoch": 0.45936,
      "grad_norm": 0.097988061606884,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 2871
    },
    {
      "epoch": 0.45952,
      "grad_norm": 0.11836589127779007,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 2872
    },
    {
      "epoch": 0.45968,
      "grad_norm": 0.11073781549930573,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 2873
    },
    {
      "epoch": 0.45984,
      "grad_norm": 0.12337949126958847,
      "learning_rate": 0.0001,
      "loss": 0.3468,
      "step": 2874
    },
    {
      "epoch": 0.46,
      "grad_norm": 0.1298624873161316,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2875
    },
    {
      "epoch": 0.46016,
      "grad_norm": 0.10909812152385712,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 2876
    },
    {
      "epoch": 0.46032,
      "grad_norm": 0.142334446310997,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 2877
    },
    {
      "epoch": 0.46048,
      "grad_norm": 0.10259514302015305,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2878
    },
    {
      "epoch": 0.46064,
      "grad_norm": 0.10495869815349579,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 2879
    },
    {
      "epoch": 0.4608,
      "grad_norm": 0.10494238138198853,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 2880
    },
    {
      "epoch": 0.46096,
      "grad_norm": 0.1690281629562378,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 2881
    },
    {
      "epoch": 0.46112,
      "grad_norm": 0.10660237073898315,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 2882
    },
    {
      "epoch": 0.46128,
      "grad_norm": 0.13206946849822998,
      "learning_rate": 0.0001,
      "loss": 0.3477,
      "step": 2883
    },
    {
      "epoch": 0.46144,
      "grad_norm": 0.12277550995349884,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 2884
    },
    {
      "epoch": 0.4616,
      "grad_norm": 0.10216709226369858,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2885
    },
    {
      "epoch": 0.46176,
      "grad_norm": 0.10532382130622864,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 2886
    },
    {
      "epoch": 0.46192,
      "grad_norm": 0.11018990725278854,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2887
    },
    {
      "epoch": 0.46208,
      "grad_norm": 0.09443113207817078,
      "learning_rate": 0.0001,
      "loss": 0.3073,
      "step": 2888
    },
    {
      "epoch": 0.46224,
      "grad_norm": 0.10409852117300034,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 2889
    },
    {
      "epoch": 0.4624,
      "grad_norm": 0.0981329083442688,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 2890
    },
    {
      "epoch": 0.46256,
      "grad_norm": 0.15002180635929108,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 2891
    },
    {
      "epoch": 0.46272,
      "grad_norm": 0.11529695987701416,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 2892
    },
    {
      "epoch": 0.46288,
      "grad_norm": 0.12182626128196716,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 2893
    },
    {
      "epoch": 0.46304,
      "grad_norm": 0.12502676248550415,
      "learning_rate": 0.0001,
      "loss": 0.3486,
      "step": 2894
    },
    {
      "epoch": 0.4632,
      "grad_norm": 0.10275061428546906,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 2895
    },
    {
      "epoch": 0.46336,
      "grad_norm": 0.12555092573165894,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 2896
    },
    {
      "epoch": 0.46352,
      "grad_norm": 0.11660374701023102,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 2897
    },
    {
      "epoch": 0.46368,
      "grad_norm": 0.10830827057361603,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 2898
    },
    {
      "epoch": 0.46384,
      "grad_norm": 0.10722307860851288,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 2899
    },
    {
      "epoch": 0.464,
      "grad_norm": 0.12020251154899597,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 2900
    },
    {
      "epoch": 0.464,
      "eval_train_accuracy": 0.9934,
      "eval_train_loss": 0.3258586823940277,
      "eval_train_runtime": 4.7199,
      "eval_train_samples_per_second": 1059.339,
      "eval_train_steps_per_second": 13.348,
      "step": 2900
    },
    {
      "epoch": 0.464,
      "eval_test_accuracy": 0.9936,
      "eval_test_loss": 0.32471734285354614,
      "eval_test_runtime": 4.4044,
      "eval_test_samples_per_second": 1135.225,
      "eval_test_steps_per_second": 14.304,
      "step": 2900
    },
    {
      "epoch": 0.46416,
      "grad_norm": 0.10512752830982208,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 2901
    },
    {
      "epoch": 0.46432,
      "grad_norm": 0.17547467350959778,
      "learning_rate": 0.0001,
      "loss": 0.3363,
      "step": 2902
    },
    {
      "epoch": 0.46448,
      "grad_norm": 0.13787275552749634,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 2903
    },
    {
      "epoch": 0.46464,
      "grad_norm": 0.15858794748783112,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 2904
    },
    {
      "epoch": 0.4648,
      "grad_norm": 0.10172603279352188,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 2905
    },
    {
      "epoch": 0.46496,
      "grad_norm": 0.1038406491279602,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 2906
    },
    {
      "epoch": 0.46512,
      "grad_norm": 0.2081514447927475,
      "learning_rate": 0.0001,
      "loss": 0.3463,
      "step": 2907
    },
    {
      "epoch": 0.46528,
      "grad_norm": 0.16977442800998688,
      "learning_rate": 0.0001,
      "loss": 0.3434,
      "step": 2908
    },
    {
      "epoch": 0.46544,
      "grad_norm": 0.11809884756803513,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 2909
    },
    {
      "epoch": 0.4656,
      "grad_norm": 0.11667927354574203,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 2910
    },
    {
      "epoch": 0.46576,
      "grad_norm": 0.11963491141796112,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 2911
    },
    {
      "epoch": 0.46592,
      "grad_norm": 0.11152078956365585,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 2912
    },
    {
      "epoch": 0.46608,
      "grad_norm": 0.11420933902263641,
      "learning_rate": 0.0001,
      "loss": 0.3427,
      "step": 2913
    },
    {
      "epoch": 0.46624,
      "grad_norm": 0.12749820947647095,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 2914
    },
    {
      "epoch": 0.4664,
      "grad_norm": 0.10780640691518784,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 2915
    },
    {
      "epoch": 0.46656,
      "grad_norm": 0.1805485486984253,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 2916
    },
    {
      "epoch": 0.46672,
      "grad_norm": 0.12110871821641922,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 2917
    },
    {
      "epoch": 0.46688,
      "grad_norm": 0.12421857565641403,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 2918
    },
    {
      "epoch": 0.46704,
      "grad_norm": 0.09756714105606079,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 2919
    },
    {
      "epoch": 0.4672,
      "grad_norm": 0.10461839288473129,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 2920
    },
    {
      "epoch": 0.46736,
      "grad_norm": 0.12159497290849686,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 2921
    },
    {
      "epoch": 0.46752,
      "grad_norm": 0.12475068122148514,
      "learning_rate": 0.0001,
      "loss": 0.3438,
      "step": 2922
    },
    {
      "epoch": 0.46768,
      "grad_norm": 0.11430410295724869,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 2923
    },
    {
      "epoch": 0.46784,
      "grad_norm": 0.10105516761541367,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 2924
    },
    {
      "epoch": 0.468,
      "grad_norm": 0.11517690122127533,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 2925
    },
    {
      "epoch": 0.46816,
      "grad_norm": 0.1009167730808258,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2926
    },
    {
      "epoch": 0.46832,
      "grad_norm": 0.12267827242612839,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 2927
    },
    {
      "epoch": 0.46848,
      "grad_norm": 0.11393678933382034,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 2928
    },
    {
      "epoch": 0.46864,
      "grad_norm": 0.10864526778459549,
      "learning_rate": 0.0001,
      "loss": 0.3402,
      "step": 2929
    },
    {
      "epoch": 0.4688,
      "grad_norm": 0.11147677898406982,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 2930
    },
    {
      "epoch": 0.46896,
      "grad_norm": 0.11173892021179199,
      "learning_rate": 0.0001,
      "loss": 0.3379,
      "step": 2931
    },
    {
      "epoch": 0.46912,
      "grad_norm": 0.13507284224033356,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 2932
    },
    {
      "epoch": 0.46928,
      "grad_norm": 0.11247255653142929,
      "learning_rate": 0.0001,
      "loss": 0.3397,
      "step": 2933
    },
    {
      "epoch": 0.46944,
      "grad_norm": 0.10144396126270294,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 2934
    },
    {
      "epoch": 0.4696,
      "grad_norm": 0.09896518290042877,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 2935
    },
    {
      "epoch": 0.46976,
      "grad_norm": 0.10478612780570984,
      "learning_rate": 0.0001,
      "loss": 0.3426,
      "step": 2936
    },
    {
      "epoch": 0.46992,
      "grad_norm": 0.11030488461256027,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 2937
    },
    {
      "epoch": 0.47008,
      "grad_norm": 0.11056061089038849,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 2938
    },
    {
      "epoch": 0.47024,
      "grad_norm": 0.09415362030267715,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 2939
    },
    {
      "epoch": 0.4704,
      "grad_norm": 0.09897328168153763,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 2940
    },
    {
      "epoch": 0.47056,
      "grad_norm": 0.09265165030956268,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 2941
    },
    {
      "epoch": 0.47072,
      "grad_norm": 0.09820123016834259,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 2942
    },
    {
      "epoch": 0.47088,
      "grad_norm": 0.10225988179445267,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 2943
    },
    {
      "epoch": 0.47104,
      "grad_norm": 0.1095835417509079,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 2944
    },
    {
      "epoch": 0.4712,
      "grad_norm": 0.08999381214380264,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 2945
    },
    {
      "epoch": 0.47136,
      "grad_norm": 0.09842503070831299,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 2946
    },
    {
      "epoch": 0.47152,
      "grad_norm": 0.09666598588228226,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 2947
    },
    {
      "epoch": 0.47168,
      "grad_norm": 0.10192428529262543,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 2948
    },
    {
      "epoch": 0.47184,
      "grad_norm": 0.09795122593641281,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 2949
    },
    {
      "epoch": 0.472,
      "grad_norm": 0.11365120112895966,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2950
    },
    {
      "epoch": 0.47216,
      "grad_norm": 0.12572595477104187,
      "learning_rate": 0.0001,
      "loss": 0.3404,
      "step": 2951
    },
    {
      "epoch": 0.47232,
      "grad_norm": 0.1151641383767128,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 2952
    },
    {
      "epoch": 0.47248,
      "grad_norm": 0.10057542473077774,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 2953
    },
    {
      "epoch": 0.47264,
      "grad_norm": 0.10140236467123032,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 2954
    },
    {
      "epoch": 0.4728,
      "grad_norm": 0.09198654443025589,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 2955
    },
    {
      "epoch": 0.47296,
      "grad_norm": 0.0986914411187172,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 2956
    },
    {
      "epoch": 0.47312,
      "grad_norm": 0.10948500037193298,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 2957
    },
    {
      "epoch": 0.47328,
      "grad_norm": 0.10475271940231323,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 2958
    },
    {
      "epoch": 0.47344,
      "grad_norm": 0.11832843720912933,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 2959
    },
    {
      "epoch": 0.4736,
      "grad_norm": 0.10286016017198563,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 2960
    },
    {
      "epoch": 0.47376,
      "grad_norm": 0.10648523271083832,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 2961
    },
    {
      "epoch": 0.47392,
      "grad_norm": 0.1010262593626976,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2962
    },
    {
      "epoch": 0.47408,
      "grad_norm": 0.09536707401275635,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 2963
    },
    {
      "epoch": 0.47424,
      "grad_norm": 0.11048278212547302,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 2964
    },
    {
      "epoch": 0.4744,
      "grad_norm": 0.10038620233535767,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 2965
    },
    {
      "epoch": 0.47456,
      "grad_norm": 0.10936342179775238,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 2966
    },
    {
      "epoch": 0.47472,
      "grad_norm": 0.11923902481794357,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2967
    },
    {
      "epoch": 0.47488,
      "grad_norm": 0.13106107711791992,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 2968
    },
    {
      "epoch": 0.47504,
      "grad_norm": 0.09988278895616531,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 2969
    },
    {
      "epoch": 0.4752,
      "grad_norm": 0.135947585105896,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 2970
    },
    {
      "epoch": 0.47536,
      "grad_norm": 0.10764972865581512,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 2971
    },
    {
      "epoch": 0.47552,
      "grad_norm": 0.10200800001621246,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 2972
    },
    {
      "epoch": 0.47568,
      "grad_norm": 0.1510416865348816,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 2973
    },
    {
      "epoch": 0.47584,
      "grad_norm": 0.1416272670030594,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 2974
    },
    {
      "epoch": 0.476,
      "grad_norm": 0.09787111729383469,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 2975
    },
    {
      "epoch": 0.47616,
      "grad_norm": 0.11313717812299728,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 2976
    },
    {
      "epoch": 0.47632,
      "grad_norm": 0.09748981893062592,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 2977
    },
    {
      "epoch": 0.47648,
      "grad_norm": 0.13176752626895905,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 2978
    },
    {
      "epoch": 0.47664,
      "grad_norm": 0.11890634894371033,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 2979
    },
    {
      "epoch": 0.4768,
      "grad_norm": 0.11108672618865967,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 2980
    },
    {
      "epoch": 0.47696,
      "grad_norm": 0.10343647748231888,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 2981
    },
    {
      "epoch": 0.47712,
      "grad_norm": 0.13074500858783722,
      "learning_rate": 0.0001,
      "loss": 0.3448,
      "step": 2982
    },
    {
      "epoch": 0.47728,
      "grad_norm": 0.11938660591840744,
      "learning_rate": 0.0001,
      "loss": 0.3064,
      "step": 2983
    },
    {
      "epoch": 0.47744,
      "grad_norm": 0.1406780630350113,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 2984
    },
    {
      "epoch": 0.4776,
      "grad_norm": 0.11793270707130432,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 2985
    },
    {
      "epoch": 0.47776,
      "grad_norm": 0.1000920757651329,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 2986
    },
    {
      "epoch": 0.47792,
      "grad_norm": 0.12740792334079742,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 2987
    },
    {
      "epoch": 0.47808,
      "grad_norm": 0.12633173167705536,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 2988
    },
    {
      "epoch": 0.47824,
      "grad_norm": 0.11313153803348541,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 2989
    },
    {
      "epoch": 0.4784,
      "grad_norm": 0.12474681437015533,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 2990
    },
    {
      "epoch": 0.47856,
      "grad_norm": 0.13579261302947998,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 2991
    },
    {
      "epoch": 0.47872,
      "grad_norm": 0.12833403050899506,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 2992
    },
    {
      "epoch": 0.47888,
      "grad_norm": 0.10228610038757324,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 2993
    },
    {
      "epoch": 0.47904,
      "grad_norm": 0.09432937204837799,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 2994
    },
    {
      "epoch": 0.4792,
      "grad_norm": 0.10365726053714752,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 2995
    },
    {
      "epoch": 0.47936,
      "grad_norm": 0.11924637109041214,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 2996
    },
    {
      "epoch": 0.47952,
      "grad_norm": 0.11201045662164688,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 2997
    },
    {
      "epoch": 0.47968,
      "grad_norm": 0.09217539429664612,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 2998
    },
    {
      "epoch": 0.47984,
      "grad_norm": 0.1215168759226799,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 2999
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.1185780018568039,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 3000
    },
    {
      "epoch": 0.48,
      "eval_train_accuracy": 0.9904,
      "eval_train_loss": 0.3247845768928528,
      "eval_train_runtime": 4.331,
      "eval_train_samples_per_second": 1154.461,
      "eval_train_steps_per_second": 14.546,
      "step": 3000
    },
    {
      "epoch": 0.48,
      "eval_test_accuracy": 0.9918,
      "eval_test_loss": 0.3233404755592346,
      "eval_test_runtime": 4.7822,
      "eval_test_samples_per_second": 1045.551,
      "eval_test_steps_per_second": 13.174,
      "step": 3000
    },
    {
      "epoch": 0.48016,
      "grad_norm": 0.11984997987747192,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 3001
    },
    {
      "epoch": 0.48032,
      "grad_norm": 0.10148447751998901,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 3002
    },
    {
      "epoch": 0.48048,
      "grad_norm": 0.10515467077493668,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 3003
    },
    {
      "epoch": 0.48064,
      "grad_norm": 0.1681361198425293,
      "learning_rate": 0.0001,
      "loss": 0.3505,
      "step": 3004
    },
    {
      "epoch": 0.4808,
      "grad_norm": 0.11626634746789932,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 3005
    },
    {
      "epoch": 0.48096,
      "grad_norm": 0.10194158554077148,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 3006
    },
    {
      "epoch": 0.48112,
      "grad_norm": 0.09901265054941177,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3007
    },
    {
      "epoch": 0.48128,
      "grad_norm": 0.11352667957544327,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 3008
    },
    {
      "epoch": 0.48144,
      "grad_norm": 0.09107254445552826,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3009
    },
    {
      "epoch": 0.4816,
      "grad_norm": 0.13752946257591248,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 3010
    },
    {
      "epoch": 0.48176,
      "grad_norm": 0.15578323602676392,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3011
    },
    {
      "epoch": 0.48192,
      "grad_norm": 0.13368825614452362,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 3012
    },
    {
      "epoch": 0.48208,
      "grad_norm": 0.10988260060548782,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 3013
    },
    {
      "epoch": 0.48224,
      "grad_norm": 0.11150844395160675,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 3014
    },
    {
      "epoch": 0.4824,
      "grad_norm": 0.12544135749340057,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3015
    },
    {
      "epoch": 0.48256,
      "grad_norm": 0.141653373837471,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3016
    },
    {
      "epoch": 0.48272,
      "grad_norm": 0.10749393701553345,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 3017
    },
    {
      "epoch": 0.48288,
      "grad_norm": 0.1229093074798584,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 3018
    },
    {
      "epoch": 0.48304,
      "grad_norm": 0.12267342209815979,
      "learning_rate": 0.0001,
      "loss": 0.3401,
      "step": 3019
    },
    {
      "epoch": 0.4832,
      "grad_norm": 0.10280812531709671,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 3020
    },
    {
      "epoch": 0.48336,
      "grad_norm": 0.12112385779619217,
      "learning_rate": 0.0001,
      "loss": 0.3393,
      "step": 3021
    },
    {
      "epoch": 0.48352,
      "grad_norm": 0.12268458306789398,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 3022
    },
    {
      "epoch": 0.48368,
      "grad_norm": 0.12321396172046661,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 3023
    },
    {
      "epoch": 0.48384,
      "grad_norm": 0.0975601077079773,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 3024
    },
    {
      "epoch": 0.484,
      "grad_norm": 0.11950130760669708,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 3025
    },
    {
      "epoch": 0.48416,
      "grad_norm": 0.11936654150485992,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 3026
    },
    {
      "epoch": 0.48432,
      "grad_norm": 0.11376287043094635,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3027
    },
    {
      "epoch": 0.48448,
      "grad_norm": 0.11437046527862549,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3028
    },
    {
      "epoch": 0.48464,
      "grad_norm": 0.11033447831869125,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 3029
    },
    {
      "epoch": 0.4848,
      "grad_norm": 0.12156599760055542,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 3030
    },
    {
      "epoch": 0.48496,
      "grad_norm": 0.09176189452409744,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 3031
    },
    {
      "epoch": 0.48512,
      "grad_norm": 0.10051481425762177,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 3032
    },
    {
      "epoch": 0.48528,
      "grad_norm": 0.09480655938386917,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 3033
    },
    {
      "epoch": 0.48544,
      "grad_norm": 0.10078898817300797,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 3034
    },
    {
      "epoch": 0.4856,
      "grad_norm": 0.09807208180427551,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3035
    },
    {
      "epoch": 0.48576,
      "grad_norm": 0.0964941754937172,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 3036
    },
    {
      "epoch": 0.48592,
      "grad_norm": 0.11067492514848709,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 3037
    },
    {
      "epoch": 0.48608,
      "grad_norm": 0.11501959711313248,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 3038
    },
    {
      "epoch": 0.48624,
      "grad_norm": 0.12810003757476807,
      "learning_rate": 0.0001,
      "loss": 0.3428,
      "step": 3039
    },
    {
      "epoch": 0.4864,
      "grad_norm": 0.16819846630096436,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 3040
    },
    {
      "epoch": 0.48656,
      "grad_norm": 0.1312192678451538,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 3041
    },
    {
      "epoch": 0.48672,
      "grad_norm": 0.09339393675327301,
      "learning_rate": 0.0001,
      "loss": 0.3383,
      "step": 3042
    },
    {
      "epoch": 0.48688,
      "grad_norm": 0.10040643811225891,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3043
    },
    {
      "epoch": 0.48704,
      "grad_norm": 0.09807208180427551,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 3044
    },
    {
      "epoch": 0.4872,
      "grad_norm": 0.17780889570713043,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 3045
    },
    {
      "epoch": 0.48736,
      "grad_norm": 0.09910736232995987,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 3046
    },
    {
      "epoch": 0.48752,
      "grad_norm": 0.10237430781126022,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 3047
    },
    {
      "epoch": 0.48768,
      "grad_norm": 0.10632187873125076,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 3048
    },
    {
      "epoch": 0.48784,
      "grad_norm": 0.12692949175834656,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 3049
    },
    {
      "epoch": 0.488,
      "grad_norm": 0.10539087653160095,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 3050
    },
    {
      "epoch": 0.48816,
      "grad_norm": 0.12734144926071167,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 3051
    },
    {
      "epoch": 0.48832,
      "grad_norm": 0.10677365958690643,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 3052
    },
    {
      "epoch": 0.48848,
      "grad_norm": 0.10032069683074951,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 3053
    },
    {
      "epoch": 0.48864,
      "grad_norm": 0.10578638315200806,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 3054
    },
    {
      "epoch": 0.4888,
      "grad_norm": 0.11247880756855011,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 3055
    },
    {
      "epoch": 0.48896,
      "grad_norm": 0.10877703130245209,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 3056
    },
    {
      "epoch": 0.48912,
      "grad_norm": 0.10804969072341919,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 3057
    },
    {
      "epoch": 0.48928,
      "grad_norm": 0.12303707003593445,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 3058
    },
    {
      "epoch": 0.48944,
      "grad_norm": 0.12514957785606384,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 3059
    },
    {
      "epoch": 0.4896,
      "grad_norm": 0.11539047211408615,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 3060
    },
    {
      "epoch": 0.48976,
      "grad_norm": 0.11952805519104004,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 3061
    },
    {
      "epoch": 0.48992,
      "grad_norm": 0.10496044158935547,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 3062
    },
    {
      "epoch": 0.49008,
      "grad_norm": 0.12437232583761215,
      "learning_rate": 0.0001,
      "loss": 0.3389,
      "step": 3063
    },
    {
      "epoch": 0.49024,
      "grad_norm": 0.10441292077302933,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3064
    },
    {
      "epoch": 0.4904,
      "grad_norm": 0.10750622302293777,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 3065
    },
    {
      "epoch": 0.49056,
      "grad_norm": 0.10326378792524338,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 3066
    },
    {
      "epoch": 0.49072,
      "grad_norm": 0.10586339235305786,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3067
    },
    {
      "epoch": 0.49088,
      "grad_norm": 0.09957023710012436,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 3068
    },
    {
      "epoch": 0.49104,
      "grad_norm": 0.12022611498832703,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 3069
    },
    {
      "epoch": 0.4912,
      "grad_norm": 0.1222718358039856,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 3070
    },
    {
      "epoch": 0.49136,
      "grad_norm": 0.10445655882358551,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 3071
    },
    {
      "epoch": 0.49152,
      "grad_norm": 0.11549998819828033,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 3072
    },
    {
      "epoch": 0.49168,
      "grad_norm": 0.11371771991252899,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 3073
    },
    {
      "epoch": 0.49184,
      "grad_norm": 0.1728764921426773,
      "learning_rate": 0.0001,
      "loss": 0.3057,
      "step": 3074
    },
    {
      "epoch": 0.492,
      "grad_norm": 0.10968206077814102,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3075
    },
    {
      "epoch": 0.49216,
      "grad_norm": 0.17802365124225616,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 3076
    },
    {
      "epoch": 0.49232,
      "grad_norm": 0.10995830595493317,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 3077
    },
    {
      "epoch": 0.49248,
      "grad_norm": 0.11452923715114594,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 3078
    },
    {
      "epoch": 0.49264,
      "grad_norm": 0.11946189403533936,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3079
    },
    {
      "epoch": 0.4928,
      "grad_norm": 0.12151750177145004,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 3080
    },
    {
      "epoch": 0.49296,
      "grad_norm": 0.10134196281433105,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3081
    },
    {
      "epoch": 0.49312,
      "grad_norm": 0.11749497801065445,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 3082
    },
    {
      "epoch": 0.49328,
      "grad_norm": 0.10882803052663803,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 3083
    },
    {
      "epoch": 0.49344,
      "grad_norm": 0.14836542308330536,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 3084
    },
    {
      "epoch": 0.4936,
      "grad_norm": 0.09979504346847534,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3085
    },
    {
      "epoch": 0.49376,
      "grad_norm": 0.1287863850593567,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 3086
    },
    {
      "epoch": 0.49392,
      "grad_norm": 0.1366826593875885,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 3087
    },
    {
      "epoch": 0.49408,
      "grad_norm": 0.11411821097135544,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 3088
    },
    {
      "epoch": 0.49424,
      "grad_norm": 0.11634742468595505,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 3089
    },
    {
      "epoch": 0.4944,
      "grad_norm": 0.10281180590391159,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 3090
    },
    {
      "epoch": 0.49456,
      "grad_norm": 0.10338713228702545,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 3091
    },
    {
      "epoch": 0.49472,
      "grad_norm": 0.11235467344522476,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 3092
    },
    {
      "epoch": 0.49488,
      "grad_norm": 0.13524477183818817,
      "learning_rate": 0.0001,
      "loss": 0.3452,
      "step": 3093
    },
    {
      "epoch": 0.49504,
      "grad_norm": 0.10032115876674652,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 3094
    },
    {
      "epoch": 0.4952,
      "grad_norm": 0.16122780740261078,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 3095
    },
    {
      "epoch": 0.49536,
      "grad_norm": 0.11415902525186539,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 3096
    },
    {
      "epoch": 0.49552,
      "grad_norm": 0.15161915123462677,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 3097
    },
    {
      "epoch": 0.49568,
      "grad_norm": 0.11121430993080139,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3098
    },
    {
      "epoch": 0.49584,
      "grad_norm": 0.1365005522966385,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3099
    },
    {
      "epoch": 0.496,
      "grad_norm": 0.10481560975313187,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 3100
    },
    {
      "epoch": 0.496,
      "eval_train_accuracy": 0.9906,
      "eval_train_loss": 0.3236474096775055,
      "eval_train_runtime": 4.423,
      "eval_train_samples_per_second": 1130.465,
      "eval_train_steps_per_second": 14.244,
      "step": 3100
    },
    {
      "epoch": 0.496,
      "eval_test_accuracy": 0.9922,
      "eval_test_loss": 0.3221598267555237,
      "eval_test_runtime": 4.6047,
      "eval_test_samples_per_second": 1085.851,
      "eval_test_steps_per_second": 13.682,
      "step": 3100
    },
    {
      "epoch": 0.49616,
      "grad_norm": 0.1110154464840889,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 3101
    },
    {
      "epoch": 0.49632,
      "grad_norm": 0.11103428900241852,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 3102
    },
    {
      "epoch": 0.49648,
      "grad_norm": 0.12441744655370712,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 3103
    },
    {
      "epoch": 0.49664,
      "grad_norm": 0.10740256309509277,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 3104
    },
    {
      "epoch": 0.4968,
      "grad_norm": 0.11208593845367432,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 3105
    },
    {
      "epoch": 0.49696,
      "grad_norm": 0.09783950448036194,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 3106
    },
    {
      "epoch": 0.49712,
      "grad_norm": 0.24243055284023285,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 3107
    },
    {
      "epoch": 0.49728,
      "grad_norm": 0.15465310215950012,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 3108
    },
    {
      "epoch": 0.49744,
      "grad_norm": 0.11780447512865067,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 3109
    },
    {
      "epoch": 0.4976,
      "grad_norm": 0.1813289374113083,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 3110
    },
    {
      "epoch": 0.49776,
      "grad_norm": 0.1314001828432083,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 3111
    },
    {
      "epoch": 0.49792,
      "grad_norm": 0.1104607954621315,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 3112
    },
    {
      "epoch": 0.49808,
      "grad_norm": 0.14035724103450775,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 3113
    },
    {
      "epoch": 0.49824,
      "grad_norm": 0.19067911803722382,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 3114
    },
    {
      "epoch": 0.4984,
      "grad_norm": 0.13471512496471405,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 3115
    },
    {
      "epoch": 0.49856,
      "grad_norm": 0.10965317487716675,
      "learning_rate": 0.0001,
      "loss": 0.3466,
      "step": 3116
    },
    {
      "epoch": 0.49872,
      "grad_norm": 0.20455245673656464,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 3117
    },
    {
      "epoch": 0.49888,
      "grad_norm": 0.136977419257164,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 3118
    },
    {
      "epoch": 0.49904,
      "grad_norm": 0.10759945958852768,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 3119
    },
    {
      "epoch": 0.4992,
      "grad_norm": 0.1553785502910614,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 3120
    },
    {
      "epoch": 0.49936,
      "grad_norm": 0.2007591724395752,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3121
    },
    {
      "epoch": 0.49952,
      "grad_norm": 0.11109253019094467,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 3122
    },
    {
      "epoch": 0.49968,
      "grad_norm": 0.14599855244159698,
      "learning_rate": 0.0001,
      "loss": 0.3455,
      "step": 3123
    },
    {
      "epoch": 0.49984,
      "grad_norm": 0.1274535059928894,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 3124
    },
    {
      "epoch": 0.5,
      "grad_norm": 0.12026014178991318,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 3125
    },
    {
      "epoch": 0.50016,
      "grad_norm": 0.11247417330741882,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 3126
    },
    {
      "epoch": 0.50032,
      "grad_norm": 0.1201564148068428,
      "learning_rate": 0.0001,
      "loss": 0.3091,
      "step": 3127
    },
    {
      "epoch": 0.50048,
      "grad_norm": 0.11665637791156769,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 3128
    },
    {
      "epoch": 0.50064,
      "grad_norm": 0.1686500459909439,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 3129
    },
    {
      "epoch": 0.5008,
      "grad_norm": 0.13172343373298645,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 3130
    },
    {
      "epoch": 0.50096,
      "grad_norm": 0.09914746135473251,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 3131
    },
    {
      "epoch": 0.50112,
      "grad_norm": 0.11700408905744553,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 3132
    },
    {
      "epoch": 0.50128,
      "grad_norm": 0.11912897974252701,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 3133
    },
    {
      "epoch": 0.50144,
      "grad_norm": 0.11557406932115555,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 3134
    },
    {
      "epoch": 0.5016,
      "grad_norm": 0.10563184320926666,
      "learning_rate": 0.0001,
      "loss": 0.3411,
      "step": 3135
    },
    {
      "epoch": 0.50176,
      "grad_norm": 0.10984131693840027,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 3136
    },
    {
      "epoch": 0.50192,
      "grad_norm": 0.10913202911615372,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 3137
    },
    {
      "epoch": 0.50208,
      "grad_norm": 0.10946062952280045,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 3138
    },
    {
      "epoch": 0.50224,
      "grad_norm": 0.10867252945899963,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 3139
    },
    {
      "epoch": 0.5024,
      "grad_norm": 0.11774078011512756,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 3140
    },
    {
      "epoch": 0.50256,
      "grad_norm": 0.11929820477962494,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 3141
    },
    {
      "epoch": 0.50272,
      "grad_norm": 0.146384134888649,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 3142
    },
    {
      "epoch": 0.50288,
      "grad_norm": 0.13991127908229828,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3143
    },
    {
      "epoch": 0.50304,
      "grad_norm": 0.11006318032741547,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3144
    },
    {
      "epoch": 0.5032,
      "grad_norm": 0.13397061824798584,
      "learning_rate": 0.0001,
      "loss": 0.3374,
      "step": 3145
    },
    {
      "epoch": 0.50336,
      "grad_norm": 0.11274923384189606,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 3146
    },
    {
      "epoch": 0.50352,
      "grad_norm": 0.10543930530548096,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 3147
    },
    {
      "epoch": 0.50368,
      "grad_norm": 0.12787668406963348,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 3148
    },
    {
      "epoch": 0.50384,
      "grad_norm": 0.11203228682279587,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 3149
    },
    {
      "epoch": 0.504,
      "grad_norm": 0.12304031103849411,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3150
    },
    {
      "epoch": 0.50416,
      "grad_norm": 0.10505741089582443,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 3151
    },
    {
      "epoch": 0.50432,
      "grad_norm": 0.10005555301904678,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 3152
    },
    {
      "epoch": 0.50448,
      "grad_norm": 0.22113002836704254,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3153
    },
    {
      "epoch": 0.50464,
      "grad_norm": 0.13937707245349884,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 3154
    },
    {
      "epoch": 0.5048,
      "grad_norm": 0.12559035420417786,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3155
    },
    {
      "epoch": 0.50496,
      "grad_norm": 0.17396068572998047,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3156
    },
    {
      "epoch": 0.50512,
      "grad_norm": 0.13842901587486267,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 3157
    },
    {
      "epoch": 0.50528,
      "grad_norm": 0.1305771917104721,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 3158
    },
    {
      "epoch": 0.50544,
      "grad_norm": 0.12125656008720398,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 3159
    },
    {
      "epoch": 0.5056,
      "grad_norm": 0.1249857097864151,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 3160
    },
    {
      "epoch": 0.50576,
      "grad_norm": 0.14862117171287537,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 3161
    },
    {
      "epoch": 0.50592,
      "grad_norm": 0.15419672429561615,
      "learning_rate": 0.0001,
      "loss": 0.3414,
      "step": 3162
    },
    {
      "epoch": 0.50608,
      "grad_norm": 0.11070364713668823,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 3163
    },
    {
      "epoch": 0.50624,
      "grad_norm": 0.11475678533315659,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 3164
    },
    {
      "epoch": 0.5064,
      "grad_norm": 0.17176194489002228,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 3165
    },
    {
      "epoch": 0.50656,
      "grad_norm": 0.11478105187416077,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3166
    },
    {
      "epoch": 0.50672,
      "grad_norm": 0.14290665090084076,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 3167
    },
    {
      "epoch": 0.50688,
      "grad_norm": 0.12369609624147415,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 3168
    },
    {
      "epoch": 0.50704,
      "grad_norm": 0.17147782444953918,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 3169
    },
    {
      "epoch": 0.5072,
      "grad_norm": 0.11520551145076752,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 3170
    },
    {
      "epoch": 0.50736,
      "grad_norm": 0.10517513751983643,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3171
    },
    {
      "epoch": 0.50752,
      "grad_norm": 0.12366634607315063,
      "learning_rate": 0.0001,
      "loss": 0.3043,
      "step": 3172
    },
    {
      "epoch": 0.50768,
      "grad_norm": 0.119888536632061,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 3173
    },
    {
      "epoch": 0.50784,
      "grad_norm": 0.1431901901960373,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 3174
    },
    {
      "epoch": 0.508,
      "grad_norm": 0.16533872485160828,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 3175
    },
    {
      "epoch": 0.50816,
      "grad_norm": 0.1250460147857666,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 3176
    },
    {
      "epoch": 0.50832,
      "grad_norm": 0.11071528494358063,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 3177
    },
    {
      "epoch": 0.50848,
      "grad_norm": 0.14003732800483704,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 3178
    },
    {
      "epoch": 0.50864,
      "grad_norm": 0.13919426500797272,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 3179
    },
    {
      "epoch": 0.5088,
      "grad_norm": 0.11073769629001617,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 3180
    },
    {
      "epoch": 0.50896,
      "grad_norm": 0.15473563969135284,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3181
    },
    {
      "epoch": 0.50912,
      "grad_norm": 0.11767056584358215,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3182
    },
    {
      "epoch": 0.50928,
      "grad_norm": 0.12409806996583939,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3183
    },
    {
      "epoch": 0.50944,
      "grad_norm": 0.1312621384859085,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3184
    },
    {
      "epoch": 0.5096,
      "grad_norm": 0.11278726905584335,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 3185
    },
    {
      "epoch": 0.50976,
      "grad_norm": 0.11237869411706924,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 3186
    },
    {
      "epoch": 0.50992,
      "grad_norm": 0.11404655128717422,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 3187
    },
    {
      "epoch": 0.51008,
      "grad_norm": 0.12647372484207153,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 3188
    },
    {
      "epoch": 0.51024,
      "grad_norm": 0.13250403106212616,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 3189
    },
    {
      "epoch": 0.5104,
      "grad_norm": 0.11072143167257309,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 3190
    },
    {
      "epoch": 0.51056,
      "grad_norm": 0.10186366736888885,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 3191
    },
    {
      "epoch": 0.51072,
      "grad_norm": 0.10381663590669632,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 3192
    },
    {
      "epoch": 0.51088,
      "grad_norm": 0.13456645607948303,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 3193
    },
    {
      "epoch": 0.51104,
      "grad_norm": 0.10852105915546417,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 3194
    },
    {
      "epoch": 0.5112,
      "grad_norm": 0.14053010940551758,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 3195
    },
    {
      "epoch": 0.51136,
      "grad_norm": 0.1205325499176979,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 3196
    },
    {
      "epoch": 0.51152,
      "grad_norm": 0.10577300935983658,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 3197
    },
    {
      "epoch": 0.51168,
      "grad_norm": 0.17558400332927704,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 3198
    },
    {
      "epoch": 0.51184,
      "grad_norm": 0.12131737172603607,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 3199
    },
    {
      "epoch": 0.512,
      "grad_norm": 0.11214633285999298,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3200
    },
    {
      "epoch": 0.512,
      "eval_train_accuracy": 0.994,
      "eval_train_loss": 0.3225138187408447,
      "eval_train_runtime": 4.4089,
      "eval_train_samples_per_second": 1134.062,
      "eval_train_steps_per_second": 14.289,
      "step": 3200
    },
    {
      "epoch": 0.512,
      "eval_test_accuracy": 0.9958,
      "eval_test_loss": 0.32116934657096863,
      "eval_test_runtime": 4.4896,
      "eval_test_samples_per_second": 1113.692,
      "eval_test_steps_per_second": 14.033,
      "step": 3200
    },
    {
      "epoch": 0.51216,
      "grad_norm": 0.11199034005403519,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 3201
    },
    {
      "epoch": 0.51232,
      "grad_norm": 0.1868835836648941,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 3202
    },
    {
      "epoch": 0.51248,
      "grad_norm": 0.10842429101467133,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3203
    },
    {
      "epoch": 0.51264,
      "grad_norm": 0.10941307246685028,
      "learning_rate": 0.0001,
      "loss": 0.3447,
      "step": 3204
    },
    {
      "epoch": 0.5128,
      "grad_norm": 0.0989757850766182,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 3205
    },
    {
      "epoch": 0.51296,
      "grad_norm": 0.09982535988092422,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 3206
    },
    {
      "epoch": 0.51312,
      "grad_norm": 0.16551624238491058,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3207
    },
    {
      "epoch": 0.51328,
      "grad_norm": 0.1207759901881218,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 3208
    },
    {
      "epoch": 0.51344,
      "grad_norm": 0.11473560333251953,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3209
    },
    {
      "epoch": 0.5136,
      "grad_norm": 0.1048588901758194,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 3210
    },
    {
      "epoch": 0.51376,
      "grad_norm": 0.10193534195423126,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 3211
    },
    {
      "epoch": 0.51392,
      "grad_norm": 0.12313657253980637,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 3212
    },
    {
      "epoch": 0.51408,
      "grad_norm": 0.11201439052820206,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 3213
    },
    {
      "epoch": 0.51424,
      "grad_norm": 0.1170986071228981,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 3214
    },
    {
      "epoch": 0.5144,
      "grad_norm": 0.16397710144519806,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 3215
    },
    {
      "epoch": 0.51456,
      "grad_norm": 0.13618981838226318,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3216
    },
    {
      "epoch": 0.51472,
      "grad_norm": 0.11263848841190338,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 3217
    },
    {
      "epoch": 0.51488,
      "grad_norm": 0.10228285938501358,
      "learning_rate": 0.0001,
      "loss": 0.302,
      "step": 3218
    },
    {
      "epoch": 0.51504,
      "grad_norm": 0.12311182171106339,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3219
    },
    {
      "epoch": 0.5152,
      "grad_norm": 0.09498414397239685,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 3220
    },
    {
      "epoch": 0.51536,
      "grad_norm": 0.1269213706254959,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 3221
    },
    {
      "epoch": 0.51552,
      "grad_norm": 0.12305398285388947,
      "learning_rate": 0.0001,
      "loss": 0.3385,
      "step": 3222
    },
    {
      "epoch": 0.51568,
      "grad_norm": 0.12431788444519043,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3223
    },
    {
      "epoch": 0.51584,
      "grad_norm": 0.12582017481327057,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 3224
    },
    {
      "epoch": 0.516,
      "grad_norm": 0.2068127989768982,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 3225
    },
    {
      "epoch": 0.51616,
      "grad_norm": 0.1388305127620697,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 3226
    },
    {
      "epoch": 0.51632,
      "grad_norm": 0.09826044738292694,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 3227
    },
    {
      "epoch": 0.51648,
      "grad_norm": 0.12039658427238464,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3228
    },
    {
      "epoch": 0.51664,
      "grad_norm": 0.20625905692577362,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3229
    },
    {
      "epoch": 0.5168,
      "grad_norm": 0.15018807351589203,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3230
    },
    {
      "epoch": 0.51696,
      "grad_norm": 0.12307478487491608,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 3231
    },
    {
      "epoch": 0.51712,
      "grad_norm": 0.10662898421287537,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 3232
    },
    {
      "epoch": 0.51728,
      "grad_norm": 0.13051630556583405,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 3233
    },
    {
      "epoch": 0.51744,
      "grad_norm": 0.1333032250404358,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 3234
    },
    {
      "epoch": 0.5176,
      "grad_norm": 0.12540683150291443,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 3235
    },
    {
      "epoch": 0.51776,
      "grad_norm": 0.12272196263074875,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 3236
    },
    {
      "epoch": 0.51792,
      "grad_norm": 0.13727687299251556,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 3237
    },
    {
      "epoch": 0.51808,
      "grad_norm": 0.10102945566177368,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 3238
    },
    {
      "epoch": 0.51824,
      "grad_norm": 0.1282949447631836,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 3239
    },
    {
      "epoch": 0.5184,
      "grad_norm": 0.12033303081989288,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 3240
    },
    {
      "epoch": 0.51856,
      "grad_norm": 0.11941943317651749,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 3241
    },
    {
      "epoch": 0.51872,
      "grad_norm": 0.14568181335926056,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 3242
    },
    {
      "epoch": 0.51888,
      "grad_norm": 0.12722985446453094,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3243
    },
    {
      "epoch": 0.51904,
      "grad_norm": 0.10358405858278275,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3244
    },
    {
      "epoch": 0.5192,
      "grad_norm": 0.12486962229013443,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 3245
    },
    {
      "epoch": 0.51936,
      "grad_norm": 0.102357417345047,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 3246
    },
    {
      "epoch": 0.51952,
      "grad_norm": 0.1271105408668518,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 3247
    },
    {
      "epoch": 0.51968,
      "grad_norm": 0.12994766235351562,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3248
    },
    {
      "epoch": 0.51984,
      "grad_norm": 0.09917158633470535,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 3249
    },
    {
      "epoch": 0.52,
      "grad_norm": 0.12288065254688263,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3250
    },
    {
      "epoch": 0.52016,
      "grad_norm": 0.11773868650197983,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3251
    },
    {
      "epoch": 0.52032,
      "grad_norm": 0.10453998297452927,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 3252
    },
    {
      "epoch": 0.52048,
      "grad_norm": 0.15823768079280853,
      "learning_rate": 0.0001,
      "loss": 0.3402,
      "step": 3253
    },
    {
      "epoch": 0.52064,
      "grad_norm": 0.13272641599178314,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 3254
    },
    {
      "epoch": 0.5208,
      "grad_norm": 0.09304653108119965,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 3255
    },
    {
      "epoch": 0.52096,
      "grad_norm": 0.13392630219459534,
      "learning_rate": 0.0001,
      "loss": 0.3067,
      "step": 3256
    },
    {
      "epoch": 0.52112,
      "grad_norm": 0.12527069449424744,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 3257
    },
    {
      "epoch": 0.52128,
      "grad_norm": 0.09635692834854126,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 3258
    },
    {
      "epoch": 0.52144,
      "grad_norm": 0.13204903900623322,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 3259
    },
    {
      "epoch": 0.5216,
      "grad_norm": 0.17519818246364594,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 3260
    },
    {
      "epoch": 0.52176,
      "grad_norm": 0.12932512164115906,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 3261
    },
    {
      "epoch": 0.52192,
      "grad_norm": 0.17597681283950806,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 3262
    },
    {
      "epoch": 0.52208,
      "grad_norm": 0.21381133794784546,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 3263
    },
    {
      "epoch": 0.52224,
      "grad_norm": 0.10424633324146271,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 3264
    },
    {
      "epoch": 0.5224,
      "grad_norm": 0.6045640110969543,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 3265
    },
    {
      "epoch": 0.52256,
      "grad_norm": 0.2958890497684479,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 3266
    },
    {
      "epoch": 0.52272,
      "grad_norm": 0.5100464224815369,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 3267
    },
    {
      "epoch": 0.52288,
      "grad_norm": 0.22533248364925385,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 3268
    },
    {
      "epoch": 0.52304,
      "grad_norm": 0.48249197006225586,
      "learning_rate": 0.0001,
      "loss": 0.3554,
      "step": 3269
    },
    {
      "epoch": 0.5232,
      "grad_norm": 0.134427011013031,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 3270
    },
    {
      "epoch": 0.52336,
      "grad_norm": 0.45646175742149353,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3271
    },
    {
      "epoch": 0.52352,
      "grad_norm": 0.24038000404834747,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 3272
    },
    {
      "epoch": 0.52368,
      "grad_norm": 0.16002234816551208,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 3273
    },
    {
      "epoch": 0.52384,
      "grad_norm": 0.2317427396774292,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 3274
    },
    {
      "epoch": 0.524,
      "grad_norm": 0.2164924591779709,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 3275
    },
    {
      "epoch": 0.52416,
      "grad_norm": 0.12898197770118713,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3276
    },
    {
      "epoch": 0.52432,
      "grad_norm": 0.28056180477142334,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3277
    },
    {
      "epoch": 0.52448,
      "grad_norm": 0.19778725504875183,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 3278
    },
    {
      "epoch": 0.52464,
      "grad_norm": 0.13615964353084564,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 3279
    },
    {
      "epoch": 0.5248,
      "grad_norm": 0.12175090610980988,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 3280
    },
    {
      "epoch": 0.52496,
      "grad_norm": 0.12798629701137543,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 3281
    },
    {
      "epoch": 0.52512,
      "grad_norm": 0.11381132155656815,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 3282
    },
    {
      "epoch": 0.52528,
      "grad_norm": 0.1666938066482544,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3283
    },
    {
      "epoch": 0.52544,
      "grad_norm": 0.12140753865242004,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 3284
    },
    {
      "epoch": 0.5256,
      "grad_norm": 0.24723999202251434,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 3285
    },
    {
      "epoch": 0.52576,
      "grad_norm": 0.10530443489551544,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 3286
    },
    {
      "epoch": 0.52592,
      "grad_norm": 0.10948272049427032,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 3287
    },
    {
      "epoch": 0.52608,
      "grad_norm": 0.12683501839637756,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3288
    },
    {
      "epoch": 0.52624,
      "grad_norm": 0.14592725038528442,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 3289
    },
    {
      "epoch": 0.5264,
      "grad_norm": 0.11512651294469833,
      "learning_rate": 0.0001,
      "loss": 0.3075,
      "step": 3290
    },
    {
      "epoch": 0.52656,
      "grad_norm": 0.14012886583805084,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 3291
    },
    {
      "epoch": 0.52672,
      "grad_norm": 0.10911144316196442,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 3292
    },
    {
      "epoch": 0.52688,
      "grad_norm": 0.10628100484609604,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 3293
    },
    {
      "epoch": 0.52704,
      "grad_norm": 0.15094350278377533,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 3294
    },
    {
      "epoch": 0.5272,
      "grad_norm": 0.14332866668701172,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 3295
    },
    {
      "epoch": 0.52736,
      "grad_norm": 0.12169528752565384,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3296
    },
    {
      "epoch": 0.52752,
      "grad_norm": 0.1254601925611496,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 3297
    },
    {
      "epoch": 0.52768,
      "grad_norm": 0.1311156302690506,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 3298
    },
    {
      "epoch": 0.52784,
      "grad_norm": 0.11353980004787445,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3299
    },
    {
      "epoch": 0.528,
      "grad_norm": 0.13279975950717926,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 3300
    },
    {
      "epoch": 0.528,
      "eval_train_accuracy": 0.9942,
      "eval_train_loss": 0.32184916734695435,
      "eval_train_runtime": 4.4414,
      "eval_train_samples_per_second": 1125.781,
      "eval_train_steps_per_second": 14.185,
      "step": 3300
    },
    {
      "epoch": 0.528,
      "eval_test_accuracy": 0.9954,
      "eval_test_loss": 0.32059037685394287,
      "eval_test_runtime": 4.4916,
      "eval_test_samples_per_second": 1113.191,
      "eval_test_steps_per_second": 14.026,
      "step": 3300
    },
    {
      "epoch": 0.52816,
      "grad_norm": 0.09983499348163605,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 3301
    },
    {
      "epoch": 0.52832,
      "grad_norm": 0.10402382165193558,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 3302
    },
    {
      "epoch": 0.52848,
      "grad_norm": 0.13372911512851715,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3303
    },
    {
      "epoch": 0.52864,
      "grad_norm": 0.1328478902578354,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 3304
    },
    {
      "epoch": 0.5288,
      "grad_norm": 0.14863257110118866,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 3305
    },
    {
      "epoch": 0.52896,
      "grad_norm": 0.12388218939304352,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 3306
    },
    {
      "epoch": 0.52912,
      "grad_norm": 0.15645048022270203,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 3307
    },
    {
      "epoch": 0.52928,
      "grad_norm": 0.14168162643909454,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 3308
    },
    {
      "epoch": 0.52944,
      "grad_norm": 0.16197094321250916,
      "learning_rate": 0.0001,
      "loss": 0.3381,
      "step": 3309
    },
    {
      "epoch": 0.5296,
      "grad_norm": 0.14302149415016174,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 3310
    },
    {
      "epoch": 0.52976,
      "grad_norm": 0.10332335531711578,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 3311
    },
    {
      "epoch": 0.52992,
      "grad_norm": 0.13262157142162323,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 3312
    },
    {
      "epoch": 0.53008,
      "grad_norm": 0.10510759055614471,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 3313
    },
    {
      "epoch": 0.53024,
      "grad_norm": 0.1523006558418274,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3314
    },
    {
      "epoch": 0.5304,
      "grad_norm": 0.11708078533411026,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 3315
    },
    {
      "epoch": 0.53056,
      "grad_norm": 0.10304312407970428,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 3316
    },
    {
      "epoch": 0.53072,
      "grad_norm": 0.4431145489215851,
      "learning_rate": 0.0001,
      "loss": 0.3353,
      "step": 3317
    },
    {
      "epoch": 0.53088,
      "grad_norm": 0.13339032232761383,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 3318
    },
    {
      "epoch": 0.53104,
      "grad_norm": 0.13372278213500977,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 3319
    },
    {
      "epoch": 0.5312,
      "grad_norm": 0.16231390833854675,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 3320
    },
    {
      "epoch": 0.53136,
      "grad_norm": 0.11061186343431473,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 3321
    },
    {
      "epoch": 0.53152,
      "grad_norm": 0.13839533925056458,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 3322
    },
    {
      "epoch": 0.53168,
      "grad_norm": 0.1543205827474594,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 3323
    },
    {
      "epoch": 0.53184,
      "grad_norm": 0.10236640274524689,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3324
    },
    {
      "epoch": 0.532,
      "grad_norm": 0.11965212970972061,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 3325
    },
    {
      "epoch": 0.53216,
      "grad_norm": 0.13518941402435303,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 3326
    },
    {
      "epoch": 0.53232,
      "grad_norm": 0.09646806120872498,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3327
    },
    {
      "epoch": 0.53248,
      "grad_norm": 0.1224742978811264,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 3328
    },
    {
      "epoch": 0.53264,
      "grad_norm": 0.11769872158765793,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 3329
    },
    {
      "epoch": 0.5328,
      "grad_norm": 0.12027814239263535,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 3330
    },
    {
      "epoch": 0.53296,
      "grad_norm": 0.14680609107017517,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3331
    },
    {
      "epoch": 0.53312,
      "grad_norm": 0.31621357798576355,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 3332
    },
    {
      "epoch": 0.53328,
      "grad_norm": 0.10265908390283585,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 3333
    },
    {
      "epoch": 0.53344,
      "grad_norm": 0.1070021241903305,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 3334
    },
    {
      "epoch": 0.5336,
      "grad_norm": 0.1195620521903038,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 3335
    },
    {
      "epoch": 0.53376,
      "grad_norm": 0.11848396807909012,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 3336
    },
    {
      "epoch": 0.53392,
      "grad_norm": 0.11077278107404709,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 3337
    },
    {
      "epoch": 0.53408,
      "grad_norm": 0.2569296956062317,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 3338
    },
    {
      "epoch": 0.53424,
      "grad_norm": 0.10901834070682526,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 3339
    },
    {
      "epoch": 0.5344,
      "grad_norm": 0.10312525182962418,
      "learning_rate": 0.0001,
      "loss": 0.3064,
      "step": 3340
    },
    {
      "epoch": 0.53456,
      "grad_norm": 0.1376200020313263,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 3341
    },
    {
      "epoch": 0.53472,
      "grad_norm": 0.10371465235948563,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 3342
    },
    {
      "epoch": 0.53488,
      "grad_norm": 0.12606726586818695,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 3343
    },
    {
      "epoch": 0.53504,
      "grad_norm": 0.13250412046909332,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 3344
    },
    {
      "epoch": 0.5352,
      "grad_norm": 0.27276140451431274,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 3345
    },
    {
      "epoch": 0.53536,
      "grad_norm": 0.10606639832258224,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 3346
    },
    {
      "epoch": 0.53552,
      "grad_norm": 0.12536239624023438,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 3347
    },
    {
      "epoch": 0.53568,
      "grad_norm": 0.17604874074459076,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 3348
    },
    {
      "epoch": 0.53584,
      "grad_norm": 0.16277769207954407,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3349
    },
    {
      "epoch": 0.536,
      "grad_norm": 0.15112829208374023,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 3350
    },
    {
      "epoch": 0.53616,
      "grad_norm": 0.15553785860538483,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 3351
    },
    {
      "epoch": 0.53632,
      "grad_norm": 0.11703945696353912,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 3352
    },
    {
      "epoch": 0.53648,
      "grad_norm": 0.10253722220659256,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3353
    },
    {
      "epoch": 0.53664,
      "grad_norm": 0.14066976308822632,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 3354
    },
    {
      "epoch": 0.5368,
      "grad_norm": 0.12780222296714783,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3355
    },
    {
      "epoch": 0.53696,
      "grad_norm": 0.1160770058631897,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3356
    },
    {
      "epoch": 0.53712,
      "grad_norm": 0.12319086492061615,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 3357
    },
    {
      "epoch": 0.53728,
      "grad_norm": 0.11031685024499893,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 3358
    },
    {
      "epoch": 0.53744,
      "grad_norm": 0.29807838797569275,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 3359
    },
    {
      "epoch": 0.5376,
      "grad_norm": 0.11838623881340027,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 3360
    },
    {
      "epoch": 0.53776,
      "grad_norm": 0.10374359041452408,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 3361
    },
    {
      "epoch": 0.53792,
      "grad_norm": 0.10938864201307297,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3362
    },
    {
      "epoch": 0.53808,
      "grad_norm": 0.13538677990436554,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 3363
    },
    {
      "epoch": 0.53824,
      "grad_norm": 0.14071284234523773,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 3364
    },
    {
      "epoch": 0.5384,
      "grad_norm": 0.1231134682893753,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 3365
    },
    {
      "epoch": 0.53856,
      "grad_norm": 0.1147746592760086,
      "learning_rate": 0.0001,
      "loss": 0.3035,
      "step": 3366
    },
    {
      "epoch": 0.53872,
      "grad_norm": 0.11367620527744293,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 3367
    },
    {
      "epoch": 0.53888,
      "grad_norm": 0.10024218261241913,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 3368
    },
    {
      "epoch": 0.53904,
      "grad_norm": 0.11934606730937958,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 3369
    },
    {
      "epoch": 0.5392,
      "grad_norm": 0.10305175930261612,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 3370
    },
    {
      "epoch": 0.53936,
      "grad_norm": 0.11116115003824234,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 3371
    },
    {
      "epoch": 0.53952,
      "grad_norm": 0.11924327164888382,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 3372
    },
    {
      "epoch": 0.53968,
      "grad_norm": 0.10125439614057541,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 3373
    },
    {
      "epoch": 0.53984,
      "grad_norm": 0.11567377299070358,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3374
    },
    {
      "epoch": 0.54,
      "grad_norm": 0.11080124974250793,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 3375
    },
    {
      "epoch": 0.54016,
      "grad_norm": 0.10938297212123871,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3376
    },
    {
      "epoch": 0.54032,
      "grad_norm": 0.124397873878479,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 3377
    },
    {
      "epoch": 0.54048,
      "grad_norm": 0.09462939202785492,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3378
    },
    {
      "epoch": 0.54064,
      "grad_norm": 0.1256120353937149,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3379
    },
    {
      "epoch": 0.5408,
      "grad_norm": 0.10603483766317368,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3380
    },
    {
      "epoch": 0.54096,
      "grad_norm": 0.11058928817510605,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 3381
    },
    {
      "epoch": 0.54112,
      "grad_norm": 0.11513842642307281,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 3382
    },
    {
      "epoch": 0.54128,
      "grad_norm": 0.10238088667392731,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 3383
    },
    {
      "epoch": 0.54144,
      "grad_norm": 0.1000451147556305,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 3384
    },
    {
      "epoch": 0.5416,
      "grad_norm": 0.12510548532009125,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 3385
    },
    {
      "epoch": 0.54176,
      "grad_norm": 0.11941451579332352,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 3386
    },
    {
      "epoch": 0.54192,
      "grad_norm": 0.09603124856948853,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 3387
    },
    {
      "epoch": 0.54208,
      "grad_norm": 0.1697254180908203,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 3388
    },
    {
      "epoch": 0.54224,
      "grad_norm": 0.1074945256114006,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 3389
    },
    {
      "epoch": 0.5424,
      "grad_norm": 0.12778501212596893,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 3390
    },
    {
      "epoch": 0.54256,
      "grad_norm": 0.10294251143932343,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 3391
    },
    {
      "epoch": 0.54272,
      "grad_norm": 0.11290641874074936,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3392
    },
    {
      "epoch": 0.54288,
      "grad_norm": 0.12425239384174347,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3393
    },
    {
      "epoch": 0.54304,
      "grad_norm": 0.1712619811296463,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 3394
    },
    {
      "epoch": 0.5432,
      "grad_norm": 0.11145386099815369,
      "learning_rate": 0.0001,
      "loss": 0.3341,
      "step": 3395
    },
    {
      "epoch": 0.54336,
      "grad_norm": 0.10625237226486206,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 3396
    },
    {
      "epoch": 0.54352,
      "grad_norm": 0.0997222512960434,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 3397
    },
    {
      "epoch": 0.54368,
      "grad_norm": 0.13126929104328156,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 3398
    },
    {
      "epoch": 0.54384,
      "grad_norm": 0.13351815938949585,
      "learning_rate": 0.0001,
      "loss": 0.3091,
      "step": 3399
    },
    {
      "epoch": 0.544,
      "grad_norm": 0.09793714433908463,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 3400
    },
    {
      "epoch": 0.544,
      "eval_train_accuracy": 0.9936,
      "eval_train_loss": 0.32104745507240295,
      "eval_train_runtime": 4.6724,
      "eval_train_samples_per_second": 1070.108,
      "eval_train_steps_per_second": 13.483,
      "step": 3400
    },
    {
      "epoch": 0.544,
      "eval_test_accuracy": 0.9952,
      "eval_test_loss": 0.31972289085388184,
      "eval_test_runtime": 4.9031,
      "eval_test_samples_per_second": 1019.771,
      "eval_test_steps_per_second": 12.849,
      "step": 3400
    },
    {
      "epoch": 0.54416,
      "grad_norm": 0.10924097150564194,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3401
    },
    {
      "epoch": 0.54432,
      "grad_norm": 0.15895740687847137,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 3402
    },
    {
      "epoch": 0.54448,
      "grad_norm": 0.09975942969322205,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3403
    },
    {
      "epoch": 0.54464,
      "grad_norm": 0.1525646448135376,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 3404
    },
    {
      "epoch": 0.5448,
      "grad_norm": 0.10806919634342194,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3405
    },
    {
      "epoch": 0.54496,
      "grad_norm": 0.10280162841081619,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3406
    },
    {
      "epoch": 0.54512,
      "grad_norm": 0.09387979656457901,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 3407
    },
    {
      "epoch": 0.54528,
      "grad_norm": 0.11153452098369598,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 3408
    },
    {
      "epoch": 0.54544,
      "grad_norm": 0.18471862375736237,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 3409
    },
    {
      "epoch": 0.5456,
      "grad_norm": 0.16845738887786865,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 3410
    },
    {
      "epoch": 0.54576,
      "grad_norm": 0.12963494658470154,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 3411
    },
    {
      "epoch": 0.54592,
      "grad_norm": 0.1141129732131958,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 3412
    },
    {
      "epoch": 0.54608,
      "grad_norm": 0.10549502819776535,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 3413
    },
    {
      "epoch": 0.54624,
      "grad_norm": 0.146093487739563,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 3414
    },
    {
      "epoch": 0.5464,
      "grad_norm": 0.12198259681463242,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 3415
    },
    {
      "epoch": 0.54656,
      "grad_norm": 0.09684810042381287,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 3416
    },
    {
      "epoch": 0.54672,
      "grad_norm": 0.15105095505714417,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3417
    },
    {
      "epoch": 0.54688,
      "grad_norm": 0.13875260949134827,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 3418
    },
    {
      "epoch": 0.54704,
      "grad_norm": 0.11035555601119995,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3419
    },
    {
      "epoch": 0.5472,
      "grad_norm": 0.10501129925251007,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 3420
    },
    {
      "epoch": 0.54736,
      "grad_norm": 0.10687614232301712,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 3421
    },
    {
      "epoch": 0.54752,
      "grad_norm": 0.2109660655260086,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 3422
    },
    {
      "epoch": 0.54768,
      "grad_norm": 0.10058624297380447,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 3423
    },
    {
      "epoch": 0.54784,
      "grad_norm": 0.12723736464977264,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3424
    },
    {
      "epoch": 0.548,
      "grad_norm": 0.13820543885231018,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 3425
    },
    {
      "epoch": 0.54816,
      "grad_norm": 0.11668255925178528,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3426
    },
    {
      "epoch": 0.54832,
      "grad_norm": 0.12907859683036804,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3427
    },
    {
      "epoch": 0.54848,
      "grad_norm": 0.09924620389938354,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 3428
    },
    {
      "epoch": 0.54864,
      "grad_norm": 0.1145976334810257,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 3429
    },
    {
      "epoch": 0.5488,
      "grad_norm": 0.1261119246482849,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 3430
    },
    {
      "epoch": 0.54896,
      "grad_norm": 0.12796691060066223,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 3431
    },
    {
      "epoch": 0.54912,
      "grad_norm": 0.13702671229839325,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 3432
    },
    {
      "epoch": 0.54928,
      "grad_norm": 0.10977369546890259,
      "learning_rate": 0.0001,
      "loss": 0.3344,
      "step": 3433
    },
    {
      "epoch": 0.54944,
      "grad_norm": 0.10869734734296799,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 3434
    },
    {
      "epoch": 0.5496,
      "grad_norm": 0.11897968500852585,
      "learning_rate": 0.0001,
      "loss": 0.3045,
      "step": 3435
    },
    {
      "epoch": 0.54976,
      "grad_norm": 0.10311219841241837,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 3436
    },
    {
      "epoch": 0.54992,
      "grad_norm": 0.1097361370921135,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 3437
    },
    {
      "epoch": 0.55008,
      "grad_norm": 0.1374036967754364,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 3438
    },
    {
      "epoch": 0.55024,
      "grad_norm": 0.12057413160800934,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 3439
    },
    {
      "epoch": 0.5504,
      "grad_norm": 0.11489194631576538,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 3440
    },
    {
      "epoch": 0.55056,
      "grad_norm": 0.13506074249744415,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 3441
    },
    {
      "epoch": 0.55072,
      "grad_norm": 0.1225275918841362,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3442
    },
    {
      "epoch": 0.55088,
      "grad_norm": 0.1059909388422966,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 3443
    },
    {
      "epoch": 0.55104,
      "grad_norm": 0.1300797462463379,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 3444
    },
    {
      "epoch": 0.5512,
      "grad_norm": 0.12226969748735428,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 3445
    },
    {
      "epoch": 0.55136,
      "grad_norm": 0.10569080710411072,
      "learning_rate": 0.0001,
      "loss": 0.307,
      "step": 3446
    },
    {
      "epoch": 0.55152,
      "grad_norm": 0.11120094358921051,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 3447
    },
    {
      "epoch": 0.55168,
      "grad_norm": 0.09042587876319885,
      "learning_rate": 0.0001,
      "loss": 0.3081,
      "step": 3448
    },
    {
      "epoch": 0.55184,
      "grad_norm": 0.09997932612895966,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3449
    },
    {
      "epoch": 0.552,
      "grad_norm": 0.10192584991455078,
      "learning_rate": 0.0001,
      "loss": 0.3046,
      "step": 3450
    },
    {
      "epoch": 0.55216,
      "grad_norm": 0.1080118864774704,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 3451
    },
    {
      "epoch": 0.55232,
      "grad_norm": 0.1131044253706932,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 3452
    },
    {
      "epoch": 0.55248,
      "grad_norm": 0.10586563497781754,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 3453
    },
    {
      "epoch": 0.55264,
      "grad_norm": 0.11346157640218735,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3454
    },
    {
      "epoch": 0.5528,
      "grad_norm": 0.11505526304244995,
      "learning_rate": 0.0001,
      "loss": 0.3401,
      "step": 3455
    },
    {
      "epoch": 0.55296,
      "grad_norm": 0.08983592689037323,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 3456
    },
    {
      "epoch": 0.55312,
      "grad_norm": 0.10430463403463364,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 3457
    },
    {
      "epoch": 0.55328,
      "grad_norm": 0.11075310409069061,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 3458
    },
    {
      "epoch": 0.55344,
      "grad_norm": 0.1101887971162796,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 3459
    },
    {
      "epoch": 0.5536,
      "grad_norm": 0.09510841965675354,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 3460
    },
    {
      "epoch": 0.55376,
      "grad_norm": 0.10010597109794617,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 3461
    },
    {
      "epoch": 0.55392,
      "grad_norm": 0.10978097468614578,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 3462
    },
    {
      "epoch": 0.55408,
      "grad_norm": 0.08457747846841812,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 3463
    },
    {
      "epoch": 0.55424,
      "grad_norm": 0.11703258752822876,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 3464
    },
    {
      "epoch": 0.5544,
      "grad_norm": 0.12028249353170395,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 3465
    },
    {
      "epoch": 0.55456,
      "grad_norm": 0.1296500563621521,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 3466
    },
    {
      "epoch": 0.55472,
      "grad_norm": 0.10541913658380508,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 3467
    },
    {
      "epoch": 0.55488,
      "grad_norm": 0.11885276436805725,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 3468
    },
    {
      "epoch": 0.55504,
      "grad_norm": 0.11153462529182434,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 3469
    },
    {
      "epoch": 0.5552,
      "grad_norm": 0.10790779441595078,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 3470
    },
    {
      "epoch": 0.55536,
      "grad_norm": 0.10693948715925217,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3471
    },
    {
      "epoch": 0.55552,
      "grad_norm": 0.12186730653047562,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 3472
    },
    {
      "epoch": 0.55568,
      "grad_norm": 0.11071159690618515,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 3473
    },
    {
      "epoch": 0.55584,
      "grad_norm": 0.12367641925811768,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 3474
    },
    {
      "epoch": 0.556,
      "grad_norm": 0.10847777128219604,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 3475
    },
    {
      "epoch": 0.55616,
      "grad_norm": 0.11245457828044891,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 3476
    },
    {
      "epoch": 0.55632,
      "grad_norm": 0.11003883928060532,
      "learning_rate": 0.0001,
      "loss": 0.3081,
      "step": 3477
    },
    {
      "epoch": 0.55648,
      "grad_norm": 0.13897818326950073,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 3478
    },
    {
      "epoch": 0.55664,
      "grad_norm": 0.1106574758887291,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3479
    },
    {
      "epoch": 0.5568,
      "grad_norm": 0.08847270905971527,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3480
    },
    {
      "epoch": 0.55696,
      "grad_norm": 0.10950170457363129,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 3481
    },
    {
      "epoch": 0.55712,
      "grad_norm": 0.10627320408821106,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 3482
    },
    {
      "epoch": 0.55728,
      "grad_norm": 0.09920326620340347,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 3483
    },
    {
      "epoch": 0.55744,
      "grad_norm": 0.1006392315030098,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 3484
    },
    {
      "epoch": 0.5576,
      "grad_norm": 0.09639281034469604,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 3485
    },
    {
      "epoch": 0.55776,
      "grad_norm": 0.1327439695596695,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 3486
    },
    {
      "epoch": 0.55792,
      "grad_norm": 0.10016641020774841,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 3487
    },
    {
      "epoch": 0.55808,
      "grad_norm": 0.1155954971909523,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 3488
    },
    {
      "epoch": 0.55824,
      "grad_norm": 0.17430661618709564,
      "learning_rate": 0.0001,
      "loss": 0.3489,
      "step": 3489
    },
    {
      "epoch": 0.5584,
      "grad_norm": 0.08641885221004486,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 3490
    },
    {
      "epoch": 0.55856,
      "grad_norm": 0.09678923338651657,
      "learning_rate": 0.0001,
      "loss": 0.3028,
      "step": 3491
    },
    {
      "epoch": 0.55872,
      "grad_norm": 0.09900479018688202,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 3492
    },
    {
      "epoch": 0.55888,
      "grad_norm": 0.10550267994403839,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 3493
    },
    {
      "epoch": 0.55904,
      "grad_norm": 0.11261904239654541,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3494
    },
    {
      "epoch": 0.5592,
      "grad_norm": 0.09302817285060883,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3495
    },
    {
      "epoch": 0.55936,
      "grad_norm": 0.09757620096206665,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 3496
    },
    {
      "epoch": 0.55952,
      "grad_norm": 0.10677370429039001,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 3497
    },
    {
      "epoch": 0.55968,
      "grad_norm": 0.10824484378099442,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 3498
    },
    {
      "epoch": 0.55984,
      "grad_norm": 0.09352319687604904,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 3499
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.12685005366802216,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3500
    },
    {
      "epoch": 0.56,
      "eval_train_accuracy": 0.9808,
      "eval_train_loss": 0.32035189867019653,
      "eval_train_runtime": 4.6957,
      "eval_train_samples_per_second": 1064.808,
      "eval_train_steps_per_second": 13.417,
      "step": 3500
    },
    {
      "epoch": 0.56,
      "eval_test_accuracy": 0.9812,
      "eval_test_loss": 0.3192008435726166,
      "eval_test_runtime": 4.4737,
      "eval_test_samples_per_second": 1117.655,
      "eval_test_steps_per_second": 14.082,
      "step": 3500
    },
    {
      "epoch": 0.56016,
      "grad_norm": 0.09088105708360672,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 3501
    },
    {
      "epoch": 0.56032,
      "grad_norm": 0.10780257731676102,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 3502
    },
    {
      "epoch": 0.56048,
      "grad_norm": 0.11909598112106323,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 3503
    },
    {
      "epoch": 0.56064,
      "grad_norm": 0.09137464314699173,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 3504
    },
    {
      "epoch": 0.5608,
      "grad_norm": 0.11319809406995773,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 3505
    },
    {
      "epoch": 0.56096,
      "grad_norm": 0.2086656391620636,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 3506
    },
    {
      "epoch": 0.56112,
      "grad_norm": 0.12769007682800293,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 3507
    },
    {
      "epoch": 0.56128,
      "grad_norm": 0.09784310311079025,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 3508
    },
    {
      "epoch": 0.56144,
      "grad_norm": 0.15094515681266785,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3509
    },
    {
      "epoch": 0.5616,
      "grad_norm": 0.09285475313663483,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 3510
    },
    {
      "epoch": 0.56176,
      "grad_norm": 0.11153250187635422,
      "learning_rate": 0.0001,
      "loss": 0.3486,
      "step": 3511
    },
    {
      "epoch": 0.56192,
      "grad_norm": 0.14510908722877502,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 3512
    },
    {
      "epoch": 0.56208,
      "grad_norm": 0.10344868153333664,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 3513
    },
    {
      "epoch": 0.56224,
      "grad_norm": 0.12040388584136963,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 3514
    },
    {
      "epoch": 0.5624,
      "grad_norm": 0.11726613342761993,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 3515
    },
    {
      "epoch": 0.56256,
      "grad_norm": 0.11394276469945908,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 3516
    },
    {
      "epoch": 0.56272,
      "grad_norm": 0.10824811458587646,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 3517
    },
    {
      "epoch": 0.56288,
      "grad_norm": 0.09431903064250946,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3518
    },
    {
      "epoch": 0.56304,
      "grad_norm": 0.17667169868946075,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 3519
    },
    {
      "epoch": 0.5632,
      "grad_norm": 0.10368742048740387,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 3520
    },
    {
      "epoch": 0.56336,
      "grad_norm": 0.12886548042297363,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 3521
    },
    {
      "epoch": 0.56352,
      "grad_norm": 0.1026017814874649,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 3522
    },
    {
      "epoch": 0.56368,
      "grad_norm": 0.14058353006839752,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 3523
    },
    {
      "epoch": 0.56384,
      "grad_norm": 0.205018550157547,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 3524
    },
    {
      "epoch": 0.564,
      "grad_norm": 0.09166205674409866,
      "learning_rate": 0.0001,
      "loss": 0.3051,
      "step": 3525
    },
    {
      "epoch": 0.56416,
      "grad_norm": 0.10935142636299133,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3526
    },
    {
      "epoch": 0.56432,
      "grad_norm": 0.1700488030910492,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3527
    },
    {
      "epoch": 0.56448,
      "grad_norm": 0.3851645886898041,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 3528
    },
    {
      "epoch": 0.56464,
      "grad_norm": 0.10330285131931305,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 3529
    },
    {
      "epoch": 0.5648,
      "grad_norm": 0.11077465116977692,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 3530
    },
    {
      "epoch": 0.56496,
      "grad_norm": 0.10690633952617645,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 3531
    },
    {
      "epoch": 0.56512,
      "grad_norm": 0.12253295630216599,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3532
    },
    {
      "epoch": 0.56528,
      "grad_norm": 0.1461106687784195,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 3533
    },
    {
      "epoch": 0.56544,
      "grad_norm": 0.5354445576667786,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 3534
    },
    {
      "epoch": 0.5656,
      "grad_norm": 0.15029457211494446,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 3535
    },
    {
      "epoch": 0.56576,
      "grad_norm": 0.25758421421051025,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 3536
    },
    {
      "epoch": 0.56592,
      "grad_norm": 0.1818198710680008,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 3537
    },
    {
      "epoch": 0.56608,
      "grad_norm": 0.14396199584007263,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 3538
    },
    {
      "epoch": 0.56624,
      "grad_norm": 0.14654438197612762,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 3539
    },
    {
      "epoch": 0.5664,
      "grad_norm": 0.23494933545589447,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 3540
    },
    {
      "epoch": 0.56656,
      "grad_norm": 0.11342983692884445,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 3541
    },
    {
      "epoch": 0.56672,
      "grad_norm": 0.14391259849071503,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 3542
    },
    {
      "epoch": 0.56688,
      "grad_norm": 0.11495383083820343,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 3543
    },
    {
      "epoch": 0.56704,
      "grad_norm": 0.16192001104354858,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 3544
    },
    {
      "epoch": 0.5672,
      "grad_norm": 0.47263064980506897,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3545
    },
    {
      "epoch": 0.56736,
      "grad_norm": 0.13433170318603516,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 3546
    },
    {
      "epoch": 0.56752,
      "grad_norm": 0.1417415738105774,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3547
    },
    {
      "epoch": 0.56768,
      "grad_norm": 0.20016247034072876,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 3548
    },
    {
      "epoch": 0.56784,
      "grad_norm": 0.22471782565116882,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 3549
    },
    {
      "epoch": 0.568,
      "grad_norm": 0.11997127532958984,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3550
    },
    {
      "epoch": 0.56816,
      "grad_norm": 0.1153515949845314,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 3551
    },
    {
      "epoch": 0.56832,
      "grad_norm": 0.24689345061779022,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 3552
    },
    {
      "epoch": 0.56848,
      "grad_norm": 0.13768334686756134,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 3553
    },
    {
      "epoch": 0.56864,
      "grad_norm": 0.1661415696144104,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 3554
    },
    {
      "epoch": 0.5688,
      "grad_norm": 0.1899060159921646,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 3555
    },
    {
      "epoch": 0.56896,
      "grad_norm": 0.19623127579689026,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 3556
    },
    {
      "epoch": 0.56912,
      "grad_norm": 0.17277061939239502,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 3557
    },
    {
      "epoch": 0.56928,
      "grad_norm": 0.1956336945295334,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 3558
    },
    {
      "epoch": 0.56944,
      "grad_norm": 0.14100582897663116,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 3559
    },
    {
      "epoch": 0.5696,
      "grad_norm": 0.14863896369934082,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 3560
    },
    {
      "epoch": 0.56976,
      "grad_norm": 0.14253968000411987,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 3561
    },
    {
      "epoch": 0.56992,
      "grad_norm": 0.1981573849916458,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 3562
    },
    {
      "epoch": 0.57008,
      "grad_norm": 0.11673352867364883,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 3563
    },
    {
      "epoch": 0.57024,
      "grad_norm": 0.14499586820602417,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3564
    },
    {
      "epoch": 0.5704,
      "grad_norm": 0.11286143958568573,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 3565
    },
    {
      "epoch": 0.57056,
      "grad_norm": 0.1746598482131958,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3566
    },
    {
      "epoch": 0.57072,
      "grad_norm": 0.15546011924743652,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 3567
    },
    {
      "epoch": 0.57088,
      "grad_norm": 0.32473063468933105,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 3568
    },
    {
      "epoch": 0.57104,
      "grad_norm": 0.1798049360513687,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 3569
    },
    {
      "epoch": 0.5712,
      "grad_norm": 0.14958451688289642,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 3570
    },
    {
      "epoch": 0.57136,
      "grad_norm": 0.1347249299287796,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 3571
    },
    {
      "epoch": 0.57152,
      "grad_norm": 0.22535616159439087,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 3572
    },
    {
      "epoch": 0.57168,
      "grad_norm": 0.1423913687467575,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 3573
    },
    {
      "epoch": 0.57184,
      "grad_norm": 0.1498815268278122,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3574
    },
    {
      "epoch": 0.572,
      "grad_norm": 0.24334010481834412,
      "learning_rate": 0.0001,
      "loss": 0.3414,
      "step": 3575
    },
    {
      "epoch": 0.57216,
      "grad_norm": 0.1633129119873047,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 3576
    },
    {
      "epoch": 0.57232,
      "grad_norm": 0.13227437436580658,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 3577
    },
    {
      "epoch": 0.57248,
      "grad_norm": 0.10995525866746902,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 3578
    },
    {
      "epoch": 0.57264,
      "grad_norm": 0.11994649469852448,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 3579
    },
    {
      "epoch": 0.5728,
      "grad_norm": 0.3098287582397461,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 3580
    },
    {
      "epoch": 0.57296,
      "grad_norm": 0.1816769391298294,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 3581
    },
    {
      "epoch": 0.57312,
      "grad_norm": 0.12604890763759613,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 3582
    },
    {
      "epoch": 0.57328,
      "grad_norm": 0.14511536061763763,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 3583
    },
    {
      "epoch": 0.57344,
      "grad_norm": 0.12159618735313416,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 3584
    },
    {
      "epoch": 0.5736,
      "grad_norm": 0.12521815299987793,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 3585
    },
    {
      "epoch": 0.57376,
      "grad_norm": 0.20618081092834473,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 3586
    },
    {
      "epoch": 0.57392,
      "grad_norm": 0.18647168576717377,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 3587
    },
    {
      "epoch": 0.57408,
      "grad_norm": 0.1570577770471573,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 3588
    },
    {
      "epoch": 0.57424,
      "grad_norm": 0.13780878484249115,
      "learning_rate": 0.0001,
      "loss": 0.3371,
      "step": 3589
    },
    {
      "epoch": 0.5744,
      "grad_norm": 0.17969007790088654,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 3590
    },
    {
      "epoch": 0.57456,
      "grad_norm": 0.16767676174640656,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 3591
    },
    {
      "epoch": 0.57472,
      "grad_norm": 0.1261642575263977,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 3592
    },
    {
      "epoch": 0.57488,
      "grad_norm": 0.226274311542511,
      "learning_rate": 0.0001,
      "loss": 0.3379,
      "step": 3593
    },
    {
      "epoch": 0.57504,
      "grad_norm": 0.1654633730649948,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3594
    },
    {
      "epoch": 0.5752,
      "grad_norm": 0.1508372277021408,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 3595
    },
    {
      "epoch": 0.57536,
      "grad_norm": 0.11087631434202194,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 3596
    },
    {
      "epoch": 0.57552,
      "grad_norm": 0.1330929845571518,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 3597
    },
    {
      "epoch": 0.57568,
      "grad_norm": 0.1140270009636879,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 3598
    },
    {
      "epoch": 0.57584,
      "grad_norm": 0.1256982982158661,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 3599
    },
    {
      "epoch": 0.576,
      "grad_norm": 0.10382703691720963,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 3600
    },
    {
      "epoch": 0.576,
      "eval_train_accuracy": 0.9942,
      "eval_train_loss": 0.32076695561408997,
      "eval_train_runtime": 4.6838,
      "eval_train_samples_per_second": 1067.513,
      "eval_train_steps_per_second": 13.451,
      "step": 3600
    },
    {
      "epoch": 0.576,
      "eval_test_accuracy": 0.9926,
      "eval_test_loss": 0.3194642961025238,
      "eval_test_runtime": 4.2033,
      "eval_test_samples_per_second": 1189.53,
      "eval_test_steps_per_second": 14.988,
      "step": 3600
    },
    {
      "epoch": 0.57616,
      "grad_norm": 0.10310487449169159,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 3601
    },
    {
      "epoch": 0.57632,
      "grad_norm": 0.12942475080490112,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 3602
    },
    {
      "epoch": 0.57648,
      "grad_norm": 0.12176155298948288,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 3603
    },
    {
      "epoch": 0.57664,
      "grad_norm": 0.11938564479351044,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 3604
    },
    {
      "epoch": 0.5768,
      "grad_norm": 0.09818296134471893,
      "learning_rate": 0.0001,
      "loss": 0.3059,
      "step": 3605
    },
    {
      "epoch": 0.57696,
      "grad_norm": 0.10521263629198074,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 3606
    },
    {
      "epoch": 0.57712,
      "grad_norm": 0.09766477346420288,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 3607
    },
    {
      "epoch": 0.57728,
      "grad_norm": 0.10921674966812134,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3608
    },
    {
      "epoch": 0.57744,
      "grad_norm": 0.12458769232034683,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 3609
    },
    {
      "epoch": 0.5776,
      "grad_norm": 0.12653043866157532,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 3610
    },
    {
      "epoch": 0.57776,
      "grad_norm": 0.10429597645998001,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 3611
    },
    {
      "epoch": 0.57792,
      "grad_norm": 0.10414746403694153,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 3612
    },
    {
      "epoch": 0.57808,
      "grad_norm": 0.10402143746614456,
      "learning_rate": 0.0001,
      "loss": 0.3435,
      "step": 3613
    },
    {
      "epoch": 0.57824,
      "grad_norm": 0.12574313580989838,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 3614
    },
    {
      "epoch": 0.5784,
      "grad_norm": 0.09917706996202469,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 3615
    },
    {
      "epoch": 0.57856,
      "grad_norm": 0.14272430539131165,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 3616
    },
    {
      "epoch": 0.57872,
      "grad_norm": 0.11187168210744858,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 3617
    },
    {
      "epoch": 0.57888,
      "grad_norm": 0.09377580881118774,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3618
    },
    {
      "epoch": 0.57904,
      "grad_norm": 0.1305542290210724,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 3619
    },
    {
      "epoch": 0.5792,
      "grad_norm": 0.09851668030023575,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 3620
    },
    {
      "epoch": 0.57936,
      "grad_norm": 0.10150238871574402,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3621
    },
    {
      "epoch": 0.57952,
      "grad_norm": 0.11443358659744263,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 3622
    },
    {
      "epoch": 0.57968,
      "grad_norm": 0.1113511249423027,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 3623
    },
    {
      "epoch": 0.57984,
      "grad_norm": 0.10776878148317337,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 3624
    },
    {
      "epoch": 0.58,
      "grad_norm": 0.09976481646299362,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 3625
    },
    {
      "epoch": 0.58016,
      "grad_norm": 0.09467817097902298,
      "learning_rate": 0.0001,
      "loss": 0.3063,
      "step": 3626
    },
    {
      "epoch": 0.58032,
      "grad_norm": 0.09017080068588257,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 3627
    },
    {
      "epoch": 0.58048,
      "grad_norm": 0.09862018376588821,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 3628
    },
    {
      "epoch": 0.58064,
      "grad_norm": 0.09873858839273453,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 3629
    },
    {
      "epoch": 0.5808,
      "grad_norm": 0.093115895986557,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 3630
    },
    {
      "epoch": 0.58096,
      "grad_norm": 0.0872054472565651,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 3631
    },
    {
      "epoch": 0.58112,
      "grad_norm": 0.10189773887395859,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 3632
    },
    {
      "epoch": 0.58128,
      "grad_norm": 0.10911465436220169,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 3633
    },
    {
      "epoch": 0.58144,
      "grad_norm": 0.09496378153562546,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 3634
    },
    {
      "epoch": 0.5816,
      "grad_norm": 0.11645875126123428,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3635
    },
    {
      "epoch": 0.58176,
      "grad_norm": 0.09834659844636917,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 3636
    },
    {
      "epoch": 0.58192,
      "grad_norm": 0.09632380306720734,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3637
    },
    {
      "epoch": 0.58208,
      "grad_norm": 0.11111092567443848,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 3638
    },
    {
      "epoch": 0.58224,
      "grad_norm": 0.08733054995536804,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 3639
    },
    {
      "epoch": 0.5824,
      "grad_norm": 0.09988769143819809,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 3640
    },
    {
      "epoch": 0.58256,
      "grad_norm": 0.13587911427021027,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 3641
    },
    {
      "epoch": 0.58272,
      "grad_norm": 0.10175056010484695,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 3642
    },
    {
      "epoch": 0.58288,
      "grad_norm": 0.09567547589540482,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 3643
    },
    {
      "epoch": 0.58304,
      "grad_norm": 0.10082933306694031,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 3644
    },
    {
      "epoch": 0.5832,
      "grad_norm": 0.10647853463888168,
      "learning_rate": 0.0001,
      "loss": 0.3059,
      "step": 3645
    },
    {
      "epoch": 0.58336,
      "grad_norm": 0.12155038863420486,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 3646
    },
    {
      "epoch": 0.58352,
      "grad_norm": 0.0927983820438385,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 3647
    },
    {
      "epoch": 0.58368,
      "grad_norm": 0.09393010288476944,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 3648
    },
    {
      "epoch": 0.58384,
      "grad_norm": 0.10113009065389633,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 3649
    },
    {
      "epoch": 0.584,
      "grad_norm": 0.09565837681293488,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 3650
    },
    {
      "epoch": 0.58416,
      "grad_norm": 0.10587552934885025,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3651
    },
    {
      "epoch": 0.58432,
      "grad_norm": 0.11239799857139587,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 3652
    },
    {
      "epoch": 0.58448,
      "grad_norm": 0.11414915323257446,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3653
    },
    {
      "epoch": 0.58464,
      "grad_norm": 0.10906150192022324,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 3654
    },
    {
      "epoch": 0.5848,
      "grad_norm": 0.10959958285093307,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 3655
    },
    {
      "epoch": 0.58496,
      "grad_norm": 0.09912875294685364,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 3656
    },
    {
      "epoch": 0.58512,
      "grad_norm": 0.10318008065223694,
      "learning_rate": 0.0001,
      "loss": 0.3405,
      "step": 3657
    },
    {
      "epoch": 0.58528,
      "grad_norm": 0.11536470055580139,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 3658
    },
    {
      "epoch": 0.58544,
      "grad_norm": 0.0890033096075058,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 3659
    },
    {
      "epoch": 0.5856,
      "grad_norm": 0.09154464304447174,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 3660
    },
    {
      "epoch": 0.58576,
      "grad_norm": 0.08862010389566422,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 3661
    },
    {
      "epoch": 0.58592,
      "grad_norm": 0.09590636193752289,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 3662
    },
    {
      "epoch": 0.58608,
      "grad_norm": 0.10796934366226196,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 3663
    },
    {
      "epoch": 0.58624,
      "grad_norm": 0.09276895225048065,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 3664
    },
    {
      "epoch": 0.5864,
      "grad_norm": 0.12682019174098969,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 3665
    },
    {
      "epoch": 0.58656,
      "grad_norm": 0.11404275894165039,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 3666
    },
    {
      "epoch": 0.58672,
      "grad_norm": 0.10204087197780609,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 3667
    },
    {
      "epoch": 0.58688,
      "grad_norm": 0.10624860227108002,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 3668
    },
    {
      "epoch": 0.58704,
      "grad_norm": 0.09691351652145386,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 3669
    },
    {
      "epoch": 0.5872,
      "grad_norm": 0.08929383009672165,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 3670
    },
    {
      "epoch": 0.58736,
      "grad_norm": 0.09406892955303192,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 3671
    },
    {
      "epoch": 0.58752,
      "grad_norm": 0.1034046933054924,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 3672
    },
    {
      "epoch": 0.58768,
      "grad_norm": 0.09639741480350494,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 3673
    },
    {
      "epoch": 0.58784,
      "grad_norm": 0.08395345509052277,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 3674
    },
    {
      "epoch": 0.588,
      "grad_norm": 0.12142759561538696,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 3675
    },
    {
      "epoch": 0.58816,
      "grad_norm": 0.11628487706184387,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 3676
    },
    {
      "epoch": 0.58832,
      "grad_norm": 0.1357242614030838,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 3677
    },
    {
      "epoch": 0.58848,
      "grad_norm": 0.35736408829689026,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 3678
    },
    {
      "epoch": 0.58864,
      "grad_norm": 0.0983985885977745,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3679
    },
    {
      "epoch": 0.5888,
      "grad_norm": 0.37050339579582214,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3680
    },
    {
      "epoch": 0.58896,
      "grad_norm": 0.12229820340871811,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 3681
    },
    {
      "epoch": 0.58912,
      "grad_norm": 0.10450147092342377,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 3682
    },
    {
      "epoch": 0.58928,
      "grad_norm": 0.19803179800510406,
      "learning_rate": 0.0001,
      "loss": 0.301,
      "step": 3683
    },
    {
      "epoch": 0.58944,
      "grad_norm": 0.2669371962547302,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 3684
    },
    {
      "epoch": 0.5896,
      "grad_norm": 0.20727136731147766,
      "learning_rate": 0.0001,
      "loss": 0.3367,
      "step": 3685
    },
    {
      "epoch": 0.58976,
      "grad_norm": 0.25322186946868896,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3686
    },
    {
      "epoch": 0.58992,
      "grad_norm": 0.14057166874408722,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 3687
    },
    {
      "epoch": 0.59008,
      "grad_norm": 0.27186182141304016,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 3688
    },
    {
      "epoch": 0.59024,
      "grad_norm": 0.2220521718263626,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 3689
    },
    {
      "epoch": 0.5904,
      "grad_norm": 0.18485453724861145,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 3690
    },
    {
      "epoch": 0.59056,
      "grad_norm": 0.2064974009990692,
      "learning_rate": 0.0001,
      "loss": 0.3415,
      "step": 3691
    },
    {
      "epoch": 0.59072,
      "grad_norm": 0.11005602777004242,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 3692
    },
    {
      "epoch": 0.59088,
      "grad_norm": 0.1690271943807602,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 3693
    },
    {
      "epoch": 0.59104,
      "grad_norm": 0.1590833067893982,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 3694
    },
    {
      "epoch": 0.5912,
      "grad_norm": 0.19686748087406158,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 3695
    },
    {
      "epoch": 0.59136,
      "grad_norm": 0.15176063776016235,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 3696
    },
    {
      "epoch": 0.59152,
      "grad_norm": 0.13298143446445465,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 3697
    },
    {
      "epoch": 0.59168,
      "grad_norm": 0.12408793717622757,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 3698
    },
    {
      "epoch": 0.59184,
      "grad_norm": 0.11430728435516357,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 3699
    },
    {
      "epoch": 0.592,
      "grad_norm": 0.11796030402183533,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 3700
    },
    {
      "epoch": 0.592,
      "eval_train_accuracy": 0.9874,
      "eval_train_loss": 0.3208380341529846,
      "eval_train_runtime": 4.6604,
      "eval_train_samples_per_second": 1072.861,
      "eval_train_steps_per_second": 13.518,
      "step": 3700
    },
    {
      "epoch": 0.592,
      "eval_test_accuracy": 0.9854,
      "eval_test_loss": 0.3196730315685272,
      "eval_test_runtime": 4.5342,
      "eval_test_samples_per_second": 1102.729,
      "eval_test_steps_per_second": 13.894,
      "step": 3700
    },
    {
      "epoch": 0.59216,
      "grad_norm": 0.13728715479373932,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 3701
    },
    {
      "epoch": 0.59232,
      "grad_norm": 0.12561193108558655,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 3702
    },
    {
      "epoch": 0.59248,
      "grad_norm": 0.12608620524406433,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 3703
    },
    {
      "epoch": 0.59264,
      "grad_norm": 0.1277317851781845,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3704
    },
    {
      "epoch": 0.5928,
      "grad_norm": 0.10955699533224106,
      "learning_rate": 0.0001,
      "loss": 0.3062,
      "step": 3705
    },
    {
      "epoch": 0.59296,
      "grad_norm": 0.10218308120965958,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 3706
    },
    {
      "epoch": 0.59312,
      "grad_norm": 0.15175384283065796,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 3707
    },
    {
      "epoch": 0.59328,
      "grad_norm": 0.1396070122718811,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 3708
    },
    {
      "epoch": 0.59344,
      "grad_norm": 0.12912070751190186,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 3709
    },
    {
      "epoch": 0.5936,
      "grad_norm": 0.11922861635684967,
      "learning_rate": 0.0001,
      "loss": 0.3503,
      "step": 3710
    },
    {
      "epoch": 0.59376,
      "grad_norm": 0.12248475849628448,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 3711
    },
    {
      "epoch": 0.59392,
      "grad_norm": 0.10007670521736145,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3712
    },
    {
      "epoch": 0.59408,
      "grad_norm": 0.11662131547927856,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 3713
    },
    {
      "epoch": 0.59424,
      "grad_norm": 0.12829230725765228,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 3714
    },
    {
      "epoch": 0.5944,
      "grad_norm": 0.15467867255210876,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 3715
    },
    {
      "epoch": 0.59456,
      "grad_norm": 0.1329890787601471,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 3716
    },
    {
      "epoch": 0.59472,
      "grad_norm": 0.125621035695076,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 3717
    },
    {
      "epoch": 0.59488,
      "grad_norm": 0.13358521461486816,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 3718
    },
    {
      "epoch": 0.59504,
      "grad_norm": 0.09324648976325989,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 3719
    },
    {
      "epoch": 0.5952,
      "grad_norm": 0.1152474582195282,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 3720
    },
    {
      "epoch": 0.59536,
      "grad_norm": 0.11298613250255585,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 3721
    },
    {
      "epoch": 0.59552,
      "grad_norm": 0.13079319894313812,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3722
    },
    {
      "epoch": 0.59568,
      "grad_norm": 0.10417012125253677,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 3723
    },
    {
      "epoch": 0.59584,
      "grad_norm": 0.1079728826880455,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 3724
    },
    {
      "epoch": 0.596,
      "grad_norm": 0.09807316213846207,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 3725
    },
    {
      "epoch": 0.59616,
      "grad_norm": 0.12142127752304077,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 3726
    },
    {
      "epoch": 0.59632,
      "grad_norm": 0.09484245628118515,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 3727
    },
    {
      "epoch": 0.59648,
      "grad_norm": 0.12348118424415588,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3728
    },
    {
      "epoch": 0.59664,
      "grad_norm": 0.11923864483833313,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 3729
    },
    {
      "epoch": 0.5968,
      "grad_norm": 0.12388767302036285,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 3730
    },
    {
      "epoch": 0.59696,
      "grad_norm": 0.16946275532245636,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3731
    },
    {
      "epoch": 0.59712,
      "grad_norm": 0.09655407071113586,
      "learning_rate": 0.0001,
      "loss": 0.3063,
      "step": 3732
    },
    {
      "epoch": 0.59728,
      "grad_norm": 0.11644146591424942,
      "learning_rate": 0.0001,
      "loss": 0.299,
      "step": 3733
    },
    {
      "epoch": 0.59744,
      "grad_norm": 0.14400272071361542,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 3734
    },
    {
      "epoch": 0.5976,
      "grad_norm": 0.1831541657447815,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 3735
    },
    {
      "epoch": 0.59776,
      "grad_norm": 0.10838011652231216,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 3736
    },
    {
      "epoch": 0.59792,
      "grad_norm": 0.12363675236701965,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 3737
    },
    {
      "epoch": 0.59808,
      "grad_norm": 0.18759475648403168,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 3738
    },
    {
      "epoch": 0.59824,
      "grad_norm": 0.15794453024864197,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 3739
    },
    {
      "epoch": 0.5984,
      "grad_norm": 0.21937747299671173,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3740
    },
    {
      "epoch": 0.59856,
      "grad_norm": 0.1681492030620575,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 3741
    },
    {
      "epoch": 0.59872,
      "grad_norm": 0.13183651864528656,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 3742
    },
    {
      "epoch": 0.59888,
      "grad_norm": 0.1260921210050583,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3743
    },
    {
      "epoch": 0.59904,
      "grad_norm": 0.15424664318561554,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 3744
    },
    {
      "epoch": 0.5992,
      "grad_norm": 0.1301756650209427,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 3745
    },
    {
      "epoch": 0.59936,
      "grad_norm": 0.10604923218488693,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 3746
    },
    {
      "epoch": 0.59952,
      "grad_norm": 0.09365873783826828,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 3747
    },
    {
      "epoch": 0.59968,
      "grad_norm": 0.1103120818734169,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 3748
    },
    {
      "epoch": 0.59984,
      "grad_norm": 0.11452294886112213,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 3749
    },
    {
      "epoch": 0.6,
      "grad_norm": 0.09609530121088028,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 3750
    },
    {
      "epoch": 0.60016,
      "grad_norm": 0.1082172691822052,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 3751
    },
    {
      "epoch": 0.60032,
      "grad_norm": 0.15173794329166412,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 3752
    },
    {
      "epoch": 0.60048,
      "grad_norm": 0.0981777235865593,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 3753
    },
    {
      "epoch": 0.60064,
      "grad_norm": 0.09808702021837234,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 3754
    },
    {
      "epoch": 0.6008,
      "grad_norm": 0.10159891843795776,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 3755
    },
    {
      "epoch": 0.60096,
      "grad_norm": 0.09431612491607666,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 3756
    },
    {
      "epoch": 0.60112,
      "grad_norm": 0.10009520500898361,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 3757
    },
    {
      "epoch": 0.60128,
      "grad_norm": 0.10446644574403763,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 3758
    },
    {
      "epoch": 0.60144,
      "grad_norm": 0.11651129275560379,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 3759
    },
    {
      "epoch": 0.6016,
      "grad_norm": 0.09317026287317276,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 3760
    },
    {
      "epoch": 0.60176,
      "grad_norm": 0.1189950630068779,
      "learning_rate": 0.0001,
      "loss": 0.3463,
      "step": 3761
    },
    {
      "epoch": 0.60192,
      "grad_norm": 0.14248983561992645,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 3762
    },
    {
      "epoch": 0.60208,
      "grad_norm": 0.11857868731021881,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 3763
    },
    {
      "epoch": 0.60224,
      "grad_norm": 0.10493504256010056,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 3764
    },
    {
      "epoch": 0.6024,
      "grad_norm": 0.13012118637561798,
      "learning_rate": 0.0001,
      "loss": 0.3032,
      "step": 3765
    },
    {
      "epoch": 0.60256,
      "grad_norm": 0.10448714345693588,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 3766
    },
    {
      "epoch": 0.60272,
      "grad_norm": 0.10298348218202591,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 3767
    },
    {
      "epoch": 0.60288,
      "grad_norm": 0.11487758159637451,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 3768
    },
    {
      "epoch": 0.60304,
      "grad_norm": 0.10674118995666504,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 3769
    },
    {
      "epoch": 0.6032,
      "grad_norm": 0.11023779213428497,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 3770
    },
    {
      "epoch": 0.60336,
      "grad_norm": 0.09640335291624069,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3771
    },
    {
      "epoch": 0.60352,
      "grad_norm": 0.09397229552268982,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 3772
    },
    {
      "epoch": 0.60368,
      "grad_norm": 0.10103026032447815,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 3773
    },
    {
      "epoch": 0.60384,
      "grad_norm": 0.09828139841556549,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3774
    },
    {
      "epoch": 0.604,
      "grad_norm": 0.1090221107006073,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 3775
    },
    {
      "epoch": 0.60416,
      "grad_norm": 0.12085463851690292,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 3776
    },
    {
      "epoch": 0.60432,
      "grad_norm": 0.12214086949825287,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 3777
    },
    {
      "epoch": 0.60448,
      "grad_norm": 0.08580814301967621,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 3778
    },
    {
      "epoch": 0.60464,
      "grad_norm": 0.09814367443323135,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 3779
    },
    {
      "epoch": 0.6048,
      "grad_norm": 0.10252154618501663,
      "learning_rate": 0.0001,
      "loss": 0.3429,
      "step": 3780
    },
    {
      "epoch": 0.60496,
      "grad_norm": 0.10181687027215958,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3781
    },
    {
      "epoch": 0.60512,
      "grad_norm": 0.0972408875823021,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 3782
    },
    {
      "epoch": 0.60528,
      "grad_norm": 0.10445669293403625,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3783
    },
    {
      "epoch": 0.60544,
      "grad_norm": 0.09368135780096054,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 3784
    },
    {
      "epoch": 0.6056,
      "grad_norm": 0.08131445944309235,
      "learning_rate": 0.0001,
      "loss": 0.3023,
      "step": 3785
    },
    {
      "epoch": 0.60576,
      "grad_norm": 0.10302282124757767,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 3786
    },
    {
      "epoch": 0.60592,
      "grad_norm": 0.10348884761333466,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 3787
    },
    {
      "epoch": 0.60608,
      "grad_norm": 0.10260183364152908,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 3788
    },
    {
      "epoch": 0.60624,
      "grad_norm": 0.10678144544363022,
      "learning_rate": 0.0001,
      "loss": 0.3063,
      "step": 3789
    },
    {
      "epoch": 0.6064,
      "grad_norm": 0.09531533718109131,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3790
    },
    {
      "epoch": 0.60656,
      "grad_norm": 0.11905565112829208,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 3791
    },
    {
      "epoch": 0.60672,
      "grad_norm": 0.09356797486543655,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3792
    },
    {
      "epoch": 0.60688,
      "grad_norm": 0.0862710252404213,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3793
    },
    {
      "epoch": 0.60704,
      "grad_norm": 0.09500439465045929,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 3794
    },
    {
      "epoch": 0.6072,
      "grad_norm": 0.10439421981573105,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 3795
    },
    {
      "epoch": 0.60736,
      "grad_norm": 0.12022345513105392,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 3796
    },
    {
      "epoch": 0.60752,
      "grad_norm": 0.10346462577581406,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 3797
    },
    {
      "epoch": 0.60768,
      "grad_norm": 0.112908273935318,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 3798
    },
    {
      "epoch": 0.60784,
      "grad_norm": 0.10390323400497437,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 3799
    },
    {
      "epoch": 0.608,
      "grad_norm": 0.09854881465435028,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 3800
    },
    {
      "epoch": 0.608,
      "eval_train_accuracy": 0.9936,
      "eval_train_loss": 0.31951048970222473,
      "eval_train_runtime": 4.6914,
      "eval_train_samples_per_second": 1065.781,
      "eval_train_steps_per_second": 13.429,
      "step": 3800
    },
    {
      "epoch": 0.608,
      "eval_test_accuracy": 0.995,
      "eval_test_loss": 0.31823262572288513,
      "eval_test_runtime": 4.1155,
      "eval_test_samples_per_second": 1214.918,
      "eval_test_steps_per_second": 15.308,
      "step": 3800
    },
    {
      "epoch": 0.60816,
      "grad_norm": 0.09245353192090988,
      "learning_rate": 0.0001,
      "loss": 0.3078,
      "step": 3801
    },
    {
      "epoch": 0.60832,
      "grad_norm": 0.09197297692298889,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 3802
    },
    {
      "epoch": 0.60848,
      "grad_norm": 0.08885738998651505,
      "learning_rate": 0.0001,
      "loss": 0.3357,
      "step": 3803
    },
    {
      "epoch": 0.60864,
      "grad_norm": 0.08645333349704742,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 3804
    },
    {
      "epoch": 0.6088,
      "grad_norm": 0.10697390139102936,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 3805
    },
    {
      "epoch": 0.60896,
      "grad_norm": 0.10444056242704391,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 3806
    },
    {
      "epoch": 0.60912,
      "grad_norm": 0.10012835264205933,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 3807
    },
    {
      "epoch": 0.60928,
      "grad_norm": 0.1036565974354744,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 3808
    },
    {
      "epoch": 0.60944,
      "grad_norm": 0.10404597222805023,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 3809
    },
    {
      "epoch": 0.6096,
      "grad_norm": 0.13707365095615387,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 3810
    },
    {
      "epoch": 0.60976,
      "grad_norm": 0.10677894949913025,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 3811
    },
    {
      "epoch": 0.60992,
      "grad_norm": 0.10376457124948502,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 3812
    },
    {
      "epoch": 0.61008,
      "grad_norm": 0.09067589789628983,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 3813
    },
    {
      "epoch": 0.61024,
      "grad_norm": 0.10531215369701385,
      "learning_rate": 0.0001,
      "loss": 0.3341,
      "step": 3814
    },
    {
      "epoch": 0.6104,
      "grad_norm": 0.14573606848716736,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 3815
    },
    {
      "epoch": 0.61056,
      "grad_norm": 0.14271114766597748,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 3816
    },
    {
      "epoch": 0.61072,
      "grad_norm": 0.11549574881792068,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 3817
    },
    {
      "epoch": 0.61088,
      "grad_norm": 0.09290309250354767,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3818
    },
    {
      "epoch": 0.61104,
      "grad_norm": 0.14003516733646393,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 3819
    },
    {
      "epoch": 0.6112,
      "grad_norm": 0.13180707395076752,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 3820
    },
    {
      "epoch": 0.61136,
      "grad_norm": 0.09471499919891357,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 3821
    },
    {
      "epoch": 0.61152,
      "grad_norm": 0.10481087863445282,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 3822
    },
    {
      "epoch": 0.61168,
      "grad_norm": 0.08810587227344513,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 3823
    },
    {
      "epoch": 0.61184,
      "grad_norm": 0.15480364859104156,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 3824
    },
    {
      "epoch": 0.612,
      "grad_norm": 0.10377652943134308,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3825
    },
    {
      "epoch": 0.61216,
      "grad_norm": 0.08353658765554428,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 3826
    },
    {
      "epoch": 0.61232,
      "grad_norm": 0.0975813940167427,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 3827
    },
    {
      "epoch": 0.61248,
      "grad_norm": 0.10166318714618683,
      "learning_rate": 0.0001,
      "loss": 0.3075,
      "step": 3828
    },
    {
      "epoch": 0.61264,
      "grad_norm": 0.09277429431676865,
      "learning_rate": 0.0001,
      "loss": 0.3046,
      "step": 3829
    },
    {
      "epoch": 0.6128,
      "grad_norm": 0.09776274859905243,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 3830
    },
    {
      "epoch": 0.61296,
      "grad_norm": 0.08359917253255844,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 3831
    },
    {
      "epoch": 0.61312,
      "grad_norm": 0.0829518660902977,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 3832
    },
    {
      "epoch": 0.61328,
      "grad_norm": 0.11129773408174515,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 3833
    },
    {
      "epoch": 0.61344,
      "grad_norm": 0.09456511586904526,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 3834
    },
    {
      "epoch": 0.6136,
      "grad_norm": 0.09540532529354095,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 3835
    },
    {
      "epoch": 0.61376,
      "grad_norm": 0.11683951318264008,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 3836
    },
    {
      "epoch": 0.61392,
      "grad_norm": 0.09841248393058777,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 3837
    },
    {
      "epoch": 0.61408,
      "grad_norm": 0.09811248630285263,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 3838
    },
    {
      "epoch": 0.61424,
      "grad_norm": 0.10630477219820023,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 3839
    },
    {
      "epoch": 0.6144,
      "grad_norm": 0.10228654742240906,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 3840
    },
    {
      "epoch": 0.61456,
      "grad_norm": 0.10514092445373535,
      "learning_rate": 0.0001,
      "loss": 0.3347,
      "step": 3841
    },
    {
      "epoch": 0.61472,
      "grad_norm": 0.1003061980009079,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 3842
    },
    {
      "epoch": 0.61488,
      "grad_norm": 0.10323310643434525,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 3843
    },
    {
      "epoch": 0.61504,
      "grad_norm": 0.08875030279159546,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 3844
    },
    {
      "epoch": 0.6152,
      "grad_norm": 0.09233938157558441,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 3845
    },
    {
      "epoch": 0.61536,
      "grad_norm": 0.12277338653802872,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 3846
    },
    {
      "epoch": 0.61552,
      "grad_norm": 0.11274148523807526,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 3847
    },
    {
      "epoch": 0.61568,
      "grad_norm": 0.09352438896894455,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 3848
    },
    {
      "epoch": 0.61584,
      "grad_norm": 0.09691209346055984,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 3849
    },
    {
      "epoch": 0.616,
      "grad_norm": 0.13245753943920135,
      "learning_rate": 0.0001,
      "loss": 0.3078,
      "step": 3850
    },
    {
      "epoch": 0.61616,
      "grad_norm": 0.10803020745515823,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 3851
    },
    {
      "epoch": 0.61632,
      "grad_norm": 0.10055391490459442,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 3852
    },
    {
      "epoch": 0.61648,
      "grad_norm": 0.22931307554244995,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 3853
    },
    {
      "epoch": 0.61664,
      "grad_norm": 0.127223938703537,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 3854
    },
    {
      "epoch": 0.6168,
      "grad_norm": 0.1091143935918808,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 3855
    },
    {
      "epoch": 0.61696,
      "grad_norm": 0.10708482563495636,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 3856
    },
    {
      "epoch": 0.61712,
      "grad_norm": 0.12742449343204498,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3857
    },
    {
      "epoch": 0.61728,
      "grad_norm": 0.09361140429973602,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 3858
    },
    {
      "epoch": 0.61744,
      "grad_norm": 0.08873927593231201,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 3859
    },
    {
      "epoch": 0.6176,
      "grad_norm": 0.1068071499466896,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 3860
    },
    {
      "epoch": 0.61776,
      "grad_norm": 0.09525728970766068,
      "learning_rate": 0.0001,
      "loss": 0.3073,
      "step": 3861
    },
    {
      "epoch": 0.61792,
      "grad_norm": 0.12452133744955063,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 3862
    },
    {
      "epoch": 0.61808,
      "grad_norm": 0.0973251610994339,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 3863
    },
    {
      "epoch": 0.61824,
      "grad_norm": 0.08951018005609512,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 3864
    },
    {
      "epoch": 0.6184,
      "grad_norm": 0.09385531395673752,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 3865
    },
    {
      "epoch": 0.61856,
      "grad_norm": 0.10561966150999069,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 3866
    },
    {
      "epoch": 0.61872,
      "grad_norm": 0.16479167342185974,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 3867
    },
    {
      "epoch": 0.61888,
      "grad_norm": 0.16288624703884125,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 3868
    },
    {
      "epoch": 0.61904,
      "grad_norm": 0.09565096348524094,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 3869
    },
    {
      "epoch": 0.6192,
      "grad_norm": 0.15795189142227173,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 3870
    },
    {
      "epoch": 0.61936,
      "grad_norm": 0.14655207097530365,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3871
    },
    {
      "epoch": 0.61952,
      "grad_norm": 0.19018356502056122,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 3872
    },
    {
      "epoch": 0.61968,
      "grad_norm": 0.10594705492258072,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 3873
    },
    {
      "epoch": 0.61984,
      "grad_norm": 0.205976203083992,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 3874
    },
    {
      "epoch": 0.62,
      "grad_norm": 0.12635764479637146,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 3875
    },
    {
      "epoch": 0.62016,
      "grad_norm": 0.11087927222251892,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 3876
    },
    {
      "epoch": 0.62032,
      "grad_norm": 0.16952724754810333,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 3877
    },
    {
      "epoch": 0.62048,
      "grad_norm": 0.11638189107179642,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3878
    },
    {
      "epoch": 0.62064,
      "grad_norm": 0.10548797994852066,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 3879
    },
    {
      "epoch": 0.6208,
      "grad_norm": 0.09919749945402145,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 3880
    },
    {
      "epoch": 0.62096,
      "grad_norm": 0.13541080057621002,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3881
    },
    {
      "epoch": 0.62112,
      "grad_norm": 0.09912481904029846,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 3882
    },
    {
      "epoch": 0.62128,
      "grad_norm": 0.09869694709777832,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3883
    },
    {
      "epoch": 0.62144,
      "grad_norm": 0.10662148147821426,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 3884
    },
    {
      "epoch": 0.6216,
      "grad_norm": 0.10653415322303772,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 3885
    },
    {
      "epoch": 0.62176,
      "grad_norm": 0.11648637056350708,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 3886
    },
    {
      "epoch": 0.62192,
      "grad_norm": 0.15208293497562408,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 3887
    },
    {
      "epoch": 0.62208,
      "grad_norm": 0.119859479367733,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 3888
    },
    {
      "epoch": 0.62224,
      "grad_norm": 0.08786338567733765,
      "learning_rate": 0.0001,
      "loss": 0.3091,
      "step": 3889
    },
    {
      "epoch": 0.6224,
      "grad_norm": 0.10391820222139359,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 3890
    },
    {
      "epoch": 0.62256,
      "grad_norm": 0.0975959375500679,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 3891
    },
    {
      "epoch": 0.62272,
      "grad_norm": 0.10830795764923096,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 3892
    },
    {
      "epoch": 0.62288,
      "grad_norm": 0.10246641933917999,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3893
    },
    {
      "epoch": 0.62304,
      "grad_norm": 0.11364206671714783,
      "learning_rate": 0.0001,
      "loss": 0.3037,
      "step": 3894
    },
    {
      "epoch": 0.6232,
      "grad_norm": 0.11897535622119904,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 3895
    },
    {
      "epoch": 0.62336,
      "grad_norm": 0.1173301562666893,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 3896
    },
    {
      "epoch": 0.62352,
      "grad_norm": 0.10827953368425369,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 3897
    },
    {
      "epoch": 0.62368,
      "grad_norm": 0.1325608640909195,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 3898
    },
    {
      "epoch": 0.62384,
      "grad_norm": 0.09794050455093384,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 3899
    },
    {
      "epoch": 0.624,
      "grad_norm": 0.10742110013961792,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 3900
    },
    {
      "epoch": 0.624,
      "eval_train_accuracy": 0.9946,
      "eval_train_loss": 0.3200385272502899,
      "eval_train_runtime": 4.5596,
      "eval_train_samples_per_second": 1096.596,
      "eval_train_steps_per_second": 13.817,
      "step": 3900
    },
    {
      "epoch": 0.624,
      "eval_test_accuracy": 0.9952,
      "eval_test_loss": 0.3188594877719879,
      "eval_test_runtime": 4.3158,
      "eval_test_samples_per_second": 1158.532,
      "eval_test_steps_per_second": 14.598,
      "step": 3900
    },
    {
      "epoch": 0.62416,
      "grad_norm": 0.12660691142082214,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 3901
    },
    {
      "epoch": 0.62432,
      "grad_norm": 0.09872834384441376,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 3902
    },
    {
      "epoch": 0.62448,
      "grad_norm": 0.0961708277463913,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 3903
    },
    {
      "epoch": 0.62464,
      "grad_norm": 0.0971141830086708,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 3904
    },
    {
      "epoch": 0.6248,
      "grad_norm": 0.1123511865735054,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 3905
    },
    {
      "epoch": 0.62496,
      "grad_norm": 0.10329581052064896,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3906
    },
    {
      "epoch": 0.62512,
      "grad_norm": 0.09777606278657913,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 3907
    },
    {
      "epoch": 0.62528,
      "grad_norm": 0.10831059515476227,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 3908
    },
    {
      "epoch": 0.62544,
      "grad_norm": 0.10085205733776093,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3909
    },
    {
      "epoch": 0.6256,
      "grad_norm": 0.09087218344211578,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 3910
    },
    {
      "epoch": 0.62576,
      "grad_norm": 0.08601266145706177,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 3911
    },
    {
      "epoch": 0.62592,
      "grad_norm": 0.10693706572055817,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 3912
    },
    {
      "epoch": 0.62608,
      "grad_norm": 0.10477562993764877,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 3913
    },
    {
      "epoch": 0.62624,
      "grad_norm": 0.09326119720935822,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 3914
    },
    {
      "epoch": 0.6264,
      "grad_norm": 0.09749218821525574,
      "learning_rate": 0.0001,
      "loss": 0.3054,
      "step": 3915
    },
    {
      "epoch": 0.62656,
      "grad_norm": 0.11755780130624771,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 3916
    },
    {
      "epoch": 0.62672,
      "grad_norm": 0.09648949652910233,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 3917
    },
    {
      "epoch": 0.62688,
      "grad_norm": 0.10458893328905106,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 3918
    },
    {
      "epoch": 0.62704,
      "grad_norm": 0.09264737367630005,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 3919
    },
    {
      "epoch": 0.6272,
      "grad_norm": 0.09951213002204895,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 3920
    },
    {
      "epoch": 0.62736,
      "grad_norm": 0.09289582073688507,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 3921
    },
    {
      "epoch": 0.62752,
      "grad_norm": 0.0981459990143776,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3922
    },
    {
      "epoch": 0.62768,
      "grad_norm": 0.08527903258800507,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 3923
    },
    {
      "epoch": 0.62784,
      "grad_norm": 0.0901525691151619,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 3924
    },
    {
      "epoch": 0.628,
      "grad_norm": 0.13335973024368286,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 3925
    },
    {
      "epoch": 0.62816,
      "grad_norm": 0.10225369036197662,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 3926
    },
    {
      "epoch": 0.62832,
      "grad_norm": 0.10841026902198792,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 3927
    },
    {
      "epoch": 0.62848,
      "grad_norm": 0.08494526147842407,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 3928
    },
    {
      "epoch": 0.62864,
      "grad_norm": 0.08612527698278427,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 3929
    },
    {
      "epoch": 0.6288,
      "grad_norm": 0.0849667340517044,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 3930
    },
    {
      "epoch": 0.62896,
      "grad_norm": 0.11138638854026794,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 3931
    },
    {
      "epoch": 0.62912,
      "grad_norm": 0.10900431871414185,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 3932
    },
    {
      "epoch": 0.62928,
      "grad_norm": 0.12169617414474487,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 3933
    },
    {
      "epoch": 0.62944,
      "grad_norm": 0.09887659549713135,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 3934
    },
    {
      "epoch": 0.6296,
      "grad_norm": 0.12280809879302979,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 3935
    },
    {
      "epoch": 0.62976,
      "grad_norm": 0.1513480395078659,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 3936
    },
    {
      "epoch": 0.62992,
      "grad_norm": 0.0925886407494545,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 3937
    },
    {
      "epoch": 0.63008,
      "grad_norm": 0.09173552691936493,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 3938
    },
    {
      "epoch": 0.63024,
      "grad_norm": 0.0945930927991867,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 3939
    },
    {
      "epoch": 0.6304,
      "grad_norm": 0.18979457020759583,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 3940
    },
    {
      "epoch": 0.63056,
      "grad_norm": 0.09471464902162552,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 3941
    },
    {
      "epoch": 0.63072,
      "grad_norm": 0.10469463467597961,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 3942
    },
    {
      "epoch": 0.63088,
      "grad_norm": 0.1028793528676033,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 3943
    },
    {
      "epoch": 0.63104,
      "grad_norm": 0.14033269882202148,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 3944
    },
    {
      "epoch": 0.6312,
      "grad_norm": 0.12021170556545258,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 3945
    },
    {
      "epoch": 0.63136,
      "grad_norm": 0.13735800981521606,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 3946
    },
    {
      "epoch": 0.63152,
      "grad_norm": 0.10042949765920639,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 3947
    },
    {
      "epoch": 0.63168,
      "grad_norm": 0.0934697762131691,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 3948
    },
    {
      "epoch": 0.63184,
      "grad_norm": 0.09984920918941498,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 3949
    },
    {
      "epoch": 0.632,
      "grad_norm": 0.08965438604354858,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 3950
    },
    {
      "epoch": 0.63216,
      "grad_norm": 0.11723759770393372,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 3951
    },
    {
      "epoch": 0.63232,
      "grad_norm": 0.17757049202919006,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 3952
    },
    {
      "epoch": 0.63248,
      "grad_norm": 0.09444935619831085,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3953
    },
    {
      "epoch": 0.63264,
      "grad_norm": 0.10252851992845535,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 3954
    },
    {
      "epoch": 0.6328,
      "grad_norm": 0.10624421387910843,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 3955
    },
    {
      "epoch": 0.63296,
      "grad_norm": 0.0973200649023056,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 3956
    },
    {
      "epoch": 0.63312,
      "grad_norm": 0.0962572917342186,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 3957
    },
    {
      "epoch": 0.63328,
      "grad_norm": 0.1017594188451767,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 3958
    },
    {
      "epoch": 0.63344,
      "grad_norm": 0.12783730030059814,
      "learning_rate": 0.0001,
      "loss": 0.3032,
      "step": 3959
    },
    {
      "epoch": 0.6336,
      "grad_norm": 0.12143595516681671,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 3960
    },
    {
      "epoch": 0.63376,
      "grad_norm": 0.12847661972045898,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 3961
    },
    {
      "epoch": 0.63392,
      "grad_norm": 0.13509231805801392,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 3962
    },
    {
      "epoch": 0.63408,
      "grad_norm": 0.09300820529460907,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 3963
    },
    {
      "epoch": 0.63424,
      "grad_norm": 0.0996752381324768,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 3964
    },
    {
      "epoch": 0.6344,
      "grad_norm": 0.08589129894971848,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 3965
    },
    {
      "epoch": 0.63456,
      "grad_norm": 0.09663963317871094,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 3966
    },
    {
      "epoch": 0.63472,
      "grad_norm": 0.12950153648853302,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 3967
    },
    {
      "epoch": 0.63488,
      "grad_norm": 0.15486834943294525,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 3968
    },
    {
      "epoch": 0.63504,
      "grad_norm": 0.10003826022148132,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 3969
    },
    {
      "epoch": 0.6352,
      "grad_norm": 0.10733728110790253,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 3970
    },
    {
      "epoch": 0.63536,
      "grad_norm": 0.12488557398319244,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 3971
    },
    {
      "epoch": 0.63552,
      "grad_norm": 0.11015090346336365,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 3972
    },
    {
      "epoch": 0.63568,
      "grad_norm": 0.09593081474304199,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 3973
    },
    {
      "epoch": 0.63584,
      "grad_norm": 0.09135407209396362,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 3974
    },
    {
      "epoch": 0.636,
      "grad_norm": 0.09944380074739456,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 3975
    },
    {
      "epoch": 0.63616,
      "grad_norm": 0.1258002668619156,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 3976
    },
    {
      "epoch": 0.63632,
      "grad_norm": 0.1358601450920105,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 3977
    },
    {
      "epoch": 0.63648,
      "grad_norm": 0.08801903575658798,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 3978
    },
    {
      "epoch": 0.63664,
      "grad_norm": 0.09061215817928314,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 3979
    },
    {
      "epoch": 0.6368,
      "grad_norm": 0.12501657009124756,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 3980
    },
    {
      "epoch": 0.63696,
      "grad_norm": 0.10745654255151749,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 3981
    },
    {
      "epoch": 0.63712,
      "grad_norm": 0.09299503266811371,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 3982
    },
    {
      "epoch": 0.63728,
      "grad_norm": 0.1042107418179512,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 3983
    },
    {
      "epoch": 0.63744,
      "grad_norm": 0.10433932393789291,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 3984
    },
    {
      "epoch": 0.6376,
      "grad_norm": 0.0992988646030426,
      "learning_rate": 0.0001,
      "loss": 0.3037,
      "step": 3985
    },
    {
      "epoch": 0.63776,
      "grad_norm": 0.10846032947301865,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 3986
    },
    {
      "epoch": 0.63792,
      "grad_norm": 0.08561517298221588,
      "learning_rate": 0.0001,
      "loss": 0.3017,
      "step": 3987
    },
    {
      "epoch": 0.63808,
      "grad_norm": 0.08847526460886002,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 3988
    },
    {
      "epoch": 0.63824,
      "grad_norm": 0.11089396476745605,
      "learning_rate": 0.0001,
      "loss": 0.292,
      "step": 3989
    },
    {
      "epoch": 0.6384,
      "grad_norm": 0.1068926453590393,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 3990
    },
    {
      "epoch": 0.63856,
      "grad_norm": 0.11474917083978653,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 3991
    },
    {
      "epoch": 0.63872,
      "grad_norm": 0.10708106309175491,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 3992
    },
    {
      "epoch": 0.63888,
      "grad_norm": 0.09452135115861893,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 3993
    },
    {
      "epoch": 0.63904,
      "grad_norm": 0.08725685626268387,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 3994
    },
    {
      "epoch": 0.6392,
      "grad_norm": 0.08059772849082947,
      "learning_rate": 0.0001,
      "loss": 0.2941,
      "step": 3995
    },
    {
      "epoch": 0.63936,
      "grad_norm": 0.10148796439170837,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 3996
    },
    {
      "epoch": 0.63952,
      "grad_norm": 0.11697614938020706,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 3997
    },
    {
      "epoch": 0.63968,
      "grad_norm": 0.09872376918792725,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 3998
    },
    {
      "epoch": 0.63984,
      "grad_norm": 0.11206533014774323,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 3999
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.10559502989053726,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 4000
    },
    {
      "epoch": 0.64,
      "eval_train_accuracy": 0.9928,
      "eval_train_loss": 0.31974369287490845,
      "eval_train_runtime": 4.3637,
      "eval_train_samples_per_second": 1145.821,
      "eval_train_steps_per_second": 14.437,
      "step": 4000
    },
    {
      "epoch": 0.64,
      "eval_test_accuracy": 0.9944,
      "eval_test_loss": 0.3186430037021637,
      "eval_test_runtime": 4.9788,
      "eval_test_samples_per_second": 1004.25,
      "eval_test_steps_per_second": 12.654,
      "step": 4000
    },
    {
      "epoch": 0.64016,
      "grad_norm": 0.09779781848192215,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4001
    },
    {
      "epoch": 0.64032,
      "grad_norm": 0.096656434237957,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4002
    },
    {
      "epoch": 0.64048,
      "grad_norm": 0.10347234457731247,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 4003
    },
    {
      "epoch": 0.64064,
      "grad_norm": 0.10043402761220932,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 4004
    },
    {
      "epoch": 0.6408,
      "grad_norm": 0.09505130350589752,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 4005
    },
    {
      "epoch": 0.64096,
      "grad_norm": 0.09701919555664062,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 4006
    },
    {
      "epoch": 0.64112,
      "grad_norm": 0.09570031613111496,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 4007
    },
    {
      "epoch": 0.64128,
      "grad_norm": 0.10634750872850418,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 4008
    },
    {
      "epoch": 0.64144,
      "grad_norm": 0.20271581411361694,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 4009
    },
    {
      "epoch": 0.6416,
      "grad_norm": 0.11049274355173111,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4010
    },
    {
      "epoch": 0.64176,
      "grad_norm": 0.09953869879245758,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 4011
    },
    {
      "epoch": 0.64192,
      "grad_norm": 0.10136077553033829,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 4012
    },
    {
      "epoch": 0.64208,
      "grad_norm": 0.09975966811180115,
      "learning_rate": 0.0001,
      "loss": 0.3057,
      "step": 4013
    },
    {
      "epoch": 0.64224,
      "grad_norm": 0.09057449549436569,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4014
    },
    {
      "epoch": 0.6424,
      "grad_norm": 0.09368617832660675,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 4015
    },
    {
      "epoch": 0.64256,
      "grad_norm": 0.10773656517267227,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 4016
    },
    {
      "epoch": 0.64272,
      "grad_norm": 0.12336984276771545,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 4017
    },
    {
      "epoch": 0.64288,
      "grad_norm": 0.10616309195756912,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 4018
    },
    {
      "epoch": 0.64304,
      "grad_norm": 0.10158614069223404,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 4019
    },
    {
      "epoch": 0.6432,
      "grad_norm": 0.10712777078151703,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 4020
    },
    {
      "epoch": 0.64336,
      "grad_norm": 0.09453016519546509,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 4021
    },
    {
      "epoch": 0.64352,
      "grad_norm": 0.08861852437257767,
      "learning_rate": 0.0001,
      "loss": 0.3065,
      "step": 4022
    },
    {
      "epoch": 0.64368,
      "grad_norm": 0.10671845823526382,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 4023
    },
    {
      "epoch": 0.64384,
      "grad_norm": 0.086250901222229,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 4024
    },
    {
      "epoch": 0.644,
      "grad_norm": 0.09502767026424408,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4025
    },
    {
      "epoch": 0.64416,
      "grad_norm": 0.09566430747509003,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 4026
    },
    {
      "epoch": 0.64432,
      "grad_norm": 0.08997806906700134,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4027
    },
    {
      "epoch": 0.64448,
      "grad_norm": 0.10095904022455215,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 4028
    },
    {
      "epoch": 0.64464,
      "grad_norm": 0.09606537967920303,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 4029
    },
    {
      "epoch": 0.6448,
      "grad_norm": 0.10158133506774902,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 4030
    },
    {
      "epoch": 0.64496,
      "grad_norm": 0.09189178049564362,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4031
    },
    {
      "epoch": 0.64512,
      "grad_norm": 0.08855962753295898,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 4032
    },
    {
      "epoch": 0.64528,
      "grad_norm": 0.08549167960882187,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 4033
    },
    {
      "epoch": 0.64544,
      "grad_norm": 0.13541418313980103,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4034
    },
    {
      "epoch": 0.6456,
      "grad_norm": 0.11132797598838806,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 4035
    },
    {
      "epoch": 0.64576,
      "grad_norm": 0.12100101262331009,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 4036
    },
    {
      "epoch": 0.64592,
      "grad_norm": 0.10001201927661896,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 4037
    },
    {
      "epoch": 0.64608,
      "grad_norm": 0.08372606337070465,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 4038
    },
    {
      "epoch": 0.64624,
      "grad_norm": 0.09875427186489105,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 4039
    },
    {
      "epoch": 0.6464,
      "grad_norm": 0.09422330558300018,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 4040
    },
    {
      "epoch": 0.64656,
      "grad_norm": 0.0904531255364418,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 4041
    },
    {
      "epoch": 0.64672,
      "grad_norm": 0.09093885868787766,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 4042
    },
    {
      "epoch": 0.64688,
      "grad_norm": 0.10586287081241608,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4043
    },
    {
      "epoch": 0.64704,
      "grad_norm": 0.1022113636136055,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 4044
    },
    {
      "epoch": 0.6472,
      "grad_norm": 0.08853443711996078,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 4045
    },
    {
      "epoch": 0.64736,
      "grad_norm": 0.0965690091252327,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 4046
    },
    {
      "epoch": 0.64752,
      "grad_norm": 0.10023247450590134,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 4047
    },
    {
      "epoch": 0.64768,
      "grad_norm": 0.08188939839601517,
      "learning_rate": 0.0001,
      "loss": 0.3031,
      "step": 4048
    },
    {
      "epoch": 0.64784,
      "grad_norm": 0.09854799509048462,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 4049
    },
    {
      "epoch": 0.648,
      "grad_norm": 0.09645635634660721,
      "learning_rate": 0.0001,
      "loss": 0.3391,
      "step": 4050
    },
    {
      "epoch": 0.64816,
      "grad_norm": 0.09388762712478638,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 4051
    },
    {
      "epoch": 0.64832,
      "grad_norm": 0.09323675185441971,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 4052
    },
    {
      "epoch": 0.64848,
      "grad_norm": 0.11352573335170746,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 4053
    },
    {
      "epoch": 0.64864,
      "grad_norm": 0.08347377181053162,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 4054
    },
    {
      "epoch": 0.6488,
      "grad_norm": 0.09879229962825775,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 4055
    },
    {
      "epoch": 0.64896,
      "grad_norm": 0.10364167392253876,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 4056
    },
    {
      "epoch": 0.64912,
      "grad_norm": 0.11199398338794708,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 4057
    },
    {
      "epoch": 0.64928,
      "grad_norm": 0.10788486897945404,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 4058
    },
    {
      "epoch": 0.64944,
      "grad_norm": 0.13174982368946075,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 4059
    },
    {
      "epoch": 0.6496,
      "grad_norm": 0.09102349728345871,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 4060
    },
    {
      "epoch": 0.64976,
      "grad_norm": 0.09604189544916153,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4061
    },
    {
      "epoch": 0.64992,
      "grad_norm": 0.09507434070110321,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 4062
    },
    {
      "epoch": 0.65008,
      "grad_norm": 0.0899488553404808,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 4063
    },
    {
      "epoch": 0.65024,
      "grad_norm": 0.10398810356855392,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4064
    },
    {
      "epoch": 0.6504,
      "grad_norm": 0.10576383769512177,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4065
    },
    {
      "epoch": 0.65056,
      "grad_norm": 0.08255608379840851,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 4066
    },
    {
      "epoch": 0.65072,
      "grad_norm": 0.09383747726678848,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 4067
    },
    {
      "epoch": 0.65088,
      "grad_norm": 0.0900990217924118,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 4068
    },
    {
      "epoch": 0.65104,
      "grad_norm": 0.10344083607196808,
      "learning_rate": 0.0001,
      "loss": 0.3002,
      "step": 4069
    },
    {
      "epoch": 0.6512,
      "grad_norm": 0.10702932626008987,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 4070
    },
    {
      "epoch": 0.65136,
      "grad_norm": 0.09977763891220093,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 4071
    },
    {
      "epoch": 0.65152,
      "grad_norm": 0.12679748237133026,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 4072
    },
    {
      "epoch": 0.65168,
      "grad_norm": 0.0936766266822815,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 4073
    },
    {
      "epoch": 0.65184,
      "grad_norm": 0.09478270262479782,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 4074
    },
    {
      "epoch": 0.652,
      "grad_norm": 0.10870809853076935,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 4075
    },
    {
      "epoch": 0.65216,
      "grad_norm": 0.09556645154953003,
      "learning_rate": 0.0001,
      "loss": 0.3373,
      "step": 4076
    },
    {
      "epoch": 0.65232,
      "grad_norm": 0.10682903975248337,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 4077
    },
    {
      "epoch": 0.65248,
      "grad_norm": 0.10127488523721695,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 4078
    },
    {
      "epoch": 0.65264,
      "grad_norm": 0.11044271290302277,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 4079
    },
    {
      "epoch": 0.6528,
      "grad_norm": 0.0981115773320198,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 4080
    },
    {
      "epoch": 0.65296,
      "grad_norm": 0.0898272916674614,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 4081
    },
    {
      "epoch": 0.65312,
      "grad_norm": 0.09629972279071808,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 4082
    },
    {
      "epoch": 0.65328,
      "grad_norm": 0.09544441848993301,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 4083
    },
    {
      "epoch": 0.65344,
      "grad_norm": 0.0995730608701706,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 4084
    },
    {
      "epoch": 0.6536,
      "grad_norm": 0.10121353715658188,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 4085
    },
    {
      "epoch": 0.65376,
      "grad_norm": 0.1035882830619812,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 4086
    },
    {
      "epoch": 0.65392,
      "grad_norm": 0.10667572915554047,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 4087
    },
    {
      "epoch": 0.65408,
      "grad_norm": 0.08272331207990646,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 4088
    },
    {
      "epoch": 0.65424,
      "grad_norm": 0.09418240189552307,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 4089
    },
    {
      "epoch": 0.6544,
      "grad_norm": 0.10915178060531616,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 4090
    },
    {
      "epoch": 0.65456,
      "grad_norm": 0.12576338648796082,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 4091
    },
    {
      "epoch": 0.65472,
      "grad_norm": 0.10663289576768875,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 4092
    },
    {
      "epoch": 0.65488,
      "grad_norm": 0.08562646806240082,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 4093
    },
    {
      "epoch": 0.65504,
      "grad_norm": 0.09966570883989334,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 4094
    },
    {
      "epoch": 0.6552,
      "grad_norm": 0.13194014132022858,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 4095
    },
    {
      "epoch": 0.65536,
      "grad_norm": 0.13118119537830353,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4096
    },
    {
      "epoch": 0.65552,
      "grad_norm": 0.10645687580108643,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4097
    },
    {
      "epoch": 0.65568,
      "grad_norm": 0.09762174636125565,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 4098
    },
    {
      "epoch": 0.65584,
      "grad_norm": 0.09970464557409286,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 4099
    },
    {
      "epoch": 0.656,
      "grad_norm": 0.10798638314008713,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4100
    },
    {
      "epoch": 0.656,
      "eval_train_accuracy": 0.9948,
      "eval_train_loss": 0.319350004196167,
      "eval_train_runtime": 4.663,
      "eval_train_samples_per_second": 1072.264,
      "eval_train_steps_per_second": 13.511,
      "step": 4100
    },
    {
      "epoch": 0.656,
      "eval_test_accuracy": 0.996,
      "eval_test_loss": 0.31796202063560486,
      "eval_test_runtime": 4.7794,
      "eval_test_samples_per_second": 1046.151,
      "eval_test_steps_per_second": 13.182,
      "step": 4100
    },
    {
      "epoch": 0.65616,
      "grad_norm": 0.09264124184846878,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 4101
    },
    {
      "epoch": 0.65632,
      "grad_norm": 0.10924818366765976,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 4102
    },
    {
      "epoch": 0.65648,
      "grad_norm": 0.09018303453922272,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 4103
    },
    {
      "epoch": 0.65664,
      "grad_norm": 0.0955924242734909,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 4104
    },
    {
      "epoch": 0.6568,
      "grad_norm": 0.09146248549222946,
      "learning_rate": 0.0001,
      "loss": 0.3052,
      "step": 4105
    },
    {
      "epoch": 0.65696,
      "grad_norm": 0.0968254879117012,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4106
    },
    {
      "epoch": 0.65712,
      "grad_norm": 0.0875515416264534,
      "learning_rate": 0.0001,
      "loss": 0.3046,
      "step": 4107
    },
    {
      "epoch": 0.65728,
      "grad_norm": 0.09635308384895325,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 4108
    },
    {
      "epoch": 0.65744,
      "grad_norm": 0.09485774487257004,
      "learning_rate": 0.0001,
      "loss": 0.3089,
      "step": 4109
    },
    {
      "epoch": 0.6576,
      "grad_norm": 0.0998220443725586,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 4110
    },
    {
      "epoch": 0.65776,
      "grad_norm": 0.1036013662815094,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 4111
    },
    {
      "epoch": 0.65792,
      "grad_norm": 0.08797693252563477,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 4112
    },
    {
      "epoch": 0.65808,
      "grad_norm": 0.09385403990745544,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 4113
    },
    {
      "epoch": 0.65824,
      "grad_norm": 0.09763537347316742,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4114
    },
    {
      "epoch": 0.6584,
      "grad_norm": 0.10013102740049362,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 4115
    },
    {
      "epoch": 0.65856,
      "grad_norm": 0.08909328281879425,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 4116
    },
    {
      "epoch": 0.65872,
      "grad_norm": 0.10263608396053314,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 4117
    },
    {
      "epoch": 0.65888,
      "grad_norm": 0.11394301056861877,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 4118
    },
    {
      "epoch": 0.65904,
      "grad_norm": 0.10633732378482819,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 4119
    },
    {
      "epoch": 0.6592,
      "grad_norm": 0.09322040528059006,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 4120
    },
    {
      "epoch": 0.65936,
      "grad_norm": 0.09661738574504852,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 4121
    },
    {
      "epoch": 0.65952,
      "grad_norm": 0.10394036769866943,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 4122
    },
    {
      "epoch": 0.65968,
      "grad_norm": 0.09870783239603043,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 4123
    },
    {
      "epoch": 0.65984,
      "grad_norm": 0.12029871344566345,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 4124
    },
    {
      "epoch": 0.66,
      "grad_norm": 0.09982138127088547,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 4125
    },
    {
      "epoch": 0.66016,
      "grad_norm": 0.08804290741682053,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 4126
    },
    {
      "epoch": 0.66032,
      "grad_norm": 0.08027045428752899,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 4127
    },
    {
      "epoch": 0.66048,
      "grad_norm": 0.09034065157175064,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 4128
    },
    {
      "epoch": 0.66064,
      "grad_norm": 0.10035119205713272,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 4129
    },
    {
      "epoch": 0.6608,
      "grad_norm": 0.10571282356977463,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 4130
    },
    {
      "epoch": 0.66096,
      "grad_norm": 0.12626296281814575,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 4131
    },
    {
      "epoch": 0.66112,
      "grad_norm": 0.098412424325943,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 4132
    },
    {
      "epoch": 0.66128,
      "grad_norm": 0.09253803640604019,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 4133
    },
    {
      "epoch": 0.66144,
      "grad_norm": 0.07987946271896362,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 4134
    },
    {
      "epoch": 0.6616,
      "grad_norm": 0.09103639423847198,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 4135
    },
    {
      "epoch": 0.66176,
      "grad_norm": 0.1122768372297287,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4136
    },
    {
      "epoch": 0.66192,
      "grad_norm": 0.11529690027236938,
      "learning_rate": 0.0001,
      "loss": 0.3409,
      "step": 4137
    },
    {
      "epoch": 0.66208,
      "grad_norm": 0.08900178223848343,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 4138
    },
    {
      "epoch": 0.66224,
      "grad_norm": 0.1010504737496376,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 4139
    },
    {
      "epoch": 0.6624,
      "grad_norm": 0.09442874044179916,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 4140
    },
    {
      "epoch": 0.66256,
      "grad_norm": 0.11940822750329971,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4141
    },
    {
      "epoch": 0.66272,
      "grad_norm": 0.09456703811883926,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 4142
    },
    {
      "epoch": 0.66288,
      "grad_norm": 0.09025502949953079,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 4143
    },
    {
      "epoch": 0.66304,
      "grad_norm": 0.0972205400466919,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 4144
    },
    {
      "epoch": 0.6632,
      "grad_norm": 0.15411405265331268,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 4145
    },
    {
      "epoch": 0.66336,
      "grad_norm": 0.11962804943323135,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 4146
    },
    {
      "epoch": 0.66352,
      "grad_norm": 0.09803591668605804,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 4147
    },
    {
      "epoch": 0.66368,
      "grad_norm": 0.089775949716568,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 4148
    },
    {
      "epoch": 0.66384,
      "grad_norm": 0.07922805845737457,
      "learning_rate": 0.0001,
      "loss": 0.2986,
      "step": 4149
    },
    {
      "epoch": 0.664,
      "grad_norm": 0.08689381927251816,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 4150
    },
    {
      "epoch": 0.66416,
      "grad_norm": 0.09483077377080917,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4151
    },
    {
      "epoch": 0.66432,
      "grad_norm": 0.09883742034435272,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4152
    },
    {
      "epoch": 0.66448,
      "grad_norm": 0.08076941967010498,
      "learning_rate": 0.0001,
      "loss": 0.2992,
      "step": 4153
    },
    {
      "epoch": 0.66464,
      "grad_norm": 0.0968158096075058,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 4154
    },
    {
      "epoch": 0.6648,
      "grad_norm": 0.09645818918943405,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 4155
    },
    {
      "epoch": 0.66496,
      "grad_norm": 0.09019535034894943,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4156
    },
    {
      "epoch": 0.66512,
      "grad_norm": 0.08622687309980392,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 4157
    },
    {
      "epoch": 0.66528,
      "grad_norm": 0.09151820838451385,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 4158
    },
    {
      "epoch": 0.66544,
      "grad_norm": 0.09736797958612442,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 4159
    },
    {
      "epoch": 0.6656,
      "grad_norm": 0.0955139622092247,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 4160
    },
    {
      "epoch": 0.66576,
      "grad_norm": 0.11677297204732895,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 4161
    },
    {
      "epoch": 0.66592,
      "grad_norm": 0.09550406783819199,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 4162
    },
    {
      "epoch": 0.66608,
      "grad_norm": 0.09129755198955536,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4163
    },
    {
      "epoch": 0.66624,
      "grad_norm": 0.09587136656045914,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4164
    },
    {
      "epoch": 0.6664,
      "grad_norm": 0.09752288460731506,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 4165
    },
    {
      "epoch": 0.66656,
      "grad_norm": 0.08636827766895294,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 4166
    },
    {
      "epoch": 0.66672,
      "grad_norm": 0.0934930369257927,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 4167
    },
    {
      "epoch": 0.66688,
      "grad_norm": 0.11034379154443741,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 4168
    },
    {
      "epoch": 0.66704,
      "grad_norm": 0.1354304999113083,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 4169
    },
    {
      "epoch": 0.6672,
      "grad_norm": 0.10832033306360245,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 4170
    },
    {
      "epoch": 0.66736,
      "grad_norm": 0.1166442260146141,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 4171
    },
    {
      "epoch": 0.66752,
      "grad_norm": 0.10319602489471436,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 4172
    },
    {
      "epoch": 0.66768,
      "grad_norm": 0.0948905348777771,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4173
    },
    {
      "epoch": 0.66784,
      "grad_norm": 0.10149464011192322,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 4174
    },
    {
      "epoch": 0.668,
      "grad_norm": 0.11386251449584961,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 4175
    },
    {
      "epoch": 0.66816,
      "grad_norm": 0.11322008818387985,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 4176
    },
    {
      "epoch": 0.66832,
      "grad_norm": 0.10857395082712173,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 4177
    },
    {
      "epoch": 0.66848,
      "grad_norm": 0.10028950870037079,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 4178
    },
    {
      "epoch": 0.66864,
      "grad_norm": 0.08750919252634048,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4179
    },
    {
      "epoch": 0.6688,
      "grad_norm": 0.1354779452085495,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 4180
    },
    {
      "epoch": 0.66896,
      "grad_norm": 0.10560059547424316,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 4181
    },
    {
      "epoch": 0.66912,
      "grad_norm": 0.09648316353559494,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 4182
    },
    {
      "epoch": 0.66928,
      "grad_norm": 0.09632250666618347,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4183
    },
    {
      "epoch": 0.66944,
      "grad_norm": 0.08474405109882355,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 4184
    },
    {
      "epoch": 0.6696,
      "grad_norm": 0.08841552585363388,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 4185
    },
    {
      "epoch": 0.66976,
      "grad_norm": 0.098138228058815,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 4186
    },
    {
      "epoch": 0.66992,
      "grad_norm": 0.09121985733509064,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4187
    },
    {
      "epoch": 0.67008,
      "grad_norm": 0.10408905148506165,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4188
    },
    {
      "epoch": 0.67024,
      "grad_norm": 0.09567274153232574,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 4189
    },
    {
      "epoch": 0.6704,
      "grad_norm": 0.09679851680994034,
      "learning_rate": 0.0001,
      "loss": 0.3004,
      "step": 4190
    },
    {
      "epoch": 0.67056,
      "grad_norm": 0.09485864639282227,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 4191
    },
    {
      "epoch": 0.67072,
      "grad_norm": 0.09774801135063171,
      "learning_rate": 0.0001,
      "loss": 0.3345,
      "step": 4192
    },
    {
      "epoch": 0.67088,
      "grad_norm": 0.0833912268280983,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 4193
    },
    {
      "epoch": 0.67104,
      "grad_norm": 0.0963888019323349,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4194
    },
    {
      "epoch": 0.6712,
      "grad_norm": 0.09983395785093307,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 4195
    },
    {
      "epoch": 0.67136,
      "grad_norm": 0.1059785783290863,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 4196
    },
    {
      "epoch": 0.67152,
      "grad_norm": 0.08547292649745941,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 4197
    },
    {
      "epoch": 0.67168,
      "grad_norm": 0.08744189143180847,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 4198
    },
    {
      "epoch": 0.67184,
      "grad_norm": 0.09107180684804916,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 4199
    },
    {
      "epoch": 0.672,
      "grad_norm": 0.08149833977222443,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 4200
    },
    {
      "epoch": 0.672,
      "eval_train_accuracy": 0.995,
      "eval_train_loss": 0.3190934956073761,
      "eval_train_runtime": 4.7295,
      "eval_train_samples_per_second": 1057.185,
      "eval_train_steps_per_second": 13.321,
      "step": 4200
    },
    {
      "epoch": 0.672,
      "eval_test_accuracy": 0.995,
      "eval_test_loss": 0.31790316104888916,
      "eval_test_runtime": 4.5213,
      "eval_test_samples_per_second": 1105.87,
      "eval_test_steps_per_second": 13.934,
      "step": 4200
    },
    {
      "epoch": 0.67216,
      "grad_norm": 0.09273344278335571,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 4201
    },
    {
      "epoch": 0.67232,
      "grad_norm": 0.08979666233062744,
      "learning_rate": 0.0001,
      "loss": 0.3074,
      "step": 4202
    },
    {
      "epoch": 0.67248,
      "grad_norm": 0.09458573907613754,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 4203
    },
    {
      "epoch": 0.67264,
      "grad_norm": 0.09122884273529053,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 4204
    },
    {
      "epoch": 0.6728,
      "grad_norm": 0.12071822583675385,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 4205
    },
    {
      "epoch": 0.67296,
      "grad_norm": 0.08738381415605545,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 4206
    },
    {
      "epoch": 0.67312,
      "grad_norm": 0.10991981625556946,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 4207
    },
    {
      "epoch": 0.67328,
      "grad_norm": 0.10040149837732315,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 4208
    },
    {
      "epoch": 0.67344,
      "grad_norm": 0.09302940219640732,
      "learning_rate": 0.0001,
      "loss": 0.3008,
      "step": 4209
    },
    {
      "epoch": 0.6736,
      "grad_norm": 0.10707702487707138,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 4210
    },
    {
      "epoch": 0.67376,
      "grad_norm": 0.10207395255565643,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4211
    },
    {
      "epoch": 0.67392,
      "grad_norm": 0.08138333261013031,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 4212
    },
    {
      "epoch": 0.67408,
      "grad_norm": 0.09061463922262192,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 4213
    },
    {
      "epoch": 0.67424,
      "grad_norm": 0.08435530960559845,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 4214
    },
    {
      "epoch": 0.6744,
      "grad_norm": 0.0909622460603714,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 4215
    },
    {
      "epoch": 0.67456,
      "grad_norm": 0.09409227967262268,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 4216
    },
    {
      "epoch": 0.67472,
      "grad_norm": 0.0868033692240715,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 4217
    },
    {
      "epoch": 0.67488,
      "grad_norm": 0.10546007752418518,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 4218
    },
    {
      "epoch": 0.67504,
      "grad_norm": 0.0921127051115036,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 4219
    },
    {
      "epoch": 0.6752,
      "grad_norm": 0.10603347420692444,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 4220
    },
    {
      "epoch": 0.67536,
      "grad_norm": 0.11497636884450912,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 4221
    },
    {
      "epoch": 0.67552,
      "grad_norm": 0.10509152710437775,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 4222
    },
    {
      "epoch": 0.67568,
      "grad_norm": 0.09640779346227646,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 4223
    },
    {
      "epoch": 0.67584,
      "grad_norm": 0.09376557171344757,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 4224
    },
    {
      "epoch": 0.676,
      "grad_norm": 0.08125169575214386,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 4225
    },
    {
      "epoch": 0.67616,
      "grad_norm": 0.08916697651147842,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4226
    },
    {
      "epoch": 0.67632,
      "grad_norm": 0.09169077128171921,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 4227
    },
    {
      "epoch": 0.67648,
      "grad_norm": 0.11169884353876114,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 4228
    },
    {
      "epoch": 0.67664,
      "grad_norm": 0.08870434015989304,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 4229
    },
    {
      "epoch": 0.6768,
      "grad_norm": 0.10148739069700241,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 4230
    },
    {
      "epoch": 0.67696,
      "grad_norm": 0.10121401399374008,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 4231
    },
    {
      "epoch": 0.67712,
      "grad_norm": 0.08964783698320389,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 4232
    },
    {
      "epoch": 0.67728,
      "grad_norm": 0.07771281898021698,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 4233
    },
    {
      "epoch": 0.67744,
      "grad_norm": 0.08405424654483795,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 4234
    },
    {
      "epoch": 0.6776,
      "grad_norm": 0.08819808065891266,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 4235
    },
    {
      "epoch": 0.67776,
      "grad_norm": 0.09141194820404053,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 4236
    },
    {
      "epoch": 0.67792,
      "grad_norm": 0.11105546355247498,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4237
    },
    {
      "epoch": 0.67808,
      "grad_norm": 0.09425246715545654,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4238
    },
    {
      "epoch": 0.67824,
      "grad_norm": 0.08857305347919464,
      "learning_rate": 0.0001,
      "loss": 0.3036,
      "step": 4239
    },
    {
      "epoch": 0.6784,
      "grad_norm": 0.09106268733739853,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4240
    },
    {
      "epoch": 0.67856,
      "grad_norm": 0.09710971266031265,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 4241
    },
    {
      "epoch": 0.67872,
      "grad_norm": 0.09424760192632675,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 4242
    },
    {
      "epoch": 0.67888,
      "grad_norm": 0.10000446438789368,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 4243
    },
    {
      "epoch": 0.67904,
      "grad_norm": 0.08845889568328857,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 4244
    },
    {
      "epoch": 0.6792,
      "grad_norm": 0.08755622804164886,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 4245
    },
    {
      "epoch": 0.67936,
      "grad_norm": 0.08328771591186523,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4246
    },
    {
      "epoch": 0.67952,
      "grad_norm": 0.0878121554851532,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 4247
    },
    {
      "epoch": 0.67968,
      "grad_norm": 0.09803880751132965,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4248
    },
    {
      "epoch": 0.67984,
      "grad_norm": 0.11968185752630234,
      "learning_rate": 0.0001,
      "loss": 0.3078,
      "step": 4249
    },
    {
      "epoch": 0.68,
      "grad_norm": 0.11535914242267609,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 4250
    },
    {
      "epoch": 0.68016,
      "grad_norm": 0.08673565834760666,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 4251
    },
    {
      "epoch": 0.68032,
      "grad_norm": 0.09183672815561295,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 4252
    },
    {
      "epoch": 0.68048,
      "grad_norm": 0.11267957091331482,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 4253
    },
    {
      "epoch": 0.68064,
      "grad_norm": 0.10402999073266983,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 4254
    },
    {
      "epoch": 0.6808,
      "grad_norm": 0.10063516348600388,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 4255
    },
    {
      "epoch": 0.68096,
      "grad_norm": 0.08682186901569366,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4256
    },
    {
      "epoch": 0.68112,
      "grad_norm": 0.12092749774456024,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 4257
    },
    {
      "epoch": 0.68128,
      "grad_norm": 0.10063368082046509,
      "learning_rate": 0.0001,
      "loss": 0.3066,
      "step": 4258
    },
    {
      "epoch": 0.68144,
      "grad_norm": 0.0950859785079956,
      "learning_rate": 0.0001,
      "loss": 0.298,
      "step": 4259
    },
    {
      "epoch": 0.6816,
      "grad_norm": 0.12977275252342224,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 4260
    },
    {
      "epoch": 0.68176,
      "grad_norm": 0.09645597636699677,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4261
    },
    {
      "epoch": 0.68192,
      "grad_norm": 0.08979438245296478,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 4262
    },
    {
      "epoch": 0.68208,
      "grad_norm": 0.11429435014724731,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 4263
    },
    {
      "epoch": 0.68224,
      "grad_norm": 0.10918457061052322,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4264
    },
    {
      "epoch": 0.6824,
      "grad_norm": 0.10436554998159409,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 4265
    },
    {
      "epoch": 0.68256,
      "grad_norm": 0.0882495641708374,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 4266
    },
    {
      "epoch": 0.68272,
      "grad_norm": 0.1020970419049263,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4267
    },
    {
      "epoch": 0.68288,
      "grad_norm": 0.08369506150484085,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 4268
    },
    {
      "epoch": 0.68304,
      "grad_norm": 0.09962756186723709,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 4269
    },
    {
      "epoch": 0.6832,
      "grad_norm": 0.09734677523374557,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 4270
    },
    {
      "epoch": 0.68336,
      "grad_norm": 0.10494963079690933,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 4271
    },
    {
      "epoch": 0.68352,
      "grad_norm": 0.17754697799682617,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 4272
    },
    {
      "epoch": 0.68368,
      "grad_norm": 0.09640266746282578,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 4273
    },
    {
      "epoch": 0.68384,
      "grad_norm": 0.10358323156833649,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 4274
    },
    {
      "epoch": 0.684,
      "grad_norm": 0.0887182205915451,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 4275
    },
    {
      "epoch": 0.68416,
      "grad_norm": 0.08493460714817047,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 4276
    },
    {
      "epoch": 0.68432,
      "grad_norm": 0.10341302305459976,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 4277
    },
    {
      "epoch": 0.68448,
      "grad_norm": 0.12635260820388794,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 4278
    },
    {
      "epoch": 0.68464,
      "grad_norm": 0.10149338096380234,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 4279
    },
    {
      "epoch": 0.6848,
      "grad_norm": 0.1180979311466217,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 4280
    },
    {
      "epoch": 0.68496,
      "grad_norm": 0.13080771267414093,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 4281
    },
    {
      "epoch": 0.68512,
      "grad_norm": 0.13858960568904877,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 4282
    },
    {
      "epoch": 0.68528,
      "grad_norm": 0.20003902912139893,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4283
    },
    {
      "epoch": 0.68544,
      "grad_norm": 0.10298040509223938,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4284
    },
    {
      "epoch": 0.6856,
      "grad_norm": 0.094057098031044,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 4285
    },
    {
      "epoch": 0.68576,
      "grad_norm": 0.09410041570663452,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4286
    },
    {
      "epoch": 0.68592,
      "grad_norm": 0.19642728567123413,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 4287
    },
    {
      "epoch": 0.68608,
      "grad_norm": 0.3243136405944824,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 4288
    },
    {
      "epoch": 0.68624,
      "grad_norm": 0.10162608325481415,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 4289
    },
    {
      "epoch": 0.6864,
      "grad_norm": 0.11942166835069656,
      "learning_rate": 0.0001,
      "loss": 0.3334,
      "step": 4290
    },
    {
      "epoch": 0.68656,
      "grad_norm": 0.10072989761829376,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 4291
    },
    {
      "epoch": 0.68672,
      "grad_norm": 0.09965000301599503,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 4292
    },
    {
      "epoch": 0.68688,
      "grad_norm": 0.12334343045949936,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 4293
    },
    {
      "epoch": 0.68704,
      "grad_norm": 0.10450372099876404,
      "learning_rate": 0.0001,
      "loss": 0.3083,
      "step": 4294
    },
    {
      "epoch": 0.6872,
      "grad_norm": 0.1599881798028946,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 4295
    },
    {
      "epoch": 0.68736,
      "grad_norm": 0.11394718289375305,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4296
    },
    {
      "epoch": 0.68752,
      "grad_norm": 0.19888165593147278,
      "learning_rate": 0.0001,
      "loss": 0.3042,
      "step": 4297
    },
    {
      "epoch": 0.68768,
      "grad_norm": 0.09661941975355148,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 4298
    },
    {
      "epoch": 0.68784,
      "grad_norm": 0.10714694112539291,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4299
    },
    {
      "epoch": 0.688,
      "grad_norm": 0.11213921010494232,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 4300
    },
    {
      "epoch": 0.688,
      "eval_train_accuracy": 0.9934,
      "eval_train_loss": 0.3196726143360138,
      "eval_train_runtime": 4.5134,
      "eval_train_samples_per_second": 1107.821,
      "eval_train_steps_per_second": 13.959,
      "step": 4300
    },
    {
      "epoch": 0.688,
      "eval_test_accuracy": 0.9946,
      "eval_test_loss": 0.3182739317417145,
      "eval_test_runtime": 4.7309,
      "eval_test_samples_per_second": 1056.885,
      "eval_test_steps_per_second": 13.317,
      "step": 4300
    },
    {
      "epoch": 0.68816,
      "grad_norm": 0.25962913036346436,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 4301
    },
    {
      "epoch": 0.68832,
      "grad_norm": 0.11235399544239044,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 4302
    },
    {
      "epoch": 0.68848,
      "grad_norm": 0.14118024706840515,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 4303
    },
    {
      "epoch": 0.68864,
      "grad_norm": 0.09476403146982193,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 4304
    },
    {
      "epoch": 0.6888,
      "grad_norm": 0.11184412240982056,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 4305
    },
    {
      "epoch": 0.68896,
      "grad_norm": 0.14951622486114502,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 4306
    },
    {
      "epoch": 0.68912,
      "grad_norm": 0.09699644893407822,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 4307
    },
    {
      "epoch": 0.68928,
      "grad_norm": 0.18936914205551147,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 4308
    },
    {
      "epoch": 0.68944,
      "grad_norm": 0.0947921946644783,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4309
    },
    {
      "epoch": 0.6896,
      "grad_norm": 0.12626928091049194,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 4310
    },
    {
      "epoch": 0.68976,
      "grad_norm": 0.11028371006250381,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4311
    },
    {
      "epoch": 0.68992,
      "grad_norm": 0.16170348227024078,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 4312
    },
    {
      "epoch": 0.69008,
      "grad_norm": 0.10075155645608902,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4313
    },
    {
      "epoch": 0.69024,
      "grad_norm": 0.16396090388298035,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 4314
    },
    {
      "epoch": 0.6904,
      "grad_norm": 0.11540736258029938,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 4315
    },
    {
      "epoch": 0.69056,
      "grad_norm": 0.11161909997463226,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 4316
    },
    {
      "epoch": 0.69072,
      "grad_norm": 0.11672919243574142,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4317
    },
    {
      "epoch": 0.69088,
      "grad_norm": 0.1683480441570282,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 4318
    },
    {
      "epoch": 0.69104,
      "grad_norm": 0.16714167594909668,
      "learning_rate": 0.0001,
      "loss": 0.3032,
      "step": 4319
    },
    {
      "epoch": 0.6912,
      "grad_norm": 0.13494451344013214,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 4320
    },
    {
      "epoch": 0.69136,
      "grad_norm": 0.12117282301187515,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 4321
    },
    {
      "epoch": 0.69152,
      "grad_norm": 0.09295584261417389,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 4322
    },
    {
      "epoch": 0.69168,
      "grad_norm": 0.09941712021827698,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 4323
    },
    {
      "epoch": 0.69184,
      "grad_norm": 0.09588028490543365,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 4324
    },
    {
      "epoch": 0.692,
      "grad_norm": 0.0947648212313652,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 4325
    },
    {
      "epoch": 0.69216,
      "grad_norm": 0.11223980784416199,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 4326
    },
    {
      "epoch": 0.69232,
      "grad_norm": 0.13638727366924286,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 4327
    },
    {
      "epoch": 0.69248,
      "grad_norm": 0.12281215935945511,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 4328
    },
    {
      "epoch": 0.69264,
      "grad_norm": 0.10366955399513245,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 4329
    },
    {
      "epoch": 0.6928,
      "grad_norm": 0.14947181940078735,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 4330
    },
    {
      "epoch": 0.69296,
      "grad_norm": 0.09996089339256287,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 4331
    },
    {
      "epoch": 0.69312,
      "grad_norm": 0.12222819030284882,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 4332
    },
    {
      "epoch": 0.69328,
      "grad_norm": 0.10607364773750305,
      "learning_rate": 0.0001,
      "loss": 0.3361,
      "step": 4333
    },
    {
      "epoch": 0.69344,
      "grad_norm": 0.1130189299583435,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4334
    },
    {
      "epoch": 0.6936,
      "grad_norm": 0.11095144599676132,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4335
    },
    {
      "epoch": 0.69376,
      "grad_norm": 0.09142182767391205,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4336
    },
    {
      "epoch": 0.69392,
      "grad_norm": 0.1074264645576477,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4337
    },
    {
      "epoch": 0.69408,
      "grad_norm": 0.10881146788597107,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 4338
    },
    {
      "epoch": 0.69424,
      "grad_norm": 0.08868671208620071,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4339
    },
    {
      "epoch": 0.6944,
      "grad_norm": 0.09800151735544205,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 4340
    },
    {
      "epoch": 0.69456,
      "grad_norm": 0.09154080599546432,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 4341
    },
    {
      "epoch": 0.69472,
      "grad_norm": 0.13053791224956512,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 4342
    },
    {
      "epoch": 0.69488,
      "grad_norm": 0.1530482918024063,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4343
    },
    {
      "epoch": 0.69504,
      "grad_norm": 0.0868329331278801,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4344
    },
    {
      "epoch": 0.6952,
      "grad_norm": 0.19717293977737427,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 4345
    },
    {
      "epoch": 0.69536,
      "grad_norm": 0.1057221069931984,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 4346
    },
    {
      "epoch": 0.69552,
      "grad_norm": 0.09589838981628418,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4347
    },
    {
      "epoch": 0.69568,
      "grad_norm": 0.11081720888614655,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 4348
    },
    {
      "epoch": 0.69584,
      "grad_norm": 0.14027075469493866,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 4349
    },
    {
      "epoch": 0.696,
      "grad_norm": 0.26684433221817017,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4350
    },
    {
      "epoch": 0.69616,
      "grad_norm": 0.10496488958597183,
      "learning_rate": 0.0001,
      "loss": 0.3041,
      "step": 4351
    },
    {
      "epoch": 0.69632,
      "grad_norm": 0.11913945525884628,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 4352
    },
    {
      "epoch": 0.69648,
      "grad_norm": 0.14154569804668427,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 4353
    },
    {
      "epoch": 0.69664,
      "grad_norm": 0.10557076334953308,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 4354
    },
    {
      "epoch": 0.6968,
      "grad_norm": 0.09566844999790192,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4355
    },
    {
      "epoch": 0.69696,
      "grad_norm": 0.13680380582809448,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 4356
    },
    {
      "epoch": 0.69712,
      "grad_norm": 0.10859876126050949,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 4357
    },
    {
      "epoch": 0.69728,
      "grad_norm": 0.10530654340982437,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 4358
    },
    {
      "epoch": 0.69744,
      "grad_norm": 0.11467920988798141,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 4359
    },
    {
      "epoch": 0.6976,
      "grad_norm": 0.10120169073343277,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 4360
    },
    {
      "epoch": 0.69776,
      "grad_norm": 0.099012590944767,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 4361
    },
    {
      "epoch": 0.69792,
      "grad_norm": 0.1693151593208313,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 4362
    },
    {
      "epoch": 0.69808,
      "grad_norm": 0.1063852310180664,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 4363
    },
    {
      "epoch": 0.69824,
      "grad_norm": 0.09908672422170639,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 4364
    },
    {
      "epoch": 0.6984,
      "grad_norm": 0.08709406852722168,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4365
    },
    {
      "epoch": 0.69856,
      "grad_norm": 0.12214095890522003,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4366
    },
    {
      "epoch": 0.69872,
      "grad_norm": 0.13784433901309967,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 4367
    },
    {
      "epoch": 0.69888,
      "grad_norm": 0.11957907676696777,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4368
    },
    {
      "epoch": 0.69904,
      "grad_norm": 0.12544439733028412,
      "learning_rate": 0.0001,
      "loss": 0.3424,
      "step": 4369
    },
    {
      "epoch": 0.6992,
      "grad_norm": 0.12319174408912659,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 4370
    },
    {
      "epoch": 0.69936,
      "grad_norm": 0.12451808899641037,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 4371
    },
    {
      "epoch": 0.69952,
      "grad_norm": 0.15867570042610168,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 4372
    },
    {
      "epoch": 0.69968,
      "grad_norm": 0.13522405922412872,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 4373
    },
    {
      "epoch": 0.69984,
      "grad_norm": 0.11865011602640152,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 4374
    },
    {
      "epoch": 0.7,
      "grad_norm": 0.13771355152130127,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 4375
    },
    {
      "epoch": 0.70016,
      "grad_norm": 0.1035890206694603,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4376
    },
    {
      "epoch": 0.70032,
      "grad_norm": 0.12689942121505737,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 4377
    },
    {
      "epoch": 0.70048,
      "grad_norm": 0.10765593498945236,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 4378
    },
    {
      "epoch": 0.70064,
      "grad_norm": 0.10072951763868332,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4379
    },
    {
      "epoch": 0.7008,
      "grad_norm": 0.12454885244369507,
      "learning_rate": 0.0001,
      "loss": 0.3046,
      "step": 4380
    },
    {
      "epoch": 0.70096,
      "grad_norm": 0.09013447910547256,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 4381
    },
    {
      "epoch": 0.70112,
      "grad_norm": 0.09479238092899323,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 4382
    },
    {
      "epoch": 0.70128,
      "grad_norm": 0.09647510200738907,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4383
    },
    {
      "epoch": 0.70144,
      "grad_norm": 0.15805363655090332,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 4384
    },
    {
      "epoch": 0.7016,
      "grad_norm": 0.20058976113796234,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 4385
    },
    {
      "epoch": 0.70176,
      "grad_norm": 0.09789988398551941,
      "learning_rate": 0.0001,
      "loss": 0.3051,
      "step": 4386
    },
    {
      "epoch": 0.70192,
      "grad_norm": 0.12159176170825958,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4387
    },
    {
      "epoch": 0.70208,
      "grad_norm": 0.16169482469558716,
      "learning_rate": 0.0001,
      "loss": 0.3341,
      "step": 4388
    },
    {
      "epoch": 0.70224,
      "grad_norm": 0.09935944527387619,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 4389
    },
    {
      "epoch": 0.7024,
      "grad_norm": 0.1501472443342209,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 4390
    },
    {
      "epoch": 0.70256,
      "grad_norm": 0.10884764790534973,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 4391
    },
    {
      "epoch": 0.70272,
      "grad_norm": 0.10055750608444214,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4392
    },
    {
      "epoch": 0.70288,
      "grad_norm": 0.2027120739221573,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 4393
    },
    {
      "epoch": 0.70304,
      "grad_norm": 0.10014418512582779,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 4394
    },
    {
      "epoch": 0.7032,
      "grad_norm": 0.10654716938734055,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 4395
    },
    {
      "epoch": 0.70336,
      "grad_norm": 0.17349104583263397,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 4396
    },
    {
      "epoch": 0.70352,
      "grad_norm": 0.2497178018093109,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4397
    },
    {
      "epoch": 0.70368,
      "grad_norm": 0.14322207868099213,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 4398
    },
    {
      "epoch": 0.70384,
      "grad_norm": 0.3476777970790863,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 4399
    },
    {
      "epoch": 0.704,
      "grad_norm": 0.19176650047302246,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 4400
    },
    {
      "epoch": 0.704,
      "eval_train_accuracy": 0.9934,
      "eval_train_loss": 0.3200904130935669,
      "eval_train_runtime": 4.5632,
      "eval_train_samples_per_second": 1095.729,
      "eval_train_steps_per_second": 13.806,
      "step": 4400
    },
    {
      "epoch": 0.704,
      "eval_test_accuracy": 0.9944,
      "eval_test_loss": 0.31892600655555725,
      "eval_test_runtime": 4.4718,
      "eval_test_samples_per_second": 1118.128,
      "eval_test_steps_per_second": 14.088,
      "step": 4400
    },
    {
      "epoch": 0.70416,
      "grad_norm": 0.39735138416290283,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 4401
    },
    {
      "epoch": 0.70432,
      "grad_norm": 0.10201448202133179,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4402
    },
    {
      "epoch": 0.70448,
      "grad_norm": 0.48285409808158875,
      "learning_rate": 0.0001,
      "loss": 0.3358,
      "step": 4403
    },
    {
      "epoch": 0.70464,
      "grad_norm": 0.13880686461925507,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 4404
    },
    {
      "epoch": 0.7048,
      "grad_norm": 0.11588158458471298,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 4405
    },
    {
      "epoch": 0.70496,
      "grad_norm": 0.3520577847957611,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 4406
    },
    {
      "epoch": 0.70512,
      "grad_norm": 0.19967079162597656,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 4407
    },
    {
      "epoch": 0.70528,
      "grad_norm": 0.20312653481960297,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4408
    },
    {
      "epoch": 0.70544,
      "grad_norm": 0.16782458126544952,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 4409
    },
    {
      "epoch": 0.7056,
      "grad_norm": 0.11044910550117493,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 4410
    },
    {
      "epoch": 0.70576,
      "grad_norm": 0.14901000261306763,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4411
    },
    {
      "epoch": 0.70592,
      "grad_norm": 0.12202686071395874,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 4412
    },
    {
      "epoch": 0.70608,
      "grad_norm": 0.10171083360910416,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4413
    },
    {
      "epoch": 0.70624,
      "grad_norm": 0.11978667229413986,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 4414
    },
    {
      "epoch": 0.7064,
      "grad_norm": 0.1580827534198761,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 4415
    },
    {
      "epoch": 0.70656,
      "grad_norm": 0.12293361127376556,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 4416
    },
    {
      "epoch": 0.70672,
      "grad_norm": 0.16456560790538788,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 4417
    },
    {
      "epoch": 0.70688,
      "grad_norm": 0.12139426171779633,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 4418
    },
    {
      "epoch": 0.70704,
      "grad_norm": 0.17025166749954224,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 4419
    },
    {
      "epoch": 0.7072,
      "grad_norm": 0.11629873514175415,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 4420
    },
    {
      "epoch": 0.70736,
      "grad_norm": 0.10308390110731125,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 4421
    },
    {
      "epoch": 0.70752,
      "grad_norm": 0.11215116083621979,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 4422
    },
    {
      "epoch": 0.70768,
      "grad_norm": 0.10364726185798645,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 4423
    },
    {
      "epoch": 0.70784,
      "grad_norm": 0.13622431457042694,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 4424
    },
    {
      "epoch": 0.708,
      "grad_norm": 0.2467607855796814,
      "learning_rate": 0.0001,
      "loss": 0.3006,
      "step": 4425
    },
    {
      "epoch": 0.70816,
      "grad_norm": 0.13021792471408844,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 4426
    },
    {
      "epoch": 0.70832,
      "grad_norm": 0.1880076825618744,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4427
    },
    {
      "epoch": 0.70848,
      "grad_norm": 0.10142013430595398,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 4428
    },
    {
      "epoch": 0.70864,
      "grad_norm": 0.2141207903623581,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 4429
    },
    {
      "epoch": 0.7088,
      "grad_norm": 0.08882734924554825,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4430
    },
    {
      "epoch": 0.70896,
      "grad_norm": 0.11873715370893478,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 4431
    },
    {
      "epoch": 0.70912,
      "grad_norm": 0.11674326658248901,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 4432
    },
    {
      "epoch": 0.70928,
      "grad_norm": 0.14206984639167786,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 4433
    },
    {
      "epoch": 0.70944,
      "grad_norm": 0.13623139262199402,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 4434
    },
    {
      "epoch": 0.7096,
      "grad_norm": 0.1219252496957779,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 4435
    },
    {
      "epoch": 0.70976,
      "grad_norm": 0.10187257826328278,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 4436
    },
    {
      "epoch": 0.70992,
      "grad_norm": 0.11390505731105804,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4437
    },
    {
      "epoch": 0.71008,
      "grad_norm": 0.09362845867872238,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4438
    },
    {
      "epoch": 0.71024,
      "grad_norm": 0.1083495244383812,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 4439
    },
    {
      "epoch": 0.7104,
      "grad_norm": 0.10669024288654327,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 4440
    },
    {
      "epoch": 0.71056,
      "grad_norm": 0.0954165905714035,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4441
    },
    {
      "epoch": 0.71072,
      "grad_norm": 0.10408394038677216,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 4442
    },
    {
      "epoch": 0.71088,
      "grad_norm": 0.114484041929245,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 4443
    },
    {
      "epoch": 0.71104,
      "grad_norm": 0.10414222627878189,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4444
    },
    {
      "epoch": 0.7112,
      "grad_norm": 0.1323714256286621,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 4445
    },
    {
      "epoch": 0.71136,
      "grad_norm": 0.09758789837360382,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 4446
    },
    {
      "epoch": 0.71152,
      "grad_norm": 0.09290307015180588,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 4447
    },
    {
      "epoch": 0.71168,
      "grad_norm": 0.09872277081012726,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 4448
    },
    {
      "epoch": 0.71184,
      "grad_norm": 0.11792544275522232,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 4449
    },
    {
      "epoch": 0.712,
      "grad_norm": 0.086411252617836,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4450
    },
    {
      "epoch": 0.71216,
      "grad_norm": 0.08530228585004807,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 4451
    },
    {
      "epoch": 0.71232,
      "grad_norm": 0.07974813133478165,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 4452
    },
    {
      "epoch": 0.71248,
      "grad_norm": 0.09406521916389465,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 4453
    },
    {
      "epoch": 0.71264,
      "grad_norm": 0.08958321809768677,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 4454
    },
    {
      "epoch": 0.7128,
      "grad_norm": 0.09605646133422852,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 4455
    },
    {
      "epoch": 0.71296,
      "grad_norm": 0.12372317165136337,
      "learning_rate": 0.0001,
      "loss": 0.3041,
      "step": 4456
    },
    {
      "epoch": 0.71312,
      "grad_norm": 0.09852804243564606,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4457
    },
    {
      "epoch": 0.71328,
      "grad_norm": 0.10714974254369736,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4458
    },
    {
      "epoch": 0.71344,
      "grad_norm": 0.13880367577075958,
      "learning_rate": 0.0001,
      "loss": 0.3414,
      "step": 4459
    },
    {
      "epoch": 0.7136,
      "grad_norm": 0.1047687903046608,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 4460
    },
    {
      "epoch": 0.71376,
      "grad_norm": 0.1049928143620491,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 4461
    },
    {
      "epoch": 0.71392,
      "grad_norm": 0.09231214225292206,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4462
    },
    {
      "epoch": 0.71408,
      "grad_norm": 0.09473712742328644,
      "learning_rate": 0.0001,
      "loss": 0.3378,
      "step": 4463
    },
    {
      "epoch": 0.71424,
      "grad_norm": 0.09855658560991287,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4464
    },
    {
      "epoch": 0.7144,
      "grad_norm": 0.10199076682329178,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 4465
    },
    {
      "epoch": 0.71456,
      "grad_norm": 0.08897626399993896,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 4466
    },
    {
      "epoch": 0.71472,
      "grad_norm": 0.09848344326019287,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 4467
    },
    {
      "epoch": 0.71488,
      "grad_norm": 0.09275956451892853,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 4468
    },
    {
      "epoch": 0.71504,
      "grad_norm": 0.12218230962753296,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 4469
    },
    {
      "epoch": 0.7152,
      "grad_norm": 0.08609512448310852,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 4470
    },
    {
      "epoch": 0.71536,
      "grad_norm": 0.08943332731723785,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 4471
    },
    {
      "epoch": 0.71552,
      "grad_norm": 0.14402098953723907,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 4472
    },
    {
      "epoch": 0.71568,
      "grad_norm": 0.08838658034801483,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 4473
    },
    {
      "epoch": 0.71584,
      "grad_norm": 0.09583982825279236,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 4474
    },
    {
      "epoch": 0.716,
      "grad_norm": 0.07819274067878723,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 4475
    },
    {
      "epoch": 0.71616,
      "grad_norm": 0.11700984835624695,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 4476
    },
    {
      "epoch": 0.71632,
      "grad_norm": 0.09678514301776886,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 4477
    },
    {
      "epoch": 0.71648,
      "grad_norm": 0.09794172644615173,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 4478
    },
    {
      "epoch": 0.71664,
      "grad_norm": 0.11401475965976715,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 4479
    },
    {
      "epoch": 0.7168,
      "grad_norm": 0.09400834143161774,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 4480
    },
    {
      "epoch": 0.71696,
      "grad_norm": 0.10434866696596146,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 4481
    },
    {
      "epoch": 0.71712,
      "grad_norm": 0.12546466290950775,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 4482
    },
    {
      "epoch": 0.71728,
      "grad_norm": 0.0915616825222969,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 4483
    },
    {
      "epoch": 0.71744,
      "grad_norm": 0.10499538481235504,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 4484
    },
    {
      "epoch": 0.7176,
      "grad_norm": 0.09372874349355698,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 4485
    },
    {
      "epoch": 0.71776,
      "grad_norm": 0.10852669179439545,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4486
    },
    {
      "epoch": 0.71792,
      "grad_norm": 0.10223736613988876,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 4487
    },
    {
      "epoch": 0.71808,
      "grad_norm": 0.09033491462469101,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4488
    },
    {
      "epoch": 0.71824,
      "grad_norm": 0.11700116097927094,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 4489
    },
    {
      "epoch": 0.7184,
      "grad_norm": 0.09499383717775345,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 4490
    },
    {
      "epoch": 0.71856,
      "grad_norm": 0.10010018944740295,
      "learning_rate": 0.0001,
      "loss": 0.3054,
      "step": 4491
    },
    {
      "epoch": 0.71872,
      "grad_norm": 0.09168343245983124,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 4492
    },
    {
      "epoch": 0.71888,
      "grad_norm": 0.15427084267139435,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 4493
    },
    {
      "epoch": 0.71904,
      "grad_norm": 0.10337585210800171,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4494
    },
    {
      "epoch": 0.7192,
      "grad_norm": 0.09754770249128342,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 4495
    },
    {
      "epoch": 0.71936,
      "grad_norm": 0.09742031991481781,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 4496
    },
    {
      "epoch": 0.71952,
      "grad_norm": 0.09134463965892792,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4497
    },
    {
      "epoch": 0.71968,
      "grad_norm": 0.08754140883684158,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 4498
    },
    {
      "epoch": 0.71984,
      "grad_norm": 0.0901758000254631,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4499
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.09428790211677551,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 4500
    },
    {
      "epoch": 0.72,
      "eval_train_accuracy": 0.9958,
      "eval_train_loss": 0.3190433084964752,
      "eval_train_runtime": 4.8369,
      "eval_train_samples_per_second": 1033.73,
      "eval_train_steps_per_second": 13.025,
      "step": 4500
    },
    {
      "epoch": 0.72,
      "eval_test_accuracy": 0.9968,
      "eval_test_loss": 0.31788501143455505,
      "eval_test_runtime": 4.6447,
      "eval_test_samples_per_second": 1076.507,
      "eval_test_steps_per_second": 13.564,
      "step": 4500
    },
    {
      "epoch": 0.72016,
      "grad_norm": 0.11647436022758484,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4501
    },
    {
      "epoch": 0.72032,
      "grad_norm": 0.09802648425102234,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 4502
    },
    {
      "epoch": 0.72048,
      "grad_norm": 0.08767072856426239,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 4503
    },
    {
      "epoch": 0.72064,
      "grad_norm": 0.0933879017829895,
      "learning_rate": 0.0001,
      "loss": 0.305,
      "step": 4504
    },
    {
      "epoch": 0.7208,
      "grad_norm": 0.1090947687625885,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 4505
    },
    {
      "epoch": 0.72096,
      "grad_norm": 0.11900785565376282,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 4506
    },
    {
      "epoch": 0.72112,
      "grad_norm": 0.10160255432128906,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 4507
    },
    {
      "epoch": 0.72128,
      "grad_norm": 0.09098134934902191,
      "learning_rate": 0.0001,
      "loss": 0.3052,
      "step": 4508
    },
    {
      "epoch": 0.72144,
      "grad_norm": 0.09434345364570618,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 4509
    },
    {
      "epoch": 0.7216,
      "grad_norm": 0.09853003174066544,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 4510
    },
    {
      "epoch": 0.72176,
      "grad_norm": 0.11288778483867645,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4511
    },
    {
      "epoch": 0.72192,
      "grad_norm": 0.1112995445728302,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 4512
    },
    {
      "epoch": 0.72208,
      "grad_norm": 0.08758176118135452,
      "learning_rate": 0.0001,
      "loss": 0.3076,
      "step": 4513
    },
    {
      "epoch": 0.72224,
      "grad_norm": 0.10242032259702682,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 4514
    },
    {
      "epoch": 0.7224,
      "grad_norm": 0.10131535679101944,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 4515
    },
    {
      "epoch": 0.72256,
      "grad_norm": 0.08744532614946365,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 4516
    },
    {
      "epoch": 0.72272,
      "grad_norm": 0.08697381615638733,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 4517
    },
    {
      "epoch": 0.72288,
      "grad_norm": 0.10180410742759705,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 4518
    },
    {
      "epoch": 0.72304,
      "grad_norm": 0.10400544852018356,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 4519
    },
    {
      "epoch": 0.7232,
      "grad_norm": 0.08644077926874161,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 4520
    },
    {
      "epoch": 0.72336,
      "grad_norm": 0.08610069006681442,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4521
    },
    {
      "epoch": 0.72352,
      "grad_norm": 0.10972646623849869,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 4522
    },
    {
      "epoch": 0.72368,
      "grad_norm": 0.08548294007778168,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 4523
    },
    {
      "epoch": 0.72384,
      "grad_norm": 0.09601465612649918,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 4524
    },
    {
      "epoch": 0.724,
      "grad_norm": 0.08821642398834229,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 4525
    },
    {
      "epoch": 0.72416,
      "grad_norm": 0.10179071128368378,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 4526
    },
    {
      "epoch": 0.72432,
      "grad_norm": 0.0838966816663742,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 4527
    },
    {
      "epoch": 0.72448,
      "grad_norm": 0.09757991880178452,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 4528
    },
    {
      "epoch": 0.72464,
      "grad_norm": 0.08737663924694061,
      "learning_rate": 0.0001,
      "loss": 0.3043,
      "step": 4529
    },
    {
      "epoch": 0.7248,
      "grad_norm": 0.09545057266950607,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4530
    },
    {
      "epoch": 0.72496,
      "grad_norm": 0.10448112338781357,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 4531
    },
    {
      "epoch": 0.72512,
      "grad_norm": 0.09239165484905243,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 4532
    },
    {
      "epoch": 0.72528,
      "grad_norm": 0.10752718150615692,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 4533
    },
    {
      "epoch": 0.72544,
      "grad_norm": 0.09051060676574707,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 4534
    },
    {
      "epoch": 0.7256,
      "grad_norm": 0.0912671834230423,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 4535
    },
    {
      "epoch": 0.72576,
      "grad_norm": 0.08350471407175064,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 4536
    },
    {
      "epoch": 0.72592,
      "grad_norm": 0.22579044103622437,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 4537
    },
    {
      "epoch": 0.72608,
      "grad_norm": 0.08338168263435364,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 4538
    },
    {
      "epoch": 0.72624,
      "grad_norm": 0.09409993141889572,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4539
    },
    {
      "epoch": 0.7264,
      "grad_norm": 0.10949933528900146,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 4540
    },
    {
      "epoch": 0.72656,
      "grad_norm": 0.09937220066785812,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 4541
    },
    {
      "epoch": 0.72672,
      "grad_norm": 0.10707997530698776,
      "learning_rate": 0.0001,
      "loss": 0.305,
      "step": 4542
    },
    {
      "epoch": 0.72688,
      "grad_norm": 0.10695898532867432,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4543
    },
    {
      "epoch": 0.72704,
      "grad_norm": 0.09311237931251526,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 4544
    },
    {
      "epoch": 0.7272,
      "grad_norm": 0.07754053175449371,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 4545
    },
    {
      "epoch": 0.72736,
      "grad_norm": 0.09815267473459244,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 4546
    },
    {
      "epoch": 0.72752,
      "grad_norm": 0.1053374782204628,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 4547
    },
    {
      "epoch": 0.72768,
      "grad_norm": 0.10113541036844254,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 4548
    },
    {
      "epoch": 0.72784,
      "grad_norm": 0.09858417510986328,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 4549
    },
    {
      "epoch": 0.728,
      "grad_norm": 0.13192938268184662,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 4550
    },
    {
      "epoch": 0.72816,
      "grad_norm": 0.09319351613521576,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 4551
    },
    {
      "epoch": 0.72832,
      "grad_norm": 0.14746253192424774,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4552
    },
    {
      "epoch": 0.72848,
      "grad_norm": 0.09989586472511292,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 4553
    },
    {
      "epoch": 0.72864,
      "grad_norm": 0.12211331725120544,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 4554
    },
    {
      "epoch": 0.7288,
      "grad_norm": 0.08412016183137894,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 4555
    },
    {
      "epoch": 0.72896,
      "grad_norm": 0.13078010082244873,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 4556
    },
    {
      "epoch": 0.72912,
      "grad_norm": 0.08852799981832504,
      "learning_rate": 0.0001,
      "loss": 0.305,
      "step": 4557
    },
    {
      "epoch": 0.72928,
      "grad_norm": 0.08791319280862808,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 4558
    },
    {
      "epoch": 0.72944,
      "grad_norm": 0.10127446800470352,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 4559
    },
    {
      "epoch": 0.7296,
      "grad_norm": 0.12079481035470963,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 4560
    },
    {
      "epoch": 0.72976,
      "grad_norm": 0.10265818238258362,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 4561
    },
    {
      "epoch": 0.72992,
      "grad_norm": 0.1314108669757843,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4562
    },
    {
      "epoch": 0.73008,
      "grad_norm": 0.08492495119571686,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 4563
    },
    {
      "epoch": 0.73024,
      "grad_norm": 0.08907342702150345,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 4564
    },
    {
      "epoch": 0.7304,
      "grad_norm": 0.08850359916687012,
      "learning_rate": 0.0001,
      "loss": 0.3033,
      "step": 4565
    },
    {
      "epoch": 0.73056,
      "grad_norm": 0.09938383102416992,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 4566
    },
    {
      "epoch": 0.73072,
      "grad_norm": 0.08863101154565811,
      "learning_rate": 0.0001,
      "loss": 0.3001,
      "step": 4567
    },
    {
      "epoch": 0.73088,
      "grad_norm": 0.09436018019914627,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 4568
    },
    {
      "epoch": 0.73104,
      "grad_norm": 0.10149236023426056,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 4569
    },
    {
      "epoch": 0.7312,
      "grad_norm": 0.09519431740045547,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 4570
    },
    {
      "epoch": 0.73136,
      "grad_norm": 0.0817187950015068,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4571
    },
    {
      "epoch": 0.73152,
      "grad_norm": 0.0994081124663353,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 4572
    },
    {
      "epoch": 0.73168,
      "grad_norm": 0.09113871306180954,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 4573
    },
    {
      "epoch": 0.73184,
      "grad_norm": 0.09455030411481857,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 4574
    },
    {
      "epoch": 0.732,
      "grad_norm": 0.09432757645845413,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4575
    },
    {
      "epoch": 0.73216,
      "grad_norm": 0.09905003756284714,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 4576
    },
    {
      "epoch": 0.73232,
      "grad_norm": 0.0922507718205452,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 4577
    },
    {
      "epoch": 0.73248,
      "grad_norm": 0.13801322877407074,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 4578
    },
    {
      "epoch": 0.73264,
      "grad_norm": 0.10530760884284973,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 4579
    },
    {
      "epoch": 0.7328,
      "grad_norm": 0.094019316136837,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 4580
    },
    {
      "epoch": 0.73296,
      "grad_norm": 0.08099206537008286,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 4581
    },
    {
      "epoch": 0.73312,
      "grad_norm": 0.10843262076377869,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 4582
    },
    {
      "epoch": 0.73328,
      "grad_norm": 0.08869828283786774,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 4583
    },
    {
      "epoch": 0.73344,
      "grad_norm": 0.10925935953855515,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 4584
    },
    {
      "epoch": 0.7336,
      "grad_norm": 0.08306252211332321,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 4585
    },
    {
      "epoch": 0.73376,
      "grad_norm": 0.08572924882173538,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 4586
    },
    {
      "epoch": 0.73392,
      "grad_norm": 0.09577006101608276,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 4587
    },
    {
      "epoch": 0.73408,
      "grad_norm": 0.10455434769392014,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 4588
    },
    {
      "epoch": 0.73424,
      "grad_norm": 0.12961187958717346,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 4589
    },
    {
      "epoch": 0.7344,
      "grad_norm": 0.08724425733089447,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 4590
    },
    {
      "epoch": 0.73456,
      "grad_norm": 0.09790956228971481,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 4591
    },
    {
      "epoch": 0.73472,
      "grad_norm": 0.17874176800251007,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4592
    },
    {
      "epoch": 0.73488,
      "grad_norm": 0.13193240761756897,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 4593
    },
    {
      "epoch": 0.73504,
      "grad_norm": 0.11267860978841782,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 4594
    },
    {
      "epoch": 0.7352,
      "grad_norm": 0.09504116326570511,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 4595
    },
    {
      "epoch": 0.73536,
      "grad_norm": 0.08487685769796371,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 4596
    },
    {
      "epoch": 0.73552,
      "grad_norm": 0.09419481456279755,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 4597
    },
    {
      "epoch": 0.73568,
      "grad_norm": 0.11988533288240433,
      "learning_rate": 0.0001,
      "loss": 0.3306,
      "step": 4598
    },
    {
      "epoch": 0.73584,
      "grad_norm": 0.17405273020267487,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 4599
    },
    {
      "epoch": 0.736,
      "grad_norm": 0.10162734985351562,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4600
    },
    {
      "epoch": 0.736,
      "eval_train_accuracy": 0.9944,
      "eval_train_loss": 0.31908729672431946,
      "eval_train_runtime": 4.6798,
      "eval_train_samples_per_second": 1068.424,
      "eval_train_steps_per_second": 13.462,
      "step": 4600
    },
    {
      "epoch": 0.736,
      "eval_test_accuracy": 0.9946,
      "eval_test_loss": 0.3179720640182495,
      "eval_test_runtime": 4.7552,
      "eval_test_samples_per_second": 1051.491,
      "eval_test_steps_per_second": 13.249,
      "step": 4600
    },
    {
      "epoch": 0.73616,
      "grad_norm": 0.08740265667438507,
      "learning_rate": 0.0001,
      "loss": 0.3067,
      "step": 4601
    },
    {
      "epoch": 0.73632,
      "grad_norm": 0.10103660821914673,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4602
    },
    {
      "epoch": 0.73648,
      "grad_norm": 0.09189178049564362,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 4603
    },
    {
      "epoch": 0.73664,
      "grad_norm": 0.10209137201309204,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 4604
    },
    {
      "epoch": 0.7368,
      "grad_norm": 0.1061415895819664,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 4605
    },
    {
      "epoch": 0.73696,
      "grad_norm": 0.08774688839912415,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4606
    },
    {
      "epoch": 0.73712,
      "grad_norm": 0.08709856122732162,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 4607
    },
    {
      "epoch": 0.73728,
      "grad_norm": 0.10903389006853104,
      "learning_rate": 0.0001,
      "loss": 0.3063,
      "step": 4608
    },
    {
      "epoch": 0.73744,
      "grad_norm": 0.11202143877744675,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 4609
    },
    {
      "epoch": 0.7376,
      "grad_norm": 0.10461678355932236,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 4610
    },
    {
      "epoch": 0.73776,
      "grad_norm": 0.09088502824306488,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 4611
    },
    {
      "epoch": 0.73792,
      "grad_norm": 0.08969022333621979,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 4612
    },
    {
      "epoch": 0.73808,
      "grad_norm": 0.08102080225944519,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 4613
    },
    {
      "epoch": 0.73824,
      "grad_norm": 0.14566074311733246,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 4614
    },
    {
      "epoch": 0.7384,
      "grad_norm": 0.09583462029695511,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 4615
    },
    {
      "epoch": 0.73856,
      "grad_norm": 0.0955883264541626,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 4616
    },
    {
      "epoch": 0.73872,
      "grad_norm": 0.09681444615125656,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 4617
    },
    {
      "epoch": 0.73888,
      "grad_norm": 0.08714661747217178,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 4618
    },
    {
      "epoch": 0.73904,
      "grad_norm": 0.09922345727682114,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 4619
    },
    {
      "epoch": 0.7392,
      "grad_norm": 0.10286234319210052,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 4620
    },
    {
      "epoch": 0.73936,
      "grad_norm": 0.0913805142045021,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4621
    },
    {
      "epoch": 0.73952,
      "grad_norm": 0.10230699926614761,
      "learning_rate": 0.0001,
      "loss": 0.3319,
      "step": 4622
    },
    {
      "epoch": 0.73968,
      "grad_norm": 0.09183750301599503,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 4623
    },
    {
      "epoch": 0.73984,
      "grad_norm": 0.087887704372406,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 4624
    },
    {
      "epoch": 0.74,
      "grad_norm": 0.08411640673875809,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4625
    },
    {
      "epoch": 0.74016,
      "grad_norm": 0.08160457760095596,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 4626
    },
    {
      "epoch": 0.74032,
      "grad_norm": 0.09517115354537964,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 4627
    },
    {
      "epoch": 0.74048,
      "grad_norm": 0.09271764755249023,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 4628
    },
    {
      "epoch": 0.74064,
      "grad_norm": 0.10139477252960205,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 4629
    },
    {
      "epoch": 0.7408,
      "grad_norm": 0.11124764382839203,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 4630
    },
    {
      "epoch": 0.74096,
      "grad_norm": 0.09953653067350388,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 4631
    },
    {
      "epoch": 0.74112,
      "grad_norm": 0.08487856388092041,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 4632
    },
    {
      "epoch": 0.74128,
      "grad_norm": 0.08968520909547806,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 4633
    },
    {
      "epoch": 0.74144,
      "grad_norm": 0.09369069337844849,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 4634
    },
    {
      "epoch": 0.7416,
      "grad_norm": 0.09298352152109146,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 4635
    },
    {
      "epoch": 0.74176,
      "grad_norm": 0.11164575815200806,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 4636
    },
    {
      "epoch": 0.74192,
      "grad_norm": 0.11221770197153091,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4637
    },
    {
      "epoch": 0.74208,
      "grad_norm": 0.08379275351762772,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 4638
    },
    {
      "epoch": 0.74224,
      "grad_norm": 0.08099037408828735,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 4639
    },
    {
      "epoch": 0.7424,
      "grad_norm": 0.10866004228591919,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 4640
    },
    {
      "epoch": 0.74256,
      "grad_norm": 0.09348373115062714,
      "learning_rate": 0.0001,
      "loss": 0.3323,
      "step": 4641
    },
    {
      "epoch": 0.74272,
      "grad_norm": 0.1157204657793045,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 4642
    },
    {
      "epoch": 0.74288,
      "grad_norm": 0.10225356370210648,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 4643
    },
    {
      "epoch": 0.74304,
      "grad_norm": 0.08753061294555664,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 4644
    },
    {
      "epoch": 0.7432,
      "grad_norm": 0.08547170460224152,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4645
    },
    {
      "epoch": 0.74336,
      "grad_norm": 0.09115193784236908,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 4646
    },
    {
      "epoch": 0.74352,
      "grad_norm": 0.09705277532339096,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 4647
    },
    {
      "epoch": 0.74368,
      "grad_norm": 0.09426780045032501,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 4648
    },
    {
      "epoch": 0.74384,
      "grad_norm": 0.08378059417009354,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 4649
    },
    {
      "epoch": 0.744,
      "grad_norm": 0.1043458804488182,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 4650
    },
    {
      "epoch": 0.74416,
      "grad_norm": 0.08640117943286896,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 4651
    },
    {
      "epoch": 0.74432,
      "grad_norm": 0.10534247010946274,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 4652
    },
    {
      "epoch": 0.74448,
      "grad_norm": 0.09661270678043365,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 4653
    },
    {
      "epoch": 0.74464,
      "grad_norm": 0.09799493849277496,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 4654
    },
    {
      "epoch": 0.7448,
      "grad_norm": 0.12947659194469452,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4655
    },
    {
      "epoch": 0.74496,
      "grad_norm": 0.0940956175327301,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 4656
    },
    {
      "epoch": 0.74512,
      "grad_norm": 0.11265525221824646,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 4657
    },
    {
      "epoch": 0.74528,
      "grad_norm": 0.09185464680194855,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 4658
    },
    {
      "epoch": 0.74544,
      "grad_norm": 0.08761312812566757,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4659
    },
    {
      "epoch": 0.7456,
      "grad_norm": 0.08596163988113403,
      "learning_rate": 0.0001,
      "loss": 0.3042,
      "step": 4660
    },
    {
      "epoch": 0.74576,
      "grad_norm": 0.09284862875938416,
      "learning_rate": 0.0001,
      "loss": 0.337,
      "step": 4661
    },
    {
      "epoch": 0.74592,
      "grad_norm": 0.08328630775213242,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4662
    },
    {
      "epoch": 0.74608,
      "grad_norm": 0.07789234817028046,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 4663
    },
    {
      "epoch": 0.74624,
      "grad_norm": 0.0848863422870636,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 4664
    },
    {
      "epoch": 0.7464,
      "grad_norm": 0.0864405408501625,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 4665
    },
    {
      "epoch": 0.74656,
      "grad_norm": 0.07857751101255417,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4666
    },
    {
      "epoch": 0.74672,
      "grad_norm": 0.08983537554740906,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 4667
    },
    {
      "epoch": 0.74688,
      "grad_norm": 0.08493401855230331,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 4668
    },
    {
      "epoch": 0.74704,
      "grad_norm": 0.09522931277751923,
      "learning_rate": 0.0001,
      "loss": 0.3376,
      "step": 4669
    },
    {
      "epoch": 0.7472,
      "grad_norm": 0.10311070084571838,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 4670
    },
    {
      "epoch": 0.74736,
      "grad_norm": 0.09727238863706589,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4671
    },
    {
      "epoch": 0.74752,
      "grad_norm": 0.0906924158334732,
      "learning_rate": 0.0001,
      "loss": 0.3001,
      "step": 4672
    },
    {
      "epoch": 0.74768,
      "grad_norm": 0.08773338794708252,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 4673
    },
    {
      "epoch": 0.74784,
      "grad_norm": 0.10929765552282333,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 4674
    },
    {
      "epoch": 0.748,
      "grad_norm": 0.08483419567346573,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4675
    },
    {
      "epoch": 0.74816,
      "grad_norm": 0.09721150249242783,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 4676
    },
    {
      "epoch": 0.74832,
      "grad_norm": 0.10760818421840668,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 4677
    },
    {
      "epoch": 0.74848,
      "grad_norm": 0.09207267314195633,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 4678
    },
    {
      "epoch": 0.74864,
      "grad_norm": 0.09157732129096985,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 4679
    },
    {
      "epoch": 0.7488,
      "grad_norm": 0.08596112579107285,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 4680
    },
    {
      "epoch": 0.74896,
      "grad_norm": 0.08949578553438187,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 4681
    },
    {
      "epoch": 0.74912,
      "grad_norm": 0.08622784912586212,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 4682
    },
    {
      "epoch": 0.74928,
      "grad_norm": 0.10143238306045532,
      "learning_rate": 0.0001,
      "loss": 0.3022,
      "step": 4683
    },
    {
      "epoch": 0.74944,
      "grad_norm": 0.10047988593578339,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 4684
    },
    {
      "epoch": 0.7496,
      "grad_norm": 0.09275155514478683,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 4685
    },
    {
      "epoch": 0.74976,
      "grad_norm": 0.08511120826005936,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 4686
    },
    {
      "epoch": 0.74992,
      "grad_norm": 0.08485717326402664,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 4687
    },
    {
      "epoch": 0.75008,
      "grad_norm": 0.08765453100204468,
      "learning_rate": 0.0001,
      "loss": 0.2955,
      "step": 4688
    },
    {
      "epoch": 0.75024,
      "grad_norm": 0.09129980206489563,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 4689
    },
    {
      "epoch": 0.7504,
      "grad_norm": 0.08938176184892654,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 4690
    },
    {
      "epoch": 0.75056,
      "grad_norm": 0.09593997150659561,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 4691
    },
    {
      "epoch": 0.75072,
      "grad_norm": 0.08014365285634995,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 4692
    },
    {
      "epoch": 0.75088,
      "grad_norm": 0.11626756936311722,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 4693
    },
    {
      "epoch": 0.75104,
      "grad_norm": 0.09041862189769745,
      "learning_rate": 0.0001,
      "loss": 0.3075,
      "step": 4694
    },
    {
      "epoch": 0.7512,
      "grad_norm": 0.09463885426521301,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 4695
    },
    {
      "epoch": 0.75136,
      "grad_norm": 0.09142075479030609,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 4696
    },
    {
      "epoch": 0.75152,
      "grad_norm": 0.1138330027461052,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 4697
    },
    {
      "epoch": 0.75168,
      "grad_norm": 0.08496144413948059,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 4698
    },
    {
      "epoch": 0.75184,
      "grad_norm": 0.09077592194080353,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 4699
    },
    {
      "epoch": 0.752,
      "grad_norm": 0.1049819141626358,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 4700
    },
    {
      "epoch": 0.752,
      "eval_train_accuracy": 0.9954,
      "eval_train_loss": 0.3190915882587433,
      "eval_train_runtime": 4.2848,
      "eval_train_samples_per_second": 1166.914,
      "eval_train_steps_per_second": 14.703,
      "step": 4700
    },
    {
      "epoch": 0.752,
      "eval_test_accuracy": 0.9976,
      "eval_test_loss": 0.3177819848060608,
      "eval_test_runtime": 4.8858,
      "eval_test_samples_per_second": 1023.374,
      "eval_test_steps_per_second": 12.895,
      "step": 4700
    },
    {
      "epoch": 0.75216,
      "grad_norm": 0.09613167494535446,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 4701
    },
    {
      "epoch": 0.75232,
      "grad_norm": 0.09910055994987488,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 4702
    },
    {
      "epoch": 0.75248,
      "grad_norm": 0.11203658580780029,
      "learning_rate": 0.0001,
      "loss": 0.3394,
      "step": 4703
    },
    {
      "epoch": 0.75264,
      "grad_norm": 0.08738203346729279,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4704
    },
    {
      "epoch": 0.7528,
      "grad_norm": 0.09365487098693848,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 4705
    },
    {
      "epoch": 0.75296,
      "grad_norm": 0.10604250431060791,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 4706
    },
    {
      "epoch": 0.75312,
      "grad_norm": 0.08652786910533905,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 4707
    },
    {
      "epoch": 0.75328,
      "grad_norm": 0.09472069889307022,
      "learning_rate": 0.0001,
      "loss": 0.2946,
      "step": 4708
    },
    {
      "epoch": 0.75344,
      "grad_norm": 0.09772183746099472,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 4709
    },
    {
      "epoch": 0.7536,
      "grad_norm": 0.09955662488937378,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 4710
    },
    {
      "epoch": 0.75376,
      "grad_norm": 0.07974095642566681,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 4711
    },
    {
      "epoch": 0.75392,
      "grad_norm": 0.09774981439113617,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4712
    },
    {
      "epoch": 0.75408,
      "grad_norm": 0.10613851249217987,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 4713
    },
    {
      "epoch": 0.75424,
      "grad_norm": 0.08716420084238052,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 4714
    },
    {
      "epoch": 0.7544,
      "grad_norm": 0.08845775574445724,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 4715
    },
    {
      "epoch": 0.75456,
      "grad_norm": 0.0884355679154396,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 4716
    },
    {
      "epoch": 0.75472,
      "grad_norm": 0.09620565921068192,
      "learning_rate": 0.0001,
      "loss": 0.3394,
      "step": 4717
    },
    {
      "epoch": 0.75488,
      "grad_norm": 0.09044339507818222,
      "learning_rate": 0.0001,
      "loss": 0.3365,
      "step": 4718
    },
    {
      "epoch": 0.75504,
      "grad_norm": 0.08545408397912979,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 4719
    },
    {
      "epoch": 0.7552,
      "grad_norm": 0.09522361308336258,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4720
    },
    {
      "epoch": 0.75536,
      "grad_norm": 0.10152226686477661,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 4721
    },
    {
      "epoch": 0.75552,
      "grad_norm": 0.08231279253959656,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 4722
    },
    {
      "epoch": 0.75568,
      "grad_norm": 0.08859466016292572,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 4723
    },
    {
      "epoch": 0.75584,
      "grad_norm": 0.10374577343463898,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 4724
    },
    {
      "epoch": 0.756,
      "grad_norm": 0.10638543218374252,
      "learning_rate": 0.0001,
      "loss": 0.3341,
      "step": 4725
    },
    {
      "epoch": 0.75616,
      "grad_norm": 0.09052299708127975,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 4726
    },
    {
      "epoch": 0.75632,
      "grad_norm": 0.09297271072864532,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 4727
    },
    {
      "epoch": 0.75648,
      "grad_norm": 0.09576566517353058,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 4728
    },
    {
      "epoch": 0.75664,
      "grad_norm": 0.08643261343240738,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 4729
    },
    {
      "epoch": 0.7568,
      "grad_norm": 0.12462013959884644,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 4730
    },
    {
      "epoch": 0.75696,
      "grad_norm": 0.08995287120342255,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 4731
    },
    {
      "epoch": 0.75712,
      "grad_norm": 0.11769884079694748,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4732
    },
    {
      "epoch": 0.75728,
      "grad_norm": 0.09247476607561111,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 4733
    },
    {
      "epoch": 0.75744,
      "grad_norm": 0.09671143442392349,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 4734
    },
    {
      "epoch": 0.7576,
      "grad_norm": 0.12686282396316528,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 4735
    },
    {
      "epoch": 0.75776,
      "grad_norm": 0.08804575353860855,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 4736
    },
    {
      "epoch": 0.75792,
      "grad_norm": 0.08447720855474472,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4737
    },
    {
      "epoch": 0.75808,
      "grad_norm": 0.11460467427968979,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4738
    },
    {
      "epoch": 0.75824,
      "grad_norm": 0.09967582672834396,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 4739
    },
    {
      "epoch": 0.7584,
      "grad_norm": 0.08033429086208344,
      "learning_rate": 0.0001,
      "loss": 0.3065,
      "step": 4740
    },
    {
      "epoch": 0.75856,
      "grad_norm": 0.07757673412561417,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 4741
    },
    {
      "epoch": 0.75872,
      "grad_norm": 0.1033654734492302,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 4742
    },
    {
      "epoch": 0.75888,
      "grad_norm": 0.09079764038324356,
      "learning_rate": 0.0001,
      "loss": 0.3048,
      "step": 4743
    },
    {
      "epoch": 0.75904,
      "grad_norm": 0.08961652964353561,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 4744
    },
    {
      "epoch": 0.7592,
      "grad_norm": 0.08330495655536652,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 4745
    },
    {
      "epoch": 0.75936,
      "grad_norm": 0.0895666554570198,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4746
    },
    {
      "epoch": 0.75952,
      "grad_norm": 0.11454985290765762,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4747
    },
    {
      "epoch": 0.75968,
      "grad_norm": 0.10101428627967834,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 4748
    },
    {
      "epoch": 0.75984,
      "grad_norm": 0.08432922512292862,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4749
    },
    {
      "epoch": 0.76,
      "grad_norm": 0.0821579173207283,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4750
    },
    {
      "epoch": 0.76016,
      "grad_norm": 0.1354975551366806,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 4751
    },
    {
      "epoch": 0.76032,
      "grad_norm": 0.08669261634349823,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 4752
    },
    {
      "epoch": 0.76048,
      "grad_norm": 0.07421784102916718,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 4753
    },
    {
      "epoch": 0.76064,
      "grad_norm": 0.08302470296621323,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 4754
    },
    {
      "epoch": 0.7608,
      "grad_norm": 0.08399253338575363,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4755
    },
    {
      "epoch": 0.76096,
      "grad_norm": 0.10353972762823105,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 4756
    },
    {
      "epoch": 0.76112,
      "grad_norm": 0.07773739099502563,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 4757
    },
    {
      "epoch": 0.76128,
      "grad_norm": 0.08628598600625992,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 4758
    },
    {
      "epoch": 0.76144,
      "grad_norm": 0.10295801609754562,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 4759
    },
    {
      "epoch": 0.7616,
      "grad_norm": 0.12861236929893494,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 4760
    },
    {
      "epoch": 0.76176,
      "grad_norm": 0.10262233763933182,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4761
    },
    {
      "epoch": 0.76192,
      "grad_norm": 0.13094185292720795,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4762
    },
    {
      "epoch": 0.76208,
      "grad_norm": 0.08774740248918533,
      "learning_rate": 0.0001,
      "loss": 0.2988,
      "step": 4763
    },
    {
      "epoch": 0.76224,
      "grad_norm": 0.10639092326164246,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 4764
    },
    {
      "epoch": 0.7624,
      "grad_norm": 0.20396657288074493,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 4765
    },
    {
      "epoch": 0.76256,
      "grad_norm": 0.08935409784317017,
      "learning_rate": 0.0001,
      "loss": 0.3007,
      "step": 4766
    },
    {
      "epoch": 0.76272,
      "grad_norm": 0.0871208906173706,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 4767
    },
    {
      "epoch": 0.76288,
      "grad_norm": 0.25484469532966614,
      "learning_rate": 0.0001,
      "loss": 0.3067,
      "step": 4768
    },
    {
      "epoch": 0.76304,
      "grad_norm": 0.10787945240736008,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 4769
    },
    {
      "epoch": 0.7632,
      "grad_norm": 0.3792290985584259,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 4770
    },
    {
      "epoch": 0.76336,
      "grad_norm": 0.15749233961105347,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 4771
    },
    {
      "epoch": 0.76352,
      "grad_norm": 1.5770177841186523,
      "learning_rate": 0.0001,
      "loss": 0.3577,
      "step": 4772
    },
    {
      "epoch": 0.76368,
      "grad_norm": 3.2684273719787598,
      "learning_rate": 0.0001,
      "loss": 0.459,
      "step": 4773
    },
    {
      "epoch": 0.76384,
      "grad_norm": 1.537289023399353,
      "learning_rate": 0.0001,
      "loss": 0.3676,
      "step": 4774
    },
    {
      "epoch": 0.764,
      "grad_norm": 0.914119303226471,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4775
    },
    {
      "epoch": 0.76416,
      "grad_norm": 2.243039846420288,
      "learning_rate": 0.0001,
      "loss": 0.4015,
      "step": 4776
    },
    {
      "epoch": 0.76432,
      "grad_norm": 1.5836855173110962,
      "learning_rate": 0.0001,
      "loss": 0.3781,
      "step": 4777
    },
    {
      "epoch": 0.76448,
      "grad_norm": 0.5582972168922424,
      "learning_rate": 0.0001,
      "loss": 0.3516,
      "step": 4778
    },
    {
      "epoch": 0.76464,
      "grad_norm": 2.1336557865142822,
      "learning_rate": 0.0001,
      "loss": 0.3588,
      "step": 4779
    },
    {
      "epoch": 0.7648,
      "grad_norm": 0.44309574365615845,
      "learning_rate": 0.0001,
      "loss": 0.3416,
      "step": 4780
    },
    {
      "epoch": 0.76496,
      "grad_norm": 0.8544241786003113,
      "learning_rate": 0.0001,
      "loss": 0.3408,
      "step": 4781
    },
    {
      "epoch": 0.76512,
      "grad_norm": 0.3121553957462311,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 4782
    },
    {
      "epoch": 0.76528,
      "grad_norm": 0.4272162616252899,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 4783
    },
    {
      "epoch": 0.76544,
      "grad_norm": 0.37591809034347534,
      "learning_rate": 0.0001,
      "loss": 0.345,
      "step": 4784
    },
    {
      "epoch": 0.7656,
      "grad_norm": 0.39795172214508057,
      "learning_rate": 0.0001,
      "loss": 0.3402,
      "step": 4785
    },
    {
      "epoch": 0.76576,
      "grad_norm": 0.2271474003791809,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 4786
    },
    {
      "epoch": 0.76592,
      "grad_norm": 0.3250545859336853,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 4787
    },
    {
      "epoch": 0.76608,
      "grad_norm": 0.1773812621831894,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4788
    },
    {
      "epoch": 0.76624,
      "grad_norm": 0.15519441664218903,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 4789
    },
    {
      "epoch": 0.7664,
      "grad_norm": 0.1562480479478836,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 4790
    },
    {
      "epoch": 0.76656,
      "grad_norm": 0.24930553138256073,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 4791
    },
    {
      "epoch": 0.76672,
      "grad_norm": 0.22322702407836914,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 4792
    },
    {
      "epoch": 0.76688,
      "grad_norm": 0.19764314591884613,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 4793
    },
    {
      "epoch": 0.76704,
      "grad_norm": 0.18628928065299988,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 4794
    },
    {
      "epoch": 0.7672,
      "grad_norm": 0.14727897942066193,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 4795
    },
    {
      "epoch": 0.76736,
      "grad_norm": 0.13227137923240662,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 4796
    },
    {
      "epoch": 0.76752,
      "grad_norm": 0.11555319279432297,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 4797
    },
    {
      "epoch": 0.76768,
      "grad_norm": 0.1707257628440857,
      "learning_rate": 0.0001,
      "loss": 0.3362,
      "step": 4798
    },
    {
      "epoch": 0.76784,
      "grad_norm": 0.15465639531612396,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 4799
    },
    {
      "epoch": 0.768,
      "grad_norm": 0.24582625925540924,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 4800
    },
    {
      "epoch": 0.768,
      "eval_train_accuracy": 0.993,
      "eval_train_loss": 0.3204857110977173,
      "eval_train_runtime": 4.5433,
      "eval_train_samples_per_second": 1100.528,
      "eval_train_steps_per_second": 13.867,
      "step": 4800
    },
    {
      "epoch": 0.768,
      "eval_test_accuracy": 0.9954,
      "eval_test_loss": 0.31912118196487427,
      "eval_test_runtime": 4.8342,
      "eval_test_samples_per_second": 1034.301,
      "eval_test_steps_per_second": 13.032,
      "step": 4800
    },
    {
      "epoch": 0.76816,
      "grad_norm": 0.1384231299161911,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 4801
    },
    {
      "epoch": 0.76832,
      "grad_norm": 0.1366530954837799,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4802
    },
    {
      "epoch": 0.76848,
      "grad_norm": 0.20383745431900024,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 4803
    },
    {
      "epoch": 0.76864,
      "grad_norm": 0.10691969841718674,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4804
    },
    {
      "epoch": 0.7688,
      "grad_norm": 0.12400345504283905,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 4805
    },
    {
      "epoch": 0.76896,
      "grad_norm": 0.14926138520240784,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 4806
    },
    {
      "epoch": 0.76912,
      "grad_norm": 0.1079876571893692,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 4807
    },
    {
      "epoch": 0.76928,
      "grad_norm": 0.14209792017936707,
      "learning_rate": 0.0001,
      "loss": 0.3388,
      "step": 4808
    },
    {
      "epoch": 0.76944,
      "grad_norm": 0.11462732404470444,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4809
    },
    {
      "epoch": 0.7696,
      "grad_norm": 0.10732949525117874,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 4810
    },
    {
      "epoch": 0.76976,
      "grad_norm": 0.11041686683893204,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 4811
    },
    {
      "epoch": 0.76992,
      "grad_norm": 0.11254476755857468,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 4812
    },
    {
      "epoch": 0.77008,
      "grad_norm": 0.10024547576904297,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 4813
    },
    {
      "epoch": 0.77024,
      "grad_norm": 0.14314648509025574,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 4814
    },
    {
      "epoch": 0.7704,
      "grad_norm": 0.13561776280403137,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4815
    },
    {
      "epoch": 0.77056,
      "grad_norm": 0.1289338767528534,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 4816
    },
    {
      "epoch": 0.77072,
      "grad_norm": 0.10455493628978729,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 4817
    },
    {
      "epoch": 0.77088,
      "grad_norm": 0.1196906790137291,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 4818
    },
    {
      "epoch": 0.77104,
      "grad_norm": 0.11992813646793365,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 4819
    },
    {
      "epoch": 0.7712,
      "grad_norm": 0.11345738917589188,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 4820
    },
    {
      "epoch": 0.77136,
      "grad_norm": 0.12414460629224777,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 4821
    },
    {
      "epoch": 0.77152,
      "grad_norm": 0.104376420378685,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 4822
    },
    {
      "epoch": 0.77168,
      "grad_norm": 0.10438279062509537,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 4823
    },
    {
      "epoch": 0.77184,
      "grad_norm": 0.10064344853162766,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4824
    },
    {
      "epoch": 0.772,
      "grad_norm": 0.1099114790558815,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 4825
    },
    {
      "epoch": 0.77216,
      "grad_norm": 0.11649011820554733,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4826
    },
    {
      "epoch": 0.77232,
      "grad_norm": 0.11166156083345413,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4827
    },
    {
      "epoch": 0.77248,
      "grad_norm": 0.10716388374567032,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 4828
    },
    {
      "epoch": 0.77264,
      "grad_norm": 0.10209188610315323,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 4829
    },
    {
      "epoch": 0.7728,
      "grad_norm": 0.12508593499660492,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 4830
    },
    {
      "epoch": 0.77296,
      "grad_norm": 0.10430334508419037,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 4831
    },
    {
      "epoch": 0.77312,
      "grad_norm": 0.1469508409500122,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 4832
    },
    {
      "epoch": 0.77328,
      "grad_norm": 0.11329375952482224,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 4833
    },
    {
      "epoch": 0.77344,
      "grad_norm": 0.10370690375566483,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 4834
    },
    {
      "epoch": 0.7736,
      "grad_norm": 0.27916646003723145,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 4835
    },
    {
      "epoch": 0.77376,
      "grad_norm": 0.09860876202583313,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 4836
    },
    {
      "epoch": 0.77392,
      "grad_norm": 0.08643999695777893,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 4837
    },
    {
      "epoch": 0.77408,
      "grad_norm": 0.10856285691261292,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 4838
    },
    {
      "epoch": 0.77424,
      "grad_norm": 0.13753663003444672,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 4839
    },
    {
      "epoch": 0.7744,
      "grad_norm": 0.13352376222610474,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 4840
    },
    {
      "epoch": 0.77456,
      "grad_norm": 0.11444846540689468,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 4841
    },
    {
      "epoch": 0.77472,
      "grad_norm": 0.10474361479282379,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 4842
    },
    {
      "epoch": 0.77488,
      "grad_norm": 0.11655612289905548,
      "learning_rate": 0.0001,
      "loss": 0.3374,
      "step": 4843
    },
    {
      "epoch": 0.77504,
      "grad_norm": 0.18873174488544464,
      "learning_rate": 0.0001,
      "loss": 0.3318,
      "step": 4844
    },
    {
      "epoch": 0.7752,
      "grad_norm": 0.10614168643951416,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 4845
    },
    {
      "epoch": 0.77536,
      "grad_norm": 0.18086782097816467,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 4846
    },
    {
      "epoch": 0.77552,
      "grad_norm": 0.1349024474620819,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 4847
    },
    {
      "epoch": 0.77568,
      "grad_norm": 0.0985240638256073,
      "learning_rate": 0.0001,
      "loss": 0.3076,
      "step": 4848
    },
    {
      "epoch": 0.77584,
      "grad_norm": 0.12810537219047546,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 4849
    },
    {
      "epoch": 0.776,
      "grad_norm": 0.10973110049962997,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 4850
    },
    {
      "epoch": 0.77616,
      "grad_norm": 0.11522618681192398,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 4851
    },
    {
      "epoch": 0.77632,
      "grad_norm": 0.1562056690454483,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 4852
    },
    {
      "epoch": 0.77648,
      "grad_norm": 0.1171346977353096,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 4853
    },
    {
      "epoch": 0.77664,
      "grad_norm": 0.09374385327100754,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 4854
    },
    {
      "epoch": 0.7768,
      "grad_norm": 0.09983653575181961,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 4855
    },
    {
      "epoch": 0.77696,
      "grad_norm": 0.09162121266126633,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 4856
    },
    {
      "epoch": 0.77712,
      "grad_norm": 0.0934271588921547,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 4857
    },
    {
      "epoch": 0.77728,
      "grad_norm": 0.09217565506696701,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 4858
    },
    {
      "epoch": 0.77744,
      "grad_norm": 0.10074356943368912,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 4859
    },
    {
      "epoch": 0.7776,
      "grad_norm": 0.11429331451654434,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 4860
    },
    {
      "epoch": 0.77776,
      "grad_norm": 0.0951508954167366,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 4861
    },
    {
      "epoch": 0.77792,
      "grad_norm": 0.10101709514856339,
      "learning_rate": 0.0001,
      "loss": 0.3314,
      "step": 4862
    },
    {
      "epoch": 0.77808,
      "grad_norm": 0.09845547378063202,
      "learning_rate": 0.0001,
      "loss": 0.3031,
      "step": 4863
    },
    {
      "epoch": 0.77824,
      "grad_norm": 0.1044137105345726,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 4864
    },
    {
      "epoch": 0.7784,
      "grad_norm": 0.10064303874969482,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 4865
    },
    {
      "epoch": 0.77856,
      "grad_norm": 0.09500040858983994,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4866
    },
    {
      "epoch": 0.77872,
      "grad_norm": 0.08604655414819717,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 4867
    },
    {
      "epoch": 0.77888,
      "grad_norm": 0.09293153136968613,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4868
    },
    {
      "epoch": 0.77904,
      "grad_norm": 0.10565754771232605,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4869
    },
    {
      "epoch": 0.7792,
      "grad_norm": 0.10916978120803833,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4870
    },
    {
      "epoch": 0.77936,
      "grad_norm": 0.17685042321681976,
      "learning_rate": 0.0001,
      "loss": 0.3293,
      "step": 4871
    },
    {
      "epoch": 0.77952,
      "grad_norm": 0.08880025148391724,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 4872
    },
    {
      "epoch": 0.77968,
      "grad_norm": 0.15083195269107819,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 4873
    },
    {
      "epoch": 0.77984,
      "grad_norm": 0.08493831753730774,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 4874
    },
    {
      "epoch": 0.78,
      "grad_norm": 0.10331351310014725,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 4875
    },
    {
      "epoch": 0.78016,
      "grad_norm": 0.08300274610519409,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 4876
    },
    {
      "epoch": 0.78032,
      "grad_norm": 0.10431111603975296,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 4877
    },
    {
      "epoch": 0.78048,
      "grad_norm": 0.11269956082105637,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 4878
    },
    {
      "epoch": 0.78064,
      "grad_norm": 0.11988617479801178,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 4879
    },
    {
      "epoch": 0.7808,
      "grad_norm": 0.09024703502655029,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 4880
    },
    {
      "epoch": 0.78096,
      "grad_norm": 0.08698447048664093,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 4881
    },
    {
      "epoch": 0.78112,
      "grad_norm": 0.08129067718982697,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 4882
    },
    {
      "epoch": 0.78128,
      "grad_norm": 0.11548902094364166,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 4883
    },
    {
      "epoch": 0.78144,
      "grad_norm": 0.08963558822870255,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 4884
    },
    {
      "epoch": 0.7816,
      "grad_norm": 0.10058034956455231,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 4885
    },
    {
      "epoch": 0.78176,
      "grad_norm": 0.11207418143749237,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 4886
    },
    {
      "epoch": 0.78192,
      "grad_norm": 0.12816335260868073,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 4887
    },
    {
      "epoch": 0.78208,
      "grad_norm": 0.086207814514637,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 4888
    },
    {
      "epoch": 0.78224,
      "grad_norm": 0.09640701115131378,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 4889
    },
    {
      "epoch": 0.7824,
      "grad_norm": 0.10079745203256607,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 4890
    },
    {
      "epoch": 0.78256,
      "grad_norm": 0.1238512471318245,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4891
    },
    {
      "epoch": 0.78272,
      "grad_norm": 0.11520782858133316,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 4892
    },
    {
      "epoch": 0.78288,
      "grad_norm": 0.08328002691268921,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 4893
    },
    {
      "epoch": 0.78304,
      "grad_norm": 0.09782937914133072,
      "learning_rate": 0.0001,
      "loss": 0.3071,
      "step": 4894
    },
    {
      "epoch": 0.7832,
      "grad_norm": 0.11975551396608353,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 4895
    },
    {
      "epoch": 0.78336,
      "grad_norm": 0.10134831070899963,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 4896
    },
    {
      "epoch": 0.78352,
      "grad_norm": 0.08735881745815277,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 4897
    },
    {
      "epoch": 0.78368,
      "grad_norm": 0.10918376594781876,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4898
    },
    {
      "epoch": 0.78384,
      "grad_norm": 0.1026046946644783,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 4899
    },
    {
      "epoch": 0.784,
      "grad_norm": 0.11433179676532745,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 4900
    },
    {
      "epoch": 0.784,
      "eval_train_accuracy": 0.9946,
      "eval_train_loss": 0.3188247084617615,
      "eval_train_runtime": 4.3904,
      "eval_train_samples_per_second": 1138.846,
      "eval_train_steps_per_second": 14.349,
      "step": 4900
    },
    {
      "epoch": 0.784,
      "eval_test_accuracy": 0.9932,
      "eval_test_loss": 0.31748437881469727,
      "eval_test_runtime": 4.7813,
      "eval_test_samples_per_second": 1045.731,
      "eval_test_steps_per_second": 13.176,
      "step": 4900
    },
    {
      "epoch": 0.78416,
      "grad_norm": 0.10086668282747269,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 4901
    },
    {
      "epoch": 0.78432,
      "grad_norm": 0.10099458694458008,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 4902
    },
    {
      "epoch": 0.78448,
      "grad_norm": 0.09385483711957932,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4903
    },
    {
      "epoch": 0.78464,
      "grad_norm": 0.12549763917922974,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 4904
    },
    {
      "epoch": 0.7848,
      "grad_norm": 0.09488105773925781,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 4905
    },
    {
      "epoch": 0.78496,
      "grad_norm": 0.08914946019649506,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4906
    },
    {
      "epoch": 0.78512,
      "grad_norm": 0.09844493865966797,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 4907
    },
    {
      "epoch": 0.78528,
      "grad_norm": 0.14636731147766113,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4908
    },
    {
      "epoch": 0.78544,
      "grad_norm": 0.12220357358455658,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 4909
    },
    {
      "epoch": 0.7856,
      "grad_norm": 0.1127210333943367,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 4910
    },
    {
      "epoch": 0.78576,
      "grad_norm": 0.10468387603759766,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 4911
    },
    {
      "epoch": 0.78592,
      "grad_norm": 0.0874345451593399,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 4912
    },
    {
      "epoch": 0.78608,
      "grad_norm": 0.08957214653491974,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 4913
    },
    {
      "epoch": 0.78624,
      "grad_norm": 0.0843328908085823,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 4914
    },
    {
      "epoch": 0.7864,
      "grad_norm": 0.11390437185764313,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 4915
    },
    {
      "epoch": 0.78656,
      "grad_norm": 0.1039254441857338,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 4916
    },
    {
      "epoch": 0.78672,
      "grad_norm": 0.08772982656955719,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 4917
    },
    {
      "epoch": 0.78688,
      "grad_norm": 0.10128934681415558,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 4918
    },
    {
      "epoch": 0.78704,
      "grad_norm": 0.09339425712823868,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 4919
    },
    {
      "epoch": 0.7872,
      "grad_norm": 0.09315010160207748,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 4920
    },
    {
      "epoch": 0.78736,
      "grad_norm": 0.09081374853849411,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 4921
    },
    {
      "epoch": 0.78752,
      "grad_norm": 0.11613871902227402,
      "learning_rate": 0.0001,
      "loss": 0.3285,
      "step": 4922
    },
    {
      "epoch": 0.78768,
      "grad_norm": 0.098422110080719,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 4923
    },
    {
      "epoch": 0.78784,
      "grad_norm": 0.08410276472568512,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 4924
    },
    {
      "epoch": 0.788,
      "grad_norm": 0.08568599820137024,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 4925
    },
    {
      "epoch": 0.78816,
      "grad_norm": 0.0950135588645935,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 4926
    },
    {
      "epoch": 0.78832,
      "grad_norm": 0.09998608380556107,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 4927
    },
    {
      "epoch": 0.78848,
      "grad_norm": 0.09910102933645248,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 4928
    },
    {
      "epoch": 0.78864,
      "grad_norm": 0.08987825363874435,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 4929
    },
    {
      "epoch": 0.7888,
      "grad_norm": 0.10249117016792297,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 4930
    },
    {
      "epoch": 0.78896,
      "grad_norm": 0.09478149563074112,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 4931
    },
    {
      "epoch": 0.78912,
      "grad_norm": 0.09624830633401871,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 4932
    },
    {
      "epoch": 0.78928,
      "grad_norm": 0.10172207653522491,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 4933
    },
    {
      "epoch": 0.78944,
      "grad_norm": 0.094708651304245,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 4934
    },
    {
      "epoch": 0.7896,
      "grad_norm": 0.08283574134111404,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4935
    },
    {
      "epoch": 0.78976,
      "grad_norm": 0.09147519618272781,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 4936
    },
    {
      "epoch": 0.78992,
      "grad_norm": 0.09146446734666824,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 4937
    },
    {
      "epoch": 0.79008,
      "grad_norm": 0.0967271700501442,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 4938
    },
    {
      "epoch": 0.79024,
      "grad_norm": 0.10138624161481857,
      "learning_rate": 0.0001,
      "loss": 0.339,
      "step": 4939
    },
    {
      "epoch": 0.7904,
      "grad_norm": 0.08224252611398697,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4940
    },
    {
      "epoch": 0.79056,
      "grad_norm": 0.08139421045780182,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 4941
    },
    {
      "epoch": 0.79072,
      "grad_norm": 0.08917902410030365,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 4942
    },
    {
      "epoch": 0.79088,
      "grad_norm": 0.09868939220905304,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 4943
    },
    {
      "epoch": 0.79104,
      "grad_norm": 0.12094339728355408,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 4944
    },
    {
      "epoch": 0.7912,
      "grad_norm": 0.09596320241689682,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 4945
    },
    {
      "epoch": 0.79136,
      "grad_norm": 0.09707621484994888,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 4946
    },
    {
      "epoch": 0.79152,
      "grad_norm": 0.08820607513189316,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 4947
    },
    {
      "epoch": 0.79168,
      "grad_norm": 0.08510152995586395,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4948
    },
    {
      "epoch": 0.79184,
      "grad_norm": 0.08392080664634705,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 4949
    },
    {
      "epoch": 0.792,
      "grad_norm": 0.09832148253917694,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 4950
    },
    {
      "epoch": 0.79216,
      "grad_norm": 0.09059302508831024,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 4951
    },
    {
      "epoch": 0.79232,
      "grad_norm": 0.09543486684560776,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 4952
    },
    {
      "epoch": 0.79248,
      "grad_norm": 0.08485331386327744,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 4953
    },
    {
      "epoch": 0.79264,
      "grad_norm": 0.0882594957947731,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 4954
    },
    {
      "epoch": 0.7928,
      "grad_norm": 0.09852428734302521,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 4955
    },
    {
      "epoch": 0.79296,
      "grad_norm": 0.08895737677812576,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 4956
    },
    {
      "epoch": 0.79312,
      "grad_norm": 0.08375488221645355,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 4957
    },
    {
      "epoch": 0.79328,
      "grad_norm": 0.08501444011926651,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4958
    },
    {
      "epoch": 0.79344,
      "grad_norm": 0.08725177496671677,
      "learning_rate": 0.0001,
      "loss": 0.3049,
      "step": 4959
    },
    {
      "epoch": 0.7936,
      "grad_norm": 0.09571154415607452,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 4960
    },
    {
      "epoch": 0.79376,
      "grad_norm": 0.08622036874294281,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 4961
    },
    {
      "epoch": 0.79392,
      "grad_norm": 0.08795792609453201,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 4962
    },
    {
      "epoch": 0.79408,
      "grad_norm": 0.08447200804948807,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 4963
    },
    {
      "epoch": 0.79424,
      "grad_norm": 0.08680392056703568,
      "learning_rate": 0.0001,
      "loss": 0.3355,
      "step": 4964
    },
    {
      "epoch": 0.7944,
      "grad_norm": 0.08922921866178513,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 4965
    },
    {
      "epoch": 0.79456,
      "grad_norm": 0.10466951876878738,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 4966
    },
    {
      "epoch": 0.79472,
      "grad_norm": 0.13912852108478546,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 4967
    },
    {
      "epoch": 0.79488,
      "grad_norm": 0.08750949800014496,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 4968
    },
    {
      "epoch": 0.79504,
      "grad_norm": 0.09696832299232483,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 4969
    },
    {
      "epoch": 0.7952,
      "grad_norm": 0.0980362817645073,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 4970
    },
    {
      "epoch": 0.79536,
      "grad_norm": 0.09062964469194412,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 4971
    },
    {
      "epoch": 0.79552,
      "grad_norm": 0.09196721017360687,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 4972
    },
    {
      "epoch": 0.79568,
      "grad_norm": 0.08830029517412186,
      "learning_rate": 0.0001,
      "loss": 0.3071,
      "step": 4973
    },
    {
      "epoch": 0.79584,
      "grad_norm": 0.09771175682544708,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 4974
    },
    {
      "epoch": 0.796,
      "grad_norm": 0.09539026767015457,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 4975
    },
    {
      "epoch": 0.79616,
      "grad_norm": 0.0943714827299118,
      "learning_rate": 0.0001,
      "loss": 0.3309,
      "step": 4976
    },
    {
      "epoch": 0.79632,
      "grad_norm": 0.09515266120433807,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 4977
    },
    {
      "epoch": 0.79648,
      "grad_norm": 0.08793389797210693,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 4978
    },
    {
      "epoch": 0.79664,
      "grad_norm": 0.10008806735277176,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 4979
    },
    {
      "epoch": 0.7968,
      "grad_norm": 0.07714810222387314,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4980
    },
    {
      "epoch": 0.79696,
      "grad_norm": 0.10264909267425537,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 4981
    },
    {
      "epoch": 0.79712,
      "grad_norm": 0.09564785659313202,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 4982
    },
    {
      "epoch": 0.79728,
      "grad_norm": 0.08307547122240067,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 4983
    },
    {
      "epoch": 0.79744,
      "grad_norm": 0.08834308385848999,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 4984
    },
    {
      "epoch": 0.7976,
      "grad_norm": 0.10013481974601746,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 4985
    },
    {
      "epoch": 0.79776,
      "grad_norm": 0.11039945483207703,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 4986
    },
    {
      "epoch": 0.79792,
      "grad_norm": 0.09663939476013184,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 4987
    },
    {
      "epoch": 0.79808,
      "grad_norm": 0.10865239799022675,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 4988
    },
    {
      "epoch": 0.79824,
      "grad_norm": 0.08841155469417572,
      "learning_rate": 0.0001,
      "loss": 0.3307,
      "step": 4989
    },
    {
      "epoch": 0.7984,
      "grad_norm": 0.09621857851743698,
      "learning_rate": 0.0001,
      "loss": 0.3074,
      "step": 4990
    },
    {
      "epoch": 0.79856,
      "grad_norm": 0.11561285704374313,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 4991
    },
    {
      "epoch": 0.79872,
      "grad_norm": 0.11703277379274368,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 4992
    },
    {
      "epoch": 0.79888,
      "grad_norm": 0.10319402068853378,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 4993
    },
    {
      "epoch": 0.79904,
      "grad_norm": 0.0853702574968338,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 4994
    },
    {
      "epoch": 0.7992,
      "grad_norm": 0.08663171529769897,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 4995
    },
    {
      "epoch": 0.79936,
      "grad_norm": 0.0878574401140213,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 4996
    },
    {
      "epoch": 0.79952,
      "grad_norm": 0.09463632106781006,
      "learning_rate": 0.0001,
      "loss": 0.304,
      "step": 4997
    },
    {
      "epoch": 0.79968,
      "grad_norm": 0.09608563780784607,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 4998
    },
    {
      "epoch": 0.79984,
      "grad_norm": 0.10360148549079895,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 4999
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.09366890043020248,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 5000
    },
    {
      "epoch": 0.8,
      "eval_train_accuracy": 0.994,
      "eval_train_loss": 0.31866806745529175,
      "eval_train_runtime": 4.4098,
      "eval_train_samples_per_second": 1133.849,
      "eval_train_steps_per_second": 14.286,
      "step": 5000
    },
    {
      "epoch": 0.8,
      "eval_test_accuracy": 0.9914,
      "eval_test_loss": 0.31745582818984985,
      "eval_test_runtime": 4.4681,
      "eval_test_samples_per_second": 1119.049,
      "eval_test_steps_per_second": 14.1,
      "step": 5000
    },
    {
      "epoch": 0.80016,
      "grad_norm": 0.09551219642162323,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 5001
    },
    {
      "epoch": 0.80032,
      "grad_norm": 0.08445841819047928,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 5002
    },
    {
      "epoch": 0.80048,
      "grad_norm": 0.09136825054883957,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 5003
    },
    {
      "epoch": 0.80064,
      "grad_norm": 0.10538504272699356,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 5004
    },
    {
      "epoch": 0.8008,
      "grad_norm": 0.09968700259923935,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 5005
    },
    {
      "epoch": 0.80096,
      "grad_norm": 0.09259788691997528,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 5006
    },
    {
      "epoch": 0.80112,
      "grad_norm": 0.14153337478637695,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 5007
    },
    {
      "epoch": 0.80128,
      "grad_norm": 0.08651275187730789,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 5008
    },
    {
      "epoch": 0.80144,
      "grad_norm": 0.09179575741291046,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5009
    },
    {
      "epoch": 0.8016,
      "grad_norm": 0.1003040000796318,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 5010
    },
    {
      "epoch": 0.80176,
      "grad_norm": 0.09834509342908859,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 5011
    },
    {
      "epoch": 0.80192,
      "grad_norm": 0.08735908567905426,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 5012
    },
    {
      "epoch": 0.80208,
      "grad_norm": 0.08670365065336227,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 5013
    },
    {
      "epoch": 0.80224,
      "grad_norm": 0.08428778499364853,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 5014
    },
    {
      "epoch": 0.8024,
      "grad_norm": 0.1625930368900299,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 5015
    },
    {
      "epoch": 0.80256,
      "grad_norm": 0.09183500707149506,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 5016
    },
    {
      "epoch": 0.80272,
      "grad_norm": 0.08732911199331284,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 5017
    },
    {
      "epoch": 0.80288,
      "grad_norm": 0.09959534555673599,
      "learning_rate": 0.0001,
      "loss": 0.2992,
      "step": 5018
    },
    {
      "epoch": 0.80304,
      "grad_norm": 0.09394816309213638,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 5019
    },
    {
      "epoch": 0.8032,
      "grad_norm": 0.12634670734405518,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5020
    },
    {
      "epoch": 0.80336,
      "grad_norm": 0.08554095774888992,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 5021
    },
    {
      "epoch": 0.80352,
      "grad_norm": 0.09003924578428268,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 5022
    },
    {
      "epoch": 0.80368,
      "grad_norm": 0.09134276956319809,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 5023
    },
    {
      "epoch": 0.80384,
      "grad_norm": 0.13121110200881958,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 5024
    },
    {
      "epoch": 0.804,
      "grad_norm": 0.0910525843501091,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 5025
    },
    {
      "epoch": 0.80416,
      "grad_norm": 0.08240585029125214,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5026
    },
    {
      "epoch": 0.80432,
      "grad_norm": 0.08846616744995117,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 5027
    },
    {
      "epoch": 0.80448,
      "grad_norm": 0.10902832448482513,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 5028
    },
    {
      "epoch": 0.80464,
      "grad_norm": 0.09753473103046417,
      "learning_rate": 0.0001,
      "loss": 0.2996,
      "step": 5029
    },
    {
      "epoch": 0.8048,
      "grad_norm": 0.08728086203336716,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 5030
    },
    {
      "epoch": 0.80496,
      "grad_norm": 0.09220942854881287,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5031
    },
    {
      "epoch": 0.80512,
      "grad_norm": 0.10172832012176514,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 5032
    },
    {
      "epoch": 0.80528,
      "grad_norm": 0.09733505547046661,
      "learning_rate": 0.0001,
      "loss": 0.3053,
      "step": 5033
    },
    {
      "epoch": 0.80544,
      "grad_norm": 0.11249060928821564,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 5034
    },
    {
      "epoch": 0.8056,
      "grad_norm": 0.09780292958021164,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 5035
    },
    {
      "epoch": 0.80576,
      "grad_norm": 0.11729934811592102,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 5036
    },
    {
      "epoch": 0.80592,
      "grad_norm": 0.09379757195711136,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5037
    },
    {
      "epoch": 0.80608,
      "grad_norm": 0.08987466990947723,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5038
    },
    {
      "epoch": 0.80624,
      "grad_norm": 0.08565675467252731,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5039
    },
    {
      "epoch": 0.8064,
      "grad_norm": 0.10559573769569397,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 5040
    },
    {
      "epoch": 0.80656,
      "grad_norm": 0.12208446860313416,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5041
    },
    {
      "epoch": 0.80672,
      "grad_norm": 0.0814911276102066,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 5042
    },
    {
      "epoch": 0.80688,
      "grad_norm": 0.08024351298809052,
      "learning_rate": 0.0001,
      "loss": 0.3011,
      "step": 5043
    },
    {
      "epoch": 0.80704,
      "grad_norm": 0.10384313762187958,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 5044
    },
    {
      "epoch": 0.8072,
      "grad_norm": 0.08415801078081131,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 5045
    },
    {
      "epoch": 0.80736,
      "grad_norm": 0.09506968408823013,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 5046
    },
    {
      "epoch": 0.80752,
      "grad_norm": 0.1198795959353447,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 5047
    },
    {
      "epoch": 0.80768,
      "grad_norm": 0.08846960961818695,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 5048
    },
    {
      "epoch": 0.80784,
      "grad_norm": 0.08693090826272964,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 5049
    },
    {
      "epoch": 0.808,
      "grad_norm": 0.11622575670480728,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 5050
    },
    {
      "epoch": 0.80816,
      "grad_norm": 0.10692652314901352,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5051
    },
    {
      "epoch": 0.80832,
      "grad_norm": 0.08644638955593109,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 5052
    },
    {
      "epoch": 0.80848,
      "grad_norm": 0.0794009268283844,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 5053
    },
    {
      "epoch": 0.80864,
      "grad_norm": 0.09137442708015442,
      "learning_rate": 0.0001,
      "loss": 0.305,
      "step": 5054
    },
    {
      "epoch": 0.8088,
      "grad_norm": 0.08136992901563644,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 5055
    },
    {
      "epoch": 0.80896,
      "grad_norm": 0.07951422035694122,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5056
    },
    {
      "epoch": 0.80912,
      "grad_norm": 0.09031828492879868,
      "learning_rate": 0.0001,
      "loss": 0.336,
      "step": 5057
    },
    {
      "epoch": 0.80928,
      "grad_norm": 0.10343579202890396,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 5058
    },
    {
      "epoch": 0.80944,
      "grad_norm": 0.11107916384935379,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 5059
    },
    {
      "epoch": 0.8096,
      "grad_norm": 0.08987952023744583,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 5060
    },
    {
      "epoch": 0.80976,
      "grad_norm": 0.0909866914153099,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 5061
    },
    {
      "epoch": 0.80992,
      "grad_norm": 0.0929773673415184,
      "learning_rate": 0.0001,
      "loss": 0.3089,
      "step": 5062
    },
    {
      "epoch": 0.81008,
      "grad_norm": 0.10751919448375702,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 5063
    },
    {
      "epoch": 0.81024,
      "grad_norm": 0.08710049837827682,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 5064
    },
    {
      "epoch": 0.8104,
      "grad_norm": 0.0917166993021965,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5065
    },
    {
      "epoch": 0.81056,
      "grad_norm": 0.08921311795711517,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5066
    },
    {
      "epoch": 0.81072,
      "grad_norm": 0.0974995419383049,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 5067
    },
    {
      "epoch": 0.81088,
      "grad_norm": 0.10705617815256119,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 5068
    },
    {
      "epoch": 0.81104,
      "grad_norm": 0.0882636159658432,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 5069
    },
    {
      "epoch": 0.8112,
      "grad_norm": 0.09356384724378586,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5070
    },
    {
      "epoch": 0.81136,
      "grad_norm": 0.10544760525226593,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 5071
    },
    {
      "epoch": 0.81152,
      "grad_norm": 0.08905152976512909,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 5072
    },
    {
      "epoch": 0.81168,
      "grad_norm": 0.09101969003677368,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 5073
    },
    {
      "epoch": 0.81184,
      "grad_norm": 0.08008582144975662,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 5074
    },
    {
      "epoch": 0.812,
      "grad_norm": 0.09297715127468109,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 5075
    },
    {
      "epoch": 0.81216,
      "grad_norm": 0.08759532123804092,
      "learning_rate": 0.0001,
      "loss": 0.3033,
      "step": 5076
    },
    {
      "epoch": 0.81232,
      "grad_norm": 0.10618048906326294,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 5077
    },
    {
      "epoch": 0.81248,
      "grad_norm": 0.08580777049064636,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 5078
    },
    {
      "epoch": 0.81264,
      "grad_norm": 0.08768175542354584,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 5079
    },
    {
      "epoch": 0.8128,
      "grad_norm": 0.09932674467563629,
      "learning_rate": 0.0001,
      "loss": 0.3051,
      "step": 5080
    },
    {
      "epoch": 0.81296,
      "grad_norm": 0.0861344188451767,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5081
    },
    {
      "epoch": 0.81312,
      "grad_norm": 0.08979775011539459,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5082
    },
    {
      "epoch": 0.81328,
      "grad_norm": 0.0894060730934143,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 5083
    },
    {
      "epoch": 0.81344,
      "grad_norm": 0.08623382449150085,
      "learning_rate": 0.0001,
      "loss": 0.3036,
      "step": 5084
    },
    {
      "epoch": 0.8136,
      "grad_norm": 0.08498960733413696,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 5085
    },
    {
      "epoch": 0.81376,
      "grad_norm": 0.0930132195353508,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 5086
    },
    {
      "epoch": 0.81392,
      "grad_norm": 0.08968270570039749,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 5087
    },
    {
      "epoch": 0.81408,
      "grad_norm": 0.08721902966499329,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 5088
    },
    {
      "epoch": 0.81424,
      "grad_norm": 0.08850039541721344,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 5089
    },
    {
      "epoch": 0.8144,
      "grad_norm": 0.09856928884983063,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 5090
    },
    {
      "epoch": 0.81456,
      "grad_norm": 0.10065120458602905,
      "learning_rate": 0.0001,
      "loss": 0.3315,
      "step": 5091
    },
    {
      "epoch": 0.81472,
      "grad_norm": 0.09178777039051056,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 5092
    },
    {
      "epoch": 0.81488,
      "grad_norm": 0.08227062225341797,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 5093
    },
    {
      "epoch": 0.81504,
      "grad_norm": 0.09584279358386993,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 5094
    },
    {
      "epoch": 0.8152,
      "grad_norm": 0.08688873052597046,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 5095
    },
    {
      "epoch": 0.81536,
      "grad_norm": 0.08637923747301102,
      "learning_rate": 0.0001,
      "loss": 0.3076,
      "step": 5096
    },
    {
      "epoch": 0.81552,
      "grad_norm": 0.08105123043060303,
      "learning_rate": 0.0001,
      "loss": 0.305,
      "step": 5097
    },
    {
      "epoch": 0.81568,
      "grad_norm": 0.09411504119634628,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 5098
    },
    {
      "epoch": 0.81584,
      "grad_norm": 0.08475331217050552,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 5099
    },
    {
      "epoch": 0.816,
      "grad_norm": 0.09478020668029785,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 5100
    },
    {
      "epoch": 0.816,
      "eval_train_accuracy": 0.994,
      "eval_train_loss": 0.3184407353401184,
      "eval_train_runtime": 4.3472,
      "eval_train_samples_per_second": 1150.167,
      "eval_train_steps_per_second": 14.492,
      "step": 5100
    },
    {
      "epoch": 0.816,
      "eval_test_accuracy": 0.9954,
      "eval_test_loss": 0.31704336404800415,
      "eval_test_runtime": 4.7487,
      "eval_test_samples_per_second": 1052.917,
      "eval_test_steps_per_second": 13.267,
      "step": 5100
    },
    {
      "epoch": 0.81616,
      "grad_norm": 0.0855853408575058,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 5101
    },
    {
      "epoch": 0.81632,
      "grad_norm": 0.10725446790456772,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 5102
    },
    {
      "epoch": 0.81648,
      "grad_norm": 0.09877315908670425,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5103
    },
    {
      "epoch": 0.81664,
      "grad_norm": 0.07845265418291092,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 5104
    },
    {
      "epoch": 0.8168,
      "grad_norm": 0.09471181035041809,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 5105
    },
    {
      "epoch": 0.81696,
      "grad_norm": 0.10346268862485886,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 5106
    },
    {
      "epoch": 0.81712,
      "grad_norm": 0.10211725533008575,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 5107
    },
    {
      "epoch": 0.81728,
      "grad_norm": 0.08474289625883102,
      "learning_rate": 0.0001,
      "loss": 0.3406,
      "step": 5108
    },
    {
      "epoch": 0.81744,
      "grad_norm": 0.07892286777496338,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 5109
    },
    {
      "epoch": 0.8176,
      "grad_norm": 0.08616027981042862,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5110
    },
    {
      "epoch": 0.81776,
      "grad_norm": 0.1297927051782608,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 5111
    },
    {
      "epoch": 0.81792,
      "grad_norm": 0.0910802036523819,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 5112
    },
    {
      "epoch": 0.81808,
      "grad_norm": 0.08663970977067947,
      "learning_rate": 0.0001,
      "loss": 0.3062,
      "step": 5113
    },
    {
      "epoch": 0.81824,
      "grad_norm": 0.09101357311010361,
      "learning_rate": 0.0001,
      "loss": 0.3081,
      "step": 5114
    },
    {
      "epoch": 0.8184,
      "grad_norm": 0.08933225274085999,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 5115
    },
    {
      "epoch": 0.81856,
      "grad_norm": 0.09251472353935242,
      "learning_rate": 0.0001,
      "loss": 0.3014,
      "step": 5116
    },
    {
      "epoch": 0.81872,
      "grad_norm": 0.10740511864423752,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 5117
    },
    {
      "epoch": 0.81888,
      "grad_norm": 0.10017968714237213,
      "learning_rate": 0.0001,
      "loss": 0.3349,
      "step": 5118
    },
    {
      "epoch": 0.81904,
      "grad_norm": 0.09506126493215561,
      "learning_rate": 0.0001,
      "loss": 0.3019,
      "step": 5119
    },
    {
      "epoch": 0.8192,
      "grad_norm": 0.07915867120027542,
      "learning_rate": 0.0001,
      "loss": 0.3051,
      "step": 5120
    },
    {
      "epoch": 0.81936,
      "grad_norm": 0.07979176193475723,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 5121
    },
    {
      "epoch": 0.81952,
      "grad_norm": 0.09743945300579071,
      "learning_rate": 0.0001,
      "loss": 0.3342,
      "step": 5122
    },
    {
      "epoch": 0.81968,
      "grad_norm": 0.10345318913459778,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 5123
    },
    {
      "epoch": 0.81984,
      "grad_norm": 0.09037943184375763,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 5124
    },
    {
      "epoch": 0.82,
      "grad_norm": 0.09651589393615723,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 5125
    },
    {
      "epoch": 0.82016,
      "grad_norm": 0.0906033143401146,
      "learning_rate": 0.0001,
      "loss": 0.3009,
      "step": 5126
    },
    {
      "epoch": 0.82032,
      "grad_norm": 0.0838095173239708,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5127
    },
    {
      "epoch": 0.82048,
      "grad_norm": 0.08893038332462311,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 5128
    },
    {
      "epoch": 0.82064,
      "grad_norm": 0.09726157784461975,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 5129
    },
    {
      "epoch": 0.8208,
      "grad_norm": 0.10545772314071655,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 5130
    },
    {
      "epoch": 0.82096,
      "grad_norm": 0.09503746032714844,
      "learning_rate": 0.0001,
      "loss": 0.3056,
      "step": 5131
    },
    {
      "epoch": 0.82112,
      "grad_norm": 0.090279221534729,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 5132
    },
    {
      "epoch": 0.82128,
      "grad_norm": 0.10420023649930954,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 5133
    },
    {
      "epoch": 0.82144,
      "grad_norm": 0.10158950835466385,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 5134
    },
    {
      "epoch": 0.8216,
      "grad_norm": 0.08460650593042374,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 5135
    },
    {
      "epoch": 0.82176,
      "grad_norm": 0.08675636351108551,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 5136
    },
    {
      "epoch": 0.82192,
      "grad_norm": 0.08661255985498428,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 5137
    },
    {
      "epoch": 0.82208,
      "grad_norm": 0.09562702476978302,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 5138
    },
    {
      "epoch": 0.82224,
      "grad_norm": 0.0921904444694519,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 5139
    },
    {
      "epoch": 0.8224,
      "grad_norm": 0.08826535195112228,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 5140
    },
    {
      "epoch": 0.82256,
      "grad_norm": 0.09343822300434113,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 5141
    },
    {
      "epoch": 0.82272,
      "grad_norm": 0.09326218068599701,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 5142
    },
    {
      "epoch": 0.82288,
      "grad_norm": 0.09068894386291504,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 5143
    },
    {
      "epoch": 0.82304,
      "grad_norm": 0.08545368909835815,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5144
    },
    {
      "epoch": 0.8232,
      "grad_norm": 0.08079798519611359,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5145
    },
    {
      "epoch": 0.82336,
      "grad_norm": 0.09516076743602753,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 5146
    },
    {
      "epoch": 0.82352,
      "grad_norm": 0.0894613265991211,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 5147
    },
    {
      "epoch": 0.82368,
      "grad_norm": 0.08796482533216476,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 5148
    },
    {
      "epoch": 0.82384,
      "grad_norm": 0.08517075330018997,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 5149
    },
    {
      "epoch": 0.824,
      "grad_norm": 0.08228468149900436,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 5150
    },
    {
      "epoch": 0.82416,
      "grad_norm": 0.08048657327890396,
      "learning_rate": 0.0001,
      "loss": 0.3017,
      "step": 5151
    },
    {
      "epoch": 0.82432,
      "grad_norm": 0.09075888991355896,
      "learning_rate": 0.0001,
      "loss": 0.331,
      "step": 5152
    },
    {
      "epoch": 0.82448,
      "grad_norm": 0.08572172373533249,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5153
    },
    {
      "epoch": 0.82464,
      "grad_norm": 0.09380882233381271,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 5154
    },
    {
      "epoch": 0.8248,
      "grad_norm": 0.08581963181495667,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 5155
    },
    {
      "epoch": 0.82496,
      "grad_norm": 0.11004650592803955,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 5156
    },
    {
      "epoch": 0.82512,
      "grad_norm": 0.08557622134685516,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 5157
    },
    {
      "epoch": 0.82528,
      "grad_norm": 0.08658608049154282,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5158
    },
    {
      "epoch": 0.82544,
      "grad_norm": 0.08487065136432648,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 5159
    },
    {
      "epoch": 0.8256,
      "grad_norm": 0.08918175846338272,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 5160
    },
    {
      "epoch": 0.82576,
      "grad_norm": 0.09456969797611237,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 5161
    },
    {
      "epoch": 0.82592,
      "grad_norm": 0.08445755392313004,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 5162
    },
    {
      "epoch": 0.82608,
      "grad_norm": 0.09084457159042358,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 5163
    },
    {
      "epoch": 0.82624,
      "grad_norm": 0.08924268931150436,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 5164
    },
    {
      "epoch": 0.8264,
      "grad_norm": 0.09669320285320282,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 5165
    },
    {
      "epoch": 0.82656,
      "grad_norm": 0.08768032491207123,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 5166
    },
    {
      "epoch": 0.82672,
      "grad_norm": 0.087617889046669,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 5167
    },
    {
      "epoch": 0.82688,
      "grad_norm": 0.09123419225215912,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 5168
    },
    {
      "epoch": 0.82704,
      "grad_norm": 0.08651856333017349,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 5169
    },
    {
      "epoch": 0.8272,
      "grad_norm": 0.09306824207305908,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 5170
    },
    {
      "epoch": 0.82736,
      "grad_norm": 0.18490543961524963,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 5171
    },
    {
      "epoch": 0.82752,
      "grad_norm": 0.10222750902175903,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 5172
    },
    {
      "epoch": 0.82768,
      "grad_norm": 0.086286760866642,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 5173
    },
    {
      "epoch": 0.82784,
      "grad_norm": 0.09399499744176865,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 5174
    },
    {
      "epoch": 0.828,
      "grad_norm": 0.0784485936164856,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 5175
    },
    {
      "epoch": 0.82816,
      "grad_norm": 0.0937870591878891,
      "learning_rate": 0.0001,
      "loss": 0.3091,
      "step": 5176
    },
    {
      "epoch": 0.82832,
      "grad_norm": 0.1389462649822235,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5177
    },
    {
      "epoch": 0.82848,
      "grad_norm": 0.08276539295911789,
      "learning_rate": 0.0001,
      "loss": 0.3013,
      "step": 5178
    },
    {
      "epoch": 0.82864,
      "grad_norm": 0.10503154247999191,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 5179
    },
    {
      "epoch": 0.8288,
      "grad_norm": 0.09633313864469528,
      "learning_rate": 0.0001,
      "loss": 0.3335,
      "step": 5180
    },
    {
      "epoch": 0.82896,
      "grad_norm": 0.07648599147796631,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 5181
    },
    {
      "epoch": 0.82912,
      "grad_norm": 0.0885428860783577,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 5182
    },
    {
      "epoch": 0.82928,
      "grad_norm": 0.0968417078256607,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 5183
    },
    {
      "epoch": 0.82944,
      "grad_norm": 0.10435530543327332,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 5184
    },
    {
      "epoch": 0.8296,
      "grad_norm": 0.11644434183835983,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 5185
    },
    {
      "epoch": 0.82976,
      "grad_norm": 0.08947423845529556,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 5186
    },
    {
      "epoch": 0.82992,
      "grad_norm": 0.08145859092473984,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 5187
    },
    {
      "epoch": 0.83008,
      "grad_norm": 0.0884641483426094,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5188
    },
    {
      "epoch": 0.83024,
      "grad_norm": 0.12575241923332214,
      "learning_rate": 0.0001,
      "loss": 0.3422,
      "step": 5189
    },
    {
      "epoch": 0.8304,
      "grad_norm": 0.09404688328504562,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 5190
    },
    {
      "epoch": 0.83056,
      "grad_norm": 0.1624474674463272,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 5191
    },
    {
      "epoch": 0.83072,
      "grad_norm": 0.0978095680475235,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 5192
    },
    {
      "epoch": 0.83088,
      "grad_norm": 0.14280618727207184,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 5193
    },
    {
      "epoch": 0.83104,
      "grad_norm": 0.09575767070055008,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 5194
    },
    {
      "epoch": 0.8312,
      "grad_norm": 0.09649721533060074,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 5195
    },
    {
      "epoch": 0.83136,
      "grad_norm": 0.08583376556634903,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 5196
    },
    {
      "epoch": 0.83152,
      "grad_norm": 0.12636496126651764,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 5197
    },
    {
      "epoch": 0.83168,
      "grad_norm": 0.09209879487752914,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 5198
    },
    {
      "epoch": 0.83184,
      "grad_norm": 0.09790763258934021,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 5199
    },
    {
      "epoch": 0.832,
      "grad_norm": 0.10822033137083054,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 5200
    },
    {
      "epoch": 0.832,
      "eval_train_accuracy": 0.9938,
      "eval_train_loss": 0.3183784782886505,
      "eval_train_runtime": 4.6554,
      "eval_train_samples_per_second": 1074.028,
      "eval_train_steps_per_second": 13.533,
      "step": 5200
    },
    {
      "epoch": 0.832,
      "eval_test_accuracy": 0.9942,
      "eval_test_loss": 0.31717562675476074,
      "eval_test_runtime": 4.472,
      "eval_test_samples_per_second": 1118.062,
      "eval_test_steps_per_second": 14.088,
      "step": 5200
    },
    {
      "epoch": 0.83216,
      "grad_norm": 0.08409928530454636,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 5201
    },
    {
      "epoch": 0.83232,
      "grad_norm": 0.0834561288356781,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 5202
    },
    {
      "epoch": 0.83248,
      "grad_norm": 0.08386529982089996,
      "learning_rate": 0.0001,
      "loss": 0.3026,
      "step": 5203
    },
    {
      "epoch": 0.83264,
      "grad_norm": 0.10197395831346512,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 5204
    },
    {
      "epoch": 0.8328,
      "grad_norm": 0.08125186711549759,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 5205
    },
    {
      "epoch": 0.83296,
      "grad_norm": 0.11427489668130875,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5206
    },
    {
      "epoch": 0.83312,
      "grad_norm": 0.1108061671257019,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 5207
    },
    {
      "epoch": 0.83328,
      "grad_norm": 0.08625295758247375,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 5208
    },
    {
      "epoch": 0.83344,
      "grad_norm": 0.0938042476773262,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5209
    },
    {
      "epoch": 0.8336,
      "grad_norm": 0.09195749461650848,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 5210
    },
    {
      "epoch": 0.83376,
      "grad_norm": 0.19868847727775574,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 5211
    },
    {
      "epoch": 0.83392,
      "grad_norm": 0.08930796384811401,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 5212
    },
    {
      "epoch": 0.83408,
      "grad_norm": 0.0893668532371521,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5213
    },
    {
      "epoch": 0.83424,
      "grad_norm": 0.08221638202667236,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 5214
    },
    {
      "epoch": 0.8344,
      "grad_norm": 0.12213000655174255,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 5215
    },
    {
      "epoch": 0.83456,
      "grad_norm": 0.10589136928319931,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 5216
    },
    {
      "epoch": 0.83472,
      "grad_norm": 0.08842061460018158,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 5217
    },
    {
      "epoch": 0.83488,
      "grad_norm": 0.10372159630060196,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 5218
    },
    {
      "epoch": 0.83504,
      "grad_norm": 0.10000298172235489,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 5219
    },
    {
      "epoch": 0.8352,
      "grad_norm": 0.098520427942276,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 5220
    },
    {
      "epoch": 0.83536,
      "grad_norm": 0.1014028787612915,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5221
    },
    {
      "epoch": 0.83552,
      "grad_norm": 0.11305703222751617,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 5222
    },
    {
      "epoch": 0.83568,
      "grad_norm": 0.10072937607765198,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 5223
    },
    {
      "epoch": 0.83584,
      "grad_norm": 0.09767698496580124,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 5224
    },
    {
      "epoch": 0.836,
      "grad_norm": 0.08192136883735657,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 5225
    },
    {
      "epoch": 0.83616,
      "grad_norm": 0.10547022521495819,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 5226
    },
    {
      "epoch": 0.83632,
      "grad_norm": 0.09220154583454132,
      "learning_rate": 0.0001,
      "loss": 0.3035,
      "step": 5227
    },
    {
      "epoch": 0.83648,
      "grad_norm": 0.10141555219888687,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 5228
    },
    {
      "epoch": 0.83664,
      "grad_norm": 0.10467591136693954,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 5229
    },
    {
      "epoch": 0.8368,
      "grad_norm": 0.10115475952625275,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 5230
    },
    {
      "epoch": 0.83696,
      "grad_norm": 0.08054140955209732,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 5231
    },
    {
      "epoch": 0.83712,
      "grad_norm": 0.09414023160934448,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 5232
    },
    {
      "epoch": 0.83728,
      "grad_norm": 0.10609610378742218,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 5233
    },
    {
      "epoch": 0.83744,
      "grad_norm": 0.11090481281280518,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 5234
    },
    {
      "epoch": 0.8376,
      "grad_norm": 0.08969883620738983,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 5235
    },
    {
      "epoch": 0.83776,
      "grad_norm": 0.09289693087339401,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 5236
    },
    {
      "epoch": 0.83792,
      "grad_norm": 0.10076908767223358,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5237
    },
    {
      "epoch": 0.83808,
      "grad_norm": 0.09449582546949387,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 5238
    },
    {
      "epoch": 0.83824,
      "grad_norm": 0.09815779328346252,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5239
    },
    {
      "epoch": 0.8384,
      "grad_norm": 0.09201355278491974,
      "learning_rate": 0.0001,
      "loss": 0.3045,
      "step": 5240
    },
    {
      "epoch": 0.83856,
      "grad_norm": 0.08944724500179291,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 5241
    },
    {
      "epoch": 0.83872,
      "grad_norm": 0.09523887932300568,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5242
    },
    {
      "epoch": 0.83888,
      "grad_norm": 0.08127455413341522,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 5243
    },
    {
      "epoch": 0.83904,
      "grad_norm": 0.09665218740701675,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 5244
    },
    {
      "epoch": 0.8392,
      "grad_norm": 0.09868079423904419,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 5245
    },
    {
      "epoch": 0.83936,
      "grad_norm": 0.08354288339614868,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 5246
    },
    {
      "epoch": 0.83952,
      "grad_norm": 0.3485310971736908,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 5247
    },
    {
      "epoch": 0.83968,
      "grad_norm": 0.08954183012247086,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 5248
    },
    {
      "epoch": 0.83984,
      "grad_norm": 0.089389368891716,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 5249
    },
    {
      "epoch": 0.84,
      "grad_norm": 0.09888885170221329,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 5250
    },
    {
      "epoch": 0.84016,
      "grad_norm": 0.1197657659649849,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 5251
    },
    {
      "epoch": 0.84032,
      "grad_norm": 0.0955028161406517,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 5252
    },
    {
      "epoch": 0.84048,
      "grad_norm": 0.1492050439119339,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 5253
    },
    {
      "epoch": 0.84064,
      "grad_norm": 0.1402685046195984,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 5254
    },
    {
      "epoch": 0.8408,
      "grad_norm": 0.08903397619724274,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 5255
    },
    {
      "epoch": 0.84096,
      "grad_norm": 0.08262941986322403,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 5256
    },
    {
      "epoch": 0.84112,
      "grad_norm": 0.08694138377904892,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 5257
    },
    {
      "epoch": 0.84128,
      "grad_norm": 0.18332168459892273,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 5258
    },
    {
      "epoch": 0.84144,
      "grad_norm": 0.08304083347320557,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 5259
    },
    {
      "epoch": 0.8416,
      "grad_norm": 0.08611738681793213,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5260
    },
    {
      "epoch": 0.84176,
      "grad_norm": 0.09894539415836334,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 5261
    },
    {
      "epoch": 0.84192,
      "grad_norm": 0.09362605214118958,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5262
    },
    {
      "epoch": 0.84208,
      "grad_norm": 0.11842280626296997,
      "learning_rate": 0.0001,
      "loss": 0.308,
      "step": 5263
    },
    {
      "epoch": 0.84224,
      "grad_norm": 0.1008390337228775,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 5264
    },
    {
      "epoch": 0.8424,
      "grad_norm": 0.09695923328399658,
      "learning_rate": 0.0001,
      "loss": 0.3021,
      "step": 5265
    },
    {
      "epoch": 0.84256,
      "grad_norm": 0.09399078786373138,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 5266
    },
    {
      "epoch": 0.84272,
      "grad_norm": 0.09856431931257248,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 5267
    },
    {
      "epoch": 0.84288,
      "grad_norm": 0.08995747566223145,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 5268
    },
    {
      "epoch": 0.84304,
      "grad_norm": 0.0843757763504982,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 5269
    },
    {
      "epoch": 0.8432,
      "grad_norm": 0.11898665130138397,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5270
    },
    {
      "epoch": 0.84336,
      "grad_norm": 0.36385631561279297,
      "learning_rate": 0.0001,
      "loss": 0.3008,
      "step": 5271
    },
    {
      "epoch": 0.84352,
      "grad_norm": 0.10301851481199265,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5272
    },
    {
      "epoch": 0.84368,
      "grad_norm": 0.09775439649820328,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 5273
    },
    {
      "epoch": 0.84384,
      "grad_norm": 0.10351157188415527,
      "learning_rate": 0.0001,
      "loss": 0.304,
      "step": 5274
    },
    {
      "epoch": 0.844,
      "grad_norm": 0.15488757193088531,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 5275
    },
    {
      "epoch": 0.84416,
      "grad_norm": 0.1231466606259346,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 5276
    },
    {
      "epoch": 0.84432,
      "grad_norm": 0.0952155590057373,
      "learning_rate": 0.0001,
      "loss": 0.3346,
      "step": 5277
    },
    {
      "epoch": 0.84448,
      "grad_norm": 0.10985848307609558,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 5278
    },
    {
      "epoch": 0.84464,
      "grad_norm": 0.14503294229507446,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 5279
    },
    {
      "epoch": 0.8448,
      "grad_norm": 0.11936130374670029,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 5280
    },
    {
      "epoch": 0.84496,
      "grad_norm": 0.10208450257778168,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 5281
    },
    {
      "epoch": 0.84512,
      "grad_norm": 0.19375506043434143,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 5282
    },
    {
      "epoch": 0.84528,
      "grad_norm": 0.10780103504657745,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 5283
    },
    {
      "epoch": 0.84544,
      "grad_norm": 0.09798772633075714,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 5284
    },
    {
      "epoch": 0.8456,
      "grad_norm": 0.1296798437833786,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 5285
    },
    {
      "epoch": 0.84576,
      "grad_norm": 0.08985310047864914,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 5286
    },
    {
      "epoch": 0.84592,
      "grad_norm": 0.17480190098285675,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 5287
    },
    {
      "epoch": 0.84608,
      "grad_norm": 0.11290761083364487,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5288
    },
    {
      "epoch": 0.84624,
      "grad_norm": 0.12596073746681213,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 5289
    },
    {
      "epoch": 0.8464,
      "grad_norm": 0.09966212511062622,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 5290
    },
    {
      "epoch": 0.84656,
      "grad_norm": 0.11732281744480133,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 5291
    },
    {
      "epoch": 0.84672,
      "grad_norm": 0.09616146981716156,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 5292
    },
    {
      "epoch": 0.84688,
      "grad_norm": 0.214269757270813,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 5293
    },
    {
      "epoch": 0.84704,
      "grad_norm": 0.10667584836483002,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 5294
    },
    {
      "epoch": 0.8472,
      "grad_norm": 0.1512930542230606,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 5295
    },
    {
      "epoch": 0.84736,
      "grad_norm": 0.09920428693294525,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 5296
    },
    {
      "epoch": 0.84752,
      "grad_norm": 0.1393357366323471,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 5297
    },
    {
      "epoch": 0.84768,
      "grad_norm": 0.08303118497133255,
      "learning_rate": 0.0001,
      "loss": 0.3029,
      "step": 5298
    },
    {
      "epoch": 0.84784,
      "grad_norm": 0.1810784488916397,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5299
    },
    {
      "epoch": 0.848,
      "grad_norm": 0.09551709145307541,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 5300
    },
    {
      "epoch": 0.848,
      "eval_train_accuracy": 0.9924,
      "eval_train_loss": 0.3191348612308502,
      "eval_train_runtime": 4.4474,
      "eval_train_samples_per_second": 1124.245,
      "eval_train_steps_per_second": 14.165,
      "step": 5300
    },
    {
      "epoch": 0.848,
      "eval_test_accuracy": 0.9946,
      "eval_test_loss": 0.31770917773246765,
      "eval_test_runtime": 4.7412,
      "eval_test_samples_per_second": 1054.58,
      "eval_test_steps_per_second": 13.288,
      "step": 5300
    },
    {
      "epoch": 0.84816,
      "grad_norm": 0.09761784225702286,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 5301
    },
    {
      "epoch": 0.84832,
      "grad_norm": 0.09990902990102768,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 5302
    },
    {
      "epoch": 0.84848,
      "grad_norm": 0.09819607436656952,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 5303
    },
    {
      "epoch": 0.84864,
      "grad_norm": 0.09623153507709503,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 5304
    },
    {
      "epoch": 0.8488,
      "grad_norm": 0.110774464905262,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 5305
    },
    {
      "epoch": 0.84896,
      "grad_norm": 0.11293426901102066,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 5306
    },
    {
      "epoch": 0.84912,
      "grad_norm": 0.3692503273487091,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 5307
    },
    {
      "epoch": 0.84928,
      "grad_norm": 0.12041962146759033,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5308
    },
    {
      "epoch": 0.84944,
      "grad_norm": 0.08595119416713715,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 5309
    },
    {
      "epoch": 0.8496,
      "grad_norm": 0.16701184213161469,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 5310
    },
    {
      "epoch": 0.84976,
      "grad_norm": 0.24150586128234863,
      "learning_rate": 0.0001,
      "loss": 0.3296,
      "step": 5311
    },
    {
      "epoch": 0.84992,
      "grad_norm": 0.10313564538955688,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 5312
    },
    {
      "epoch": 0.85008,
      "grad_norm": 0.7279236912727356,
      "learning_rate": 0.0001,
      "loss": 0.3379,
      "step": 5313
    },
    {
      "epoch": 0.85024,
      "grad_norm": 0.15173202753067017,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5314
    },
    {
      "epoch": 0.8504,
      "grad_norm": 0.18305182456970215,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 5315
    },
    {
      "epoch": 0.85056,
      "grad_norm": 0.2376597821712494,
      "learning_rate": 0.0001,
      "loss": 0.3417,
      "step": 5316
    },
    {
      "epoch": 0.85072,
      "grad_norm": 0.2284819632768631,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 5317
    },
    {
      "epoch": 0.85088,
      "grad_norm": 0.1121896356344223,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 5318
    },
    {
      "epoch": 0.85104,
      "grad_norm": 0.15530896186828613,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 5319
    },
    {
      "epoch": 0.8512,
      "grad_norm": 0.1395048052072525,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 5320
    },
    {
      "epoch": 0.85136,
      "grad_norm": 0.18134593963623047,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 5321
    },
    {
      "epoch": 0.85152,
      "grad_norm": 0.12867294251918793,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5322
    },
    {
      "epoch": 0.85168,
      "grad_norm": 0.09399531781673431,
      "learning_rate": 0.0001,
      "loss": 0.3039,
      "step": 5323
    },
    {
      "epoch": 0.85184,
      "grad_norm": 0.16251227259635925,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 5324
    },
    {
      "epoch": 0.852,
      "grad_norm": 0.10859722644090652,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 5325
    },
    {
      "epoch": 0.85216,
      "grad_norm": 0.11079970002174377,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 5326
    },
    {
      "epoch": 0.85232,
      "grad_norm": 0.09237715601921082,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 5327
    },
    {
      "epoch": 0.85248,
      "grad_norm": 0.13331498205661774,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 5328
    },
    {
      "epoch": 0.85264,
      "grad_norm": 0.11503586918115616,
      "learning_rate": 0.0001,
      "loss": 0.3054,
      "step": 5329
    },
    {
      "epoch": 0.8528,
      "grad_norm": 0.1306457221508026,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 5330
    },
    {
      "epoch": 0.85296,
      "grad_norm": 0.14534582197666168,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 5331
    },
    {
      "epoch": 0.85312,
      "grad_norm": 0.20941734313964844,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5332
    },
    {
      "epoch": 0.85328,
      "grad_norm": 0.10089428722858429,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 5333
    },
    {
      "epoch": 0.85344,
      "grad_norm": 0.11608274281024933,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 5334
    },
    {
      "epoch": 0.8536,
      "grad_norm": 0.29504701495170593,
      "learning_rate": 0.0001,
      "loss": 0.3347,
      "step": 5335
    },
    {
      "epoch": 0.85376,
      "grad_norm": 0.11967954784631729,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 5336
    },
    {
      "epoch": 0.85392,
      "grad_norm": 0.12241855263710022,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 5337
    },
    {
      "epoch": 0.85408,
      "grad_norm": 0.09369517862796783,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 5338
    },
    {
      "epoch": 0.85424,
      "grad_norm": 0.32730892300605774,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 5339
    },
    {
      "epoch": 0.8544,
      "grad_norm": 0.09831291437149048,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 5340
    },
    {
      "epoch": 0.85456,
      "grad_norm": 0.2615917921066284,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 5341
    },
    {
      "epoch": 0.85472,
      "grad_norm": 0.09854412078857422,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 5342
    },
    {
      "epoch": 0.85488,
      "grad_norm": 0.15394870936870575,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 5343
    },
    {
      "epoch": 0.85504,
      "grad_norm": 0.12941789627075195,
      "learning_rate": 0.0001,
      "loss": 0.34,
      "step": 5344
    },
    {
      "epoch": 0.8552,
      "grad_norm": 0.10847528278827667,
      "learning_rate": 0.0001,
      "loss": 0.3017,
      "step": 5345
    },
    {
      "epoch": 0.85536,
      "grad_norm": 0.3408457040786743,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 5346
    },
    {
      "epoch": 0.85552,
      "grad_norm": 0.13527445495128632,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5347
    },
    {
      "epoch": 0.85568,
      "grad_norm": 0.11875628679990768,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 5348
    },
    {
      "epoch": 0.85584,
      "grad_norm": 0.1240248829126358,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5349
    },
    {
      "epoch": 0.856,
      "grad_norm": 0.15165206789970398,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5350
    },
    {
      "epoch": 0.85616,
      "grad_norm": 0.09783688932657242,
      "learning_rate": 0.0001,
      "loss": 0.3317,
      "step": 5351
    },
    {
      "epoch": 0.85632,
      "grad_norm": 0.12115470319986343,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 5352
    },
    {
      "epoch": 0.85648,
      "grad_norm": 0.11352241784334183,
      "learning_rate": 0.0001,
      "loss": 0.2999,
      "step": 5353
    },
    {
      "epoch": 0.85664,
      "grad_norm": 0.09635674208402634,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5354
    },
    {
      "epoch": 0.8568,
      "grad_norm": 0.16538970172405243,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 5355
    },
    {
      "epoch": 0.85696,
      "grad_norm": 0.10559576749801636,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 5356
    },
    {
      "epoch": 0.85712,
      "grad_norm": 0.12348829209804535,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5357
    },
    {
      "epoch": 0.85728,
      "grad_norm": 0.12626275420188904,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 5358
    },
    {
      "epoch": 0.85744,
      "grad_norm": 0.10304615646600723,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 5359
    },
    {
      "epoch": 0.8576,
      "grad_norm": 0.08986850827932358,
      "learning_rate": 0.0001,
      "loss": 0.3055,
      "step": 5360
    },
    {
      "epoch": 0.85776,
      "grad_norm": 0.1381075233221054,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 5361
    },
    {
      "epoch": 0.85792,
      "grad_norm": 0.14560090005397797,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 5362
    },
    {
      "epoch": 0.85808,
      "grad_norm": 0.11341336369514465,
      "learning_rate": 0.0001,
      "loss": 0.3023,
      "step": 5363
    },
    {
      "epoch": 0.85824,
      "grad_norm": 0.09873119741678238,
      "learning_rate": 0.0001,
      "loss": 0.3322,
      "step": 5364
    },
    {
      "epoch": 0.8584,
      "grad_norm": 0.11515266448259354,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 5365
    },
    {
      "epoch": 0.85856,
      "grad_norm": 0.11008169502019882,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 5366
    },
    {
      "epoch": 0.85872,
      "grad_norm": 0.08722478896379471,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 5367
    },
    {
      "epoch": 0.85888,
      "grad_norm": 0.08701229840517044,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 5368
    },
    {
      "epoch": 0.85904,
      "grad_norm": 0.1041063666343689,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 5369
    },
    {
      "epoch": 0.8592,
      "grad_norm": 0.10160644352436066,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 5370
    },
    {
      "epoch": 0.85936,
      "grad_norm": 0.1147754043340683,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 5371
    },
    {
      "epoch": 0.85952,
      "grad_norm": 0.09551151841878891,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5372
    },
    {
      "epoch": 0.85968,
      "grad_norm": 0.10690590739250183,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5373
    },
    {
      "epoch": 0.85984,
      "grad_norm": 0.21152062714099884,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 5374
    },
    {
      "epoch": 0.86,
      "grad_norm": 0.09972129762172699,
      "learning_rate": 0.0001,
      "loss": 0.3154,
      "step": 5375
    },
    {
      "epoch": 0.86016,
      "grad_norm": 0.1013520210981369,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 5376
    },
    {
      "epoch": 0.86032,
      "grad_norm": 0.11948513984680176,
      "learning_rate": 0.0001,
      "loss": 0.3056,
      "step": 5377
    },
    {
      "epoch": 0.86048,
      "grad_norm": 0.11418382078409195,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5378
    },
    {
      "epoch": 0.86064,
      "grad_norm": 0.09709185361862183,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5379
    },
    {
      "epoch": 0.8608,
      "grad_norm": 0.09237130731344223,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 5380
    },
    {
      "epoch": 0.86096,
      "grad_norm": 0.12254548817873001,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 5381
    },
    {
      "epoch": 0.86112,
      "grad_norm": 0.11016875505447388,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 5382
    },
    {
      "epoch": 0.86128,
      "grad_norm": 0.09580940753221512,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 5383
    },
    {
      "epoch": 0.86144,
      "grad_norm": 0.16660547256469727,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 5384
    },
    {
      "epoch": 0.8616,
      "grad_norm": 0.10253959894180298,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 5385
    },
    {
      "epoch": 0.86176,
      "grad_norm": 0.08512118458747864,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5386
    },
    {
      "epoch": 0.86192,
      "grad_norm": 0.0947844609618187,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 5387
    },
    {
      "epoch": 0.86208,
      "grad_norm": 0.09636285156011581,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 5388
    },
    {
      "epoch": 0.86224,
      "grad_norm": 0.1048225536942482,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 5389
    },
    {
      "epoch": 0.8624,
      "grad_norm": 0.08394527435302734,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 5390
    },
    {
      "epoch": 0.86256,
      "grad_norm": 0.08991201221942902,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 5391
    },
    {
      "epoch": 0.86272,
      "grad_norm": 0.10143548995256424,
      "learning_rate": 0.0001,
      "loss": 0.3302,
      "step": 5392
    },
    {
      "epoch": 0.86288,
      "grad_norm": 0.09829548746347427,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 5393
    },
    {
      "epoch": 0.86304,
      "grad_norm": 0.10010647773742676,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 5394
    },
    {
      "epoch": 0.8632,
      "grad_norm": 0.20725896954536438,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5395
    },
    {
      "epoch": 0.86336,
      "grad_norm": 0.09384088218212128,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 5396
    },
    {
      "epoch": 0.86352,
      "grad_norm": 0.0825372263789177,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 5397
    },
    {
      "epoch": 0.86368,
      "grad_norm": 0.09144973754882812,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 5398
    },
    {
      "epoch": 0.86384,
      "grad_norm": 0.10698828101158142,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 5399
    },
    {
      "epoch": 0.864,
      "grad_norm": 0.10172034800052643,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 5400
    },
    {
      "epoch": 0.864,
      "eval_train_accuracy": 0.9964,
      "eval_train_loss": 0.31863558292388916,
      "eval_train_runtime": 4.6446,
      "eval_train_samples_per_second": 1076.53,
      "eval_train_steps_per_second": 13.564,
      "step": 5400
    },
    {
      "epoch": 0.864,
      "eval_test_accuracy": 0.9956,
      "eval_test_loss": 0.3175203502178192,
      "eval_test_runtime": 4.7664,
      "eval_test_samples_per_second": 1049.002,
      "eval_test_steps_per_second": 13.217,
      "step": 5400
    },
    {
      "epoch": 0.86416,
      "grad_norm": 0.13780716061592102,
      "learning_rate": 0.0001,
      "loss": 0.3372,
      "step": 5401
    },
    {
      "epoch": 0.86432,
      "grad_norm": 0.0849367082118988,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 5402
    },
    {
      "epoch": 0.86448,
      "grad_norm": 0.09279408305883408,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 5403
    },
    {
      "epoch": 0.86464,
      "grad_norm": 0.0945124551653862,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 5404
    },
    {
      "epoch": 0.8648,
      "grad_norm": 0.20224113762378693,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 5405
    },
    {
      "epoch": 0.86496,
      "grad_norm": 0.09488994628190994,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 5406
    },
    {
      "epoch": 0.86512,
      "grad_norm": 0.0921308845281601,
      "learning_rate": 0.0001,
      "loss": 0.325,
      "step": 5407
    },
    {
      "epoch": 0.86528,
      "grad_norm": 0.09871204942464828,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5408
    },
    {
      "epoch": 0.86544,
      "grad_norm": 0.08627369999885559,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 5409
    },
    {
      "epoch": 0.8656,
      "grad_norm": 0.08822678029537201,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 5410
    },
    {
      "epoch": 0.86576,
      "grad_norm": 0.10363169014453888,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 5411
    },
    {
      "epoch": 0.86592,
      "grad_norm": 0.08791615813970566,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 5412
    },
    {
      "epoch": 0.86608,
      "grad_norm": 0.17624160647392273,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 5413
    },
    {
      "epoch": 0.86624,
      "grad_norm": 0.11920908093452454,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5414
    },
    {
      "epoch": 0.8664,
      "grad_norm": 0.08554107695817947,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 5415
    },
    {
      "epoch": 0.86656,
      "grad_norm": 0.15706051886081696,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 5416
    },
    {
      "epoch": 0.86672,
      "grad_norm": 0.1093313917517662,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 5417
    },
    {
      "epoch": 0.86688,
      "grad_norm": 0.10473180562257767,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 5418
    },
    {
      "epoch": 0.86704,
      "grad_norm": 0.11078774183988571,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 5419
    },
    {
      "epoch": 0.8672,
      "grad_norm": 0.10285768657922745,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 5420
    },
    {
      "epoch": 0.86736,
      "grad_norm": 0.10065813362598419,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 5421
    },
    {
      "epoch": 0.86752,
      "grad_norm": 0.09763187170028687,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 5422
    },
    {
      "epoch": 0.86768,
      "grad_norm": 0.10262809693813324,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 5423
    },
    {
      "epoch": 0.86784,
      "grad_norm": 0.13666969537734985,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 5424
    },
    {
      "epoch": 0.868,
      "grad_norm": 0.09928509593009949,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 5425
    },
    {
      "epoch": 0.86816,
      "grad_norm": 0.10475218296051025,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 5426
    },
    {
      "epoch": 0.86832,
      "grad_norm": 0.1622096300125122,
      "learning_rate": 0.0001,
      "loss": 0.3227,
      "step": 5427
    },
    {
      "epoch": 0.86848,
      "grad_norm": 0.11663373559713364,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 5428
    },
    {
      "epoch": 0.86864,
      "grad_norm": 0.10314041376113892,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 5429
    },
    {
      "epoch": 0.8688,
      "grad_norm": 0.10782629251480103,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 5430
    },
    {
      "epoch": 0.86896,
      "grad_norm": 0.13711261749267578,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 5431
    },
    {
      "epoch": 0.86912,
      "grad_norm": 0.12001324445009232,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 5432
    },
    {
      "epoch": 0.86928,
      "grad_norm": 0.11349727213382721,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 5433
    },
    {
      "epoch": 0.86944,
      "grad_norm": 0.11491749435663223,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 5434
    },
    {
      "epoch": 0.8696,
      "grad_norm": 0.20980167388916016,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 5435
    },
    {
      "epoch": 0.86976,
      "grad_norm": 0.08837110549211502,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 5436
    },
    {
      "epoch": 0.86992,
      "grad_norm": 0.10550075024366379,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 5437
    },
    {
      "epoch": 0.87008,
      "grad_norm": 0.09833159297704697,
      "learning_rate": 0.0001,
      "loss": 0.3407,
      "step": 5438
    },
    {
      "epoch": 0.87024,
      "grad_norm": 0.14626066386699677,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 5439
    },
    {
      "epoch": 0.8704,
      "grad_norm": 0.11406292021274567,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 5440
    },
    {
      "epoch": 0.87056,
      "grad_norm": 0.10798458755016327,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 5441
    },
    {
      "epoch": 0.87072,
      "grad_norm": 0.09510090947151184,
      "learning_rate": 0.0001,
      "loss": 0.3063,
      "step": 5442
    },
    {
      "epoch": 0.87088,
      "grad_norm": 0.09216544777154922,
      "learning_rate": 0.0001,
      "loss": 0.3245,
      "step": 5443
    },
    {
      "epoch": 0.87104,
      "grad_norm": 0.09304486960172653,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 5444
    },
    {
      "epoch": 0.8712,
      "grad_norm": 0.29509639739990234,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 5445
    },
    {
      "epoch": 0.87136,
      "grad_norm": 0.10214777290821075,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 5446
    },
    {
      "epoch": 0.87152,
      "grad_norm": 0.11114201694726944,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 5447
    },
    {
      "epoch": 0.87168,
      "grad_norm": 0.10214073210954666,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 5448
    },
    {
      "epoch": 0.87184,
      "grad_norm": 0.10255427658557892,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 5449
    },
    {
      "epoch": 0.872,
      "grad_norm": 0.08714643120765686,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 5450
    },
    {
      "epoch": 0.87216,
      "grad_norm": 0.08364832401275635,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 5451
    },
    {
      "epoch": 0.87232,
      "grad_norm": 0.09024902433156967,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 5452
    },
    {
      "epoch": 0.87248,
      "grad_norm": 0.08514988422393799,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 5453
    },
    {
      "epoch": 0.87264,
      "grad_norm": 0.0989699512720108,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 5454
    },
    {
      "epoch": 0.8728,
      "grad_norm": 0.11254919320344925,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 5455
    },
    {
      "epoch": 0.87296,
      "grad_norm": 0.09284546971321106,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 5456
    },
    {
      "epoch": 0.87312,
      "grad_norm": 0.0803339034318924,
      "learning_rate": 0.0001,
      "loss": 0.3019,
      "step": 5457
    },
    {
      "epoch": 0.87328,
      "grad_norm": 0.09506368637084961,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 5458
    },
    {
      "epoch": 0.87344,
      "grad_norm": 0.08987674117088318,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 5459
    },
    {
      "epoch": 0.8736,
      "grad_norm": 0.09448346495628357,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 5460
    },
    {
      "epoch": 0.87376,
      "grad_norm": 0.08442070335149765,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 5461
    },
    {
      "epoch": 0.87392,
      "grad_norm": 0.08372539281845093,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 5462
    },
    {
      "epoch": 0.87408,
      "grad_norm": 0.188045933842659,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 5463
    },
    {
      "epoch": 0.87424,
      "grad_norm": 0.09772320091724396,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 5464
    },
    {
      "epoch": 0.8744,
      "grad_norm": 0.08613372594118118,
      "learning_rate": 0.0001,
      "loss": 0.3064,
      "step": 5465
    },
    {
      "epoch": 0.87456,
      "grad_norm": 0.10821469873189926,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 5466
    },
    {
      "epoch": 0.87472,
      "grad_norm": 0.11651276051998138,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5467
    },
    {
      "epoch": 0.87488,
      "grad_norm": 0.08300040662288666,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 5468
    },
    {
      "epoch": 0.87504,
      "grad_norm": 0.11688849329948425,
      "learning_rate": 0.0001,
      "loss": 0.3316,
      "step": 5469
    },
    {
      "epoch": 0.8752,
      "grad_norm": 0.08871366828680038,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 5470
    },
    {
      "epoch": 0.87536,
      "grad_norm": 0.09943516552448273,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 5471
    },
    {
      "epoch": 0.87552,
      "grad_norm": 0.07909811288118362,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 5472
    },
    {
      "epoch": 0.87568,
      "grad_norm": 0.09697126597166061,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5473
    },
    {
      "epoch": 0.87584,
      "grad_norm": 0.1424948126077652,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 5474
    },
    {
      "epoch": 0.876,
      "grad_norm": 0.0798136293888092,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 5475
    },
    {
      "epoch": 0.87616,
      "grad_norm": 0.11790519952774048,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 5476
    },
    {
      "epoch": 0.87632,
      "grad_norm": 0.10484752058982849,
      "learning_rate": 0.0001,
      "loss": 0.3073,
      "step": 5477
    },
    {
      "epoch": 0.87648,
      "grad_norm": 0.09047337621450424,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5478
    },
    {
      "epoch": 0.87664,
      "grad_norm": 0.09940692782402039,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5479
    },
    {
      "epoch": 0.8768,
      "grad_norm": 0.27894237637519836,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 5480
    },
    {
      "epoch": 0.87696,
      "grad_norm": 0.09051692485809326,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 5481
    },
    {
      "epoch": 0.87712,
      "grad_norm": 0.08753430843353271,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 5482
    },
    {
      "epoch": 0.87728,
      "grad_norm": 0.08567536622285843,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 5483
    },
    {
      "epoch": 0.87744,
      "grad_norm": 0.1051664724946022,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 5484
    },
    {
      "epoch": 0.8776,
      "grad_norm": 0.10693895816802979,
      "learning_rate": 0.0001,
      "loss": 0.3325,
      "step": 5485
    },
    {
      "epoch": 0.87776,
      "grad_norm": 0.08165109902620316,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 5486
    },
    {
      "epoch": 0.87792,
      "grad_norm": 0.09822522103786469,
      "learning_rate": 0.0001,
      "loss": 0.3078,
      "step": 5487
    },
    {
      "epoch": 0.87808,
      "grad_norm": 0.27203869819641113,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 5488
    },
    {
      "epoch": 0.87824,
      "grad_norm": 0.08774159103631973,
      "learning_rate": 0.0001,
      "loss": 0.3321,
      "step": 5489
    },
    {
      "epoch": 0.8784,
      "grad_norm": 0.10805196315050125,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 5490
    },
    {
      "epoch": 0.87856,
      "grad_norm": 0.13580571115016937,
      "learning_rate": 0.0001,
      "loss": 0.3054,
      "step": 5491
    },
    {
      "epoch": 0.87872,
      "grad_norm": 0.10037331283092499,
      "learning_rate": 0.0001,
      "loss": 0.3055,
      "step": 5492
    },
    {
      "epoch": 0.87888,
      "grad_norm": 0.14701245725154877,
      "learning_rate": 0.0001,
      "loss": 0.305,
      "step": 5493
    },
    {
      "epoch": 0.87904,
      "grad_norm": 0.093939408659935,
      "learning_rate": 0.0001,
      "loss": 0.3289,
      "step": 5494
    },
    {
      "epoch": 0.8792,
      "grad_norm": 0.09354017674922943,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 5495
    },
    {
      "epoch": 0.87936,
      "grad_norm": 0.09351664036512375,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 5496
    },
    {
      "epoch": 0.87952,
      "grad_norm": 0.09441014379262924,
      "learning_rate": 0.0001,
      "loss": 0.3065,
      "step": 5497
    },
    {
      "epoch": 0.87968,
      "grad_norm": 0.10500092059373856,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 5498
    },
    {
      "epoch": 0.87984,
      "grad_norm": 0.0823269709944725,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5499
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.08864995837211609,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 5500
    },
    {
      "epoch": 0.88,
      "eval_train_accuracy": 0.997,
      "eval_train_loss": 0.31859686970710754,
      "eval_train_runtime": 4.5334,
      "eval_train_samples_per_second": 1102.931,
      "eval_train_steps_per_second": 13.897,
      "step": 5500
    },
    {
      "epoch": 0.88,
      "eval_test_accuracy": 0.9964,
      "eval_test_loss": 0.3175275921821594,
      "eval_test_runtime": 4.8876,
      "eval_test_samples_per_second": 1022.992,
      "eval_test_steps_per_second": 12.89,
      "step": 5500
    },
    {
      "epoch": 0.88016,
      "grad_norm": 0.10077417641878128,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 5501
    },
    {
      "epoch": 0.88032,
      "grad_norm": 0.17558491230010986,
      "learning_rate": 0.0001,
      "loss": 0.3348,
      "step": 5502
    },
    {
      "epoch": 0.88048,
      "grad_norm": 0.10883791744709015,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 5503
    },
    {
      "epoch": 0.88064,
      "grad_norm": 0.10840262472629547,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 5504
    },
    {
      "epoch": 0.8808,
      "grad_norm": 0.09436506778001785,
      "learning_rate": 0.0001,
      "loss": 0.3311,
      "step": 5505
    },
    {
      "epoch": 0.88096,
      "grad_norm": 0.08406113088130951,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5506
    },
    {
      "epoch": 0.88112,
      "grad_norm": 0.13492591679096222,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 5507
    },
    {
      "epoch": 0.88128,
      "grad_norm": 0.10550593584775925,
      "learning_rate": 0.0001,
      "loss": 0.307,
      "step": 5508
    },
    {
      "epoch": 0.88144,
      "grad_norm": 0.08234775811433792,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 5509
    },
    {
      "epoch": 0.8816,
      "grad_norm": 0.10148994624614716,
      "learning_rate": 0.0001,
      "loss": 0.303,
      "step": 5510
    },
    {
      "epoch": 0.88176,
      "grad_norm": 0.07980062067508698,
      "learning_rate": 0.0001,
      "loss": 0.2965,
      "step": 5511
    },
    {
      "epoch": 0.88192,
      "grad_norm": 0.11385858803987503,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 5512
    },
    {
      "epoch": 0.88208,
      "grad_norm": 0.10648974776268005,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 5513
    },
    {
      "epoch": 0.88224,
      "grad_norm": 0.09305322915315628,
      "learning_rate": 0.0001,
      "loss": 0.3002,
      "step": 5514
    },
    {
      "epoch": 0.8824,
      "grad_norm": 0.08677306026220322,
      "learning_rate": 0.0001,
      "loss": 0.3106,
      "step": 5515
    },
    {
      "epoch": 0.88256,
      "grad_norm": 0.14412543177604675,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 5516
    },
    {
      "epoch": 0.88272,
      "grad_norm": 0.0825442522764206,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 5517
    },
    {
      "epoch": 0.88288,
      "grad_norm": 0.08237290382385254,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 5518
    },
    {
      "epoch": 0.88304,
      "grad_norm": 0.08320928364992142,
      "learning_rate": 0.0001,
      "loss": 0.3012,
      "step": 5519
    },
    {
      "epoch": 0.8832,
      "grad_norm": 0.113480344414711,
      "learning_rate": 0.0001,
      "loss": 0.3118,
      "step": 5520
    },
    {
      "epoch": 0.88336,
      "grad_norm": 0.08152353018522263,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 5521
    },
    {
      "epoch": 0.88352,
      "grad_norm": 0.1054665744304657,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 5522
    },
    {
      "epoch": 0.88368,
      "grad_norm": 0.08638298511505127,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 5523
    },
    {
      "epoch": 0.88384,
      "grad_norm": 0.09763207286596298,
      "learning_rate": 0.0001,
      "loss": 0.3247,
      "step": 5524
    },
    {
      "epoch": 0.884,
      "grad_norm": 0.09585892409086227,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 5525
    },
    {
      "epoch": 0.88416,
      "grad_norm": 0.07876233756542206,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 5526
    },
    {
      "epoch": 0.88432,
      "grad_norm": 0.0943833440542221,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 5527
    },
    {
      "epoch": 0.88448,
      "grad_norm": 0.09347213059663773,
      "learning_rate": 0.0001,
      "loss": 0.3013,
      "step": 5528
    },
    {
      "epoch": 0.88464,
      "grad_norm": 0.08878114819526672,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 5529
    },
    {
      "epoch": 0.8848,
      "grad_norm": 0.09711182117462158,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 5530
    },
    {
      "epoch": 0.88496,
      "grad_norm": 0.08213390409946442,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 5531
    },
    {
      "epoch": 0.88512,
      "grad_norm": 0.08291475474834442,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 5532
    },
    {
      "epoch": 0.88528,
      "grad_norm": 0.1106300950050354,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5533
    },
    {
      "epoch": 0.88544,
      "grad_norm": 0.0932840034365654,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5534
    },
    {
      "epoch": 0.8856,
      "grad_norm": 0.09865741431713104,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 5535
    },
    {
      "epoch": 0.88576,
      "grad_norm": 0.08031829446554184,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 5536
    },
    {
      "epoch": 0.88592,
      "grad_norm": 0.09694170206785202,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 5537
    },
    {
      "epoch": 0.88608,
      "grad_norm": 0.0999419316649437,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 5538
    },
    {
      "epoch": 0.88624,
      "grad_norm": 0.10137289762496948,
      "learning_rate": 0.0001,
      "loss": 0.338,
      "step": 5539
    },
    {
      "epoch": 0.8864,
      "grad_norm": 0.09301432222127914,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5540
    },
    {
      "epoch": 0.88656,
      "grad_norm": 0.08387291431427002,
      "learning_rate": 0.0001,
      "loss": 0.3083,
      "step": 5541
    },
    {
      "epoch": 0.88672,
      "grad_norm": 0.11321181058883667,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 5542
    },
    {
      "epoch": 0.88688,
      "grad_norm": 0.0812169685959816,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5543
    },
    {
      "epoch": 0.88704,
      "grad_norm": 0.11020880192518234,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 5544
    },
    {
      "epoch": 0.8872,
      "grad_norm": 0.09432787448167801,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 5545
    },
    {
      "epoch": 0.88736,
      "grad_norm": 0.08024191111326218,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 5546
    },
    {
      "epoch": 0.88752,
      "grad_norm": 0.07617233693599701,
      "learning_rate": 0.0001,
      "loss": 0.3053,
      "step": 5547
    },
    {
      "epoch": 0.88768,
      "grad_norm": 0.07767176628112793,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5548
    },
    {
      "epoch": 0.88784,
      "grad_norm": 0.09989520907402039,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 5549
    },
    {
      "epoch": 0.888,
      "grad_norm": 0.08715298026800156,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 5550
    },
    {
      "epoch": 0.88816,
      "grad_norm": 0.09487947076559067,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 5551
    },
    {
      "epoch": 0.88832,
      "grad_norm": 0.0979752242565155,
      "learning_rate": 0.0001,
      "loss": 0.2997,
      "step": 5552
    },
    {
      "epoch": 0.88848,
      "grad_norm": 0.09498177468776703,
      "learning_rate": 0.0001,
      "loss": 0.2996,
      "step": 5553
    },
    {
      "epoch": 0.88864,
      "grad_norm": 0.09489686042070389,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5554
    },
    {
      "epoch": 0.8888,
      "grad_norm": 0.08726407587528229,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 5555
    },
    {
      "epoch": 0.88896,
      "grad_norm": 0.08964768797159195,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 5556
    },
    {
      "epoch": 0.88912,
      "grad_norm": 0.07513447850942612,
      "learning_rate": 0.0001,
      "loss": 0.3062,
      "step": 5557
    },
    {
      "epoch": 0.88928,
      "grad_norm": 0.09263870120048523,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 5558
    },
    {
      "epoch": 0.88944,
      "grad_norm": 0.09826408326625824,
      "learning_rate": 0.0001,
      "loss": 0.3096,
      "step": 5559
    },
    {
      "epoch": 0.8896,
      "grad_norm": 0.09816820919513702,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 5560
    },
    {
      "epoch": 0.88976,
      "grad_norm": 0.094457246363163,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 5561
    },
    {
      "epoch": 0.88992,
      "grad_norm": 0.08548688888549805,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 5562
    },
    {
      "epoch": 0.89008,
      "grad_norm": 0.08813681453466415,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 5563
    },
    {
      "epoch": 0.89024,
      "grad_norm": 0.11055602133274078,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 5564
    },
    {
      "epoch": 0.8904,
      "grad_norm": 0.0875617116689682,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 5565
    },
    {
      "epoch": 0.89056,
      "grad_norm": 0.0886491909623146,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5566
    },
    {
      "epoch": 0.89072,
      "grad_norm": 0.09311185032129288,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 5567
    },
    {
      "epoch": 0.89088,
      "grad_norm": 0.08342479914426804,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5568
    },
    {
      "epoch": 0.89104,
      "grad_norm": 0.09499087184667587,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 5569
    },
    {
      "epoch": 0.8912,
      "grad_norm": 0.09803574532270432,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 5570
    },
    {
      "epoch": 0.89136,
      "grad_norm": 0.0806155726313591,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 5571
    },
    {
      "epoch": 0.89152,
      "grad_norm": 0.10232007503509521,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 5572
    },
    {
      "epoch": 0.89168,
      "grad_norm": 0.08390064537525177,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 5573
    },
    {
      "epoch": 0.89184,
      "grad_norm": 0.1343134641647339,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 5574
    },
    {
      "epoch": 0.892,
      "grad_norm": 0.11023431271314621,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 5575
    },
    {
      "epoch": 0.89216,
      "grad_norm": 0.08775272220373154,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 5576
    },
    {
      "epoch": 0.89232,
      "grad_norm": 0.08762961626052856,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 5577
    },
    {
      "epoch": 0.89248,
      "grad_norm": 0.10515540838241577,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 5578
    },
    {
      "epoch": 0.89264,
      "grad_norm": 0.12106365710496902,
      "learning_rate": 0.0001,
      "loss": 0.3213,
      "step": 5579
    },
    {
      "epoch": 0.8928,
      "grad_norm": 0.09768827259540558,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 5580
    },
    {
      "epoch": 0.89296,
      "grad_norm": 0.0827208161354065,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5581
    },
    {
      "epoch": 0.89312,
      "grad_norm": 0.09283141791820526,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 5582
    },
    {
      "epoch": 0.89328,
      "grad_norm": 0.10416396707296371,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 5583
    },
    {
      "epoch": 0.89344,
      "grad_norm": 0.08935447037220001,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 5584
    },
    {
      "epoch": 0.8936,
      "grad_norm": 0.09017512202262878,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 5585
    },
    {
      "epoch": 0.89376,
      "grad_norm": 0.08772944658994675,
      "learning_rate": 0.0001,
      "loss": 0.3328,
      "step": 5586
    },
    {
      "epoch": 0.89392,
      "grad_norm": 0.09031284600496292,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 5587
    },
    {
      "epoch": 0.89408,
      "grad_norm": 0.08115077018737793,
      "learning_rate": 0.0001,
      "loss": 0.3165,
      "step": 5588
    },
    {
      "epoch": 0.89424,
      "grad_norm": 0.10019856691360474,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 5589
    },
    {
      "epoch": 0.8944,
      "grad_norm": 0.09755358844995499,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 5590
    },
    {
      "epoch": 0.89456,
      "grad_norm": 0.09368044883012772,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 5591
    },
    {
      "epoch": 0.89472,
      "grad_norm": 0.08063043653964996,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 5592
    },
    {
      "epoch": 0.89488,
      "grad_norm": 0.10111082345247269,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5593
    },
    {
      "epoch": 0.89504,
      "grad_norm": 0.11257438361644745,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5594
    },
    {
      "epoch": 0.8952,
      "grad_norm": 0.08814045786857605,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 5595
    },
    {
      "epoch": 0.89536,
      "grad_norm": 0.08605801314115524,
      "learning_rate": 0.0001,
      "loss": 0.314,
      "step": 5596
    },
    {
      "epoch": 0.89552,
      "grad_norm": 0.09176827222108841,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 5597
    },
    {
      "epoch": 0.89568,
      "grad_norm": 0.0877721905708313,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 5598
    },
    {
      "epoch": 0.89584,
      "grad_norm": 0.11956220120191574,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 5599
    },
    {
      "epoch": 0.896,
      "grad_norm": 0.08411902189254761,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 5600
    },
    {
      "epoch": 0.896,
      "eval_train_accuracy": 0.9982,
      "eval_train_loss": 0.31789156794548035,
      "eval_train_runtime": 4.4271,
      "eval_train_samples_per_second": 1129.411,
      "eval_train_steps_per_second": 14.231,
      "step": 5600
    },
    {
      "epoch": 0.896,
      "eval_test_accuracy": 0.9992,
      "eval_test_loss": 0.3169325292110443,
      "eval_test_runtime": 4.8404,
      "eval_test_samples_per_second": 1032.966,
      "eval_test_steps_per_second": 13.015,
      "step": 5600
    },
    {
      "epoch": 0.89616,
      "grad_norm": 0.1008201539516449,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 5601
    },
    {
      "epoch": 0.89632,
      "grad_norm": 0.0780177116394043,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 5602
    },
    {
      "epoch": 0.89648,
      "grad_norm": 0.07842835783958435,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 5603
    },
    {
      "epoch": 0.89664,
      "grad_norm": 0.13096463680267334,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5604
    },
    {
      "epoch": 0.8968,
      "grad_norm": 0.08713670074939728,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 5605
    },
    {
      "epoch": 0.89696,
      "grad_norm": 0.09190300852060318,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5606
    },
    {
      "epoch": 0.89712,
      "grad_norm": 0.0890781432390213,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 5607
    },
    {
      "epoch": 0.89728,
      "grad_norm": 0.09064286947250366,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5608
    },
    {
      "epoch": 0.89744,
      "grad_norm": 0.09295739233493805,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 5609
    },
    {
      "epoch": 0.8976,
      "grad_norm": 0.10399933159351349,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 5610
    },
    {
      "epoch": 0.89776,
      "grad_norm": 0.08455201238393784,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 5611
    },
    {
      "epoch": 0.89792,
      "grad_norm": 0.08565761893987656,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 5612
    },
    {
      "epoch": 0.89808,
      "grad_norm": 0.08374327421188354,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 5613
    },
    {
      "epoch": 0.89824,
      "grad_norm": 0.08837972581386566,
      "learning_rate": 0.0001,
      "loss": 0.3339,
      "step": 5614
    },
    {
      "epoch": 0.8984,
      "grad_norm": 0.07823824137449265,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 5615
    },
    {
      "epoch": 0.89856,
      "grad_norm": 0.08426230400800705,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5616
    },
    {
      "epoch": 0.89872,
      "grad_norm": 0.10735345631837845,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 5617
    },
    {
      "epoch": 0.89888,
      "grad_norm": 0.09931185841560364,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 5618
    },
    {
      "epoch": 0.89904,
      "grad_norm": 0.07961676269769669,
      "learning_rate": 0.0001,
      "loss": 0.3063,
      "step": 5619
    },
    {
      "epoch": 0.8992,
      "grad_norm": 0.08608543127775192,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 5620
    },
    {
      "epoch": 0.89936,
      "grad_norm": 0.09094835817813873,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 5621
    },
    {
      "epoch": 0.89952,
      "grad_norm": 0.08948581665754318,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 5622
    },
    {
      "epoch": 0.89968,
      "grad_norm": 0.10124526917934418,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 5623
    },
    {
      "epoch": 0.89984,
      "grad_norm": 0.0780581682920456,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5624
    },
    {
      "epoch": 0.9,
      "grad_norm": 0.1868886649608612,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 5625
    },
    {
      "epoch": 0.90016,
      "grad_norm": 0.0962744653224945,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 5626
    },
    {
      "epoch": 0.90032,
      "grad_norm": 0.09576177597045898,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 5627
    },
    {
      "epoch": 0.90048,
      "grad_norm": 0.088690847158432,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 5628
    },
    {
      "epoch": 0.90064,
      "grad_norm": 0.08727822452783585,
      "learning_rate": 0.0001,
      "loss": 0.3031,
      "step": 5629
    },
    {
      "epoch": 0.9008,
      "grad_norm": 0.11438705027103424,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 5630
    },
    {
      "epoch": 0.90096,
      "grad_norm": 0.08354919403791428,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 5631
    },
    {
      "epoch": 0.90112,
      "grad_norm": 0.17451858520507812,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 5632
    },
    {
      "epoch": 0.90128,
      "grad_norm": 0.10273659974336624,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 5633
    },
    {
      "epoch": 0.90144,
      "grad_norm": 0.14301852881908417,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5634
    },
    {
      "epoch": 0.9016,
      "grad_norm": 0.07775988429784775,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 5635
    },
    {
      "epoch": 0.90176,
      "grad_norm": 0.10570503026247025,
      "learning_rate": 0.0001,
      "loss": 0.3336,
      "step": 5636
    },
    {
      "epoch": 0.90192,
      "grad_norm": 0.07701663672924042,
      "learning_rate": 0.0001,
      "loss": 0.2964,
      "step": 5637
    },
    {
      "epoch": 0.90208,
      "grad_norm": 0.09438125789165497,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 5638
    },
    {
      "epoch": 0.90224,
      "grad_norm": 0.08651195466518402,
      "learning_rate": 0.0001,
      "loss": 0.295,
      "step": 5639
    },
    {
      "epoch": 0.9024,
      "grad_norm": 0.09465718269348145,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 5640
    },
    {
      "epoch": 0.90256,
      "grad_norm": 0.08440475910902023,
      "learning_rate": 0.0001,
      "loss": 0.3041,
      "step": 5641
    },
    {
      "epoch": 0.90272,
      "grad_norm": 0.09951136261224747,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 5642
    },
    {
      "epoch": 0.90288,
      "grad_norm": 0.08004042506217957,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 5643
    },
    {
      "epoch": 0.90304,
      "grad_norm": 0.13110250234603882,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 5644
    },
    {
      "epoch": 0.9032,
      "grad_norm": 0.09869332611560822,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 5645
    },
    {
      "epoch": 0.90336,
      "grad_norm": 0.10682398825883865,
      "learning_rate": 0.0001,
      "loss": 0.3205,
      "step": 5646
    },
    {
      "epoch": 0.90352,
      "grad_norm": 0.09220881760120392,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 5647
    },
    {
      "epoch": 0.90368,
      "grad_norm": 0.09256557375192642,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 5648
    },
    {
      "epoch": 0.90384,
      "grad_norm": 0.10729309916496277,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 5649
    },
    {
      "epoch": 0.904,
      "grad_norm": 0.09943915903568268,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 5650
    },
    {
      "epoch": 0.90416,
      "grad_norm": 0.08540883660316467,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 5651
    },
    {
      "epoch": 0.90432,
      "grad_norm": 0.08018655329942703,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 5652
    },
    {
      "epoch": 0.90448,
      "grad_norm": 0.08155220746994019,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 5653
    },
    {
      "epoch": 0.90464,
      "grad_norm": 0.11078403890132904,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 5654
    },
    {
      "epoch": 0.9048,
      "grad_norm": 0.09626654535531998,
      "learning_rate": 0.0001,
      "loss": 0.3326,
      "step": 5655
    },
    {
      "epoch": 0.90496,
      "grad_norm": 0.10497073829174042,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5656
    },
    {
      "epoch": 0.90512,
      "grad_norm": 0.08839573711156845,
      "learning_rate": 0.0001,
      "loss": 0.3057,
      "step": 5657
    },
    {
      "epoch": 0.90528,
      "grad_norm": 0.07720046490430832,
      "learning_rate": 0.0001,
      "loss": 0.3135,
      "step": 5658
    },
    {
      "epoch": 0.90544,
      "grad_norm": 0.18491534888744354,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 5659
    },
    {
      "epoch": 0.9056,
      "grad_norm": 0.09815841168165207,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 5660
    },
    {
      "epoch": 0.90576,
      "grad_norm": 0.09094805270433426,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 5661
    },
    {
      "epoch": 0.90592,
      "grad_norm": 0.09707660228013992,
      "learning_rate": 0.0001,
      "loss": 0.3359,
      "step": 5662
    },
    {
      "epoch": 0.90608,
      "grad_norm": 0.09185978770256042,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5663
    },
    {
      "epoch": 0.90624,
      "grad_norm": 0.0860450267791748,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 5664
    },
    {
      "epoch": 0.9064,
      "grad_norm": 0.09451049566268921,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 5665
    },
    {
      "epoch": 0.90656,
      "grad_norm": 0.1216094046831131,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 5666
    },
    {
      "epoch": 0.90672,
      "grad_norm": 0.07908891141414642,
      "learning_rate": 0.0001,
      "loss": 0.3312,
      "step": 5667
    },
    {
      "epoch": 0.90688,
      "grad_norm": 0.10125668346881866,
      "learning_rate": 0.0001,
      "loss": 0.3081,
      "step": 5668
    },
    {
      "epoch": 0.90704,
      "grad_norm": 0.08993136137723923,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 5669
    },
    {
      "epoch": 0.9072,
      "grad_norm": 0.08775677531957626,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5670
    },
    {
      "epoch": 0.90736,
      "grad_norm": 0.11600635200738907,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5671
    },
    {
      "epoch": 0.90752,
      "grad_norm": 0.09756673872470856,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 5672
    },
    {
      "epoch": 0.90768,
      "grad_norm": 0.0788409560918808,
      "learning_rate": 0.0001,
      "loss": 0.3152,
      "step": 5673
    },
    {
      "epoch": 0.90784,
      "grad_norm": 0.0911654531955719,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 5674
    },
    {
      "epoch": 0.908,
      "grad_norm": 0.11264172941446304,
      "learning_rate": 0.0001,
      "loss": 0.3091,
      "step": 5675
    },
    {
      "epoch": 0.90816,
      "grad_norm": 0.07797394692897797,
      "learning_rate": 0.0001,
      "loss": 0.304,
      "step": 5676
    },
    {
      "epoch": 0.90832,
      "grad_norm": 0.08886151015758514,
      "learning_rate": 0.0001,
      "loss": 0.3287,
      "step": 5677
    },
    {
      "epoch": 0.90848,
      "grad_norm": 0.08172276616096497,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5678
    },
    {
      "epoch": 0.90864,
      "grad_norm": 0.10087789595127106,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 5679
    },
    {
      "epoch": 0.9088,
      "grad_norm": 0.08384804427623749,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 5680
    },
    {
      "epoch": 0.90896,
      "grad_norm": 0.0861351266503334,
      "learning_rate": 0.0001,
      "loss": 0.3294,
      "step": 5681
    },
    {
      "epoch": 0.90912,
      "grad_norm": 0.08133983612060547,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 5682
    },
    {
      "epoch": 0.90928,
      "grad_norm": 0.10204635560512543,
      "learning_rate": 0.0001,
      "loss": 0.3431,
      "step": 5683
    },
    {
      "epoch": 0.90944,
      "grad_norm": 0.09219467639923096,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 5684
    },
    {
      "epoch": 0.9096,
      "grad_norm": 0.08925560116767883,
      "learning_rate": 0.0001,
      "loss": 0.3071,
      "step": 5685
    },
    {
      "epoch": 0.90976,
      "grad_norm": 0.0777108296751976,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 5686
    },
    {
      "epoch": 0.90992,
      "grad_norm": 0.11130182445049286,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 5687
    },
    {
      "epoch": 0.91008,
      "grad_norm": 0.10232515633106232,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 5688
    },
    {
      "epoch": 0.91024,
      "grad_norm": 0.07788290828466415,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5689
    },
    {
      "epoch": 0.9104,
      "grad_norm": 0.10933859646320343,
      "learning_rate": 0.0001,
      "loss": 0.3097,
      "step": 5690
    },
    {
      "epoch": 0.91056,
      "grad_norm": 0.08700333535671234,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 5691
    },
    {
      "epoch": 0.91072,
      "grad_norm": 0.08934193849563599,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 5692
    },
    {
      "epoch": 0.91088,
      "grad_norm": 0.10416191071271896,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 5693
    },
    {
      "epoch": 0.91104,
      "grad_norm": 0.0893019586801529,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 5694
    },
    {
      "epoch": 0.9112,
      "grad_norm": 0.10539635270833969,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 5695
    },
    {
      "epoch": 0.91136,
      "grad_norm": 0.08192454278469086,
      "learning_rate": 0.0001,
      "loss": 0.3073,
      "step": 5696
    },
    {
      "epoch": 0.91152,
      "grad_norm": 0.10180340707302094,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 5697
    },
    {
      "epoch": 0.91168,
      "grad_norm": 0.1098184809088707,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 5698
    },
    {
      "epoch": 0.91184,
      "grad_norm": 0.1031416654586792,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 5699
    },
    {
      "epoch": 0.912,
      "grad_norm": 0.09632571041584015,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 5700
    },
    {
      "epoch": 0.912,
      "eval_train_accuracy": 0.9974,
      "eval_train_loss": 0.3183691203594208,
      "eval_train_runtime": 4.3549,
      "eval_train_samples_per_second": 1148.144,
      "eval_train_steps_per_second": 14.467,
      "step": 5700
    },
    {
      "epoch": 0.912,
      "eval_test_accuracy": 0.9966,
      "eval_test_loss": 0.31709393858909607,
      "eval_test_runtime": 4.9477,
      "eval_test_samples_per_second": 1010.578,
      "eval_test_steps_per_second": 12.733,
      "step": 5700
    },
    {
      "epoch": 0.91216,
      "grad_norm": 0.11070220917463303,
      "learning_rate": 0.0001,
      "loss": 0.3151,
      "step": 5701
    },
    {
      "epoch": 0.91232,
      "grad_norm": 0.09399683773517609,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 5702
    },
    {
      "epoch": 0.91248,
      "grad_norm": 0.12880048155784607,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 5703
    },
    {
      "epoch": 0.91264,
      "grad_norm": 0.09217521548271179,
      "learning_rate": 0.0001,
      "loss": 0.334,
      "step": 5704
    },
    {
      "epoch": 0.9128,
      "grad_norm": 0.09669730812311172,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 5705
    },
    {
      "epoch": 0.91296,
      "grad_norm": 0.11119718849658966,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 5706
    },
    {
      "epoch": 0.91312,
      "grad_norm": 0.09806018322706223,
      "learning_rate": 0.0001,
      "loss": 0.3427,
      "step": 5707
    },
    {
      "epoch": 0.91328,
      "grad_norm": 0.08591051399707794,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 5708
    },
    {
      "epoch": 0.91344,
      "grad_norm": 0.09568224847316742,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 5709
    },
    {
      "epoch": 0.9136,
      "grad_norm": 0.08560393005609512,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5710
    },
    {
      "epoch": 0.91376,
      "grad_norm": 0.08722620457410812,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 5711
    },
    {
      "epoch": 0.91392,
      "grad_norm": 0.10246738791465759,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 5712
    },
    {
      "epoch": 0.91408,
      "grad_norm": 0.09324218332767487,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 5713
    },
    {
      "epoch": 0.91424,
      "grad_norm": 0.09536025673151016,
      "learning_rate": 0.0001,
      "loss": 0.3087,
      "step": 5714
    },
    {
      "epoch": 0.9144,
      "grad_norm": 0.08290360122919083,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 5715
    },
    {
      "epoch": 0.91456,
      "grad_norm": 0.09087742120027542,
      "learning_rate": 0.0001,
      "loss": 0.3105,
      "step": 5716
    },
    {
      "epoch": 0.91472,
      "grad_norm": 0.09034767001867294,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 5717
    },
    {
      "epoch": 0.91488,
      "grad_norm": 0.08309534937143326,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 5718
    },
    {
      "epoch": 0.91504,
      "grad_norm": 0.09524782747030258,
      "learning_rate": 0.0001,
      "loss": 0.3249,
      "step": 5719
    },
    {
      "epoch": 0.9152,
      "grad_norm": 0.08823714405298233,
      "learning_rate": 0.0001,
      "loss": 0.3406,
      "step": 5720
    },
    {
      "epoch": 0.91536,
      "grad_norm": 0.09044934809207916,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 5721
    },
    {
      "epoch": 0.91552,
      "grad_norm": 0.08393222838640213,
      "learning_rate": 0.0001,
      "loss": 0.3037,
      "step": 5722
    },
    {
      "epoch": 0.91568,
      "grad_norm": 0.09703473001718521,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 5723
    },
    {
      "epoch": 0.91584,
      "grad_norm": 0.07691820710897446,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 5724
    },
    {
      "epoch": 0.916,
      "grad_norm": 0.07724699378013611,
      "learning_rate": 0.0001,
      "loss": 0.3053,
      "step": 5725
    },
    {
      "epoch": 0.91616,
      "grad_norm": 0.08662812411785126,
      "learning_rate": 0.0001,
      "loss": 0.3268,
      "step": 5726
    },
    {
      "epoch": 0.91632,
      "grad_norm": 0.0798042044043541,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 5727
    },
    {
      "epoch": 0.91648,
      "grad_norm": 0.09703269600868225,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 5728
    },
    {
      "epoch": 0.91664,
      "grad_norm": 0.08744139969348907,
      "learning_rate": 0.0001,
      "loss": 0.3244,
      "step": 5729
    },
    {
      "epoch": 0.9168,
      "grad_norm": 0.09903344511985779,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 5730
    },
    {
      "epoch": 0.91696,
      "grad_norm": 0.08936963230371475,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 5731
    },
    {
      "epoch": 0.91712,
      "grad_norm": 0.10357838869094849,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5732
    },
    {
      "epoch": 0.91728,
      "grad_norm": 0.08188513666391373,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 5733
    },
    {
      "epoch": 0.91744,
      "grad_norm": 0.08504655212163925,
      "learning_rate": 0.0001,
      "loss": 0.3377,
      "step": 5734
    },
    {
      "epoch": 0.9176,
      "grad_norm": 0.10190056264400482,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 5735
    },
    {
      "epoch": 0.91776,
      "grad_norm": 0.11401950567960739,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 5736
    },
    {
      "epoch": 0.91792,
      "grad_norm": 0.09230636805295944,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 5737
    },
    {
      "epoch": 0.91808,
      "grad_norm": 0.0894840732216835,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 5738
    },
    {
      "epoch": 0.91824,
      "grad_norm": 0.09375499933958054,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 5739
    },
    {
      "epoch": 0.9184,
      "grad_norm": 0.0965130552649498,
      "learning_rate": 0.0001,
      "loss": 0.3181,
      "step": 5740
    },
    {
      "epoch": 0.91856,
      "grad_norm": 0.11852584779262543,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 5741
    },
    {
      "epoch": 0.91872,
      "grad_norm": 0.07697036862373352,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 5742
    },
    {
      "epoch": 0.91888,
      "grad_norm": 0.07924924790859222,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5743
    },
    {
      "epoch": 0.91904,
      "grad_norm": 0.0819949135184288,
      "learning_rate": 0.0001,
      "loss": 0.3032,
      "step": 5744
    },
    {
      "epoch": 0.9192,
      "grad_norm": 0.09785095602273941,
      "learning_rate": 0.0001,
      "loss": 0.3042,
      "step": 5745
    },
    {
      "epoch": 0.91936,
      "grad_norm": 0.09297206997871399,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 5746
    },
    {
      "epoch": 0.91952,
      "grad_norm": 0.08368002623319626,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 5747
    },
    {
      "epoch": 0.91968,
      "grad_norm": 0.08906645327806473,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 5748
    },
    {
      "epoch": 0.91984,
      "grad_norm": 0.08385014533996582,
      "learning_rate": 0.0001,
      "loss": 0.3044,
      "step": 5749
    },
    {
      "epoch": 0.92,
      "grad_norm": 0.0939672440290451,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 5750
    },
    {
      "epoch": 0.92016,
      "grad_norm": 0.08723121881484985,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 5751
    },
    {
      "epoch": 0.92032,
      "grad_norm": 0.08058786392211914,
      "learning_rate": 0.0001,
      "loss": 0.311,
      "step": 5752
    },
    {
      "epoch": 0.92048,
      "grad_norm": 0.08452966809272766,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5753
    },
    {
      "epoch": 0.92064,
      "grad_norm": 0.08836214989423752,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 5754
    },
    {
      "epoch": 0.9208,
      "grad_norm": 0.09047272801399231,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 5755
    },
    {
      "epoch": 0.92096,
      "grad_norm": 0.08854677528142929,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 5756
    },
    {
      "epoch": 0.92112,
      "grad_norm": 0.08211562037467957,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 5757
    },
    {
      "epoch": 0.92128,
      "grad_norm": 0.07870050519704819,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 5758
    },
    {
      "epoch": 0.92144,
      "grad_norm": 0.0911351665854454,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 5759
    },
    {
      "epoch": 0.9216,
      "grad_norm": 0.09753792732954025,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 5760
    },
    {
      "epoch": 0.92176,
      "grad_norm": 0.0917070060968399,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 5761
    },
    {
      "epoch": 0.92192,
      "grad_norm": 0.11042778193950653,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 5762
    },
    {
      "epoch": 0.92208,
      "grad_norm": 0.08360276371240616,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 5763
    },
    {
      "epoch": 0.92224,
      "grad_norm": 0.08503421396017075,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 5764
    },
    {
      "epoch": 0.9224,
      "grad_norm": 0.08998312801122665,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 5765
    },
    {
      "epoch": 0.92256,
      "grad_norm": 0.0846061259508133,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 5766
    },
    {
      "epoch": 0.92272,
      "grad_norm": 0.07950598746538162,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 5767
    },
    {
      "epoch": 0.92288,
      "grad_norm": 0.08141899853944778,
      "learning_rate": 0.0001,
      "loss": 0.2966,
      "step": 5768
    },
    {
      "epoch": 0.92304,
      "grad_norm": 0.08910369127988815,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 5769
    },
    {
      "epoch": 0.9232,
      "grad_norm": 0.08141563832759857,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 5770
    },
    {
      "epoch": 0.92336,
      "grad_norm": 0.07321044057607651,
      "learning_rate": 0.0001,
      "loss": 0.3027,
      "step": 5771
    },
    {
      "epoch": 0.92352,
      "grad_norm": 0.08653130382299423,
      "learning_rate": 0.0001,
      "loss": 0.3174,
      "step": 5772
    },
    {
      "epoch": 0.92368,
      "grad_norm": 0.08275284618139267,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 5773
    },
    {
      "epoch": 0.92384,
      "grad_norm": 0.0796700119972229,
      "learning_rate": 0.0001,
      "loss": 0.3155,
      "step": 5774
    },
    {
      "epoch": 0.924,
      "grad_norm": 0.08257818967103958,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 5775
    },
    {
      "epoch": 0.92416,
      "grad_norm": 0.08709804713726044,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 5776
    },
    {
      "epoch": 0.92432,
      "grad_norm": 0.09480691701173782,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 5777
    },
    {
      "epoch": 0.92448,
      "grad_norm": 0.07482069730758667,
      "learning_rate": 0.0001,
      "loss": 0.3192,
      "step": 5778
    },
    {
      "epoch": 0.92464,
      "grad_norm": 0.12447765469551086,
      "learning_rate": 0.0001,
      "loss": 0.3041,
      "step": 5779
    },
    {
      "epoch": 0.9248,
      "grad_norm": 0.08694623410701752,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 5780
    },
    {
      "epoch": 0.92496,
      "grad_norm": 0.08823484182357788,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 5781
    },
    {
      "epoch": 0.92512,
      "grad_norm": 0.08063323050737381,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 5782
    },
    {
      "epoch": 0.92528,
      "grad_norm": 0.10231219232082367,
      "learning_rate": 0.0001,
      "loss": 0.3262,
      "step": 5783
    },
    {
      "epoch": 0.92544,
      "grad_norm": 0.08640880137681961,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 5784
    },
    {
      "epoch": 0.9256,
      "grad_norm": 0.09038243442773819,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 5785
    },
    {
      "epoch": 0.92576,
      "grad_norm": 0.09186383336782455,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 5786
    },
    {
      "epoch": 0.92592,
      "grad_norm": 0.09211521595716476,
      "learning_rate": 0.0001,
      "loss": 0.3343,
      "step": 5787
    },
    {
      "epoch": 0.92608,
      "grad_norm": 0.09041940420866013,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 5788
    },
    {
      "epoch": 0.92624,
      "grad_norm": 0.13778185844421387,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 5789
    },
    {
      "epoch": 0.9264,
      "grad_norm": 0.08145210146903992,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 5790
    },
    {
      "epoch": 0.92656,
      "grad_norm": 0.09382551163434982,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 5791
    },
    {
      "epoch": 0.92672,
      "grad_norm": 0.10073965042829514,
      "learning_rate": 0.0001,
      "loss": 0.2971,
      "step": 5792
    },
    {
      "epoch": 0.92688,
      "grad_norm": 0.16142770648002625,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 5793
    },
    {
      "epoch": 0.92704,
      "grad_norm": 0.09272817522287369,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 5794
    },
    {
      "epoch": 0.9272,
      "grad_norm": 0.09329315274953842,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 5795
    },
    {
      "epoch": 0.92736,
      "grad_norm": 0.10842079669237137,
      "learning_rate": 0.0001,
      "loss": 0.303,
      "step": 5796
    },
    {
      "epoch": 0.92752,
      "grad_norm": 0.09305154532194138,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 5797
    },
    {
      "epoch": 0.92768,
      "grad_norm": 0.09883110970258713,
      "learning_rate": 0.0001,
      "loss": 0.3065,
      "step": 5798
    },
    {
      "epoch": 0.92784,
      "grad_norm": 0.11819588392972946,
      "learning_rate": 0.0001,
      "loss": 0.3251,
      "step": 5799
    },
    {
      "epoch": 0.928,
      "grad_norm": 0.08472274243831635,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 5800
    },
    {
      "epoch": 0.928,
      "eval_train_accuracy": 1.0,
      "eval_train_loss": 0.3179984986782074,
      "eval_train_runtime": 4.7102,
      "eval_train_samples_per_second": 1061.516,
      "eval_train_steps_per_second": 13.375,
      "step": 5800
    },
    {
      "epoch": 0.928,
      "eval_test_accuracy": 1.0,
      "eval_test_loss": 0.3165913224220276,
      "eval_test_runtime": 4.8246,
      "eval_test_samples_per_second": 1036.347,
      "eval_test_steps_per_second": 13.058,
      "step": 5800
    },
    {
      "epoch": 0.92816,
      "grad_norm": 0.183805912733078,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 5801
    },
    {
      "epoch": 0.92832,
      "grad_norm": 0.09376142919063568,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 5802
    },
    {
      "epoch": 0.92848,
      "grad_norm": 0.09697798639535904,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 5803
    },
    {
      "epoch": 0.92864,
      "grad_norm": 0.07746405899524689,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 5804
    },
    {
      "epoch": 0.9288,
      "grad_norm": 0.08893013000488281,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 5805
    },
    {
      "epoch": 0.92896,
      "grad_norm": 0.10549095273017883,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 5806
    },
    {
      "epoch": 0.92912,
      "grad_norm": 0.08597879111766815,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 5807
    },
    {
      "epoch": 0.92928,
      "grad_norm": 0.08029810339212418,
      "learning_rate": 0.0001,
      "loss": 0.3053,
      "step": 5808
    },
    {
      "epoch": 0.92944,
      "grad_norm": 0.08638405799865723,
      "learning_rate": 0.0001,
      "loss": 0.3274,
      "step": 5809
    },
    {
      "epoch": 0.9296,
      "grad_norm": 0.092493437230587,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 5810
    },
    {
      "epoch": 0.92976,
      "grad_norm": 0.08280692249536514,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 5811
    },
    {
      "epoch": 0.92992,
      "grad_norm": 0.0878964364528656,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 5812
    },
    {
      "epoch": 0.93008,
      "grad_norm": 0.08192504197359085,
      "learning_rate": 0.0001,
      "loss": 0.3211,
      "step": 5813
    },
    {
      "epoch": 0.93024,
      "grad_norm": 0.08639296889305115,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 5814
    },
    {
      "epoch": 0.9304,
      "grad_norm": 0.10999900847673416,
      "learning_rate": 0.0001,
      "loss": 0.3351,
      "step": 5815
    },
    {
      "epoch": 0.93056,
      "grad_norm": 0.08600591123104095,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 5816
    },
    {
      "epoch": 0.93072,
      "grad_norm": 0.08237100392580032,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 5817
    },
    {
      "epoch": 0.93088,
      "grad_norm": 0.0792708620429039,
      "learning_rate": 0.0001,
      "loss": 0.3129,
      "step": 5818
    },
    {
      "epoch": 0.93104,
      "grad_norm": 0.0929526835680008,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 5819
    },
    {
      "epoch": 0.9312,
      "grad_norm": 0.09119720757007599,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 5820
    },
    {
      "epoch": 0.93136,
      "grad_norm": 0.08300862461328506,
      "learning_rate": 0.0001,
      "loss": 0.307,
      "step": 5821
    },
    {
      "epoch": 0.93152,
      "grad_norm": 0.0845833271741867,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 5822
    },
    {
      "epoch": 0.93168,
      "grad_norm": 0.10794808715581894,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 5823
    },
    {
      "epoch": 0.93184,
      "grad_norm": 0.11795816570520401,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 5824
    },
    {
      "epoch": 0.932,
      "grad_norm": 0.0909949243068695,
      "learning_rate": 0.0001,
      "loss": 0.3086,
      "step": 5825
    },
    {
      "epoch": 0.93216,
      "grad_norm": 0.09957364946603775,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 5826
    },
    {
      "epoch": 0.93232,
      "grad_norm": 0.07347457855939865,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 5827
    },
    {
      "epoch": 0.93248,
      "grad_norm": 0.08614649623632431,
      "learning_rate": 0.0001,
      "loss": 0.3039,
      "step": 5828
    },
    {
      "epoch": 0.93264,
      "grad_norm": 0.08592027425765991,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 5829
    },
    {
      "epoch": 0.9328,
      "grad_norm": 0.09114342927932739,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 5830
    },
    {
      "epoch": 0.93296,
      "grad_norm": 0.09605129063129425,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 5831
    },
    {
      "epoch": 0.93312,
      "grad_norm": 0.0857820212841034,
      "learning_rate": 0.0001,
      "loss": 0.309,
      "step": 5832
    },
    {
      "epoch": 0.93328,
      "grad_norm": 0.090204156935215,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 5833
    },
    {
      "epoch": 0.93344,
      "grad_norm": 0.08602617681026459,
      "learning_rate": 0.0001,
      "loss": 0.3143,
      "step": 5834
    },
    {
      "epoch": 0.9336,
      "grad_norm": 0.07778538018465042,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 5835
    },
    {
      "epoch": 0.93376,
      "grad_norm": 0.10482165962457657,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 5836
    },
    {
      "epoch": 0.93392,
      "grad_norm": 0.09256355464458466,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 5837
    },
    {
      "epoch": 0.93408,
      "grad_norm": 0.08540263772010803,
      "learning_rate": 0.0001,
      "loss": 0.3058,
      "step": 5838
    },
    {
      "epoch": 0.93424,
      "grad_norm": 0.09219232946634293,
      "learning_rate": 0.0001,
      "loss": 0.3258,
      "step": 5839
    },
    {
      "epoch": 0.9344,
      "grad_norm": 0.0832190290093422,
      "learning_rate": 0.0001,
      "loss": 0.3208,
      "step": 5840
    },
    {
      "epoch": 0.93456,
      "grad_norm": 0.09435150027275085,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 5841
    },
    {
      "epoch": 0.93472,
      "grad_norm": 0.09030929207801819,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 5842
    },
    {
      "epoch": 0.93488,
      "grad_norm": 0.08531787246465683,
      "learning_rate": 0.0001,
      "loss": 0.2981,
      "step": 5843
    },
    {
      "epoch": 0.93504,
      "grad_norm": 0.07589699327945709,
      "learning_rate": 0.0001,
      "loss": 0.3064,
      "step": 5844
    },
    {
      "epoch": 0.9352,
      "grad_norm": 0.08242355287075043,
      "learning_rate": 0.0001,
      "loss": 0.32,
      "step": 5845
    },
    {
      "epoch": 0.93536,
      "grad_norm": 0.08795441687107086,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 5846
    },
    {
      "epoch": 0.93552,
      "grad_norm": 0.08752873539924622,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 5847
    },
    {
      "epoch": 0.93568,
      "grad_norm": 0.08070354908704758,
      "learning_rate": 0.0001,
      "loss": 0.3038,
      "step": 5848
    },
    {
      "epoch": 0.93584,
      "grad_norm": 0.07938961684703827,
      "learning_rate": 0.0001,
      "loss": 0.3057,
      "step": 5849
    },
    {
      "epoch": 0.936,
      "grad_norm": 0.0859682485461235,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 5850
    },
    {
      "epoch": 0.93616,
      "grad_norm": 0.10428307950496674,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5851
    },
    {
      "epoch": 0.93632,
      "grad_norm": 0.09594784677028656,
      "learning_rate": 0.0001,
      "loss": 0.3259,
      "step": 5852
    },
    {
      "epoch": 0.93648,
      "grad_norm": 0.08506721258163452,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 5853
    },
    {
      "epoch": 0.93664,
      "grad_norm": 0.09768711030483246,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 5854
    },
    {
      "epoch": 0.9368,
      "grad_norm": 0.11555830389261246,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 5855
    },
    {
      "epoch": 0.93696,
      "grad_norm": 0.0798339992761612,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 5856
    },
    {
      "epoch": 0.93712,
      "grad_norm": 0.0922788456082344,
      "learning_rate": 0.0001,
      "loss": 0.3331,
      "step": 5857
    },
    {
      "epoch": 0.93728,
      "grad_norm": 0.09523415565490723,
      "learning_rate": 0.0001,
      "loss": 0.3161,
      "step": 5858
    },
    {
      "epoch": 0.93744,
      "grad_norm": 0.07647727429866791,
      "learning_rate": 0.0001,
      "loss": 0.3119,
      "step": 5859
    },
    {
      "epoch": 0.9376,
      "grad_norm": 0.07826539874076843,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5860
    },
    {
      "epoch": 0.93776,
      "grad_norm": 0.08760127425193787,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 5861
    },
    {
      "epoch": 0.93792,
      "grad_norm": 0.08731770515441895,
      "learning_rate": 0.0001,
      "loss": 0.3291,
      "step": 5862
    },
    {
      "epoch": 0.93808,
      "grad_norm": 0.08368581533432007,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 5863
    },
    {
      "epoch": 0.93824,
      "grad_norm": 0.09070730954408646,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 5864
    },
    {
      "epoch": 0.9384,
      "grad_norm": 0.0820237398147583,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 5865
    },
    {
      "epoch": 0.93856,
      "grad_norm": 0.08512545377016068,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 5866
    },
    {
      "epoch": 0.93872,
      "grad_norm": 0.08482607454061508,
      "learning_rate": 0.0001,
      "loss": 0.3026,
      "step": 5867
    },
    {
      "epoch": 0.93888,
      "grad_norm": 0.08185004442930222,
      "learning_rate": 0.0001,
      "loss": 0.3102,
      "step": 5868
    },
    {
      "epoch": 0.93904,
      "grad_norm": 0.08697304874658585,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 5869
    },
    {
      "epoch": 0.9392,
      "grad_norm": 0.09051934629678726,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 5870
    },
    {
      "epoch": 0.93936,
      "grad_norm": 0.0899847224354744,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 5871
    },
    {
      "epoch": 0.93952,
      "grad_norm": 0.08692610263824463,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 5872
    },
    {
      "epoch": 0.93968,
      "grad_norm": 0.08606941252946854,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 5873
    },
    {
      "epoch": 0.93984,
      "grad_norm": 0.07310619205236435,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 5874
    },
    {
      "epoch": 0.94,
      "grad_norm": 0.08912794291973114,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 5875
    },
    {
      "epoch": 0.94016,
      "grad_norm": 0.0841640904545784,
      "learning_rate": 0.0001,
      "loss": 0.3038,
      "step": 5876
    },
    {
      "epoch": 0.94032,
      "grad_norm": 0.0927715003490448,
      "learning_rate": 0.0001,
      "loss": 0.3132,
      "step": 5877
    },
    {
      "epoch": 0.94048,
      "grad_norm": 0.09167324751615524,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 5878
    },
    {
      "epoch": 0.94064,
      "grad_norm": 0.09639650583267212,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5879
    },
    {
      "epoch": 0.9408,
      "grad_norm": 0.08280546963214874,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 5880
    },
    {
      "epoch": 0.94096,
      "grad_norm": 0.08545498549938202,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5881
    },
    {
      "epoch": 0.94112,
      "grad_norm": 0.10162647813558578,
      "learning_rate": 0.0001,
      "loss": 0.3052,
      "step": 5882
    },
    {
      "epoch": 0.94128,
      "grad_norm": 0.09271629899740219,
      "learning_rate": 0.0001,
      "loss": 0.3099,
      "step": 5883
    },
    {
      "epoch": 0.94144,
      "grad_norm": 0.09337297081947327,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 5884
    },
    {
      "epoch": 0.9416,
      "grad_norm": 0.09067563712596893,
      "learning_rate": 0.0001,
      "loss": 0.3117,
      "step": 5885
    },
    {
      "epoch": 0.94176,
      "grad_norm": 0.09479086846113205,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 5886
    },
    {
      "epoch": 0.94192,
      "grad_norm": 0.08553056418895721,
      "learning_rate": 0.0001,
      "loss": 0.3206,
      "step": 5887
    },
    {
      "epoch": 0.94208,
      "grad_norm": 0.08059786260128021,
      "learning_rate": 0.0001,
      "loss": 0.3062,
      "step": 5888
    },
    {
      "epoch": 0.94224,
      "grad_norm": 0.08475883305072784,
      "learning_rate": 0.0001,
      "loss": 0.3313,
      "step": 5889
    },
    {
      "epoch": 0.9424,
      "grad_norm": 0.08071329444646835,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 5890
    },
    {
      "epoch": 0.94256,
      "grad_norm": 0.08729074895381927,
      "learning_rate": 0.0001,
      "loss": 0.3071,
      "step": 5891
    },
    {
      "epoch": 0.94272,
      "grad_norm": 0.09083770960569382,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 5892
    },
    {
      "epoch": 0.94288,
      "grad_norm": 0.08250414580106735,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5893
    },
    {
      "epoch": 0.94304,
      "grad_norm": 0.147757425904274,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 5894
    },
    {
      "epoch": 0.9432,
      "grad_norm": 0.09745360165834427,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 5895
    },
    {
      "epoch": 0.94336,
      "grad_norm": 0.07856624573469162,
      "learning_rate": 0.0001,
      "loss": 0.3002,
      "step": 5896
    },
    {
      "epoch": 0.94352,
      "grad_norm": 0.0825372263789177,
      "learning_rate": 0.0001,
      "loss": 0.2948,
      "step": 5897
    },
    {
      "epoch": 0.94368,
      "grad_norm": 0.09847649931907654,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 5898
    },
    {
      "epoch": 0.94384,
      "grad_norm": 0.09890235960483551,
      "learning_rate": 0.0001,
      "loss": 0.3248,
      "step": 5899
    },
    {
      "epoch": 0.944,
      "grad_norm": 0.11287804692983627,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 5900
    },
    {
      "epoch": 0.944,
      "eval_train_accuracy": 0.992,
      "eval_train_loss": 0.31783413887023926,
      "eval_train_runtime": 4.6966,
      "eval_train_samples_per_second": 1064.598,
      "eval_train_steps_per_second": 13.414,
      "step": 5900
    },
    {
      "epoch": 0.944,
      "eval_test_accuracy": 0.993,
      "eval_test_loss": 0.3165894150733948,
      "eval_test_runtime": 4.7822,
      "eval_test_samples_per_second": 1045.538,
      "eval_test_steps_per_second": 13.174,
      "step": 5900
    },
    {
      "epoch": 0.94416,
      "grad_norm": 0.09579228609800339,
      "learning_rate": 0.0001,
      "loss": 0.33,
      "step": 5901
    },
    {
      "epoch": 0.94432,
      "grad_norm": 0.09871324896812439,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 5902
    },
    {
      "epoch": 0.94448,
      "grad_norm": 0.08352524787187576,
      "learning_rate": 0.0001,
      "loss": 0.3273,
      "step": 5903
    },
    {
      "epoch": 0.94464,
      "grad_norm": 0.07882853597402573,
      "learning_rate": 0.0001,
      "loss": 0.3356,
      "step": 5904
    },
    {
      "epoch": 0.9448,
      "grad_norm": 0.09011224657297134,
      "learning_rate": 0.0001,
      "loss": 0.3279,
      "step": 5905
    },
    {
      "epoch": 0.94496,
      "grad_norm": 0.10042973607778549,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 5906
    },
    {
      "epoch": 0.94512,
      "grad_norm": 0.09441083669662476,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 5907
    },
    {
      "epoch": 0.94528,
      "grad_norm": 0.08196225762367249,
      "learning_rate": 0.0001,
      "loss": 0.2994,
      "step": 5908
    },
    {
      "epoch": 0.94544,
      "grad_norm": 0.08431072533130646,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 5909
    },
    {
      "epoch": 0.9456,
      "grad_norm": 0.09200604259967804,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 5910
    },
    {
      "epoch": 0.94576,
      "grad_norm": 0.09715263545513153,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 5911
    },
    {
      "epoch": 0.94592,
      "grad_norm": 0.10779322683811188,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 5912
    },
    {
      "epoch": 0.94608,
      "grad_norm": 0.0983855128288269,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5913
    },
    {
      "epoch": 0.94624,
      "grad_norm": 0.07780325412750244,
      "learning_rate": 0.0001,
      "loss": 0.3043,
      "step": 5914
    },
    {
      "epoch": 0.9464,
      "grad_norm": 0.09086446464061737,
      "learning_rate": 0.0001,
      "loss": 0.3107,
      "step": 5915
    },
    {
      "epoch": 0.94656,
      "grad_norm": 0.18744760751724243,
      "learning_rate": 0.0001,
      "loss": 0.3256,
      "step": 5916
    },
    {
      "epoch": 0.94672,
      "grad_norm": 0.09504491835832596,
      "learning_rate": 0.0001,
      "loss": 0.3142,
      "step": 5917
    },
    {
      "epoch": 0.94688,
      "grad_norm": 0.09395348280668259,
      "learning_rate": 0.0001,
      "loss": 0.3128,
      "step": 5918
    },
    {
      "epoch": 0.94704,
      "grad_norm": 0.09749431908130646,
      "learning_rate": 0.0001,
      "loss": 0.3173,
      "step": 5919
    },
    {
      "epoch": 0.9472,
      "grad_norm": 0.17245084047317505,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 5920
    },
    {
      "epoch": 0.94736,
      "grad_norm": 0.09243863821029663,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5921
    },
    {
      "epoch": 0.94752,
      "grad_norm": 0.10283613204956055,
      "learning_rate": 0.0001,
      "loss": 0.3149,
      "step": 5922
    },
    {
      "epoch": 0.94768,
      "grad_norm": 0.10194889456033707,
      "learning_rate": 0.0001,
      "loss": 0.3193,
      "step": 5923
    },
    {
      "epoch": 0.94784,
      "grad_norm": 0.08908150345087051,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 5924
    },
    {
      "epoch": 0.948,
      "grad_norm": 0.09386210143566132,
      "learning_rate": 0.0001,
      "loss": 0.3078,
      "step": 5925
    },
    {
      "epoch": 0.94816,
      "grad_norm": 0.1124715507030487,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 5926
    },
    {
      "epoch": 0.94832,
      "grad_norm": 0.09894341230392456,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 5927
    },
    {
      "epoch": 0.94848,
      "grad_norm": 0.097646564245224,
      "learning_rate": 0.0001,
      "loss": 0.3186,
      "step": 5928
    },
    {
      "epoch": 0.94864,
      "grad_norm": 0.08118018507957458,
      "learning_rate": 0.0001,
      "loss": 0.3269,
      "step": 5929
    },
    {
      "epoch": 0.9488,
      "grad_norm": 0.09543995559215546,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 5930
    },
    {
      "epoch": 0.94896,
      "grad_norm": 0.12032531201839447,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 5931
    },
    {
      "epoch": 0.94912,
      "grad_norm": 0.0960376039147377,
      "learning_rate": 0.0001,
      "loss": 0.3078,
      "step": 5932
    },
    {
      "epoch": 0.94928,
      "grad_norm": 0.08095362037420273,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 5933
    },
    {
      "epoch": 0.94944,
      "grad_norm": 0.1250857412815094,
      "learning_rate": 0.0001,
      "loss": 0.3079,
      "step": 5934
    },
    {
      "epoch": 0.9496,
      "grad_norm": 0.16263417899608612,
      "learning_rate": 0.0001,
      "loss": 0.3168,
      "step": 5935
    },
    {
      "epoch": 0.94976,
      "grad_norm": 0.10180605202913284,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 5936
    },
    {
      "epoch": 0.94992,
      "grad_norm": 0.10988010466098785,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 5937
    },
    {
      "epoch": 0.95008,
      "grad_norm": 0.13341140747070312,
      "learning_rate": 0.0001,
      "loss": 0.3329,
      "step": 5938
    },
    {
      "epoch": 0.95024,
      "grad_norm": 0.11238908022642136,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 5939
    },
    {
      "epoch": 0.9504,
      "grad_norm": 0.10772000998258591,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 5940
    },
    {
      "epoch": 0.95056,
      "grad_norm": 0.08711977303028107,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 5941
    },
    {
      "epoch": 0.95072,
      "grad_norm": 0.08486949652433395,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 5942
    },
    {
      "epoch": 0.95088,
      "grad_norm": 0.08523062616586685,
      "learning_rate": 0.0001,
      "loss": 0.3114,
      "step": 5943
    },
    {
      "epoch": 0.95104,
      "grad_norm": 0.10413430631160736,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 5944
    },
    {
      "epoch": 0.9512,
      "grad_norm": 0.09795306622982025,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 5945
    },
    {
      "epoch": 0.95136,
      "grad_norm": 0.10614945739507675,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 5946
    },
    {
      "epoch": 0.95152,
      "grad_norm": 0.09073259681463242,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5947
    },
    {
      "epoch": 0.95168,
      "grad_norm": 0.08704839646816254,
      "learning_rate": 0.0001,
      "loss": 0.3082,
      "step": 5948
    },
    {
      "epoch": 0.95184,
      "grad_norm": 0.08112536370754242,
      "learning_rate": 0.0001,
      "loss": 0.3062,
      "step": 5949
    },
    {
      "epoch": 0.952,
      "grad_norm": 0.08955105394124985,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 5950
    },
    {
      "epoch": 0.95216,
      "grad_norm": 0.08760225027799606,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 5951
    },
    {
      "epoch": 0.95232,
      "grad_norm": 0.10262975096702576,
      "learning_rate": 0.0001,
      "loss": 0.3098,
      "step": 5952
    },
    {
      "epoch": 0.95248,
      "grad_norm": 0.09615936130285263,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 5953
    },
    {
      "epoch": 0.95264,
      "grad_norm": 0.09205673635005951,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 5954
    },
    {
      "epoch": 0.9528,
      "grad_norm": 0.15153905749320984,
      "learning_rate": 0.0001,
      "loss": 0.3332,
      "step": 5955
    },
    {
      "epoch": 0.95296,
      "grad_norm": 0.09039397537708282,
      "learning_rate": 0.0001,
      "loss": 0.3253,
      "step": 5956
    },
    {
      "epoch": 0.95312,
      "grad_norm": 0.09130235016345978,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 5957
    },
    {
      "epoch": 0.95328,
      "grad_norm": 0.0971447229385376,
      "learning_rate": 0.0001,
      "loss": 0.332,
      "step": 5958
    },
    {
      "epoch": 0.95344,
      "grad_norm": 0.1133119985461235,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 5959
    },
    {
      "epoch": 0.9536,
      "grad_norm": 0.10609930753707886,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 5960
    },
    {
      "epoch": 0.95376,
      "grad_norm": 0.0950242280960083,
      "learning_rate": 0.0001,
      "loss": 0.323,
      "step": 5961
    },
    {
      "epoch": 0.95392,
      "grad_norm": 0.09121523797512054,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 5962
    },
    {
      "epoch": 0.95408,
      "grad_norm": 0.09599610418081284,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 5963
    },
    {
      "epoch": 0.95424,
      "grad_norm": 0.10223162174224854,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 5964
    },
    {
      "epoch": 0.9544,
      "grad_norm": 0.1055547446012497,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 5965
    },
    {
      "epoch": 0.95456,
      "grad_norm": 0.09226767718791962,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 5966
    },
    {
      "epoch": 0.95472,
      "grad_norm": 0.10926516354084015,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 5967
    },
    {
      "epoch": 0.95488,
      "grad_norm": 0.10006243735551834,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 5968
    },
    {
      "epoch": 0.95504,
      "grad_norm": 0.18715611100196838,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 5969
    },
    {
      "epoch": 0.9552,
      "grad_norm": 0.10199892520904541,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 5970
    },
    {
      "epoch": 0.95536,
      "grad_norm": 0.09172210097312927,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 5971
    },
    {
      "epoch": 0.95552,
      "grad_norm": 0.07419732213020325,
      "learning_rate": 0.0001,
      "loss": 0.2985,
      "step": 5972
    },
    {
      "epoch": 0.95568,
      "grad_norm": 0.10544513911008835,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 5973
    },
    {
      "epoch": 0.95584,
      "grad_norm": 0.08514024317264557,
      "learning_rate": 0.0001,
      "loss": 0.3178,
      "step": 5974
    },
    {
      "epoch": 0.956,
      "grad_norm": 0.11411809921264648,
      "learning_rate": 0.0001,
      "loss": 0.3123,
      "step": 5975
    },
    {
      "epoch": 0.95616,
      "grad_norm": 0.09726326912641525,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 5976
    },
    {
      "epoch": 0.95632,
      "grad_norm": 0.08868099004030228,
      "learning_rate": 0.0001,
      "loss": 0.3078,
      "step": 5977
    },
    {
      "epoch": 0.95648,
      "grad_norm": 0.08872941881418228,
      "learning_rate": 0.0001,
      "loss": 0.3308,
      "step": 5978
    },
    {
      "epoch": 0.95664,
      "grad_norm": 0.09705913066864014,
      "learning_rate": 0.0001,
      "loss": 0.3077,
      "step": 5979
    },
    {
      "epoch": 0.9568,
      "grad_norm": 0.12250841408967972,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 5980
    },
    {
      "epoch": 0.95696,
      "grad_norm": 0.08504091948270798,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 5981
    },
    {
      "epoch": 0.95712,
      "grad_norm": 0.10454638302326202,
      "learning_rate": 0.0001,
      "loss": 0.3297,
      "step": 5982
    },
    {
      "epoch": 0.95728,
      "grad_norm": 0.11012464016675949,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 5983
    },
    {
      "epoch": 0.95744,
      "grad_norm": 0.09723253548145294,
      "learning_rate": 0.0001,
      "loss": 0.335,
      "step": 5984
    },
    {
      "epoch": 0.9576,
      "grad_norm": 0.08339864760637283,
      "learning_rate": 0.0001,
      "loss": 0.303,
      "step": 5985
    },
    {
      "epoch": 0.95776,
      "grad_norm": 0.09696661680936813,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 5986
    },
    {
      "epoch": 0.95792,
      "grad_norm": 0.07516058534383774,
      "learning_rate": 0.0001,
      "loss": 0.3047,
      "step": 5987
    },
    {
      "epoch": 0.95808,
      "grad_norm": 0.09728609770536423,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 5988
    },
    {
      "epoch": 0.95824,
      "grad_norm": 0.0907592698931694,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 5989
    },
    {
      "epoch": 0.9584,
      "grad_norm": 0.09535933285951614,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 5990
    },
    {
      "epoch": 0.95856,
      "grad_norm": 0.23037727177143097,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 5991
    },
    {
      "epoch": 0.95872,
      "grad_norm": 0.08279197663068771,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 5992
    },
    {
      "epoch": 0.95888,
      "grad_norm": 0.08433305472135544,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 5993
    },
    {
      "epoch": 0.95904,
      "grad_norm": 0.0778060182929039,
      "learning_rate": 0.0001,
      "loss": 0.3121,
      "step": 5994
    },
    {
      "epoch": 0.9592,
      "grad_norm": 0.1329975724220276,
      "learning_rate": 0.0001,
      "loss": 0.3182,
      "step": 5995
    },
    {
      "epoch": 0.95936,
      "grad_norm": 0.08505203574895859,
      "learning_rate": 0.0001,
      "loss": 0.317,
      "step": 5996
    },
    {
      "epoch": 0.95952,
      "grad_norm": 0.09461748600006104,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 5997
    },
    {
      "epoch": 0.95968,
      "grad_norm": 0.09476642310619354,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 5998
    },
    {
      "epoch": 0.95984,
      "grad_norm": 0.1527746170759201,
      "learning_rate": 0.0001,
      "loss": 0.3236,
      "step": 5999
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.10437428206205368,
      "learning_rate": 0.0001,
      "loss": 0.3232,
      "step": 6000
    },
    {
      "epoch": 0.96,
      "eval_train_accuracy": 1.0,
      "eval_train_loss": 0.3181493580341339,
      "eval_train_runtime": 4.7192,
      "eval_train_samples_per_second": 1059.51,
      "eval_train_steps_per_second": 13.35,
      "step": 6000
    },
    {
      "epoch": 0.96,
      "eval_test_accuracy": 0.9998,
      "eval_test_loss": 0.3169364631175995,
      "eval_test_runtime": 4.7704,
      "eval_test_samples_per_second": 1048.122,
      "eval_test_steps_per_second": 13.206,
      "step": 6000
    },
    {
      "epoch": 0.96016,
      "grad_norm": 0.0808408185839653,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 6001
    },
    {
      "epoch": 0.96032,
      "grad_norm": 0.1361035704612732,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 6002
    },
    {
      "epoch": 0.96048,
      "grad_norm": 0.10769873112440109,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 6003
    },
    {
      "epoch": 0.96064,
      "grad_norm": 0.12908194959163666,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 6004
    },
    {
      "epoch": 0.9608,
      "grad_norm": 0.08346050977706909,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 6005
    },
    {
      "epoch": 0.96096,
      "grad_norm": 0.08041510730981827,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 6006
    },
    {
      "epoch": 0.96112,
      "grad_norm": 0.10038337856531143,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 6007
    },
    {
      "epoch": 0.96128,
      "grad_norm": 0.1111547201871872,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 6008
    },
    {
      "epoch": 0.96144,
      "grad_norm": 0.09567449241876602,
      "learning_rate": 0.0001,
      "loss": 0.3238,
      "step": 6009
    },
    {
      "epoch": 0.9616,
      "grad_norm": 0.09834767878055573,
      "learning_rate": 0.0001,
      "loss": 0.318,
      "step": 6010
    },
    {
      "epoch": 0.96176,
      "grad_norm": 0.13540136814117432,
      "learning_rate": 0.0001,
      "loss": 0.3202,
      "step": 6011
    },
    {
      "epoch": 0.96192,
      "grad_norm": 0.11769577860832214,
      "learning_rate": 0.0001,
      "loss": 0.3115,
      "step": 6012
    },
    {
      "epoch": 0.96208,
      "grad_norm": 0.1748325675725937,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 6013
    },
    {
      "epoch": 0.96224,
      "grad_norm": 0.09221711754798889,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 6014
    },
    {
      "epoch": 0.9624,
      "grad_norm": 0.1078108549118042,
      "learning_rate": 0.0001,
      "loss": 0.313,
      "step": 6015
    },
    {
      "epoch": 0.96256,
      "grad_norm": 0.1084638461470604,
      "learning_rate": 0.0001,
      "loss": 0.3368,
      "step": 6016
    },
    {
      "epoch": 0.96272,
      "grad_norm": 0.08769117295742035,
      "learning_rate": 0.0001,
      "loss": 0.3125,
      "step": 6017
    },
    {
      "epoch": 0.96288,
      "grad_norm": 0.1110832467675209,
      "learning_rate": 0.0001,
      "loss": 0.328,
      "step": 6018
    },
    {
      "epoch": 0.96304,
      "grad_norm": 0.09908690303564072,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 6019
    },
    {
      "epoch": 0.9632,
      "grad_norm": 0.15392570197582245,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 6020
    },
    {
      "epoch": 0.96336,
      "grad_norm": 0.08575733751058578,
      "learning_rate": 0.0001,
      "loss": 0.3275,
      "step": 6021
    },
    {
      "epoch": 0.96352,
      "grad_norm": 0.10639676451683044,
      "learning_rate": 0.0001,
      "loss": 0.3327,
      "step": 6022
    },
    {
      "epoch": 0.96368,
      "grad_norm": 0.4306185245513916,
      "learning_rate": 0.0001,
      "loss": 0.326,
      "step": 6023
    },
    {
      "epoch": 0.96384,
      "grad_norm": 0.10716353356838226,
      "learning_rate": 0.0001,
      "loss": 0.3338,
      "step": 6024
    },
    {
      "epoch": 0.964,
      "grad_norm": 0.09999151527881622,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 6025
    },
    {
      "epoch": 0.96416,
      "grad_norm": 0.09891584515571594,
      "learning_rate": 0.0001,
      "loss": 0.3191,
      "step": 6026
    },
    {
      "epoch": 0.96432,
      "grad_norm": 0.14396628737449646,
      "learning_rate": 0.0001,
      "loss": 0.3053,
      "step": 6027
    },
    {
      "epoch": 0.96448,
      "grad_norm": 0.09724709391593933,
      "learning_rate": 0.0001,
      "loss": 0.3271,
      "step": 6028
    },
    {
      "epoch": 0.96464,
      "grad_norm": 0.14167004823684692,
      "learning_rate": 0.0001,
      "loss": 0.3185,
      "step": 6029
    },
    {
      "epoch": 0.9648,
      "grad_norm": 0.12821541726589203,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 6030
    },
    {
      "epoch": 0.96496,
      "grad_norm": 0.19745169579982758,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 6031
    },
    {
      "epoch": 0.96512,
      "grad_norm": 0.10796940326690674,
      "learning_rate": 0.0001,
      "loss": 0.3028,
      "step": 6032
    },
    {
      "epoch": 0.96528,
      "grad_norm": 0.12937311828136444,
      "learning_rate": 0.0001,
      "loss": 0.3228,
      "step": 6033
    },
    {
      "epoch": 0.96544,
      "grad_norm": 0.17331121861934662,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 6034
    },
    {
      "epoch": 0.9656,
      "grad_norm": 0.09730671346187592,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 6035
    },
    {
      "epoch": 0.96576,
      "grad_norm": 0.13772594928741455,
      "learning_rate": 0.0001,
      "loss": 0.3219,
      "step": 6036
    },
    {
      "epoch": 0.96592,
      "grad_norm": 0.12352446466684341,
      "learning_rate": 0.0001,
      "loss": 0.3141,
      "step": 6037
    },
    {
      "epoch": 0.96608,
      "grad_norm": 0.09901843965053558,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 6038
    },
    {
      "epoch": 0.96624,
      "grad_norm": 0.08624832332134247,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 6039
    },
    {
      "epoch": 0.9664,
      "grad_norm": 0.10678897798061371,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 6040
    },
    {
      "epoch": 0.96656,
      "grad_norm": 0.08598915487527847,
      "learning_rate": 0.0001,
      "loss": 0.3226,
      "step": 6041
    },
    {
      "epoch": 0.96672,
      "grad_norm": 0.07782559096813202,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 6042
    },
    {
      "epoch": 0.96688,
      "grad_norm": 0.0905391052365303,
      "learning_rate": 0.0001,
      "loss": 0.3184,
      "step": 6043
    },
    {
      "epoch": 0.96704,
      "grad_norm": 0.09097809344530106,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 6044
    },
    {
      "epoch": 0.9672,
      "grad_norm": 0.08916568756103516,
      "learning_rate": 0.0001,
      "loss": 0.3153,
      "step": 6045
    },
    {
      "epoch": 0.96736,
      "grad_norm": 0.12956932187080383,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 6046
    },
    {
      "epoch": 0.96752,
      "grad_norm": 0.09616968780755997,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 6047
    },
    {
      "epoch": 0.96768,
      "grad_norm": 0.11997243762016296,
      "learning_rate": 0.0001,
      "loss": 0.3116,
      "step": 6048
    },
    {
      "epoch": 0.96784,
      "grad_norm": 0.08579380810260773,
      "learning_rate": 0.0001,
      "loss": 0.3212,
      "step": 6049
    },
    {
      "epoch": 0.968,
      "grad_norm": 0.11961778253316879,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 6050
    },
    {
      "epoch": 0.96816,
      "grad_norm": 0.10508653521537781,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 6051
    },
    {
      "epoch": 0.96832,
      "grad_norm": 0.0961785539984703,
      "learning_rate": 0.0001,
      "loss": 0.3354,
      "step": 6052
    },
    {
      "epoch": 0.96848,
      "grad_norm": 0.1007697656750679,
      "learning_rate": 0.0001,
      "loss": 0.3366,
      "step": 6053
    },
    {
      "epoch": 0.96864,
      "grad_norm": 0.12850509583950043,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 6054
    },
    {
      "epoch": 0.9688,
      "grad_norm": 0.08451370149850845,
      "learning_rate": 0.0001,
      "loss": 0.3272,
      "step": 6055
    },
    {
      "epoch": 0.96896,
      "grad_norm": 0.07795635610818863,
      "learning_rate": 0.0001,
      "loss": 0.3136,
      "step": 6056
    },
    {
      "epoch": 0.96912,
      "grad_norm": 0.09388439357280731,
      "learning_rate": 0.0001,
      "loss": 0.3137,
      "step": 6057
    },
    {
      "epoch": 0.96928,
      "grad_norm": 0.08124735951423645,
      "learning_rate": 0.0001,
      "loss": 0.3277,
      "step": 6058
    },
    {
      "epoch": 0.96944,
      "grad_norm": 0.09239117801189423,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 6059
    },
    {
      "epoch": 0.9696,
      "grad_norm": 0.08060452342033386,
      "learning_rate": 0.0001,
      "loss": 0.3241,
      "step": 6060
    },
    {
      "epoch": 0.96976,
      "grad_norm": 0.07844451069831848,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 6061
    },
    {
      "epoch": 0.96992,
      "grad_norm": 0.09738610684871674,
      "learning_rate": 0.0001,
      "loss": 0.3303,
      "step": 6062
    },
    {
      "epoch": 0.97008,
      "grad_norm": 0.1052205041050911,
      "learning_rate": 0.0001,
      "loss": 0.3112,
      "step": 6063
    },
    {
      "epoch": 0.97024,
      "grad_norm": 0.09529343247413635,
      "learning_rate": 0.0001,
      "loss": 0.3009,
      "step": 6064
    },
    {
      "epoch": 0.9704,
      "grad_norm": 0.07704504579305649,
      "learning_rate": 0.0001,
      "loss": 0.3199,
      "step": 6065
    },
    {
      "epoch": 0.97056,
      "grad_norm": 0.08571379631757736,
      "learning_rate": 0.0001,
      "loss": 0.3194,
      "step": 6066
    },
    {
      "epoch": 0.97072,
      "grad_norm": 0.08915655314922333,
      "learning_rate": 0.0001,
      "loss": 0.3074,
      "step": 6067
    },
    {
      "epoch": 0.97088,
      "grad_norm": 0.08566360175609589,
      "learning_rate": 0.0001,
      "loss": 0.3176,
      "step": 6068
    },
    {
      "epoch": 0.97104,
      "grad_norm": 0.08238441497087479,
      "learning_rate": 0.0001,
      "loss": 0.3069,
      "step": 6069
    },
    {
      "epoch": 0.9712,
      "grad_norm": 0.08686460554599762,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 6070
    },
    {
      "epoch": 0.97136,
      "grad_norm": 0.07266097515821457,
      "learning_rate": 0.0001,
      "loss": 0.3039,
      "step": 6071
    },
    {
      "epoch": 0.97152,
      "grad_norm": 0.0917883962392807,
      "learning_rate": 0.0001,
      "loss": 0.3032,
      "step": 6072
    },
    {
      "epoch": 0.97168,
      "grad_norm": 0.07837457954883575,
      "learning_rate": 0.0001,
      "loss": 0.3288,
      "step": 6073
    },
    {
      "epoch": 0.97184,
      "grad_norm": 0.08504591137170792,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 6074
    },
    {
      "epoch": 0.972,
      "grad_norm": 0.07853404432535172,
      "learning_rate": 0.0001,
      "loss": 0.3203,
      "step": 6075
    },
    {
      "epoch": 0.97216,
      "grad_norm": 0.08456028997898102,
      "learning_rate": 0.0001,
      "loss": 0.3239,
      "step": 6076
    },
    {
      "epoch": 0.97232,
      "grad_norm": 0.07589658349752426,
      "learning_rate": 0.0001,
      "loss": 0.3218,
      "step": 6077
    },
    {
      "epoch": 0.97248,
      "grad_norm": 0.08795267343521118,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 6078
    },
    {
      "epoch": 0.97264,
      "grad_norm": 0.0843781977891922,
      "learning_rate": 0.0001,
      "loss": 0.3088,
      "step": 6079
    },
    {
      "epoch": 0.9728,
      "grad_norm": 0.09512197226285934,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 6080
    },
    {
      "epoch": 0.97296,
      "grad_norm": 0.09029458463191986,
      "learning_rate": 0.0001,
      "loss": 0.3101,
      "step": 6081
    },
    {
      "epoch": 0.97312,
      "grad_norm": 0.08440206944942474,
      "learning_rate": 0.0001,
      "loss": 0.3278,
      "step": 6082
    },
    {
      "epoch": 0.97328,
      "grad_norm": 0.08644735813140869,
      "learning_rate": 0.0001,
      "loss": 0.3305,
      "step": 6083
    },
    {
      "epoch": 0.97344,
      "grad_norm": 0.10221734642982483,
      "learning_rate": 0.0001,
      "loss": 0.3085,
      "step": 6084
    },
    {
      "epoch": 0.9736,
      "grad_norm": 0.1018296331167221,
      "learning_rate": 0.0001,
      "loss": 0.3122,
      "step": 6085
    },
    {
      "epoch": 0.97376,
      "grad_norm": 0.0838622897863388,
      "learning_rate": 0.0001,
      "loss": 0.3266,
      "step": 6086
    },
    {
      "epoch": 0.97392,
      "grad_norm": 0.08847519755363464,
      "learning_rate": 0.0001,
      "loss": 0.3177,
      "step": 6087
    },
    {
      "epoch": 0.97408,
      "grad_norm": 0.09029752761125565,
      "learning_rate": 0.0001,
      "loss": 0.3364,
      "step": 6088
    },
    {
      "epoch": 0.97424,
      "grad_norm": 0.10998311638832092,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 6089
    },
    {
      "epoch": 0.9744,
      "grad_norm": 0.09148372709751129,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 6090
    },
    {
      "epoch": 0.97456,
      "grad_norm": 0.08869180083274841,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 6091
    },
    {
      "epoch": 0.97472,
      "grad_norm": 0.0954744815826416,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 6092
    },
    {
      "epoch": 0.97488,
      "grad_norm": 0.08961150050163269,
      "learning_rate": 0.0001,
      "loss": 0.3084,
      "step": 6093
    },
    {
      "epoch": 0.97504,
      "grad_norm": 0.0782371535897255,
      "learning_rate": 0.0001,
      "loss": 0.3037,
      "step": 6094
    },
    {
      "epoch": 0.9752,
      "grad_norm": 0.08251526206731796,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 6095
    },
    {
      "epoch": 0.97536,
      "grad_norm": 0.09115887433290482,
      "learning_rate": 0.0001,
      "loss": 0.3095,
      "step": 6096
    },
    {
      "epoch": 0.97552,
      "grad_norm": 0.08716315031051636,
      "learning_rate": 0.0001,
      "loss": 0.2973,
      "step": 6097
    },
    {
      "epoch": 0.97568,
      "grad_norm": 0.09093058109283447,
      "learning_rate": 0.0001,
      "loss": 0.3264,
      "step": 6098
    },
    {
      "epoch": 0.97584,
      "grad_norm": 0.08366470783948898,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 6099
    },
    {
      "epoch": 0.976,
      "grad_norm": 0.07187218964099884,
      "learning_rate": 0.0001,
      "loss": 0.3072,
      "step": 6100
    },
    {
      "epoch": 0.976,
      "eval_train_accuracy": 1.0,
      "eval_train_loss": 0.3177967667579651,
      "eval_train_runtime": 4.6096,
      "eval_train_samples_per_second": 1084.685,
      "eval_train_steps_per_second": 13.667,
      "step": 6100
    },
    {
      "epoch": 0.976,
      "eval_test_accuracy": 1.0,
      "eval_test_loss": 0.3167685270309448,
      "eval_test_runtime": 5.0856,
      "eval_test_samples_per_second": 983.172,
      "eval_test_steps_per_second": 12.388,
      "step": 6100
    },
    {
      "epoch": 0.97616,
      "grad_norm": 0.09505581110715866,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 6101
    },
    {
      "epoch": 0.97632,
      "grad_norm": 0.09982280433177948,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 6102
    },
    {
      "epoch": 0.97648,
      "grad_norm": 0.08175292611122131,
      "learning_rate": 0.0001,
      "loss": 0.3078,
      "step": 6103
    },
    {
      "epoch": 0.97664,
      "grad_norm": 0.09066876769065857,
      "learning_rate": 0.0001,
      "loss": 0.3237,
      "step": 6104
    },
    {
      "epoch": 0.9768,
      "grad_norm": 0.09191980957984924,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 6105
    },
    {
      "epoch": 0.97696,
      "grad_norm": 0.09841528534889221,
      "learning_rate": 0.0001,
      "loss": 0.3276,
      "step": 6106
    },
    {
      "epoch": 0.97712,
      "grad_norm": 0.08111461251974106,
      "learning_rate": 0.0001,
      "loss": 0.3162,
      "step": 6107
    },
    {
      "epoch": 0.97728,
      "grad_norm": 0.07878833264112473,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 6108
    },
    {
      "epoch": 0.97744,
      "grad_norm": 0.08988095074892044,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 6109
    },
    {
      "epoch": 0.9776,
      "grad_norm": 0.09609858691692352,
      "learning_rate": 0.0001,
      "loss": 0.3187,
      "step": 6110
    },
    {
      "epoch": 0.97776,
      "grad_norm": 0.09286114573478699,
      "learning_rate": 0.0001,
      "loss": 0.3183,
      "step": 6111
    },
    {
      "epoch": 0.97792,
      "grad_norm": 0.10665607452392578,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 6112
    },
    {
      "epoch": 0.97808,
      "grad_norm": 0.09962426126003265,
      "learning_rate": 0.0001,
      "loss": 0.3045,
      "step": 6113
    },
    {
      "epoch": 0.97824,
      "grad_norm": 0.09837494790554047,
      "learning_rate": 0.0001,
      "loss": 0.3234,
      "step": 6114
    },
    {
      "epoch": 0.9784,
      "grad_norm": 0.08704091608524323,
      "learning_rate": 0.0001,
      "loss": 0.3169,
      "step": 6115
    },
    {
      "epoch": 0.97856,
      "grad_norm": 0.07600941509008408,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 6116
    },
    {
      "epoch": 0.97872,
      "grad_norm": 0.0966196209192276,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 6117
    },
    {
      "epoch": 0.97888,
      "grad_norm": 0.09558391571044922,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 6118
    },
    {
      "epoch": 0.97904,
      "grad_norm": 0.09562348574399948,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 6119
    },
    {
      "epoch": 0.9792,
      "grad_norm": 0.08349063247442245,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 6120
    },
    {
      "epoch": 0.97936,
      "grad_norm": 0.08653650432825089,
      "learning_rate": 0.0001,
      "loss": 0.3124,
      "step": 6121
    },
    {
      "epoch": 0.97952,
      "grad_norm": 0.12277454137802124,
      "learning_rate": 0.0001,
      "loss": 0.3222,
      "step": 6122
    },
    {
      "epoch": 0.97968,
      "grad_norm": 0.09696561843156815,
      "learning_rate": 0.0001,
      "loss": 0.3156,
      "step": 6123
    },
    {
      "epoch": 0.97984,
      "grad_norm": 0.10757189989089966,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 6124
    },
    {
      "epoch": 0.98,
      "grad_norm": 0.07903209328651428,
      "learning_rate": 0.0001,
      "loss": 0.3167,
      "step": 6125
    },
    {
      "epoch": 0.98016,
      "grad_norm": 0.08592557162046432,
      "learning_rate": 0.0001,
      "loss": 0.3066,
      "step": 6126
    },
    {
      "epoch": 0.98032,
      "grad_norm": 0.09308972209692001,
      "learning_rate": 0.0001,
      "loss": 0.319,
      "step": 6127
    },
    {
      "epoch": 0.98048,
      "grad_norm": 0.0913030281662941,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 6128
    },
    {
      "epoch": 0.98064,
      "grad_norm": 0.11248878389596939,
      "learning_rate": 0.0001,
      "loss": 0.3229,
      "step": 6129
    },
    {
      "epoch": 0.9808,
      "grad_norm": 0.09251877665519714,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 6130
    },
    {
      "epoch": 0.98096,
      "grad_norm": 0.08137056976556778,
      "learning_rate": 0.0001,
      "loss": 0.3265,
      "step": 6131
    },
    {
      "epoch": 0.98112,
      "grad_norm": 0.07897459715604782,
      "learning_rate": 0.0001,
      "loss": 0.3024,
      "step": 6132
    },
    {
      "epoch": 0.98128,
      "grad_norm": 0.08412577956914902,
      "learning_rate": 0.0001,
      "loss": 0.3233,
      "step": 6133
    },
    {
      "epoch": 0.98144,
      "grad_norm": 0.07391072809696198,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 6134
    },
    {
      "epoch": 0.9816,
      "grad_norm": 0.09912022203207016,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 6135
    },
    {
      "epoch": 0.98176,
      "grad_norm": 0.08832069486379623,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 6136
    },
    {
      "epoch": 0.98192,
      "grad_norm": 0.10168633610010147,
      "learning_rate": 0.0001,
      "loss": 0.321,
      "step": 6137
    },
    {
      "epoch": 0.98208,
      "grad_norm": 0.10042863339185715,
      "learning_rate": 0.0001,
      "loss": 0.3216,
      "step": 6138
    },
    {
      "epoch": 0.98224,
      "grad_norm": 0.09837277978658676,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 6139
    },
    {
      "epoch": 0.9824,
      "grad_norm": 0.09446047991514206,
      "learning_rate": 0.0001,
      "loss": 0.3333,
      "step": 6140
    },
    {
      "epoch": 0.98256,
      "grad_norm": 0.0889773964881897,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 6141
    },
    {
      "epoch": 0.98272,
      "grad_norm": 0.08914550393819809,
      "learning_rate": 0.0001,
      "loss": 0.3324,
      "step": 6142
    },
    {
      "epoch": 0.98288,
      "grad_norm": 0.09978744387626648,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 6143
    },
    {
      "epoch": 0.98304,
      "grad_norm": 0.09397321939468384,
      "learning_rate": 0.0001,
      "loss": 0.3035,
      "step": 6144
    },
    {
      "epoch": 0.9832,
      "grad_norm": 0.09726536273956299,
      "learning_rate": 0.0001,
      "loss": 0.316,
      "step": 6145
    },
    {
      "epoch": 0.98336,
      "grad_norm": 0.0873497873544693,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 6146
    },
    {
      "epoch": 0.98352,
      "grad_norm": 0.08071915805339813,
      "learning_rate": 0.0001,
      "loss": 0.3003,
      "step": 6147
    },
    {
      "epoch": 0.98368,
      "grad_norm": 0.10641004145145416,
      "learning_rate": 0.0001,
      "loss": 0.3157,
      "step": 6148
    },
    {
      "epoch": 0.98384,
      "grad_norm": 0.08731212466955185,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 6149
    },
    {
      "epoch": 0.984,
      "grad_norm": 0.08529006689786911,
      "learning_rate": 0.0001,
      "loss": 0.3295,
      "step": 6150
    },
    {
      "epoch": 0.98416,
      "grad_norm": 0.08853036165237427,
      "learning_rate": 0.0001,
      "loss": 0.3175,
      "step": 6151
    },
    {
      "epoch": 0.98432,
      "grad_norm": 0.07462329417467117,
      "learning_rate": 0.0001,
      "loss": 0.3246,
      "step": 6152
    },
    {
      "epoch": 0.98448,
      "grad_norm": 0.08737123012542725,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 6153
    },
    {
      "epoch": 0.98464,
      "grad_norm": 0.09490329027175903,
      "learning_rate": 0.0001,
      "loss": 0.3286,
      "step": 6154
    },
    {
      "epoch": 0.9848,
      "grad_norm": 0.08048555999994278,
      "learning_rate": 0.0001,
      "loss": 0.2976,
      "step": 6155
    },
    {
      "epoch": 0.98496,
      "grad_norm": 0.1037028506398201,
      "learning_rate": 0.0001,
      "loss": 0.3394,
      "step": 6156
    },
    {
      "epoch": 0.98512,
      "grad_norm": 0.08986812084913254,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 6157
    },
    {
      "epoch": 0.98528,
      "grad_norm": 0.08566956222057343,
      "learning_rate": 0.0001,
      "loss": 0.3024,
      "step": 6158
    },
    {
      "epoch": 0.98544,
      "grad_norm": 0.08067505806684494,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 6159
    },
    {
      "epoch": 0.9856,
      "grad_norm": 0.08437549322843552,
      "learning_rate": 0.0001,
      "loss": 0.3139,
      "step": 6160
    },
    {
      "epoch": 0.98576,
      "grad_norm": 0.11088725924491882,
      "learning_rate": 0.0001,
      "loss": 0.3292,
      "step": 6161
    },
    {
      "epoch": 0.98592,
      "grad_norm": 0.09054724127054214,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 6162
    },
    {
      "epoch": 0.98608,
      "grad_norm": 0.07737773656845093,
      "learning_rate": 0.0001,
      "loss": 0.3029,
      "step": 6163
    },
    {
      "epoch": 0.98624,
      "grad_norm": 0.07715108245611191,
      "learning_rate": 0.0001,
      "loss": 0.3369,
      "step": 6164
    },
    {
      "epoch": 0.9864,
      "grad_norm": 0.08422770351171494,
      "learning_rate": 0.0001,
      "loss": 0.3133,
      "step": 6165
    },
    {
      "epoch": 0.98656,
      "grad_norm": 0.09240544587373734,
      "learning_rate": 0.0001,
      "loss": 0.3215,
      "step": 6166
    },
    {
      "epoch": 0.98672,
      "grad_norm": 0.10217367857694626,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 6167
    },
    {
      "epoch": 0.98688,
      "grad_norm": 0.08593204617500305,
      "learning_rate": 0.0001,
      "loss": 0.324,
      "step": 6168
    },
    {
      "epoch": 0.98704,
      "grad_norm": 0.10601739585399628,
      "learning_rate": 0.0001,
      "loss": 0.3131,
      "step": 6169
    },
    {
      "epoch": 0.9872,
      "grad_norm": 0.07301807403564453,
      "learning_rate": 0.0001,
      "loss": 0.3016,
      "step": 6170
    },
    {
      "epoch": 0.98736,
      "grad_norm": 0.09335695207118988,
      "learning_rate": 0.0001,
      "loss": 0.3198,
      "step": 6171
    },
    {
      "epoch": 0.98752,
      "grad_norm": 0.08978022634983063,
      "learning_rate": 0.0001,
      "loss": 0.3267,
      "step": 6172
    },
    {
      "epoch": 0.98768,
      "grad_norm": 0.1065535619854927,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 6173
    },
    {
      "epoch": 0.98784,
      "grad_norm": 0.07844308018684387,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 6174
    },
    {
      "epoch": 0.988,
      "grad_norm": 0.08144239336252213,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 6175
    },
    {
      "epoch": 0.98816,
      "grad_norm": 0.08956369012594223,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 6176
    },
    {
      "epoch": 0.98832,
      "grad_norm": 0.08234377950429916,
      "learning_rate": 0.0001,
      "loss": 0.3146,
      "step": 6177
    },
    {
      "epoch": 0.98848,
      "grad_norm": 0.11053300648927689,
      "learning_rate": 0.0001,
      "loss": 0.3252,
      "step": 6178
    },
    {
      "epoch": 0.98864,
      "grad_norm": 0.09087154269218445,
      "learning_rate": 0.0001,
      "loss": 0.3299,
      "step": 6179
    },
    {
      "epoch": 0.9888,
      "grad_norm": 0.07581128925085068,
      "learning_rate": 0.0001,
      "loss": 0.3092,
      "step": 6180
    },
    {
      "epoch": 0.98896,
      "grad_norm": 0.10624229162931442,
      "learning_rate": 0.0001,
      "loss": 0.3113,
      "step": 6181
    },
    {
      "epoch": 0.98912,
      "grad_norm": 0.08416386693716049,
      "learning_rate": 0.0001,
      "loss": 0.3298,
      "step": 6182
    },
    {
      "epoch": 0.98928,
      "grad_norm": 0.09045441448688507,
      "learning_rate": 0.0001,
      "loss": 0.3201,
      "step": 6183
    },
    {
      "epoch": 0.98944,
      "grad_norm": 0.09148074686527252,
      "learning_rate": 0.0001,
      "loss": 0.3145,
      "step": 6184
    },
    {
      "epoch": 0.9896,
      "grad_norm": 0.09030362218618393,
      "learning_rate": 0.0001,
      "loss": 0.3207,
      "step": 6185
    },
    {
      "epoch": 0.98976,
      "grad_norm": 0.1213812381029129,
      "learning_rate": 0.0001,
      "loss": 0.3144,
      "step": 6186
    },
    {
      "epoch": 0.98992,
      "grad_norm": 0.09487131983041763,
      "learning_rate": 0.0001,
      "loss": 0.3352,
      "step": 6187
    },
    {
      "epoch": 0.99008,
      "grad_norm": 0.08423824608325958,
      "learning_rate": 0.0001,
      "loss": 0.3221,
      "step": 6188
    },
    {
      "epoch": 0.99024,
      "grad_norm": 0.11404532939195633,
      "learning_rate": 0.0001,
      "loss": 0.3235,
      "step": 6189
    },
    {
      "epoch": 0.9904,
      "grad_norm": 0.0901913270354271,
      "learning_rate": 0.0001,
      "loss": 0.3158,
      "step": 6190
    },
    {
      "epoch": 0.99056,
      "grad_norm": 0.12315334379673004,
      "learning_rate": 0.0001,
      "loss": 0.3257,
      "step": 6191
    },
    {
      "epoch": 0.99072,
      "grad_norm": 0.0827651172876358,
      "learning_rate": 0.0001,
      "loss": 0.3026,
      "step": 6192
    },
    {
      "epoch": 0.99088,
      "grad_norm": 0.0959140732884407,
      "learning_rate": 0.0001,
      "loss": 0.3126,
      "step": 6193
    },
    {
      "epoch": 0.99104,
      "grad_norm": 0.09819387644529343,
      "learning_rate": 0.0001,
      "loss": 0.3217,
      "step": 6194
    },
    {
      "epoch": 0.9912,
      "grad_norm": 0.10179852694272995,
      "learning_rate": 0.0001,
      "loss": 0.3425,
      "step": 6195
    },
    {
      "epoch": 0.99136,
      "grad_norm": 0.09226756542921066,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 6196
    },
    {
      "epoch": 0.99152,
      "grad_norm": 0.08662654459476471,
      "learning_rate": 0.0001,
      "loss": 0.315,
      "step": 6197
    },
    {
      "epoch": 0.99168,
      "grad_norm": 0.09282901883125305,
      "learning_rate": 0.0001,
      "loss": 0.3166,
      "step": 6198
    },
    {
      "epoch": 0.99184,
      "grad_norm": 0.08272334933280945,
      "learning_rate": 0.0001,
      "loss": 0.3261,
      "step": 6199
    },
    {
      "epoch": 0.992,
      "grad_norm": 0.08286717534065247,
      "learning_rate": 0.0001,
      "loss": 0.31,
      "step": 6200
    },
    {
      "epoch": 0.992,
      "eval_train_accuracy": 1.0,
      "eval_train_loss": 0.3177495002746582,
      "eval_train_runtime": 4.6714,
      "eval_train_samples_per_second": 1070.339,
      "eval_train_steps_per_second": 13.486,
      "step": 6200
    },
    {
      "epoch": 0.992,
      "eval_test_accuracy": 0.9998,
      "eval_test_loss": 0.31651318073272705,
      "eval_test_runtime": 4.8909,
      "eval_test_samples_per_second": 1022.308,
      "eval_test_steps_per_second": 12.881,
      "step": 6200
    },
    {
      "epoch": 0.99216,
      "grad_norm": 0.08700265735387802,
      "learning_rate": 0.0001,
      "loss": 0.3093,
      "step": 6201
    },
    {
      "epoch": 0.99232,
      "grad_norm": 0.09006032347679138,
      "learning_rate": 0.0001,
      "loss": 0.3255,
      "step": 6202
    },
    {
      "epoch": 0.99248,
      "grad_norm": 0.07712739706039429,
      "learning_rate": 0.0001,
      "loss": 0.3179,
      "step": 6203
    },
    {
      "epoch": 0.99264,
      "grad_norm": 0.11609874665737152,
      "learning_rate": 0.0001,
      "loss": 0.3159,
      "step": 6204
    },
    {
      "epoch": 0.9928,
      "grad_norm": 0.09907791018486023,
      "learning_rate": 0.0001,
      "loss": 0.3196,
      "step": 6205
    },
    {
      "epoch": 0.99296,
      "grad_norm": 0.0763474777340889,
      "learning_rate": 0.0001,
      "loss": 0.3148,
      "step": 6206
    },
    {
      "epoch": 0.99312,
      "grad_norm": 0.07995226234197617,
      "learning_rate": 0.0001,
      "loss": 0.3058,
      "step": 6207
    },
    {
      "epoch": 0.99328,
      "grad_norm": 0.08185773342847824,
      "learning_rate": 0.0001,
      "loss": 0.3224,
      "step": 6208
    },
    {
      "epoch": 0.99344,
      "grad_norm": 0.13363249599933624,
      "learning_rate": 0.0001,
      "loss": 0.3094,
      "step": 6209
    },
    {
      "epoch": 0.9936,
      "grad_norm": 0.1054527536034584,
      "learning_rate": 0.0001,
      "loss": 0.3214,
      "step": 6210
    },
    {
      "epoch": 0.99376,
      "grad_norm": 0.09381313621997833,
      "learning_rate": 0.0001,
      "loss": 0.3109,
      "step": 6211
    },
    {
      "epoch": 0.99392,
      "grad_norm": 0.0803767517209053,
      "learning_rate": 0.0001,
      "loss": 0.3163,
      "step": 6212
    },
    {
      "epoch": 0.99408,
      "grad_norm": 0.0958738774061203,
      "learning_rate": 0.0001,
      "loss": 0.3263,
      "step": 6213
    },
    {
      "epoch": 0.99424,
      "grad_norm": 0.08453277498483658,
      "learning_rate": 0.0001,
      "loss": 0.3195,
      "step": 6214
    },
    {
      "epoch": 0.9944,
      "grad_norm": 0.09216673672199249,
      "learning_rate": 0.0001,
      "loss": 0.3172,
      "step": 6215
    },
    {
      "epoch": 0.99456,
      "grad_norm": 0.08431723713874817,
      "learning_rate": 0.0001,
      "loss": 0.3283,
      "step": 6216
    },
    {
      "epoch": 0.99472,
      "grad_norm": 0.08705800026655197,
      "learning_rate": 0.0001,
      "loss": 0.322,
      "step": 6217
    },
    {
      "epoch": 0.99488,
      "grad_norm": 0.10362721979618073,
      "learning_rate": 0.0001,
      "loss": 0.333,
      "step": 6218
    },
    {
      "epoch": 0.99504,
      "grad_norm": 0.08714055269956589,
      "learning_rate": 0.0001,
      "loss": 0.3188,
      "step": 6219
    },
    {
      "epoch": 0.9952,
      "grad_norm": 0.08566796779632568,
      "learning_rate": 0.0001,
      "loss": 0.3103,
      "step": 6220
    },
    {
      "epoch": 0.99536,
      "grad_norm": 0.08389200270175934,
      "learning_rate": 0.0001,
      "loss": 0.3147,
      "step": 6221
    },
    {
      "epoch": 0.99552,
      "grad_norm": 0.08323664963245392,
      "learning_rate": 0.0001,
      "loss": 0.3243,
      "step": 6222
    },
    {
      "epoch": 0.99568,
      "grad_norm": 0.10584814101457596,
      "learning_rate": 0.0001,
      "loss": 0.3134,
      "step": 6223
    },
    {
      "epoch": 0.99584,
      "grad_norm": 0.10928492248058319,
      "learning_rate": 0.0001,
      "loss": 0.3204,
      "step": 6224
    },
    {
      "epoch": 0.996,
      "grad_norm": 0.07801264524459839,
      "learning_rate": 0.0001,
      "loss": 0.329,
      "step": 6225
    },
    {
      "epoch": 0.99616,
      "grad_norm": 0.08036686480045319,
      "learning_rate": 0.0001,
      "loss": 0.3281,
      "step": 6226
    },
    {
      "epoch": 0.99632,
      "grad_norm": 0.08789928257465363,
      "learning_rate": 0.0001,
      "loss": 0.3111,
      "step": 6227
    },
    {
      "epoch": 0.99648,
      "grad_norm": 0.0909661054611206,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 6228
    },
    {
      "epoch": 0.99664,
      "grad_norm": 0.09780367463827133,
      "learning_rate": 0.0001,
      "loss": 0.3284,
      "step": 6229
    },
    {
      "epoch": 0.9968,
      "grad_norm": 0.10678248107433319,
      "learning_rate": 0.0001,
      "loss": 0.327,
      "step": 6230
    },
    {
      "epoch": 0.99696,
      "grad_norm": 0.08733727782964706,
      "learning_rate": 0.0001,
      "loss": 0.3282,
      "step": 6231
    },
    {
      "epoch": 0.99712,
      "grad_norm": 0.08993217349052429,
      "learning_rate": 0.0001,
      "loss": 0.3254,
      "step": 6232
    },
    {
      "epoch": 0.99728,
      "grad_norm": 0.08096586912870407,
      "learning_rate": 0.0001,
      "loss": 0.3197,
      "step": 6233
    },
    {
      "epoch": 0.99744,
      "grad_norm": 0.08171314001083374,
      "learning_rate": 0.0001,
      "loss": 0.3189,
      "step": 6234
    },
    {
      "epoch": 0.9976,
      "grad_norm": 0.09057450294494629,
      "learning_rate": 0.0001,
      "loss": 0.3223,
      "step": 6235
    },
    {
      "epoch": 0.99776,
      "grad_norm": 0.07918648421764374,
      "learning_rate": 0.0001,
      "loss": 0.3104,
      "step": 6236
    },
    {
      "epoch": 0.99792,
      "grad_norm": 0.08604142814874649,
      "learning_rate": 0.0001,
      "loss": 0.312,
      "step": 6237
    },
    {
      "epoch": 0.99808,
      "grad_norm": 0.09373418986797333,
      "learning_rate": 0.0001,
      "loss": 0.3225,
      "step": 6238
    },
    {
      "epoch": 0.99824,
      "grad_norm": 0.07979341596364975,
      "learning_rate": 0.0001,
      "loss": 0.3231,
      "step": 6239
    },
    {
      "epoch": 0.9984,
      "grad_norm": 0.08162809908390045,
      "learning_rate": 0.0001,
      "loss": 0.3138,
      "step": 6240
    },
    {
      "epoch": 0.99856,
      "grad_norm": 0.07993357628583908,
      "learning_rate": 0.0001,
      "loss": 0.3108,
      "step": 6241
    },
    {
      "epoch": 0.99872,
      "grad_norm": 0.09486144036054611,
      "learning_rate": 0.0001,
      "loss": 0.3304,
      "step": 6242
    },
    {
      "epoch": 0.99888,
      "grad_norm": 0.09023323655128479,
      "learning_rate": 0.0001,
      "loss": 0.3242,
      "step": 6243
    },
    {
      "epoch": 0.99904,
      "grad_norm": 0.08527602255344391,
      "learning_rate": 0.0001,
      "loss": 0.3301,
      "step": 6244
    },
    {
      "epoch": 0.9992,
      "grad_norm": 0.08250780403614044,
      "learning_rate": 0.0001,
      "loss": 0.3164,
      "step": 6245
    },
    {
      "epoch": 0.99936,
      "grad_norm": 0.10621356219053268,
      "learning_rate": 0.0001,
      "loss": 0.3008,
      "step": 6246
    },
    {
      "epoch": 0.99952,
      "grad_norm": 0.08411465585231781,
      "learning_rate": 0.0001,
      "loss": 0.3171,
      "step": 6247
    },
    {
      "epoch": 0.99968,
      "grad_norm": 0.08628972619771957,
      "learning_rate": 0.0001,
      "loss": 0.3074,
      "step": 6248
    },
    {
      "epoch": 0.99984,
      "grad_norm": 0.09350275993347168,
      "learning_rate": 0.0001,
      "loss": 0.3127,
      "step": 6249
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.09219013154506683,
      "learning_rate": 0.0001,
      "loss": 0.3209,
      "step": 6250
    },
    {
      "before_init_mem_cpu": 1301233664,
      "before_init_mem_gpu": 596095488,
      "epoch": 1.0,
      "init_mem_cpu_alloc_delta": 655360,
      "init_mem_cpu_peaked_delta": 0,
      "init_mem_gpu_alloc_delta": 0,
      "init_mem_gpu_peaked_delta": 0,
      "step": 6250,
      "total_flos": 6.53232308224e+16,
      "train_loss": 0.3671976668977737,
      "train_mem_cpu_alloc_delta": 1977155584,
      "train_mem_cpu_peaked_delta": 623603712,
      "train_mem_gpu_alloc_delta": 1540037632,
      "train_mem_gpu_peaked_delta": 5834735616,
      "train_runtime": 3502.8638,
      "train_samples_per_second": 142.74,
      "train_steps_per_second": 1.784
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 6250,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 2000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 6.53232308224e+16,
  "train_batch_size": 10,
  "trial_name": null,
  "trial_params": null
}
