{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1000.0,
  "eval_steps": 100,
  "global_step": 13000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.77,
      "learning_rate": 1e-05,
      "loss": 3.1995,
      "step": 10
    },
    {
      "epoch": 1.54,
      "learning_rate": 2e-05,
      "loss": 2.9306,
      "step": 20
    },
    {
      "epoch": 2.31,
      "learning_rate": 3e-05,
      "loss": 2.7335,
      "step": 30
    },
    {
      "epoch": 3.08,
      "learning_rate": 4e-05,
      "loss": 2.4573,
      "step": 40
    },
    {
      "epoch": 3.85,
      "learning_rate": 5e-05,
      "loss": 2.3312,
      "step": 50
    },
    {
      "epoch": 4.62,
      "learning_rate": 6e-05,
      "loss": 2.2974,
      "step": 60
    },
    {
      "epoch": 5.38,
      "learning_rate": 7e-05,
      "loss": 2.3354,
      "step": 70
    },
    {
      "epoch": 6.15,
      "learning_rate": 8e-05,
      "loss": 2.0716,
      "step": 80
    },
    {
      "epoch": 6.92,
      "learning_rate": 9e-05,
      "loss": 2.1392,
      "step": 90
    },
    {
      "epoch": 7.69,
      "learning_rate": 0.0001,
      "loss": 1.9663,
      "step": 100
    },
    {
      "epoch": 7.69,
      "eval_valid_eval_loss": 3.184727668762207,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 24.160707473754883,
      "eval_valid_eval_perplexity_res": 25.83807373046875,
      "eval_valid_eval_perplexity_seq": 24.160707473754883,
      "eval_valid_eval_reconstruction": 0.08974359184503555,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 3.184727907180786,
      "eval_valid_runtime": 0.3798,
      "eval_valid_samples_per_second": 2.633,
      "eval_valid_steps_per_second": 2.633,
      "step": 100
    },
    {
      "epoch": 7.69,
      "eval_train_eval_loss": 0.5967535376548767,
      "eval_train_eval_loss_<cls>": 4.587690830230713,
      "eval_train_eval_perplexity_batch": 1.8162130117416382,
      "eval_train_eval_perplexity_res": 5.601595878601074,
      "eval_train_eval_perplexity_seq": 2.285353660583496,
      "eval_train_eval_reconstruction": 0.01989677920937538,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.9410583972930908,
      "eval_train_runtime": 2.0999,
      "eval_train_samples_per_second": 47.144,
      "eval_train_steps_per_second": 6.191,
      "step": 100
    },
    {
      "epoch": 8.46,
      "learning_rate": 9.999983624426623e-05,
      "loss": 1.8955,
      "step": 110
    },
    {
      "epoch": 9.23,
      "learning_rate": 9.999934497813753e-05,
      "loss": 1.8206,
      "step": 120
    },
    {
      "epoch": 10.0,
      "learning_rate": 9.999852620483182e-05,
      "loss": 1.7714,
      "step": 130
    },
    {
      "epoch": 10.77,
      "learning_rate": 9.999737992971226e-05,
      "loss": 1.7012,
      "step": 140
    },
    {
      "epoch": 11.54,
      "learning_rate": 9.99959061602872e-05,
      "loss": 1.5889,
      "step": 150
    },
    {
      "epoch": 12.31,
      "learning_rate": 9.999410490621015e-05,
      "loss": 1.617,
      "step": 160
    },
    {
      "epoch": 13.08,
      "learning_rate": 9.999197617927978e-05,
      "loss": 1.6211,
      "step": 170
    },
    {
      "epoch": 13.85,
      "learning_rate": 9.998951999343973e-05,
      "loss": 1.4576,
      "step": 180
    },
    {
      "epoch": 14.62,
      "learning_rate": 9.998673636477855e-05,
      "loss": 1.6223,
      "step": 190
    },
    {
      "epoch": 15.38,
      "learning_rate": 9.998362531152967e-05,
      "loss": 1.5751,
      "step": 200
    },
    {
      "epoch": 15.38,
      "eval_valid_eval_loss": 2.9770302772521973,
      "eval_valid_eval_loss_<cls>": 5.450567722320557,
      "eval_valid_eval_perplexity_batch": 19.629436492919922,
      "eval_valid_eval_perplexity_res": 22.706436157226562,
      "eval_valid_eval_perplexity_seq": 19.629436492919922,
      "eval_valid_eval_reconstruction": 0.08865979313850403,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 2.9770302772521973,
      "eval_valid_runtime": 0.5354,
      "eval_valid_samples_per_second": 1.868,
      "eval_valid_steps_per_second": 1.868,
      "step": 200
    },
    {
      "epoch": 15.38,
      "eval_train_eval_loss": 0.8168385028839111,
      "eval_train_eval_loss_<cls>": 5.406922817230225,
      "eval_train_eval_perplexity_batch": 2.2633330821990967,
      "eval_train_eval_perplexity_res": 6.875979900360107,
      "eval_train_eval_perplexity_seq": 3.0815701484680176,
      "eval_train_eval_reconstruction": 0.03121259994804859,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.4219367504119873,
      "eval_train_runtime": 0.7891,
      "eval_train_samples_per_second": 125.467,
      "eval_train_steps_per_second": 16.475,
      "step": 200
    },
    {
      "epoch": 16.15,
      "learning_rate": 9.998018685407121e-05,
      "loss": 1.3946,
      "step": 210
    },
    {
      "epoch": 16.92,
      "learning_rate": 9.997642101492584e-05,
      "loss": 1.3534,
      "step": 220
    },
    {
      "epoch": 17.69,
      "learning_rate": 9.997232781876067e-05,
      "loss": 1.4552,
      "step": 230
    },
    {
      "epoch": 18.46,
      "learning_rate": 9.996790729238708e-05,
      "loss": 1.2697,
      "step": 240
    },
    {
      "epoch": 19.23,
      "learning_rate": 9.996315946476053e-05,
      "loss": 1.3098,
      "step": 250
    },
    {
      "epoch": 20.0,
      "learning_rate": 9.995808436698037e-05,
      "loss": 1.3062,
      "step": 260
    },
    {
      "epoch": 20.77,
      "learning_rate": 9.995268203228968e-05,
      "loss": 1.2363,
      "step": 270
    },
    {
      "epoch": 21.54,
      "learning_rate": 9.994695249607497e-05,
      "loss": 1.3981,
      "step": 280
    },
    {
      "epoch": 22.31,
      "learning_rate": 9.9940895795866e-05,
      "loss": 1.3146,
      "step": 290
    },
    {
      "epoch": 23.08,
      "learning_rate": 9.993451197133559e-05,
      "loss": 1.2124,
      "step": 300
    },
    {
      "epoch": 23.08,
      "eval_valid_eval_loss": 2.949476480484009,
      "eval_valid_eval_loss_<cls>": 6.059302806854248,
      "eval_valid_eval_perplexity_batch": 19.09595489501953,
      "eval_valid_eval_perplexity_res": 22.566938400268555,
      "eval_valid_eval_perplexity_seq": 19.09595489501953,
      "eval_valid_eval_reconstruction": 0.08627858757972717,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 2.949476957321167,
      "eval_valid_runtime": 0.3933,
      "eval_valid_samples_per_second": 2.543,
      "eval_valid_steps_per_second": 2.543,
      "step": 300
    },
    {
      "epoch": 23.08,
      "eval_train_eval_loss": 0.5014746189117432,
      "eval_train_eval_loss_<cls>": 5.932294845581055,
      "eval_train_eval_perplexity_batch": 1.6511542797088623,
      "eval_train_eval_perplexity_res": 4.505498886108398,
      "eval_train_eval_perplexity_seq": 1.9813560247421265,
      "eval_train_eval_reconstruction": 0.022077694535255432,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.2741684913635254,
      "eval_train_runtime": 0.7816,
      "eval_train_samples_per_second": 126.669,
      "eval_train_steps_per_second": 16.633,
      "step": 300
    },
    {
      "epoch": 23.85,
      "learning_rate": 9.992780106429922e-05,
      "loss": 1.1472,
      "step": 310
    },
    {
      "epoch": 24.62,
      "learning_rate": 9.99207631187149e-05,
      "loss": 1.3058,
      "step": 320
    },
    {
      "epoch": 25.38,
      "learning_rate": 9.991339818068275e-05,
      "loss": 1.2245,
      "step": 330
    },
    {
      "epoch": 26.15,
      "learning_rate": 9.990570629844482e-05,
      "loss": 1.2774,
      "step": 340
    },
    {
      "epoch": 26.92,
      "learning_rate": 9.989768752238472e-05,
      "loss": 1.0374,
      "step": 350
    },
    {
      "epoch": 27.69,
      "learning_rate": 9.988934190502726e-05,
      "loss": 1.3308,
      "step": 360
    },
    {
      "epoch": 28.46,
      "learning_rate": 9.988066950103813e-05,
      "loss": 1.1718,
      "step": 370
    },
    {
      "epoch": 29.23,
      "learning_rate": 9.987167036722358e-05,
      "loss": 1.4863,
      "step": 380
    },
    {
      "epoch": 30.0,
      "learning_rate": 9.986234456253001e-05,
      "loss": 1.1568,
      "step": 390
    },
    {
      "epoch": 30.77,
      "learning_rate": 9.985269214804357e-05,
      "loss": 1.2642,
      "step": 400
    },
    {
      "epoch": 30.77,
      "eval_valid_eval_loss": 2.9291465282440186,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 18.711654663085938,
      "eval_valid_eval_perplexity_res": 21.712574005126953,
      "eval_valid_eval_perplexity_seq": 18.711654663085938,
      "eval_valid_eval_reconstruction": 0.10683760792016983,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 2.9291465282440186,
      "eval_valid_runtime": 0.3619,
      "eval_valid_samples_per_second": 2.763,
      "eval_valid_steps_per_second": 2.763,
      "step": 400
    },
    {
      "epoch": 30.77,
      "eval_train_eval_loss": 0.5479980111122131,
      "eval_train_eval_loss_<cls>": 6.060770034790039,
      "eval_train_eval_perplexity_batch": 1.7297865152359009,
      "eval_train_eval_perplexity_res": 4.798701763153076,
      "eval_train_eval_perplexity_seq": 2.045001983642578,
      "eval_train_eval_reconstruction": 0.02601097896695137,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.230396032333374,
      "eval_train_runtime": 1.0209,
      "eval_train_samples_per_second": 96.974,
      "eval_train_steps_per_second": 12.734,
      "step": 400
    },
    {
      "epoch": 31.54,
      "learning_rate": 9.98427131869898e-05,
      "loss": 1.3369,
      "step": 410
    },
    {
      "epoch": 32.31,
      "learning_rate": 9.983240774473315e-05,
      "loss": 1.1657,
      "step": 420
    },
    {
      "epoch": 33.08,
      "learning_rate": 9.982177588877665e-05,
      "loss": 1.4413,
      "step": 430
    },
    {
      "epoch": 33.85,
      "learning_rate": 9.981081768876142e-05,
      "loss": 1.0859,
      "step": 440
    },
    {
      "epoch": 34.62,
      "learning_rate": 9.979953321646615e-05,
      "loss": 1.3878,
      "step": 450
    },
    {
      "epoch": 35.38,
      "learning_rate": 9.978792254580675e-05,
      "loss": 1.3226,
      "step": 460
    },
    {
      "epoch": 36.15,
      "learning_rate": 9.977598575283575e-05,
      "loss": 1.2938,
      "step": 470
    },
    {
      "epoch": 36.92,
      "learning_rate": 9.976372291574188e-05,
      "loss": 1.0942,
      "step": 480
    },
    {
      "epoch": 37.69,
      "learning_rate": 9.975113411484956e-05,
      "loss": 1.3404,
      "step": 490
    },
    {
      "epoch": 38.46,
      "learning_rate": 9.97382194326183e-05,
      "loss": 1.3423,
      "step": 500
    },
    {
      "epoch": 38.46,
      "eval_valid_eval_loss": 2.914052963256836,
      "eval_valid_eval_loss_<cls>": 6.373297214508057,
      "eval_valid_eval_perplexity_batch": 18.43134880065918,
      "eval_valid_eval_perplexity_res": 22.228487014770508,
      "eval_valid_eval_perplexity_seq": 18.43134880065918,
      "eval_valid_eval_reconstruction": 0.09336941689252853,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 2.9140524864196777,
      "eval_valid_runtime": 0.6479,
      "eval_valid_samples_per_second": 1.543,
      "eval_valid_steps_per_second": 1.543,
      "step": 500
    },
    {
      "epoch": 38.46,
      "eval_train_eval_loss": 0.49834978580474854,
      "eval_train_eval_loss_<cls>": 6.144237041473389,
      "eval_train_eval_perplexity_batch": 1.6460027694702148,
      "eval_train_eval_perplexity_res": 4.4217848777771,
      "eval_train_eval_perplexity_seq": 1.8569060564041138,
      "eval_train_eval_reconstruction": 0.024812471121549606,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.301446795463562,
      "eval_train_runtime": 1.0714,
      "eval_train_samples_per_second": 92.402,
      "eval_train_steps_per_second": 12.134,
      "step": 500
    },
    {
      "epoch": 39.23,
      "learning_rate": 9.972497895364224e-05,
      "loss": 1.1911,
      "step": 510
    },
    {
      "epoch": 40.0,
      "learning_rate": 9.971141276464955e-05,
      "loss": 1.3345,
      "step": 520
    },
    {
      "epoch": 40.77,
      "learning_rate": 9.969752095450189e-05,
      "loss": 1.3034,
      "step": 530
    },
    {
      "epoch": 41.54,
      "learning_rate": 9.96833036141938e-05,
      "loss": 1.2088,
      "step": 540
    },
    {
      "epoch": 42.31,
      "learning_rate": 9.96687608368521e-05,
      "loss": 1.3077,
      "step": 550
    },
    {
      "epoch": 43.08,
      "learning_rate": 9.965389271773535e-05,
      "loss": 1.2964,
      "step": 560
    },
    {
      "epoch": 43.85,
      "learning_rate": 9.963869935423312e-05,
      "loss": 1.1023,
      "step": 570
    },
    {
      "epoch": 44.62,
      "learning_rate": 9.962318084586541e-05,
      "loss": 1.249,
      "step": 580
    },
    {
      "epoch": 45.38,
      "learning_rate": 9.960733729428205e-05,
      "loss": 1.2765,
      "step": 590
    },
    {
      "epoch": 46.15,
      "learning_rate": 9.95911688032619e-05,
      "loss": 1.1439,
      "step": 600
    },
    {
      "epoch": 46.15,
      "eval_valid_eval_loss": 2.909369468688965,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 18.34522819519043,
      "eval_valid_eval_perplexity_res": 21.258039474487305,
      "eval_valid_eval_perplexity_seq": 18.34522819519043,
      "eval_valid_eval_reconstruction": 0.0882352963089943,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 2.9093692302703857,
      "eval_valid_runtime": 0.6509,
      "eval_valid_samples_per_second": 1.536,
      "eval_valid_steps_per_second": 1.536,
      "step": 600
    },
    {
      "epoch": 46.15,
      "eval_train_eval_loss": 0.5761028528213501,
      "eval_train_eval_loss_<cls>": 6.053499698638916,
      "eval_train_eval_perplexity_batch": 1.779091477394104,
      "eval_train_eval_perplexity_res": 4.975674152374268,
      "eval_train_eval_perplexity_seq": 2.011854648590088,
      "eval_train_eval_reconstruction": 0.03269527107477188,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.2629854679107666,
      "eval_train_runtime": 1.4425,
      "eval_train_samples_per_second": 68.629,
      "eval_train_steps_per_second": 9.012,
      "step": 600
    },
    {
      "epoch": 46.92,
      "learning_rate": 9.95746754787123e-05,
      "loss": 1.2729,
      "step": 610
    },
    {
      "epoch": 47.69,
      "learning_rate": 9.955785742866832e-05,
      "loss": 1.2332,
      "step": 620
    },
    {
      "epoch": 48.46,
      "learning_rate": 9.954071476329201e-05,
      "loss": 1.4278,
      "step": 630
    },
    {
      "epoch": 49.23,
      "learning_rate": 9.95232475948718e-05,
      "loss": 1.289,
      "step": 640
    },
    {
      "epoch": 50.0,
      "learning_rate": 9.950545603782162e-05,
      "loss": 1.299,
      "step": 650
    },
    {
      "epoch": 50.77,
      "learning_rate": 9.948734020868027e-05,
      "loss": 1.3831,
      "step": 660
    },
    {
      "epoch": 51.54,
      "learning_rate": 9.946890022611058e-05,
      "loss": 1.133,
      "step": 670
    },
    {
      "epoch": 52.31,
      "learning_rate": 9.945013621089864e-05,
      "loss": 1.1853,
      "step": 680
    },
    {
      "epoch": 53.08,
      "learning_rate": 9.94310482859531e-05,
      "loss": 1.1475,
      "step": 690
    },
    {
      "epoch": 53.85,
      "learning_rate": 9.941163657630419e-05,
      "loss": 1.1153,
      "step": 700
    },
    {
      "epoch": 53.85,
      "eval_valid_eval_loss": 2.938490629196167,
      "eval_valid_eval_loss_<cls>": 6.403172016143799,
      "eval_valid_eval_perplexity_batch": 18.887317657470703,
      "eval_valid_eval_perplexity_res": 24.8074893951416,
      "eval_valid_eval_perplexity_seq": 18.887317657470703,
      "eval_valid_eval_reconstruction": 0.08717519044876099,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 2.938490629196167,
      "eval_valid_runtime": 0.3242,
      "eval_valid_samples_per_second": 3.085,
      "eval_valid_steps_per_second": 3.085,
      "step": 700
    },
    {
      "epoch": 53.85,
      "eval_train_eval_loss": 0.6845220923423767,
      "eval_train_eval_loss_<cls>": 5.980774402618408,
      "eval_train_eval_perplexity_batch": 1.9828239679336548,
      "eval_train_eval_perplexity_res": 5.7949347496032715,
      "eval_train_eval_perplexity_seq": 2.512219190597534,
      "eval_train_eval_reconstruction": 0.043110400438308716,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.1350014209747314,
      "eval_train_runtime": 0.9334,
      "eval_train_samples_per_second": 106.063,
      "eval_train_steps_per_second": 13.927,
      "step": 700
    },
    {
      "epoch": 54.62,
      "learning_rate": 9.939190120910311e-05,
      "loss": 1.3598,
      "step": 710
    },
    {
      "epoch": 55.38,
      "learning_rate": 9.937184231362103e-05,
      "loss": 1.2831,
      "step": 720
    },
    {
      "epoch": 56.15,
      "learning_rate": 9.935146002124829e-05,
      "loss": 1.149,
      "step": 730
    },
    {
      "epoch": 56.92,
      "learning_rate": 9.93307544654936e-05,
      "loss": 1.1638,
      "step": 740
    },
    {
      "epoch": 57.69,
      "learning_rate": 9.93097257819831e-05,
      "loss": 1.305,
      "step": 750
    },
    {
      "epoch": 58.46,
      "learning_rate": 9.92883741084595e-05,
      "loss": 1.4991,
      "step": 760
    },
    {
      "epoch": 59.23,
      "learning_rate": 9.926669958478115e-05,
      "loss": 1.1485,
      "step": 770
    },
    {
      "epoch": 60.0,
      "learning_rate": 9.924470235292112e-05,
      "loss": 1.1128,
      "step": 780
    },
    {
      "epoch": 60.77,
      "learning_rate": 9.922238255696636e-05,
      "loss": 1.2147,
      "step": 790
    },
    {
      "epoch": 61.54,
      "learning_rate": 9.919974034311666e-05,
      "loss": 1.1664,
      "step": 800
    },
    {
      "epoch": 61.54,
      "eval_valid_eval_loss": 2.989870548248291,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 19.883108139038086,
      "eval_valid_eval_perplexity_res": 26.04182243347168,
      "eval_valid_eval_perplexity_seq": 19.883108139038086,
      "eval_valid_eval_reconstruction": 0.07169811427593231,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 2.989870309829712,
      "eval_valid_runtime": 0.3543,
      "eval_valid_samples_per_second": 2.822,
      "eval_valid_steps_per_second": 2.822,
      "step": 800
    },
    {
      "epoch": 61.54,
      "eval_train_eval_loss": 0.6157941222190857,
      "eval_train_eval_loss_<cls>": 5.753787517547607,
      "eval_train_eval_perplexity_batch": 1.8511260747909546,
      "eval_train_eval_perplexity_res": 5.2561936378479,
      "eval_train_eval_perplexity_seq": 2.1967759132385254,
      "eval_train_eval_reconstruction": 0.04596715047955513,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.1287609338760376,
      "eval_train_runtime": 0.9701,
      "eval_train_samples_per_second": 102.048,
      "eval_train_steps_per_second": 13.4,
      "step": 800
    },
    {
      "epoch": 62.31,
      "learning_rate": 9.917677585968367e-05,
      "loss": 1.2113,
      "step": 810
    },
    {
      "epoch": 63.08,
      "learning_rate": 9.915348925709006e-05,
      "loss": 1.4363,
      "step": 820
    },
    {
      "epoch": 63.85,
      "learning_rate": 9.912988068786841e-05,
      "loss": 1.0861,
      "step": 830
    },
    {
      "epoch": 64.62,
      "learning_rate": 9.910595030666026e-05,
      "loss": 1.2394,
      "step": 840
    },
    {
      "epoch": 65.38,
      "learning_rate": 9.908169827021511e-05,
      "loss": 1.0191,
      "step": 850
    },
    {
      "epoch": 66.15,
      "learning_rate": 9.905712473738933e-05,
      "loss": 1.1429,
      "step": 860
    },
    {
      "epoch": 66.92,
      "learning_rate": 9.903222986914523e-05,
      "loss": 1.1121,
      "step": 870
    },
    {
      "epoch": 67.69,
      "learning_rate": 9.900701382854986e-05,
      "loss": 1.2826,
      "step": 880
    },
    {
      "epoch": 68.46,
      "learning_rate": 9.898147678077413e-05,
      "loss": 1.221,
      "step": 890
    },
    {
      "epoch": 69.23,
      "learning_rate": 9.895561889309152e-05,
      "loss": 1.1326,
      "step": 900
    },
    {
      "epoch": 69.23,
      "eval_valid_eval_loss": 3.044008731842041,
      "eval_valid_eval_loss_<cls>": 6.404849529266357,
      "eval_valid_eval_perplexity_batch": 20.989215850830078,
      "eval_valid_eval_perplexity_res": 28.426130294799805,
      "eval_valid_eval_perplexity_seq": 20.989215850830078,
      "eval_valid_eval_reconstruction": 0.07290233671665192,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 3.044009208679199,
      "eval_valid_runtime": 0.3666,
      "eval_valid_samples_per_second": 2.728,
      "eval_valid_steps_per_second": 2.728,
      "step": 900
    },
    {
      "epoch": 69.23,
      "eval_train_eval_loss": 0.4589044749736786,
      "eval_train_eval_loss_<cls>": 5.733584880828857,
      "eval_train_eval_perplexity_batch": 1.5823395252227783,
      "eval_train_eval_perplexity_res": 4.137435436248779,
      "eval_train_eval_perplexity_seq": 1.8242491483688354,
      "eval_train_eval_reconstruction": 0.03824393451213837,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.08784019947052,
      "eval_train_runtime": 0.9351,
      "eval_train_samples_per_second": 105.87,
      "eval_train_steps_per_second": 13.902,
      "step": 900
    },
    {
      "epoch": 70.0,
      "learning_rate": 9.892944033487712e-05,
      "loss": 1.2626,
      "step": 910
    },
    {
      "epoch": 70.77,
      "learning_rate": 9.890294127760653e-05,
      "loss": 1.029,
      "step": 920
    },
    {
      "epoch": 71.54,
      "learning_rate": 9.88761218948546e-05,
      "loss": 1.1902,
      "step": 930
    },
    {
      "epoch": 72.31,
      "learning_rate": 9.884898236229448e-05,
      "loss": 1.2058,
      "step": 940
    },
    {
      "epoch": 73.08,
      "learning_rate": 9.882152285769633e-05,
      "loss": 1.1373,
      "step": 950
    },
    {
      "epoch": 73.85,
      "learning_rate": 9.879374356092617e-05,
      "loss": 1.2015,
      "step": 960
    },
    {
      "epoch": 74.62,
      "learning_rate": 9.87656446539448e-05,
      "loss": 1.1779,
      "step": 970
    },
    {
      "epoch": 75.38,
      "learning_rate": 9.873722632080648e-05,
      "loss": 1.0849,
      "step": 980
    },
    {
      "epoch": 76.15,
      "learning_rate": 9.870848874765783e-05,
      "loss": 1.063,
      "step": 990
    },
    {
      "epoch": 76.92,
      "learning_rate": 9.867943212273653e-05,
      "loss": 1.0519,
      "step": 1000
    },
    {
      "epoch": 76.92,
      "eval_valid_eval_loss": 3.088024377822876,
      "eval_valid_eval_loss_<cls>": 5.97885274887085,
      "eval_valid_eval_perplexity_batch": 21.93370246887207,
      "eval_valid_eval_perplexity_res": 30.738264083862305,
      "eval_valid_eval_perplexity_seq": 21.93370246887207,
      "eval_valid_eval_reconstruction": 0.0765199139714241,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 3.0880250930786133,
      "eval_valid_runtime": 0.3711,
      "eval_valid_samples_per_second": 2.695,
      "eval_valid_steps_per_second": 2.695,
      "step": 1000
    },
    {
      "epoch": 76.92,
      "eval_train_eval_loss": 0.47674667835235596,
      "eval_train_eval_loss_<cls>": 5.659303188323975,
      "eval_train_eval_perplexity_batch": 1.6108253002166748,
      "eval_train_eval_perplexity_res": 4.279344081878662,
      "eval_train_eval_perplexity_seq": 1.7931718826293945,
      "eval_train_eval_reconstruction": 0.049000486731529236,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.1785944700241089,
      "eval_train_runtime": 0.7873,
      "eval_train_samples_per_second": 125.752,
      "eval_train_steps_per_second": 16.513,
      "step": 1000
    },
    {
      "epoch": 77.69,
      "learning_rate": 9.865005663637015e-05,
      "loss": 1.1297,
      "step": 1010
    },
    {
      "epoch": 78.46,
      "learning_rate": 9.862036248097484e-05,
      "loss": 1.2046,
      "step": 1020
    },
    {
      "epoch": 79.23,
      "learning_rate": 9.859034985105415e-05,
      "loss": 1.112,
      "step": 1030
    },
    {
      "epoch": 80.0,
      "learning_rate": 9.856001894319771e-05,
      "loss": 1.1697,
      "step": 1040
    },
    {
      "epoch": 80.77,
      "learning_rate": 9.852936995607987e-05,
      "loss": 1.0932,
      "step": 1050
    },
    {
      "epoch": 81.54,
      "learning_rate": 9.849840309045857e-05,
      "loss": 1.1064,
      "step": 1060
    },
    {
      "epoch": 82.31,
      "learning_rate": 9.846711854917386e-05,
      "loss": 1.0687,
      "step": 1070
    },
    {
      "epoch": 83.08,
      "learning_rate": 9.843551653714665e-05,
      "loss": 1.0799,
      "step": 1080
    },
    {
      "epoch": 83.85,
      "learning_rate": 9.840359726137737e-05,
      "loss": 1.253,
      "step": 1090
    },
    {
      "epoch": 84.62,
      "learning_rate": 9.837136093094465e-05,
      "loss": 0.9574,
      "step": 1100
    },
    {
      "epoch": 84.62,
      "eval_valid_eval_loss": 3.155353546142578,
      "eval_valid_eval_loss_<cls>": 5.958154201507568,
      "eval_valid_eval_perplexity_batch": 23.46133041381836,
      "eval_valid_eval_perplexity_res": 33.691612243652344,
      "eval_valid_eval_perplexity_seq": 23.46133041381836,
      "eval_valid_eval_reconstruction": 0.06040992587804794,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 3.1553537845611572,
      "eval_valid_runtime": 0.3692,
      "eval_valid_samples_per_second": 2.709,
      "eval_valid_steps_per_second": 2.709,
      "step": 1100
    },
    {
      "epoch": 84.62,
      "eval_train_eval_loss": 0.4724369943141937,
      "eval_train_eval_loss_<cls>": 5.432304382324219,
      "eval_train_eval_perplexity_batch": 1.6038981676101685,
      "eval_train_eval_perplexity_res": 4.217350006103516,
      "eval_train_eval_perplexity_seq": 1.7808831930160522,
      "eval_train_eval_reconstruction": 0.05827289819717407,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.0234384536743164,
      "eval_train_runtime": 0.8073,
      "eval_train_samples_per_second": 122.638,
      "eval_train_steps_per_second": 16.104,
      "step": 1100
    },
    {
      "epoch": 85.38,
      "learning_rate": 9.833880775700379e-05,
      "loss": 1.1891,
      "step": 1110
    },
    {
      "epoch": 86.15,
      "learning_rate": 9.830593795278555e-05,
      "loss": 1.1489,
      "step": 1120
    },
    {
      "epoch": 86.92,
      "learning_rate": 9.827275173359471e-05,
      "loss": 1.0173,
      "step": 1130
    },
    {
      "epoch": 87.69,
      "learning_rate": 9.823924931680859e-05,
      "loss": 1.0391,
      "step": 1140
    },
    {
      "epoch": 88.46,
      "learning_rate": 9.820543092187572e-05,
      "loss": 0.9759,
      "step": 1150
    },
    {
      "epoch": 89.23,
      "learning_rate": 9.817129677031437e-05,
      "loss": 1.0172,
      "step": 1160
    },
    {
      "epoch": 90.0,
      "learning_rate": 9.813684708571102e-05,
      "loss": 1.0239,
      "step": 1170
    },
    {
      "epoch": 90.77,
      "learning_rate": 9.8102082093719e-05,
      "loss": 1.0386,
      "step": 1180
    },
    {
      "epoch": 91.54,
      "learning_rate": 9.806700202205702e-05,
      "loss": 1.0019,
      "step": 1190
    },
    {
      "epoch": 92.31,
      "learning_rate": 9.803160710050756e-05,
      "loss": 1.203,
      "step": 1200
    },
    {
      "epoch": 92.31,
      "eval_valid_eval_loss": 3.2173478603363037,
      "eval_valid_eval_loss_<cls>": 5.8359503746032715,
      "eval_valid_eval_perplexity_batch": 24.961830139160156,
      "eval_valid_eval_perplexity_res": 36.58826446533203,
      "eval_valid_eval_perplexity_seq": 24.961830139160156,
      "eval_valid_eval_reconstruction": 0.073804572224617,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 3.2173478603363037,
      "eval_valid_runtime": 0.3638,
      "eval_valid_samples_per_second": 2.748,
      "eval_valid_steps_per_second": 2.748,
      "step": 1200
    },
    {
      "epoch": 92.31,
      "eval_train_eval_loss": 0.36982378363609314,
      "eval_train_eval_loss_<cls>": 5.285909175872803,
      "eval_train_eval_perplexity_batch": 1.447479486465454,
      "eval_train_eval_perplexity_res": 3.49033522605896,
      "eval_train_eval_perplexity_seq": 1.5990747213363647,
      "eval_train_eval_reconstruction": 0.0553770549595356,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.9524680376052856,
      "eval_train_runtime": 0.9395,
      "eval_train_samples_per_second": 105.372,
      "eval_train_steps_per_second": 13.837,
      "step": 1200
    },
    {
      "epoch": 93.08,
      "learning_rate": 9.79958975609155e-05,
      "loss": 1.0997,
      "step": 1210
    },
    {
      "epoch": 93.85,
      "learning_rate": 9.795987363718651e-05,
      "loss": 1.0387,
      "step": 1220
    },
    {
      "epoch": 94.62,
      "learning_rate": 9.792353556528552e-05,
      "loss": 1.0056,
      "step": 1230
    },
    {
      "epoch": 95.38,
      "learning_rate": 9.788688358323528e-05,
      "loss": 1.0495,
      "step": 1240
    },
    {
      "epoch": 96.15,
      "learning_rate": 9.784991793111465e-05,
      "loss": 0.9646,
      "step": 1250
    },
    {
      "epoch": 96.92,
      "learning_rate": 9.781263885105714e-05,
      "loss": 1.0579,
      "step": 1260
    },
    {
      "epoch": 97.69,
      "learning_rate": 9.777504658724928e-05,
      "loss": 1.0357,
      "step": 1270
    },
    {
      "epoch": 98.46,
      "learning_rate": 9.773714138592902e-05,
      "loss": 1.0561,
      "step": 1280
    },
    {
      "epoch": 99.23,
      "learning_rate": 9.76989234953841e-05,
      "loss": 1.0955,
      "step": 1290
    },
    {
      "epoch": 100.0,
      "learning_rate": 9.766039316595048e-05,
      "loss": 0.974,
      "step": 1300
    },
    {
      "epoch": 100.0,
      "eval_valid_eval_loss": 3.248541831970215,
      "eval_valid_eval_loss_<cls>": 5.631791114807129,
      "eval_valid_eval_perplexity_batch": 25.75275993347168,
      "eval_valid_eval_perplexity_res": 40.0836067199707,
      "eval_valid_eval_perplexity_seq": 25.75275993347168,
      "eval_valid_eval_reconstruction": 0.07267441600561142,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 3.248542308807373,
      "eval_valid_runtime": 0.475,
      "eval_valid_samples_per_second": 2.105,
      "eval_valid_steps_per_second": 2.105,
      "step": 1300
    },
    {
      "epoch": 100.0,
      "eval_train_eval_loss": 0.5169622898101807,
      "eval_train_eval_loss_<cls>": 5.242598056793213,
      "eval_train_eval_perplexity_batch": 1.6769258975982666,
      "eval_train_eval_perplexity_res": 4.501459121704102,
      "eval_train_eval_perplexity_seq": 1.883366584777832,
      "eval_train_eval_reconstruction": 0.08655958622694016,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 1.012523889541626,
      "eval_train_runtime": 0.9206,
      "eval_train_samples_per_second": 107.542,
      "eval_train_steps_per_second": 14.122,
      "step": 1300
    },
    {
      "epoch": 100.77,
      "learning_rate": 9.762155065001068e-05,
      "loss": 0.939,
      "step": 1310
    },
    {
      "epoch": 101.54,
      "learning_rate": 9.758239620199205e-05,
      "loss": 1.1015,
      "step": 1320
    },
    {
      "epoch": 102.31,
      "learning_rate": 9.754293007836522e-05,
      "loss": 0.9926,
      "step": 1330
    },
    {
      "epoch": 103.08,
      "learning_rate": 9.750315253764235e-05,
      "loss": 0.9744,
      "step": 1340
    },
    {
      "epoch": 103.85,
      "learning_rate": 9.746306384037546e-05,
      "loss": 0.8763,
      "step": 1350
    },
    {
      "epoch": 104.62,
      "learning_rate": 9.742266424915472e-05,
      "loss": 0.9825,
      "step": 1360
    },
    {
      "epoch": 105.38,
      "learning_rate": 9.738195402860669e-05,
      "loss": 1.0959,
      "step": 1370
    },
    {
      "epoch": 106.15,
      "learning_rate": 9.734093344539268e-05,
      "loss": 0.9511,
      "step": 1380
    },
    {
      "epoch": 106.92,
      "learning_rate": 9.729960276820691e-05,
      "loss": 0.9462,
      "step": 1390
    },
    {
      "epoch": 107.69,
      "learning_rate": 9.725796226777477e-05,
      "loss": 0.9721,
      "step": 1400
    },
    {
      "epoch": 107.69,
      "eval_valid_eval_loss": 3.3722662925720215,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 29.144502639770508,
      "eval_valid_eval_perplexity_res": 46.087825775146484,
      "eval_valid_eval_perplexity_seq": 29.144502639770508,
      "eval_valid_eval_reconstruction": 0.06792452931404114,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 3.3722660541534424,
      "eval_valid_runtime": 0.4057,
      "eval_valid_samples_per_second": 2.465,
      "eval_valid_steps_per_second": 2.465,
      "step": 1400
    },
    {
      "epoch": 107.69,
      "eval_train_eval_loss": 0.4195198714733124,
      "eval_train_eval_loss_<cls>": 5.246524810791016,
      "eval_train_eval_perplexity_batch": 1.521230936050415,
      "eval_train_eval_perplexity_res": 3.9151694774627686,
      "eval_train_eval_perplexity_seq": 1.6912752389907837,
      "eval_train_eval_reconstruction": 0.07954417169094086,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.9214987754821777,
      "eval_train_runtime": 1.1487,
      "eval_train_samples_per_second": 86.187,
      "eval_train_steps_per_second": 11.318,
      "step": 1400
    },
    {
      "epoch": 108.46,
      "learning_rate": 9.721601221685113e-05,
      "loss": 0.9432,
      "step": 1410
    },
    {
      "epoch": 109.23,
      "learning_rate": 9.717375289021842e-05,
      "loss": 0.8857,
      "step": 1420
    },
    {
      "epoch": 110.0,
      "learning_rate": 9.713118456468492e-05,
      "loss": 0.9011,
      "step": 1430
    },
    {
      "epoch": 110.77,
      "learning_rate": 9.708830751908293e-05,
      "loss": 0.9023,
      "step": 1440
    },
    {
      "epoch": 111.54,
      "learning_rate": 9.704512203426695e-05,
      "loss": 0.9748,
      "step": 1450
    },
    {
      "epoch": 112.31,
      "learning_rate": 9.700162839311177e-05,
      "loss": 0.8793,
      "step": 1460
    },
    {
      "epoch": 113.08,
      "learning_rate": 9.695782688051075e-05,
      "loss": 0.8662,
      "step": 1470
    },
    {
      "epoch": 113.85,
      "learning_rate": 9.691371778337383e-05,
      "loss": 0.9829,
      "step": 1480
    },
    {
      "epoch": 114.62,
      "learning_rate": 9.686930139062572e-05,
      "loss": 0.8412,
      "step": 1490
    },
    {
      "epoch": 115.38,
      "learning_rate": 9.682457799320396e-05,
      "loss": 0.9726,
      "step": 1500
    },
    {
      "epoch": 115.38,
      "eval_valid_eval_loss": 3.383617877960205,
      "eval_valid_eval_loss_<cls>": 5.827253818511963,
      "eval_valid_eval_perplexity_batch": 29.477224349975586,
      "eval_valid_eval_perplexity_res": 51.15354919433594,
      "eval_valid_eval_perplexity_seq": 29.477224349975586,
      "eval_valid_eval_reconstruction": 0.0709812119603157,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 3.383617877960205,
      "eval_valid_runtime": 0.3417,
      "eval_valid_samples_per_second": 2.927,
      "eval_valid_steps_per_second": 2.927,
      "step": 1500
    },
    {
      "epoch": 115.38,
      "eval_train_eval_loss": 0.3619048595428467,
      "eval_train_eval_loss_<cls>": 5.180494785308838,
      "eval_train_eval_perplexity_batch": 1.4360623359680176,
      "eval_train_eval_perplexity_res": 3.5063095092773438,
      "eval_train_eval_perplexity_seq": 1.5398668050765991,
      "eval_train_eval_reconstruction": 0.07990248501300812,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.9110155701637268,
      "eval_train_runtime": 0.9204,
      "eval_train_samples_per_second": 107.56,
      "eval_train_steps_per_second": 14.124,
      "step": 1500
    },
    {
      "epoch": 116.15,
      "learning_rate": 9.67795478840571e-05,
      "loss": 0.9565,
      "step": 1510
    },
    {
      "epoch": 116.92,
      "learning_rate": 9.673421135814264e-05,
      "loss": 0.8797,
      "step": 1520
    },
    {
      "epoch": 117.69,
      "learning_rate": 9.668856871242524e-05,
      "loss": 0.909,
      "step": 1530
    },
    {
      "epoch": 118.46,
      "learning_rate": 9.664262024587472e-05,
      "loss": 0.9369,
      "step": 1540
    },
    {
      "epoch": 119.23,
      "learning_rate": 9.659636625946402e-05,
      "loss": 0.8509,
      "step": 1550
    },
    {
      "epoch": 120.0,
      "learning_rate": 9.65498070561674e-05,
      "loss": 1.0146,
      "step": 1560
    },
    {
      "epoch": 120.77,
      "learning_rate": 9.650294294095833e-05,
      "loss": 0.7797,
      "step": 1570
    },
    {
      "epoch": 121.54,
      "learning_rate": 9.645577422080748e-05,
      "loss": 0.9215,
      "step": 1580
    },
    {
      "epoch": 122.31,
      "learning_rate": 9.64083012046808e-05,
      "loss": 0.9672,
      "step": 1590
    },
    {
      "epoch": 123.08,
      "learning_rate": 9.636052420353744e-05,
      "loss": 0.8316,
      "step": 1600
    },
    {
      "epoch": 123.08,
      "eval_valid_eval_loss": 3.511475086212158,
      "eval_valid_eval_loss_<cls>": 5.276108741760254,
      "eval_valid_eval_perplexity_batch": 33.497642517089844,
      "eval_valid_eval_perplexity_res": 54.7573356628418,
      "eval_valid_eval_perplexity_seq": 33.497642517089844,
      "eval_valid_eval_reconstruction": 0.05999999865889549,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 3.5114753246307373,
      "eval_valid_runtime": 0.3383,
      "eval_valid_samples_per_second": 2.956,
      "eval_valid_steps_per_second": 2.956,
      "step": 1600
    },
    {
      "epoch": 123.08,
      "eval_train_eval_loss": 0.3313448429107666,
      "eval_train_eval_loss_<cls>": 5.080358982086182,
      "eval_train_eval_perplexity_batch": 1.392840027809143,
      "eval_train_eval_perplexity_res": 3.3294713497161865,
      "eval_train_eval_perplexity_seq": 1.5073585510253906,
      "eval_train_eval_reconstruction": 0.08478094637393951,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.8325678706169128,
      "eval_train_runtime": 0.9513,
      "eval_train_samples_per_second": 104.072,
      "eval_train_steps_per_second": 13.666,
      "step": 1600
    },
    {
      "epoch": 123.85,
      "learning_rate": 9.63124435303277e-05,
      "loss": 0.8591,
      "step": 1610
    },
    {
      "epoch": 124.62,
      "learning_rate": 9.626405949999103e-05,
      "loss": 0.8198,
      "step": 1620
    },
    {
      "epoch": 125.38,
      "learning_rate": 9.621537242945392e-05,
      "loss": 0.8926,
      "step": 1630
    },
    {
      "epoch": 126.15,
      "learning_rate": 9.616638263762785e-05,
      "loss": 0.7714,
      "step": 1640
    },
    {
      "epoch": 126.92,
      "learning_rate": 9.61170904454072e-05,
      "loss": 0.8115,
      "step": 1650
    },
    {
      "epoch": 127.69,
      "learning_rate": 9.606749617566711e-05,
      "loss": 0.878,
      "step": 1660
    },
    {
      "epoch": 128.46,
      "learning_rate": 9.601760015326145e-05,
      "loss": 0.8254,
      "step": 1670
    },
    {
      "epoch": 129.23,
      "learning_rate": 9.596740270502059e-05,
      "loss": 0.7762,
      "step": 1680
    },
    {
      "epoch": 130.0,
      "learning_rate": 9.591690415974933e-05,
      "loss": 0.8182,
      "step": 1690
    },
    {
      "epoch": 130.77,
      "learning_rate": 9.586610484822474e-05,
      "loss": 0.7965,
      "step": 1700
    },
    {
      "epoch": 130.77,
      "eval_valid_eval_loss": 3.461280584335327,
      "eval_valid_eval_loss_<cls>": 6.184008598327637,
      "eval_valid_eval_perplexity_batch": 31.857746124267578,
      "eval_valid_eval_perplexity_res": 59.89516067504883,
      "eval_valid_eval_perplexity_seq": 31.857746124267578,
      "eval_valid_eval_reconstruction": 0.07284767925739288,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 3.461280584335327,
      "eval_valid_runtime": 0.628,
      "eval_valid_samples_per_second": 1.592,
      "eval_valid_steps_per_second": 1.592,
      "step": 1700
    },
    {
      "epoch": 130.77,
      "eval_train_eval_loss": 0.2824122905731201,
      "eval_train_eval_loss_<cls>": 5.078222274780273,
      "eval_train_eval_perplexity_batch": 1.3263254165649414,
      "eval_train_eval_perplexity_res": 3.029505729675293,
      "eval_train_eval_perplexity_seq": 1.402225375175476,
      "eval_train_eval_reconstruction": 0.07994107902050018,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.8397864103317261,
      "eval_train_runtime": 1.1435,
      "eval_train_samples_per_second": 86.573,
      "eval_train_steps_per_second": 11.368,
      "step": 1700
    },
    {
      "epoch": 131.54,
      "learning_rate": 9.581500510319395e-05,
      "loss": 0.7798,
      "step": 1710
    },
    {
      "epoch": 132.31,
      "learning_rate": 9.576360525937201e-05,
      "loss": 0.8201,
      "step": 1720
    },
    {
      "epoch": 133.08,
      "learning_rate": 9.57119056534397e-05,
      "loss": 0.8858,
      "step": 1730
    },
    {
      "epoch": 133.85,
      "learning_rate": 9.565990662404128e-05,
      "loss": 0.7262,
      "step": 1740
    },
    {
      "epoch": 134.62,
      "learning_rate": 9.56076085117823e-05,
      "loss": 0.7143,
      "step": 1750
    },
    {
      "epoch": 135.38,
      "learning_rate": 9.555501165922742e-05,
      "loss": 0.7495,
      "step": 1760
    },
    {
      "epoch": 136.15,
      "learning_rate": 9.550211641089807e-05,
      "loss": 0.7427,
      "step": 1770
    },
    {
      "epoch": 136.92,
      "learning_rate": 9.54489231132703e-05,
      "loss": 0.6838,
      "step": 1780
    },
    {
      "epoch": 137.69,
      "learning_rate": 9.539543211477235e-05,
      "loss": 0.8099,
      "step": 1790
    },
    {
      "epoch": 138.46,
      "learning_rate": 9.534164376578257e-05,
      "loss": 0.7056,
      "step": 1800
    },
    {
      "epoch": 138.46,
      "eval_valid_eval_loss": 3.6223769187927246,
      "eval_valid_eval_loss_<cls>": 6.12917423248291,
      "eval_valid_eval_perplexity_batch": 37.426422119140625,
      "eval_valid_eval_perplexity_res": 70.1200180053711,
      "eval_valid_eval_perplexity_seq": 37.426422119140625,
      "eval_valid_eval_reconstruction": 0.06454316526651382,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 3.622377395629883,
      "eval_valid_runtime": 0.4537,
      "eval_valid_samples_per_second": 2.204,
      "eval_valid_steps_per_second": 2.204,
      "step": 1800
    },
    {
      "epoch": 138.46,
      "eval_train_eval_loss": 0.3004385828971863,
      "eval_train_eval_loss_<cls>": 5.104137420654297,
      "eval_train_eval_perplexity_batch": 1.3504509925842285,
      "eval_train_eval_perplexity_res": 3.279783010482788,
      "eval_train_eval_perplexity_seq": 1.431604266166687,
      "eval_train_eval_reconstruction": 0.09231783449649811,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.7800032496452332,
      "eval_train_runtime": 0.986,
      "eval_train_samples_per_second": 100.404,
      "eval_train_steps_per_second": 13.184,
      "step": 1800
    },
    {
      "epoch": 139.23,
      "learning_rate": 9.528755841862696e-05,
      "loss": 0.8164,
      "step": 1810
    },
    {
      "epoch": 140.0,
      "learning_rate": 9.523317642757695e-05,
      "loss": 0.7867,
      "step": 1820
    },
    {
      "epoch": 140.77,
      "learning_rate": 9.517849814884706e-05,
      "loss": 0.738,
      "step": 1830
    },
    {
      "epoch": 141.54,
      "learning_rate": 9.512352394059256e-05,
      "loss": 0.7725,
      "step": 1840
    },
    {
      "epoch": 142.31,
      "learning_rate": 9.506825416290712e-05,
      "loss": 0.7602,
      "step": 1850
    },
    {
      "epoch": 143.08,
      "learning_rate": 9.501268917782046e-05,
      "loss": 0.6824,
      "step": 1860
    },
    {
      "epoch": 143.85,
      "learning_rate": 9.495682934929598e-05,
      "loss": 0.7169,
      "step": 1870
    },
    {
      "epoch": 144.62,
      "learning_rate": 9.490067504322835e-05,
      "loss": 0.7494,
      "step": 1880
    },
    {
      "epoch": 145.38,
      "learning_rate": 9.484422662744117e-05,
      "loss": 0.6775,
      "step": 1890
    },
    {
      "epoch": 146.15,
      "learning_rate": 9.478748447168449e-05,
      "loss": 0.7195,
      "step": 1900
    },
    {
      "epoch": 146.15,
      "eval_valid_eval_loss": 3.69570255279541,
      "eval_valid_eval_loss_<cls>": 5.89847993850708,
      "eval_valid_eval_perplexity_batch": 40.27385711669922,
      "eval_valid_eval_perplexity_res": 77.54609680175781,
      "eval_valid_eval_perplexity_seq": 40.27385711669922,
      "eval_valid_eval_reconstruction": 0.06148867309093475,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 3.695702314376831,
      "eval_valid_runtime": 0.4633,
      "eval_valid_samples_per_second": 2.159,
      "eval_valid_steps_per_second": 2.159,
      "step": 1900
    },
    {
      "epoch": 146.15,
      "eval_train_eval_loss": 0.25279051065444946,
      "eval_train_eval_loss_<cls>": 5.017960071563721,
      "eval_train_eval_perplexity_batch": 1.2876135110855103,
      "eval_train_eval_perplexity_res": 2.921917200088501,
      "eval_train_eval_perplexity_seq": 1.3562239408493042,
      "eval_train_eval_reconstruction": 0.08979345113039017,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.6818684935569763,
      "eval_train_runtime": 1.078,
      "eval_train_samples_per_second": 91.833,
      "eval_train_steps_per_second": 12.059,
      "step": 1900
    },
    {
      "epoch": 146.92,
      "learning_rate": 9.473044894763248e-05,
      "loss": 0.7515,
      "step": 1910
    },
    {
      "epoch": 147.69,
      "learning_rate": 9.467312042888088e-05,
      "loss": 0.7467,
      "step": 1920
    },
    {
      "epoch": 148.46,
      "learning_rate": 9.461549929094464e-05,
      "loss": 0.7316,
      "step": 1930
    },
    {
      "epoch": 149.23,
      "learning_rate": 9.455758591125544e-05,
      "loss": 0.732,
      "step": 1940
    },
    {
      "epoch": 150.0,
      "learning_rate": 9.449938066915918e-05,
      "loss": 0.6223,
      "step": 1950
    },
    {
      "epoch": 150.77,
      "learning_rate": 9.444088394591355e-05,
      "loss": 0.6711,
      "step": 1960
    },
    {
      "epoch": 151.54,
      "learning_rate": 9.438209612468551e-05,
      "loss": 0.7031,
      "step": 1970
    },
    {
      "epoch": 152.31,
      "learning_rate": 9.432301759054878e-05,
      "loss": 0.6047,
      "step": 1980
    },
    {
      "epoch": 153.08,
      "learning_rate": 9.426364873048128e-05,
      "loss": 0.6973,
      "step": 1990
    },
    {
      "epoch": 153.85,
      "learning_rate": 9.420398993336269e-05,
      "loss": 0.6654,
      "step": 2000
    },
    {
      "epoch": 153.85,
      "eval_valid_eval_loss": 3.635303258895874,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 37.91334915161133,
      "eval_valid_eval_perplexity_res": 81.33296203613281,
      "eval_valid_eval_perplexity_seq": 37.91334915161133,
      "eval_valid_eval_reconstruction": 0.08695652335882187,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 3.635303258895874,
      "eval_valid_runtime": 0.4901,
      "eval_valid_samples_per_second": 2.04,
      "eval_valid_steps_per_second": 2.04,
      "step": 2000
    },
    {
      "epoch": 153.85,
      "eval_train_eval_loss": 0.28629395365715027,
      "eval_train_eval_loss_<cls>": 4.929717063903809,
      "eval_train_eval_perplexity_batch": 1.3314838409423828,
      "eval_train_eval_perplexity_res": 3.1397719383239746,
      "eval_train_eval_perplexity_seq": 1.3802140951156616,
      "eval_train_eval_reconstruction": 0.12479647248983383,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.6560508608818054,
      "eval_train_runtime": 0.9801,
      "eval_train_samples_per_second": 101.01,
      "eval_train_steps_per_second": 13.264,
      "step": 2000
    },
    {
      "epoch": 154.62,
      "learning_rate": 9.41440415899718e-05,
      "loss": 0.7117,
      "step": 2010
    },
    {
      "epoch": 155.38,
      "learning_rate": 9.408380409298401e-05,
      "loss": 0.6658,
      "step": 2020
    },
    {
      "epoch": 156.15,
      "learning_rate": 9.402327783696874e-05,
      "loss": 0.7254,
      "step": 2030
    },
    {
      "epoch": 156.92,
      "learning_rate": 9.396246321838684e-05,
      "loss": 0.6706,
      "step": 2040
    },
    {
      "epoch": 157.69,
      "learning_rate": 9.390136063558804e-05,
      "loss": 0.6808,
      "step": 2050
    },
    {
      "epoch": 158.46,
      "learning_rate": 9.383997048880823e-05,
      "loss": 0.6128,
      "step": 2060
    },
    {
      "epoch": 159.23,
      "learning_rate": 9.377829318016697e-05,
      "loss": 0.6641,
      "step": 2070
    },
    {
      "epoch": 160.0,
      "learning_rate": 9.37163291136648e-05,
      "loss": 0.6114,
      "step": 2080
    },
    {
      "epoch": 160.77,
      "learning_rate": 9.365407869518055e-05,
      "loss": 0.6592,
      "step": 2090
    },
    {
      "epoch": 161.54,
      "learning_rate": 9.359154233246872e-05,
      "loss": 0.6428,
      "step": 2100
    },
    {
      "epoch": 161.54,
      "eval_valid_eval_loss": 3.8475441932678223,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 46.87779998779297,
      "eval_valid_eval_perplexity_res": 92.81877136230469,
      "eval_valid_eval_perplexity_seq": 46.87779998779297,
      "eval_valid_eval_reconstruction": 0.056603774428367615,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 3.847543954849243,
      "eval_valid_runtime": 0.4283,
      "eval_valid_samples_per_second": 2.335,
      "eval_valid_steps_per_second": 2.335,
      "step": 2100
    },
    {
      "epoch": 161.54,
      "eval_train_eval_loss": 0.31911417841911316,
      "eval_train_eval_loss_<cls>": 5.064146041870117,
      "eval_train_eval_perplexity_batch": 1.375908374786377,
      "eval_train_eval_perplexity_res": 3.5622518062591553,
      "eval_train_eval_perplexity_seq": 1.450244426727295,
      "eval_train_eval_reconstruction": 0.14522694051265717,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.6431555151939392,
      "eval_train_runtime": 0.7796,
      "eval_train_samples_per_second": 126.995,
      "eval_train_steps_per_second": 16.676,
      "step": 2100
    },
    {
      "epoch": 162.31,
      "learning_rate": 9.352872043515683e-05,
      "loss": 0.6345,
      "step": 2110
    },
    {
      "epoch": 163.08,
      "learning_rate": 9.346561341474275e-05,
      "loss": 0.6988,
      "step": 2120
    },
    {
      "epoch": 163.85,
      "learning_rate": 9.34022216845919e-05,
      "loss": 0.5823,
      "step": 2130
    },
    {
      "epoch": 164.62,
      "learning_rate": 9.333854565993468e-05,
      "loss": 0.6039,
      "step": 2140
    },
    {
      "epoch": 165.38,
      "learning_rate": 9.327458575786364e-05,
      "loss": 0.6427,
      "step": 2150
    },
    {
      "epoch": 166.15,
      "learning_rate": 9.321034239733082e-05,
      "loss": 0.604,
      "step": 2160
    },
    {
      "epoch": 166.92,
      "learning_rate": 9.314581599914494e-05,
      "loss": 0.6309,
      "step": 2170
    },
    {
      "epoch": 167.69,
      "learning_rate": 9.308100698596873e-05,
      "loss": 0.6414,
      "step": 2180
    },
    {
      "epoch": 168.46,
      "learning_rate": 9.301591578231609e-05,
      "loss": 0.7487,
      "step": 2190
    },
    {
      "epoch": 169.23,
      "learning_rate": 9.295054281454931e-05,
      "loss": 0.593,
      "step": 2200
    },
    {
      "epoch": 169.23,
      "eval_valid_eval_loss": 3.933159112930298,
      "eval_valid_eval_loss_<cls>": 6.4867634773254395,
      "eval_valid_eval_perplexity_batch": 51.06805419921875,
      "eval_valid_eval_perplexity_res": 108.01132202148438,
      "eval_valid_eval_perplexity_seq": 51.06805419921875,
      "eval_valid_eval_reconstruction": 0.060824740678071976,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 3.933159112930298,
      "eval_valid_runtime": 0.4164,
      "eval_valid_samples_per_second": 2.402,
      "eval_valid_steps_per_second": 2.402,
      "step": 2200
    },
    {
      "epoch": 169.23,
      "eval_train_eval_loss": 0.30206385254859924,
      "eval_train_eval_loss_<cls>": 4.852409839630127,
      "eval_train_eval_perplexity_batch": 1.3526475429534912,
      "eval_train_eval_perplexity_res": 3.483583927154541,
      "eval_train_eval_perplexity_seq": 1.4339417219161987,
      "eval_train_eval_reconstruction": 0.15214188396930695,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.5264075398445129,
      "eval_train_runtime": 0.7631,
      "eval_train_samples_per_second": 129.738,
      "eval_train_steps_per_second": 17.036,
      "step": 2200
    },
    {
      "epoch": 170.0,
      "learning_rate": 9.288488851087635e-05,
      "loss": 0.6264,
      "step": 2210
    },
    {
      "epoch": 170.77,
      "learning_rate": 9.281895330134795e-05,
      "loss": 0.6354,
      "step": 2220
    },
    {
      "epoch": 171.54,
      "learning_rate": 9.275273761785485e-05,
      "loss": 0.6208,
      "step": 2230
    },
    {
      "epoch": 172.31,
      "learning_rate": 9.268624189412498e-05,
      "loss": 0.5496,
      "step": 2240
    },
    {
      "epoch": 173.08,
      "learning_rate": 9.261946656572055e-05,
      "loss": 0.5779,
      "step": 2250
    },
    {
      "epoch": 173.85,
      "learning_rate": 9.255241207003528e-05,
      "loss": 0.5307,
      "step": 2260
    },
    {
      "epoch": 174.62,
      "learning_rate": 9.248507884629152e-05,
      "loss": 0.5218,
      "step": 2270
    },
    {
      "epoch": 175.38,
      "learning_rate": 9.241746733553733e-05,
      "loss": 0.5368,
      "step": 2280
    },
    {
      "epoch": 176.15,
      "learning_rate": 9.234957798064359e-05,
      "loss": 0.5692,
      "step": 2290
    },
    {
      "epoch": 176.92,
      "learning_rate": 9.228141122630115e-05,
      "loss": 0.5358,
      "step": 2300
    },
    {
      "epoch": 176.92,
      "eval_valid_eval_loss": 4.0134968757629395,
      "eval_valid_eval_loss_<cls>": 5.39896821975708,
      "eval_valid_eval_perplexity_batch": 55.340049743652344,
      "eval_valid_eval_perplexity_res": 121.2594223022461,
      "eval_valid_eval_perplexity_seq": 55.340049743652344,
      "eval_valid_eval_reconstruction": 0.07377049326896667,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.0134968757629395,
      "eval_valid_runtime": 0.4347,
      "eval_valid_samples_per_second": 2.301,
      "eval_valid_steps_per_second": 2.301,
      "step": 2300
    },
    {
      "epoch": 176.92,
      "eval_train_eval_loss": 0.21920545399188995,
      "eval_train_eval_loss_<cls>": 4.823019027709961,
      "eval_train_eval_perplexity_batch": 1.2450870275497437,
      "eval_train_eval_perplexity_res": 2.845428466796875,
      "eval_train_eval_perplexity_seq": 1.2929084300994873,
      "eval_train_eval_reconstruction": 0.12404906749725342,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.5434402823448181,
      "eval_train_runtime": 0.8139,
      "eval_train_samples_per_second": 121.639,
      "eval_train_steps_per_second": 15.973,
      "step": 2300
    },
    {
      "epoch": 177.69,
      "learning_rate": 9.221296751901788e-05,
      "loss": 0.5603,
      "step": 2310
    },
    {
      "epoch": 178.46,
      "learning_rate": 9.214424730711578e-05,
      "loss": 0.541,
      "step": 2320
    },
    {
      "epoch": 179.23,
      "learning_rate": 9.207525104072799e-05,
      "loss": 0.5689,
      "step": 2330
    },
    {
      "epoch": 180.0,
      "learning_rate": 9.200597917179585e-05,
      "loss": 0.5415,
      "step": 2340
    },
    {
      "epoch": 180.77,
      "learning_rate": 9.193643215406604e-05,
      "loss": 0.5223,
      "step": 2350
    },
    {
      "epoch": 181.54,
      "learning_rate": 9.186661044308745e-05,
      "loss": 0.6145,
      "step": 2360
    },
    {
      "epoch": 182.31,
      "learning_rate": 9.17965144962083e-05,
      "loss": 0.6179,
      "step": 2370
    },
    {
      "epoch": 183.08,
      "learning_rate": 9.172614477257313e-05,
      "loss": 0.567,
      "step": 2380
    },
    {
      "epoch": 183.85,
      "learning_rate": 9.165550173311976e-05,
      "loss": 0.5598,
      "step": 2390
    },
    {
      "epoch": 184.62,
      "learning_rate": 9.15845858405763e-05,
      "loss": 0.5156,
      "step": 2400
    },
    {
      "epoch": 184.62,
      "eval_valid_eval_loss": 4.013978004455566,
      "eval_valid_eval_loss_<cls>": 7.067290306091309,
      "eval_valid_eval_perplexity_batch": 55.36668014526367,
      "eval_valid_eval_perplexity_res": 132.0686798095703,
      "eval_valid_eval_perplexity_seq": 55.36668014526367,
      "eval_valid_eval_reconstruction": 0.0746268630027771,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.013978481292725,
      "eval_valid_runtime": 0.4021,
      "eval_valid_samples_per_second": 2.487,
      "eval_valid_steps_per_second": 2.487,
      "step": 2400
    },
    {
      "epoch": 184.62,
      "eval_train_eval_loss": 0.29128512740135193,
      "eval_train_eval_loss_<cls>": 4.903323173522949,
      "eval_train_eval_perplexity_batch": 1.3381460905075073,
      "eval_train_eval_perplexity_res": 3.551617383956909,
      "eval_train_eval_perplexity_seq": 1.4018126726150513,
      "eval_train_eval_reconstruction": 0.17757825553417206,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.5266262888908386,
      "eval_train_runtime": 0.6996,
      "eval_train_samples_per_second": 141.507,
      "eval_train_steps_per_second": 18.582,
      "step": 2400
    },
    {
      "epoch": 185.38,
      "learning_rate": 9.151339755945811e-05,
      "loss": 0.5374,
      "step": 2410
    },
    {
      "epoch": 186.15,
      "learning_rate": 9.144193735606476e-05,
      "loss": 0.5427,
      "step": 2420
    },
    {
      "epoch": 186.92,
      "learning_rate": 9.137020569847698e-05,
      "loss": 0.5041,
      "step": 2430
    },
    {
      "epoch": 187.69,
      "learning_rate": 9.129820305655357e-05,
      "loss": 0.5487,
      "step": 2440
    },
    {
      "epoch": 188.46,
      "learning_rate": 9.122592990192835e-05,
      "loss": 0.5604,
      "step": 2450
    },
    {
      "epoch": 189.23,
      "learning_rate": 9.115338670800706e-05,
      "loss": 0.5135,
      "step": 2460
    },
    {
      "epoch": 190.0,
      "learning_rate": 9.108057394996426e-05,
      "loss": 0.4888,
      "step": 2470
    },
    {
      "epoch": 190.77,
      "learning_rate": 9.10074921047402e-05,
      "loss": 0.4654,
      "step": 2480
    },
    {
      "epoch": 191.54,
      "learning_rate": 9.093414165103774e-05,
      "loss": 0.5374,
      "step": 2490
    },
    {
      "epoch": 192.31,
      "learning_rate": 9.086052306931918e-05,
      "loss": 0.5561,
      "step": 2500
    },
    {
      "epoch": 192.31,
      "eval_valid_eval_loss": 4.129936218261719,
      "eval_valid_eval_loss_<cls>": 6.042158603668213,
      "eval_valid_eval_perplexity_batch": 62.17395782470703,
      "eval_valid_eval_perplexity_res": 147.02040100097656,
      "eval_valid_eval_perplexity_seq": 62.17395782470703,
      "eval_valid_eval_reconstruction": 0.0729253962635994,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.129936218261719,
      "eval_valid_runtime": 0.3945,
      "eval_valid_samples_per_second": 2.535,
      "eval_valid_steps_per_second": 2.535,
      "step": 2500
    },
    {
      "epoch": 192.31,
      "eval_train_eval_loss": 0.21113918721675873,
      "eval_train_eval_loss_<cls>": 4.742990016937256,
      "eval_train_eval_perplexity_batch": 1.2350842952728271,
      "eval_train_eval_perplexity_res": 2.918592691421509,
      "eval_train_eval_perplexity_seq": 1.2731993198394775,
      "eval_train_eval_reconstruction": 0.1425890028476715,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.41996094584465027,
      "eval_train_runtime": 0.7717,
      "eval_train_samples_per_second": 128.292,
      "eval_train_steps_per_second": 16.846,
      "step": 2500
    },
    {
      "epoch": 193.08,
      "learning_rate": 9.078663684180311e-05,
      "loss": 0.5257,
      "step": 2510
    },
    {
      "epoch": 193.85,
      "learning_rate": 9.071248345246126e-05,
      "loss": 0.438,
      "step": 2520
    },
    {
      "epoch": 194.62,
      "learning_rate": 9.063806338701534e-05,
      "loss": 0.4773,
      "step": 2530
    },
    {
      "epoch": 195.38,
      "learning_rate": 9.056337713293387e-05,
      "loss": 0.5184,
      "step": 2540
    },
    {
      "epoch": 196.15,
      "learning_rate": 9.04884251794289e-05,
      "loss": 0.4798,
      "step": 2550
    },
    {
      "epoch": 196.92,
      "learning_rate": 9.041320801745296e-05,
      "loss": 0.4331,
      "step": 2560
    },
    {
      "epoch": 197.69,
      "learning_rate": 9.033772613969567e-05,
      "loss": 0.4475,
      "step": 2570
    },
    {
      "epoch": 198.46,
      "learning_rate": 9.026198004058068e-05,
      "loss": 0.5039,
      "step": 2580
    },
    {
      "epoch": 199.23,
      "learning_rate": 9.018597021626227e-05,
      "loss": 0.5042,
      "step": 2590
    },
    {
      "epoch": 200.0,
      "learning_rate": 9.010969716462227e-05,
      "loss": 0.45,
      "step": 2600
    },
    {
      "epoch": 200.0,
      "eval_valid_eval_loss": 4.165721893310547,
      "eval_valid_eval_loss_<cls>": 6.187627792358398,
      "eval_valid_eval_perplexity_batch": 64.4391860961914,
      "eval_valid_eval_perplexity_res": 160.29910278320312,
      "eval_valid_eval_perplexity_seq": 64.4391860961914,
      "eval_valid_eval_reconstruction": 0.075239397585392,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.1657209396362305,
      "eval_valid_runtime": 0.3838,
      "eval_valid_samples_per_second": 2.606,
      "eval_valid_steps_per_second": 2.606,
      "step": 2600
    },
    {
      "epoch": 200.0,
      "eval_train_eval_loss": 0.22201888263225555,
      "eval_train_eval_loss_<cls>": 4.6673054695129395,
      "eval_train_eval_perplexity_batch": 1.2485949993133545,
      "eval_train_eval_perplexity_res": 3.044835329055786,
      "eval_train_eval_perplexity_seq": 1.2923314571380615,
      "eval_train_eval_reconstruction": 0.16159947216510773,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.42704641819000244,
      "eval_train_runtime": 0.7305,
      "eval_train_samples_per_second": 135.533,
      "eval_train_steps_per_second": 17.797,
      "step": 2600
    },
    {
      "epoch": 200.77,
      "learning_rate": 9.003316138526662e-05,
      "loss": 0.4085,
      "step": 2610
    },
    {
      "epoch": 201.54,
      "learning_rate": 8.995636337952226e-05,
      "loss": 0.4502,
      "step": 2620
    },
    {
      "epoch": 202.31,
      "learning_rate": 8.987930365043373e-05,
      "loss": 0.477,
      "step": 2630
    },
    {
      "epoch": 203.08,
      "learning_rate": 8.980198270275991e-05,
      "loss": 0.5058,
      "step": 2640
    },
    {
      "epoch": 203.85,
      "learning_rate": 8.972440104297077e-05,
      "loss": 0.4188,
      "step": 2650
    },
    {
      "epoch": 204.62,
      "learning_rate": 8.964655917924397e-05,
      "loss": 0.4488,
      "step": 2660
    },
    {
      "epoch": 205.38,
      "learning_rate": 8.956845762146154e-05,
      "loss": 0.4336,
      "step": 2670
    },
    {
      "epoch": 206.15,
      "learning_rate": 8.949009688120661e-05,
      "loss": 0.477,
      "step": 2680
    },
    {
      "epoch": 206.92,
      "learning_rate": 8.941147747176005e-05,
      "loss": 0.3886,
      "step": 2690
    },
    {
      "epoch": 207.69,
      "learning_rate": 8.933259990809697e-05,
      "loss": 0.3942,
      "step": 2700
    },
    {
      "epoch": 207.69,
      "eval_valid_eval_loss": 4.303371906280518,
      "eval_valid_eval_loss_<cls>": 7.261048316955566,
      "eval_valid_eval_perplexity_batch": 73.94872283935547,
      "eval_valid_eval_perplexity_res": 186.57406616210938,
      "eval_valid_eval_perplexity_seq": 73.94872283935547,
      "eval_valid_eval_reconstruction": 0.06802721321582794,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.303370952606201,
      "eval_valid_runtime": 0.6791,
      "eval_valid_samples_per_second": 1.473,
      "eval_valid_steps_per_second": 1.473,
      "step": 2700
    },
    {
      "epoch": 207.69,
      "eval_train_eval_loss": 0.324714332818985,
      "eval_train_eval_loss_<cls>": 4.526513576507568,
      "eval_train_eval_perplexity_batch": 1.3836352825164795,
      "eval_train_eval_perplexity_res": 4.070531368255615,
      "eval_train_eval_perplexity_seq": 1.4449659585952759,
      "eval_train_eval_reconstruction": 0.26023367047309875,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.4852467179298401,
      "eval_train_runtime": 1.0262,
      "eval_train_samples_per_second": 96.472,
      "eval_train_steps_per_second": 12.668,
      "step": 2700
    },
    {
      "epoch": 208.46,
      "learning_rate": 8.925346470688351e-05,
      "loss": 0.3885,
      "step": 2710
    },
    {
      "epoch": 209.23,
      "learning_rate": 8.91740723864734e-05,
      "loss": 0.4505,
      "step": 2720
    },
    {
      "epoch": 210.0,
      "learning_rate": 8.909442346690452e-05,
      "loss": 0.3813,
      "step": 2730
    },
    {
      "epoch": 210.77,
      "learning_rate": 8.90145184698956e-05,
      "loss": 0.4464,
      "step": 2740
    },
    {
      "epoch": 211.54,
      "learning_rate": 8.893435791884268e-05,
      "loss": 0.4236,
      "step": 2750
    },
    {
      "epoch": 212.31,
      "learning_rate": 8.885394233881574e-05,
      "loss": 0.4928,
      "step": 2760
    },
    {
      "epoch": 213.08,
      "learning_rate": 8.87732722565553e-05,
      "loss": 0.4283,
      "step": 2770
    },
    {
      "epoch": 213.85,
      "learning_rate": 8.869234820046888e-05,
      "loss": 0.3962,
      "step": 2780
    },
    {
      "epoch": 214.62,
      "learning_rate": 8.86111707006276e-05,
      "loss": 0.4506,
      "step": 2790
    },
    {
      "epoch": 215.38,
      "learning_rate": 8.852974028876272e-05,
      "loss": 0.3674,
      "step": 2800
    },
    {
      "epoch": 215.38,
      "eval_valid_eval_loss": 4.392819881439209,
      "eval_valid_eval_loss_<cls>": 7.305635929107666,
      "eval_valid_eval_perplexity_batch": 80.86813354492188,
      "eval_valid_eval_perplexity_res": 196.41175842285156,
      "eval_valid_eval_perplexity_seq": 80.86813354492188,
      "eval_valid_eval_reconstruction": 0.0595238097012043,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.392819404602051,
      "eval_valid_runtime": 0.4784,
      "eval_valid_samples_per_second": 2.09,
      "eval_valid_steps_per_second": 2.09,
      "step": 2800
    },
    {
      "epoch": 215.38,
      "eval_train_eval_loss": 0.24832451343536377,
      "eval_train_eval_loss_<cls>": 4.58762264251709,
      "eval_train_eval_perplexity_batch": 1.2818758487701416,
      "eval_train_eval_perplexity_res": 3.430211067199707,
      "eval_train_eval_perplexity_seq": 1.3350536823272705,
      "eval_train_eval_reconstruction": 0.21627312898635864,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.37679150700569153,
      "eval_train_runtime": 0.6966,
      "eval_train_samples_per_second": 142.114,
      "eval_train_steps_per_second": 18.661,
      "step": 2800
    },
    {
      "epoch": 216.15,
      "learning_rate": 8.844805749826212e-05,
      "loss": 0.4315,
      "step": 2810
    },
    {
      "epoch": 216.92,
      "learning_rate": 8.836612286416681e-05,
      "loss": 0.3866,
      "step": 2820
    },
    {
      "epoch": 217.69,
      "learning_rate": 8.828393692316741e-05,
      "loss": 0.3867,
      "step": 2830
    },
    {
      "epoch": 218.46,
      "learning_rate": 8.82015002136007e-05,
      "loss": 0.4101,
      "step": 2840
    },
    {
      "epoch": 219.23,
      "learning_rate": 8.811881327544604e-05,
      "loss": 0.4788,
      "step": 2850
    },
    {
      "epoch": 220.0,
      "learning_rate": 8.803587665032184e-05,
      "loss": 0.4102,
      "step": 2860
    },
    {
      "epoch": 220.77,
      "learning_rate": 8.795269088148199e-05,
      "loss": 0.4596,
      "step": 2870
    },
    {
      "epoch": 221.54,
      "learning_rate": 8.78692565138124e-05,
      "loss": 0.4419,
      "step": 2880
    },
    {
      "epoch": 222.31,
      "learning_rate": 8.778557409382726e-05,
      "loss": 0.3862,
      "step": 2890
    },
    {
      "epoch": 223.08,
      "learning_rate": 8.770164416966565e-05,
      "loss": 0.404,
      "step": 2900
    },
    {
      "epoch": 223.08,
      "eval_valid_eval_loss": 4.38576602935791,
      "eval_valid_eval_loss_<cls>": 5.653395175933838,
      "eval_valid_eval_perplexity_batch": 80.29971313476562,
      "eval_valid_eval_perplexity_res": 221.09242248535156,
      "eval_valid_eval_perplexity_seq": 80.29971313476562,
      "eval_valid_eval_reconstruction": 0.07023061066865921,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.38576602935791,
      "eval_valid_runtime": 0.4111,
      "eval_valid_samples_per_second": 2.433,
      "eval_valid_steps_per_second": 2.433,
      "step": 2900
    },
    {
      "epoch": 223.08,
      "eval_train_eval_loss": 0.1596442610025406,
      "eval_train_eval_loss_<cls>": 4.517099857330322,
      "eval_train_eval_perplexity_batch": 1.1730934381484985,
      "eval_train_eval_perplexity_res": 2.558664321899414,
      "eval_train_eval_perplexity_seq": 1.186792016029358,
      "eval_train_eval_reconstruction": 0.1548718810081482,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.4132130444049835,
      "eval_train_runtime": 0.7586,
      "eval_train_samples_per_second": 130.51,
      "eval_train_steps_per_second": 17.138,
      "step": 2900
    },
    {
      "epoch": 223.85,
      "learning_rate": 8.761746729108782e-05,
      "loss": 0.417,
      "step": 2910
    },
    {
      "epoch": 224.62,
      "learning_rate": 8.753304400947162e-05,
      "loss": 0.4036,
      "step": 2920
    },
    {
      "epoch": 225.38,
      "learning_rate": 8.744837487780892e-05,
      "loss": 0.3805,
      "step": 2930
    },
    {
      "epoch": 226.15,
      "learning_rate": 8.736346045070194e-05,
      "loss": 0.3796,
      "step": 2940
    },
    {
      "epoch": 226.92,
      "learning_rate": 8.727830128435966e-05,
      "loss": 0.367,
      "step": 2950
    },
    {
      "epoch": 227.69,
      "learning_rate": 8.719289793659414e-05,
      "loss": 0.3625,
      "step": 2960
    },
    {
      "epoch": 228.46,
      "learning_rate": 8.710725096681691e-05,
      "loss": 0.3399,
      "step": 2970
    },
    {
      "epoch": 229.23,
      "learning_rate": 8.702136093603524e-05,
      "loss": 0.4382,
      "step": 2980
    },
    {
      "epoch": 230.0,
      "learning_rate": 8.693522840684856e-05,
      "loss": 0.3488,
      "step": 2990
    },
    {
      "epoch": 230.77,
      "learning_rate": 8.684885394344469e-05,
      "loss": 0.3687,
      "step": 3000
    },
    {
      "epoch": 230.77,
      "eval_valid_eval_loss": 4.477060317993164,
      "eval_valid_eval_loss_<cls>": 7.085204601287842,
      "eval_valid_eval_perplexity_batch": 87.97566986083984,
      "eval_valid_eval_perplexity_res": 247.12074279785156,
      "eval_valid_eval_perplexity_seq": 87.97566986083984,
      "eval_valid_eval_reconstruction": 0.07459677755832672,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.477060317993164,
      "eval_valid_runtime": 0.4354,
      "eval_valid_samples_per_second": 2.297,
      "eval_valid_steps_per_second": 2.297,
      "step": 3000
    },
    {
      "epoch": 230.77,
      "eval_train_eval_loss": 0.14021773636341095,
      "eval_train_eval_loss_<cls>": 4.311463356018066,
      "eval_train_eval_perplexity_batch": 1.1505242586135864,
      "eval_train_eval_perplexity_res": 2.4697394371032715,
      "eval_train_eval_perplexity_seq": 1.1701689958572388,
      "eval_train_eval_reconstruction": 0.14064614474773407,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.35891932249069214,
      "eval_train_runtime": 0.9272,
      "eval_train_samples_per_second": 106.768,
      "eval_train_steps_per_second": 14.02,
      "step": 3000
    },
    {
      "epoch": 231.54,
      "learning_rate": 8.676223811159615e-05,
      "loss": 0.4138,
      "step": 3010
    },
    {
      "epoch": 232.31,
      "learning_rate": 8.667538147865653e-05,
      "loss": 0.3828,
      "step": 3020
    },
    {
      "epoch": 233.08,
      "learning_rate": 8.658828461355667e-05,
      "loss": 0.3554,
      "step": 3030
    },
    {
      "epoch": 233.85,
      "learning_rate": 8.650094808680103e-05,
      "loss": 0.3463,
      "step": 3040
    },
    {
      "epoch": 234.62,
      "learning_rate": 8.64133724704639e-05,
      "loss": 0.3786,
      "step": 3050
    },
    {
      "epoch": 235.38,
      "learning_rate": 8.632555833818563e-05,
      "loss": 0.3907,
      "step": 3060
    },
    {
      "epoch": 236.15,
      "learning_rate": 8.623750626516893e-05,
      "loss": 0.365,
      "step": 3070
    },
    {
      "epoch": 236.92,
      "learning_rate": 8.614921682817509e-05,
      "loss": 0.3505,
      "step": 3080
    },
    {
      "epoch": 237.69,
      "learning_rate": 8.606069060552017e-05,
      "loss": 0.3191,
      "step": 3090
    },
    {
      "epoch": 238.46,
      "learning_rate": 8.597192817707122e-05,
      "loss": 0.3458,
      "step": 3100
    },
    {
      "epoch": 238.46,
      "eval_valid_eval_loss": 4.4834980964660645,
      "eval_valid_eval_loss_<cls>": 6.110050201416016,
      "eval_valid_eval_perplexity_batch": 88.54386901855469,
      "eval_valid_eval_perplexity_res": 260.18450927734375,
      "eval_valid_eval_perplexity_seq": 88.54386901855469,
      "eval_valid_eval_reconstruction": 0.0708092451095581,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.483498573303223,
      "eval_valid_runtime": 0.4268,
      "eval_valid_samples_per_second": 2.343,
      "eval_valid_steps_per_second": 2.343,
      "step": 3100
    },
    {
      "epoch": 238.46,
      "eval_train_eval_loss": 0.15696126222610474,
      "eval_train_eval_loss_<cls>": 4.299846649169922,
      "eval_train_eval_perplexity_batch": 1.169950246810913,
      "eval_train_eval_perplexity_res": 2.670320749282837,
      "eval_train_eval_perplexity_seq": 1.183605670928955,
      "eval_train_eval_reconstruction": 0.17933592200279236,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.3705751895904541,
      "eval_train_runtime": 0.9682,
      "eval_train_samples_per_second": 102.248,
      "eval_train_steps_per_second": 13.427,
      "step": 3100
    },
    {
      "epoch": 239.23,
      "learning_rate": 8.58829301242425e-05,
      "loss": 0.3632,
      "step": 3110
    },
    {
      "epoch": 240.0,
      "learning_rate": 8.579369702999166e-05,
      "loss": 0.3243,
      "step": 3120
    },
    {
      "epoch": 240.77,
      "learning_rate": 8.570422947881597e-05,
      "loss": 0.3445,
      "step": 3130
    },
    {
      "epoch": 241.54,
      "learning_rate": 8.561452805674842e-05,
      "loss": 0.3595,
      "step": 3140
    },
    {
      "epoch": 242.31,
      "learning_rate": 8.552459335135381e-05,
      "loss": 0.3344,
      "step": 3150
    },
    {
      "epoch": 243.08,
      "learning_rate": 8.543442595172517e-05,
      "loss": 0.3523,
      "step": 3160
    },
    {
      "epoch": 243.85,
      "learning_rate": 8.534402644847962e-05,
      "loss": 0.3287,
      "step": 3170
    },
    {
      "epoch": 244.62,
      "learning_rate": 8.525339543375463e-05,
      "loss": 0.3051,
      "step": 3180
    },
    {
      "epoch": 245.38,
      "learning_rate": 8.516253350120416e-05,
      "loss": 0.3529,
      "step": 3190
    },
    {
      "epoch": 246.15,
      "learning_rate": 8.507144124599467e-05,
      "loss": 0.281,
      "step": 3200
    },
    {
      "epoch": 246.15,
      "eval_valid_eval_loss": 4.569522857666016,
      "eval_valid_eval_loss_<cls>": 6.606115341186523,
      "eval_valid_eval_perplexity_batch": 96.49805450439453,
      "eval_valid_eval_perplexity_res": 264.0279846191406,
      "eval_valid_eval_perplexity_seq": 96.49805450439453,
      "eval_valid_eval_reconstruction": 0.05496828630566597,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.569522857666016,
      "eval_valid_runtime": 0.3471,
      "eval_valid_samples_per_second": 2.881,
      "eval_valid_steps_per_second": 2.881,
      "step": 3200
    },
    {
      "epoch": 246.15,
      "eval_train_eval_loss": 0.12462347745895386,
      "eval_train_eval_loss_<cls>": 4.374582767486572,
      "eval_train_eval_perplexity_batch": 1.1327219009399414,
      "eval_train_eval_perplexity_res": 2.4504292011260986,
      "eval_train_eval_perplexity_seq": 1.152151346206665,
      "eval_train_eval_reconstruction": 0.14067314565181732,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.31919798254966736,
      "eval_train_runtime": 0.9554,
      "eval_train_samples_per_second": 103.625,
      "eval_train_steps_per_second": 13.607,
      "step": 3200
    },
    {
      "epoch": 246.92,
      "learning_rate": 8.498011926480137e-05,
      "loss": 0.3078,
      "step": 3210
    },
    {
      "epoch": 247.69,
      "learning_rate": 8.488856815580413e-05,
      "loss": 0.3286,
      "step": 3220
    },
    {
      "epoch": 248.46,
      "learning_rate": 8.479678851868376e-05,
      "loss": 0.3167,
      "step": 3230
    },
    {
      "epoch": 249.23,
      "learning_rate": 8.470478095461789e-05,
      "loss": 0.3176,
      "step": 3240
    },
    {
      "epoch": 250.0,
      "learning_rate": 8.46125460662772e-05,
      "loss": 0.32,
      "step": 3250
    },
    {
      "epoch": 250.77,
      "learning_rate": 8.452008445782134e-05,
      "loss": 0.2646,
      "step": 3260
    },
    {
      "epoch": 251.54,
      "learning_rate": 8.442739673489505e-05,
      "loss": 0.3131,
      "step": 3270
    },
    {
      "epoch": 252.31,
      "learning_rate": 8.43344835046242e-05,
      "loss": 0.3091,
      "step": 3280
    },
    {
      "epoch": 253.08,
      "learning_rate": 8.424134537561176e-05,
      "loss": 0.3086,
      "step": 3290
    },
    {
      "epoch": 253.85,
      "learning_rate": 8.41479829579338e-05,
      "loss": 0.2848,
      "step": 3300
    },
    {
      "epoch": 253.85,
      "eval_valid_eval_loss": 4.634016036987305,
      "eval_valid_eval_loss_<cls>": 5.964147567749023,
      "eval_valid_eval_perplexity_batch": 102.92658996582031,
      "eval_valid_eval_perplexity_res": 318.66949462890625,
      "eval_valid_eval_perplexity_seq": 102.92658996582031,
      "eval_valid_eval_reconstruction": 0.07371348887681961,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.634016513824463,
      "eval_valid_runtime": 0.3768,
      "eval_valid_samples_per_second": 2.654,
      "eval_valid_steps_per_second": 2.654,
      "step": 3300
    },
    {
      "epoch": 253.85,
      "eval_train_eval_loss": 0.13956759870052338,
      "eval_train_eval_loss_<cls>": 4.179720878601074,
      "eval_train_eval_perplexity_batch": 1.149776577949524,
      "eval_train_eval_perplexity_res": 2.647271156311035,
      "eval_train_eval_perplexity_seq": 1.167457938194275,
      "eval_train_eval_reconstruction": 0.1772788017988205,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.26891347765922546,
      "eval_train_runtime": 0.8096,
      "eval_train_samples_per_second": 122.284,
      "eval_train_steps_per_second": 16.058,
      "step": 3300
    },
    {
      "epoch": 254.62,
      "learning_rate": 8.405439686313558e-05,
      "loss": 0.3048,
      "step": 3310
    },
    {
      "epoch": 255.38,
      "learning_rate": 8.396058770422752e-05,
      "loss": 0.3029,
      "step": 3320
    },
    {
      "epoch": 256.15,
      "learning_rate": 8.386655609568106e-05,
      "loss": 0.3407,
      "step": 3330
    },
    {
      "epoch": 256.92,
      "learning_rate": 8.377230265342486e-05,
      "loss": 0.2851,
      "step": 3340
    },
    {
      "epoch": 257.69,
      "learning_rate": 8.367782799484057e-05,
      "loss": 0.2906,
      "step": 3350
    },
    {
      "epoch": 258.46,
      "learning_rate": 8.358313273875886e-05,
      "loss": 0.3529,
      "step": 3360
    },
    {
      "epoch": 259.23,
      "learning_rate": 8.348821750545539e-05,
      "loss": 0.3039,
      "step": 3370
    },
    {
      "epoch": 260.0,
      "learning_rate": 8.339308291664669e-05,
      "loss": 0.2892,
      "step": 3380
    },
    {
      "epoch": 260.77,
      "learning_rate": 8.329772959548614e-05,
      "loss": 0.3026,
      "step": 3390
    },
    {
      "epoch": 261.54,
      "learning_rate": 8.320215816655987e-05,
      "loss": 0.3027,
      "step": 3400
    },
    {
      "epoch": 261.54,
      "eval_valid_eval_loss": 4.680872917175293,
      "eval_valid_eval_loss_<cls>": 5.807539939880371,
      "eval_valid_eval_perplexity_batch": 107.86418914794922,
      "eval_valid_eval_perplexity_res": 339.8542175292969,
      "eval_valid_eval_perplexity_seq": 107.86418914794922,
      "eval_valid_eval_reconstruction": 0.07803468406200409,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.680871963500977,
      "eval_valid_runtime": 0.4024,
      "eval_valid_samples_per_second": 2.485,
      "eval_valid_steps_per_second": 2.485,
      "step": 3400
    },
    {
      "epoch": 261.54,
      "eval_train_eval_loss": 0.13213811814785004,
      "eval_train_eval_loss_<cls>": 4.108087539672852,
      "eval_train_eval_perplexity_batch": 1.1412659883499146,
      "eval_train_eval_perplexity_res": 2.6780126094818115,
      "eval_train_eval_perplexity_seq": 1.1615846157073975,
      "eval_train_eval_reconstruction": 0.17040303349494934,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.25738897919654846,
      "eval_train_runtime": 0.923,
      "eval_train_samples_per_second": 107.256,
      "eval_train_steps_per_second": 14.084,
      "step": 3400
    },
    {
      "epoch": 262.31,
      "learning_rate": 8.310636925588267e-05,
      "loss": 0.2811,
      "step": 3410
    },
    {
      "epoch": 263.08,
      "learning_rate": 8.301036349089384e-05,
      "loss": 0.315,
      "step": 3420
    },
    {
      "epoch": 263.85,
      "learning_rate": 8.29141415004532e-05,
      "loss": 0.2781,
      "step": 3430
    },
    {
      "epoch": 264.62,
      "learning_rate": 8.281770391483684e-05,
      "loss": 0.2751,
      "step": 3440
    },
    {
      "epoch": 265.38,
      "learning_rate": 8.272105136573303e-05,
      "loss": 0.2884,
      "step": 3450
    },
    {
      "epoch": 266.15,
      "learning_rate": 8.262418448623819e-05,
      "loss": 0.3286,
      "step": 3460
    },
    {
      "epoch": 266.92,
      "learning_rate": 8.252710391085257e-05,
      "loss": 0.2409,
      "step": 3470
    },
    {
      "epoch": 267.69,
      "learning_rate": 8.24298102754762e-05,
      "loss": 0.2683,
      "step": 3480
    },
    {
      "epoch": 268.46,
      "learning_rate": 8.233230421740471e-05,
      "loss": 0.3013,
      "step": 3490
    },
    {
      "epoch": 269.23,
      "learning_rate": 8.223458637532515e-05,
      "loss": 0.2799,
      "step": 3500
    },
    {
      "epoch": 269.23,
      "eval_valid_eval_loss": 4.771777629852295,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 118.12904357910156,
      "eval_valid_eval_perplexity_res": 360.90386962890625,
      "eval_valid_eval_perplexity_seq": 118.12904357910156,
      "eval_valid_eval_reconstruction": 0.06037735939025879,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 4.7717790603637695,
      "eval_valid_runtime": 0.3783,
      "eval_valid_samples_per_second": 2.644,
      "eval_valid_steps_per_second": 2.644,
      "step": 3500
    },
    {
      "epoch": 269.23,
      "eval_train_eval_loss": 0.09445569664239883,
      "eval_train_eval_loss_<cls>": 4.184533596038818,
      "eval_train_eval_perplexity_batch": 1.0990604162216187,
      "eval_train_eval_perplexity_res": 2.2407827377319336,
      "eval_train_eval_perplexity_seq": 1.1091591119766235,
      "eval_train_eval_reconstruction": 0.13135109841823578,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.26717662811279297,
      "eval_train_runtime": 0.8197,
      "eval_train_samples_per_second": 120.769,
      "eval_train_steps_per_second": 15.859,
      "step": 3500
    },
    {
      "epoch": 270.0,
      "learning_rate": 8.213665738931178e-05,
      "loss": 0.3067,
      "step": 3510
    },
    {
      "epoch": 270.77,
      "learning_rate": 8.203851790082195e-05,
      "loss": 0.2775,
      "step": 3520
    },
    {
      "epoch": 271.54,
      "learning_rate": 8.19401685526918e-05,
      "loss": 0.2728,
      "step": 3530
    },
    {
      "epoch": 272.31,
      "learning_rate": 8.184160998913211e-05,
      "loss": 0.2641,
      "step": 3540
    },
    {
      "epoch": 273.08,
      "learning_rate": 8.174284285572408e-05,
      "loss": 0.2643,
      "step": 3550
    },
    {
      "epoch": 273.85,
      "learning_rate": 8.164386779941508e-05,
      "loss": 0.254,
      "step": 3560
    },
    {
      "epoch": 274.62,
      "learning_rate": 8.154468546851446e-05,
      "loss": 0.2533,
      "step": 3570
    },
    {
      "epoch": 275.38,
      "learning_rate": 8.14452965126892e-05,
      "loss": 0.2532,
      "step": 3580
    },
    {
      "epoch": 276.15,
      "learning_rate": 8.134570158295975e-05,
      "loss": 0.2762,
      "step": 3590
    },
    {
      "epoch": 276.92,
      "learning_rate": 8.12459013316958e-05,
      "loss": 0.2404,
      "step": 3600
    },
    {
      "epoch": 276.92,
      "eval_valid_eval_loss": 4.832058429718018,
      "eval_valid_eval_loss_<cls>": 5.930058479309082,
      "eval_valid_eval_perplexity_batch": 125.46896362304688,
      "eval_valid_eval_perplexity_res": 402.608642578125,
      "eval_valid_eval_perplexity_seq": 125.46896362304688,
      "eval_valid_eval_reconstruction": 0.07460184395313263,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.832057952880859,
      "eval_valid_runtime": 0.3568,
      "eval_valid_samples_per_second": 2.803,
      "eval_valid_steps_per_second": 2.803,
      "step": 3600
    },
    {
      "epoch": 276.92,
      "eval_train_eval_loss": 0.10204389691352844,
      "eval_train_eval_loss_<cls>": 3.992995262145996,
      "eval_train_eval_perplexity_batch": 1.1074321269989014,
      "eval_train_eval_perplexity_res": 2.297297477722168,
      "eval_train_eval_perplexity_seq": 1.1132243871688843,
      "eval_train_eval_reconstruction": 0.1684117615222931,
      "eval_train_eval_reconstruction_<cls>": 0.004950494971126318,
      "eval_train_loss": 0.24759753048419952,
      "eval_train_runtime": 0.7651,
      "eval_train_samples_per_second": 129.395,
      "eval_train_steps_per_second": 16.991,
      "step": 3600
    },
    {
      "epoch": 277.69,
      "learning_rate": 8.11458964126118e-05,
      "loss": 0.2586,
      "step": 3610
    },
    {
      "epoch": 278.46,
      "learning_rate": 8.104568748076297e-05,
      "loss": 0.2518,
      "step": 3620
    },
    {
      "epoch": 279.23,
      "learning_rate": 8.09452751925408e-05,
      "loss": 0.2639,
      "step": 3630
    },
    {
      "epoch": 280.0,
      "learning_rate": 8.084466020566877e-05,
      "loss": 0.3039,
      "step": 3640
    },
    {
      "epoch": 280.77,
      "learning_rate": 8.074384317919812e-05,
      "loss": 0.2395,
      "step": 3650
    },
    {
      "epoch": 281.54,
      "learning_rate": 8.064282477350353e-05,
      "loss": 0.253,
      "step": 3660
    },
    {
      "epoch": 282.31,
      "learning_rate": 8.05416056502787e-05,
      "loss": 0.2453,
      "step": 3670
    },
    {
      "epoch": 283.08,
      "learning_rate": 8.044018647253211e-05,
      "loss": 0.2507,
      "step": 3680
    },
    {
      "epoch": 283.85,
      "learning_rate": 8.033856790458263e-05,
      "loss": 0.2242,
      "step": 3690
    },
    {
      "epoch": 284.62,
      "learning_rate": 8.023675061205519e-05,
      "loss": 0.2449,
      "step": 3700
    },
    {
      "epoch": 284.62,
      "eval_valid_eval_loss": 4.915607929229736,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 136.40220642089844,
      "eval_valid_eval_perplexity_res": 422.3777160644531,
      "eval_valid_eval_perplexity_seq": 136.40220642089844,
      "eval_valid_eval_reconstruction": 0.054621849209070206,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 4.915609359741211,
      "eval_valid_runtime": 0.3801,
      "eval_valid_samples_per_second": 2.631,
      "eval_valid_steps_per_second": 2.631,
      "step": 3700
    },
    {
      "epoch": 284.62,
      "eval_train_eval_loss": 0.11272455751895905,
      "eval_train_eval_loss_<cls>": 3.888201951980591,
      "eval_train_eval_perplexity_batch": 1.1193236112594604,
      "eval_train_eval_perplexity_res": 2.5347893238067627,
      "eval_train_eval_perplexity_seq": 1.128070592880249,
      "eval_train_eval_reconstruction": 0.1866622418165207,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.24044878780841827,
      "eval_train_runtime": 0.7358,
      "eval_train_samples_per_second": 134.546,
      "eval_train_steps_per_second": 17.668,
      "step": 3700
    },
    {
      "epoch": 285.38,
      "learning_rate": 8.013473526187641e-05,
      "loss": 0.2288,
      "step": 3710
    },
    {
      "epoch": 286.15,
      "learning_rate": 8.003252252227023e-05,
      "loss": 0.208,
      "step": 3720
    },
    {
      "epoch": 286.92,
      "learning_rate": 7.993011306275354e-05,
      "loss": 0.2344,
      "step": 3730
    },
    {
      "epoch": 287.69,
      "learning_rate": 7.982750755413177e-05,
      "loss": 0.2255,
      "step": 3740
    },
    {
      "epoch": 288.46,
      "learning_rate": 7.972470666849457e-05,
      "loss": 0.2308,
      "step": 3750
    },
    {
      "epoch": 289.23,
      "learning_rate": 7.962171107921128e-05,
      "loss": 0.2552,
      "step": 3760
    },
    {
      "epoch": 290.0,
      "learning_rate": 7.951852146092666e-05,
      "loss": 0.2416,
      "step": 3770
    },
    {
      "epoch": 290.77,
      "learning_rate": 7.941513848955635e-05,
      "loss": 0.2079,
      "step": 3780
    },
    {
      "epoch": 291.54,
      "learning_rate": 7.931156284228255e-05,
      "loss": 0.2186,
      "step": 3790
    },
    {
      "epoch": 292.31,
      "learning_rate": 7.920779519754948e-05,
      "loss": 0.227,
      "step": 3800
    },
    {
      "epoch": 292.31,
      "eval_valid_eval_loss": 4.899383068084717,
      "eval_valid_eval_loss_<cls>": 6.293835639953613,
      "eval_valid_eval_perplexity_batch": 134.2069549560547,
      "eval_valid_eval_perplexity_res": 467.0709533691406,
      "eval_valid_eval_perplexity_seq": 134.2069549560547,
      "eval_valid_eval_reconstruction": 0.07703488320112228,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 4.899383544921875,
      "eval_valid_runtime": 0.4059,
      "eval_valid_samples_per_second": 2.464,
      "eval_valid_steps_per_second": 2.464,
      "step": 3800
    },
    {
      "epoch": 292.31,
      "eval_train_eval_loss": 0.09294155240058899,
      "eval_train_eval_loss_<cls>": 3.9531517028808594,
      "eval_train_eval_perplexity_batch": 1.0973975658416748,
      "eval_train_eval_perplexity_res": 2.354480266571045,
      "eval_train_eval_perplexity_seq": 1.1051592826843262,
      "eval_train_eval_reconstruction": 0.16547125577926636,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.20310650765895844,
      "eval_train_runtime": 0.8066,
      "eval_train_samples_per_second": 122.733,
      "eval_train_steps_per_second": 16.116,
      "step": 3800
    },
    {
      "epoch": 293.08,
      "learning_rate": 7.910383623505904e-05,
      "loss": 0.244,
      "step": 3810
    },
    {
      "epoch": 293.85,
      "learning_rate": 7.899968663576626e-05,
      "loss": 0.2045,
      "step": 3820
    },
    {
      "epoch": 294.62,
      "learning_rate": 7.889534708187491e-05,
      "loss": 0.227,
      "step": 3830
    },
    {
      "epoch": 295.38,
      "learning_rate": 7.879081825683299e-05,
      "loss": 0.2322,
      "step": 3840
    },
    {
      "epoch": 296.15,
      "learning_rate": 7.868610084532828e-05,
      "loss": 0.229,
      "step": 3850
    },
    {
      "epoch": 296.92,
      "learning_rate": 7.858119553328383e-05,
      "loss": 0.2248,
      "step": 3860
    },
    {
      "epoch": 297.69,
      "learning_rate": 7.847610300785352e-05,
      "loss": 0.2223,
      "step": 3870
    },
    {
      "epoch": 298.46,
      "learning_rate": 7.837082395741748e-05,
      "loss": 0.2088,
      "step": 3880
    },
    {
      "epoch": 299.23,
      "learning_rate": 7.826535907157764e-05,
      "loss": 0.2357,
      "step": 3890
    },
    {
      "epoch": 300.0,
      "learning_rate": 7.815970904115319e-05,
      "loss": 0.1827,
      "step": 3900
    },
    {
      "epoch": 300.0,
      "eval_valid_eval_loss": 5.024557590484619,
      "eval_valid_eval_loss_<cls>": 6.711338043212891,
      "eval_valid_eval_perplexity_batch": 152.1029510498047,
      "eval_valid_eval_perplexity_res": 500.06195068359375,
      "eval_valid_eval_perplexity_seq": 152.1029510498047,
      "eval_valid_eval_reconstruction": 0.055913977324962616,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.024556636810303,
      "eval_valid_runtime": 0.3604,
      "eval_valid_samples_per_second": 2.775,
      "eval_valid_steps_per_second": 2.775,
      "step": 3900
    },
    {
      "epoch": 300.0,
      "eval_train_eval_loss": 0.1305294632911682,
      "eval_train_eval_loss_<cls>": 3.746674060821533,
      "eval_train_eval_perplexity_batch": 1.1394314765930176,
      "eval_train_eval_perplexity_res": 2.8394887447357178,
      "eval_train_eval_perplexity_seq": 1.147197961807251,
      "eval_train_eval_reconstruction": 0.2505452632904053,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.22598600387573242,
      "eval_train_runtime": 0.6539,
      "eval_train_samples_per_second": 151.394,
      "eval_train_steps_per_second": 19.88,
      "step": 3900
    },
    {
      "epoch": 300.77,
      "learning_rate": 7.805387455817606e-05,
      "loss": 0.2212,
      "step": 3910
    },
    {
      "epoch": 301.54,
      "learning_rate": 7.79478563158864e-05,
      "loss": 0.2078,
      "step": 3920
    },
    {
      "epoch": 302.31,
      "learning_rate": 7.784165500872796e-05,
      "loss": 0.2183,
      "step": 3930
    },
    {
      "epoch": 303.08,
      "learning_rate": 7.773527133234373e-05,
      "loss": 0.1925,
      "step": 3940
    },
    {
      "epoch": 303.85,
      "learning_rate": 7.762870598357115e-05,
      "loss": 0.2231,
      "step": 3950
    },
    {
      "epoch": 304.62,
      "learning_rate": 7.752195966043771e-05,
      "loss": 0.2202,
      "step": 3960
    },
    {
      "epoch": 305.38,
      "learning_rate": 7.741503306215629e-05,
      "loss": 0.1846,
      "step": 3970
    },
    {
      "epoch": 306.15,
      "learning_rate": 7.730792688912064e-05,
      "loss": 0.2442,
      "step": 3980
    },
    {
      "epoch": 306.92,
      "learning_rate": 7.720064184290076e-05,
      "loss": 0.2067,
      "step": 3990
    },
    {
      "epoch": 307.69,
      "learning_rate": 7.709317862623833e-05,
      "loss": 0.2461,
      "step": 4000
    },
    {
      "epoch": 307.69,
      "eval_valid_eval_loss": 5.031611442565918,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 153.17965698242188,
      "eval_valid_eval_perplexity_res": 483.2081298828125,
      "eval_valid_eval_perplexity_seq": 153.17965698242188,
      "eval_valid_eval_reconstruction": 0.05128205195069313,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 5.031611919403076,
      "eval_valid_runtime": 0.4092,
      "eval_valid_samples_per_second": 2.444,
      "eval_valid_steps_per_second": 2.444,
      "step": 4000
    },
    {
      "epoch": 307.69,
      "eval_train_eval_loss": 0.12302865087985992,
      "eval_train_eval_loss_<cls>": 3.5394062995910645,
      "eval_train_eval_perplexity_batch": 1.1309168338775635,
      "eval_train_eval_perplexity_res": 2.870748281478882,
      "eval_train_eval_perplexity_seq": 1.1417032480239868,
      "eval_train_eval_reconstruction": 0.24809807538986206,
      "eval_train_eval_reconstruction_<cls>": 0.017167381942272186,
      "eval_train_loss": 0.19543728232383728,
      "eval_train_runtime": 0.744,
      "eval_train_samples_per_second": 133.071,
      "eval_train_steps_per_second": 17.474,
      "step": 4000
    },
    {
      "epoch": 308.46,
      "learning_rate": 7.698553794304204e-05,
      "loss": 0.2094,
      "step": 4010
    },
    {
      "epoch": 309.23,
      "learning_rate": 7.687772049838307e-05,
      "loss": 0.192,
      "step": 4020
    },
    {
      "epoch": 310.0,
      "learning_rate": 7.676972699849039e-05,
      "loss": 0.2145,
      "step": 4030
    },
    {
      "epoch": 310.77,
      "learning_rate": 7.666155815074618e-05,
      "loss": 0.1951,
      "step": 4040
    },
    {
      "epoch": 311.54,
      "learning_rate": 7.655321466368126e-05,
      "loss": 0.1944,
      "step": 4050
    },
    {
      "epoch": 312.31,
      "learning_rate": 7.644469724697026e-05,
      "loss": 0.1971,
      "step": 4060
    },
    {
      "epoch": 313.08,
      "learning_rate": 7.633600661142718e-05,
      "loss": 0.213,
      "step": 4070
    },
    {
      "epoch": 313.85,
      "learning_rate": 7.622714346900062e-05,
      "loss": 0.1818,
      "step": 4080
    },
    {
      "epoch": 314.62,
      "learning_rate": 7.611810853276908e-05,
      "loss": 0.1883,
      "step": 4090
    },
    {
      "epoch": 315.38,
      "learning_rate": 7.600890251693645e-05,
      "loss": 0.1933,
      "step": 4100
    },
    {
      "epoch": 315.38,
      "eval_valid_eval_loss": 4.958859443664551,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 142.43125915527344,
      "eval_valid_eval_perplexity_res": 587.0364990234375,
      "eval_valid_eval_perplexity_seq": 142.43125915527344,
      "eval_valid_eval_reconstruction": 0.0810810774564743,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 4.958859920501709,
      "eval_valid_runtime": 0.4087,
      "eval_valid_samples_per_second": 2.447,
      "eval_valid_steps_per_second": 2.447,
      "step": 4100
    },
    {
      "epoch": 315.38,
      "eval_train_eval_loss": 0.08589540421962738,
      "eval_train_eval_loss_<cls>": 3.726341962814331,
      "eval_train_eval_perplexity_batch": 1.0896923542022705,
      "eval_train_eval_perplexity_res": 2.3778393268585205,
      "eval_train_eval_perplexity_seq": 1.0955955982208252,
      "eval_train_eval_reconstruction": 0.1836109459400177,
      "eval_train_eval_reconstruction_<cls>": 0.0,
      "eval_train_loss": 0.18926408886909485,
      "eval_train_runtime": 0.9879,
      "eval_train_samples_per_second": 100.213,
      "eval_train_steps_per_second": 13.159,
      "step": 4100
    },
    {
      "epoch": 316.15,
      "learning_rate": 7.589952613682715e-05,
      "loss": 0.2072,
      "step": 4110
    },
    {
      "epoch": 316.92,
      "learning_rate": 7.57899801088816e-05,
      "loss": 0.1726,
      "step": 4120
    },
    {
      "epoch": 317.69,
      "learning_rate": 7.568026515065135e-05,
      "loss": 0.1775,
      "step": 4130
    },
    {
      "epoch": 318.46,
      "learning_rate": 7.557038198079458e-05,
      "loss": 0.2148,
      "step": 4140
    },
    {
      "epoch": 319.23,
      "learning_rate": 7.546033131907122e-05,
      "loss": 0.1924,
      "step": 4150
    },
    {
      "epoch": 320.0,
      "learning_rate": 7.535011388633839e-05,
      "loss": 0.1824,
      "step": 4160
    },
    {
      "epoch": 320.77,
      "learning_rate": 7.523973040454551e-05,
      "loss": 0.1675,
      "step": 4170
    },
    {
      "epoch": 321.54,
      "learning_rate": 7.512918159672972e-05,
      "loss": 0.1976,
      "step": 4180
    },
    {
      "epoch": 322.31,
      "learning_rate": 7.501846818701106e-05,
      "loss": 0.173,
      "step": 4190
    },
    {
      "epoch": 323.08,
      "learning_rate": 7.490759090058778e-05,
      "loss": 0.1819,
      "step": 4200
    },
    {
      "epoch": 323.08,
      "eval_valid_eval_loss": 5.190311908721924,
      "eval_valid_eval_loss_<cls>": 6.050612926483154,
      "eval_valid_eval_perplexity_batch": 179.5245361328125,
      "eval_valid_eval_perplexity_res": 643.9727783203125,
      "eval_valid_eval_perplexity_seq": 179.5245361328125,
      "eval_valid_eval_reconstruction": 0.06784968823194504,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.190312385559082,
      "eval_valid_runtime": 0.4027,
      "eval_valid_samples_per_second": 2.483,
      "eval_valid_steps_per_second": 2.483,
      "step": 4200
    },
    {
      "epoch": 323.08,
      "eval_train_eval_loss": 0.0781521275639534,
      "eval_train_eval_loss_<cls>": 3.440129518508911,
      "eval_train_eval_perplexity_batch": 1.081287145614624,
      "eval_train_eval_perplexity_res": 2.3567545413970947,
      "eval_train_eval_perplexity_seq": 1.088571310043335,
      "eval_train_eval_reconstruction": 0.17049641907215118,
      "eval_train_eval_reconstruction_<cls>": 0.010204081423580647,
      "eval_train_loss": 0.15903161466121674,
      "eval_train_runtime": 0.7896,
      "eval_train_samples_per_second": 125.382,
      "eval_train_steps_per_second": 16.464,
      "step": 4200
    },
    {
      "epoch": 323.85,
      "learning_rate": 7.47965504637315e-05,
      "loss": 0.179,
      "step": 4210
    },
    {
      "epoch": 324.62,
      "learning_rate": 7.468534760378258e-05,
      "loss": 0.1819,
      "step": 4220
    },
    {
      "epoch": 325.38,
      "learning_rate": 7.457398304914524e-05,
      "loss": 0.1612,
      "step": 4230
    },
    {
      "epoch": 326.15,
      "learning_rate": 7.446245752928284e-05,
      "loss": 0.1955,
      "step": 4240
    },
    {
      "epoch": 326.92,
      "learning_rate": 7.435077177471315e-05,
      "loss": 0.1811,
      "step": 4250
    },
    {
      "epoch": 327.69,
      "learning_rate": 7.423892651700345e-05,
      "loss": 0.2323,
      "step": 4260
    },
    {
      "epoch": 328.46,
      "learning_rate": 7.412692248876584e-05,
      "loss": 0.1466,
      "step": 4270
    },
    {
      "epoch": 329.23,
      "learning_rate": 7.401476042365239e-05,
      "loss": 0.1606,
      "step": 4280
    },
    {
      "epoch": 330.0,
      "learning_rate": 7.390244105635036e-05,
      "loss": 0.1627,
      "step": 4290
    },
    {
      "epoch": 330.77,
      "learning_rate": 7.378996512257735e-05,
      "loss": 0.1474,
      "step": 4300
    },
    {
      "epoch": 330.77,
      "eval_valid_eval_loss": 5.211280822753906,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 183.32872009277344,
      "eval_valid_eval_perplexity_res": 655.4028930664062,
      "eval_valid_eval_perplexity_seq": 183.32872009277344,
      "eval_valid_eval_reconstruction": 0.04700854793190956,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 5.211280822753906,
      "eval_valid_runtime": 0.3683,
      "eval_valid_samples_per_second": 2.715,
      "eval_valid_steps_per_second": 2.715,
      "step": 4300
    },
    {
      "epoch": 330.77,
      "eval_train_eval_loss": 0.07100299000740051,
      "eval_train_eval_loss_<cls>": 3.3382935523986816,
      "eval_train_eval_perplexity_batch": 1.0735844373703003,
      "eval_train_eval_perplexity_res": 2.18603777885437,
      "eval_train_eval_perplexity_seq": 1.0794340372085571,
      "eval_train_eval_reconstruction": 0.1662573516368866,
      "eval_train_eval_reconstruction_<cls>": 0.030927835032343864,
      "eval_train_loss": 0.16973821818828583,
      "eval_train_runtime": 0.7559,
      "eval_train_samples_per_second": 130.976,
      "eval_train_steps_per_second": 17.199,
      "step": 4300
    },
    {
      "epoch": 331.54,
      "learning_rate": 7.367733335907654e-05,
      "loss": 0.1576,
      "step": 4310
    },
    {
      "epoch": 332.31,
      "learning_rate": 7.35645465036118e-05,
      "loss": 0.161,
      "step": 4320
    },
    {
      "epoch": 333.08,
      "learning_rate": 7.345160529496293e-05,
      "loss": 0.1917,
      "step": 4330
    },
    {
      "epoch": 333.85,
      "learning_rate": 7.333851047292068e-05,
      "loss": 0.1432,
      "step": 4340
    },
    {
      "epoch": 334.62,
      "learning_rate": 7.322526277828216e-05,
      "loss": 0.1817,
      "step": 4350
    },
    {
      "epoch": 335.38,
      "learning_rate": 7.311186295284568e-05,
      "loss": 0.1857,
      "step": 4360
    },
    {
      "epoch": 336.15,
      "learning_rate": 7.299831173940612e-05,
      "loss": 0.1801,
      "step": 4370
    },
    {
      "epoch": 336.92,
      "learning_rate": 7.288460988175e-05,
      "loss": 0.1632,
      "step": 4380
    },
    {
      "epoch": 337.69,
      "learning_rate": 7.277075812465054e-05,
      "loss": 0.1798,
      "step": 4390
    },
    {
      "epoch": 338.46,
      "learning_rate": 7.265675721386285e-05,
      "loss": 0.166,
      "step": 4400
    },
    {
      "epoch": 338.46,
      "eval_valid_eval_loss": 5.289268970489502,
      "eval_valid_eval_loss_<cls>": 6.390383720397949,
      "eval_valid_eval_perplexity_batch": 198.198486328125,
      "eval_valid_eval_perplexity_res": 736.2738647460938,
      "eval_valid_eval_perplexity_seq": 198.198486328125,
      "eval_valid_eval_reconstruction": 0.0637049451470375,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.289268970489502,
      "eval_valid_runtime": 0.3609,
      "eval_valid_samples_per_second": 2.771,
      "eval_valid_steps_per_second": 2.771,
      "step": 4400
    },
    {
      "epoch": 338.46,
      "eval_train_eval_loss": 0.07030191272497177,
      "eval_train_eval_loss_<cls>": 3.318791389465332,
      "eval_train_eval_perplexity_batch": 1.0728319883346558,
      "eval_train_eval_perplexity_res": 2.342437744140625,
      "eval_train_eval_perplexity_seq": 1.078352928161621,
      "eval_train_eval_reconstruction": 0.17371420562267303,
      "eval_train_eval_reconstruction_<cls>": 0.02463054098188877,
      "eval_train_loss": 0.1621524691581726,
      "eval_train_runtime": 0.7282,
      "eval_train_samples_per_second": 135.951,
      "eval_train_steps_per_second": 17.852,
      "step": 4400
    },
    {
      "epoch": 339.23,
      "learning_rate": 7.254260789611906e-05,
      "loss": 0.1597,
      "step": 4410
    },
    {
      "epoch": 340.0,
      "learning_rate": 7.242831091912339e-05,
      "loss": 0.1809,
      "step": 4420
    },
    {
      "epoch": 340.77,
      "learning_rate": 7.231386703154724e-05,
      "loss": 0.1381,
      "step": 4430
    },
    {
      "epoch": 341.54,
      "learning_rate": 7.219927698302432e-05,
      "loss": 0.1593,
      "step": 4440
    },
    {
      "epoch": 342.31,
      "learning_rate": 7.208454152414571e-05,
      "loss": 0.168,
      "step": 4450
    },
    {
      "epoch": 343.08,
      "learning_rate": 7.196966140645504e-05,
      "loss": 0.1579,
      "step": 4460
    },
    {
      "epoch": 343.85,
      "learning_rate": 7.185463738244337e-05,
      "loss": 0.1358,
      "step": 4470
    },
    {
      "epoch": 344.62,
      "learning_rate": 7.173947020554446e-05,
      "loss": 0.158,
      "step": 4480
    },
    {
      "epoch": 345.38,
      "learning_rate": 7.162416063012974e-05,
      "loss": 0.1483,
      "step": 4490
    },
    {
      "epoch": 346.15,
      "learning_rate": 7.150870941150336e-05,
      "loss": 0.1654,
      "step": 4500
    },
    {
      "epoch": 346.15,
      "eval_valid_eval_loss": 5.218707084655762,
      "eval_valid_eval_loss_<cls>": 7.226640701293945,
      "eval_valid_eval_perplexity_batch": 184.6952362060547,
      "eval_valid_eval_perplexity_res": 769.6434326171875,
      "eval_valid_eval_perplexity_seq": 184.6952362060547,
      "eval_valid_eval_reconstruction": 0.06783369928598404,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.218707084655762,
      "eval_valid_runtime": 0.42,
      "eval_valid_samples_per_second": 2.381,
      "eval_valid_steps_per_second": 2.381,
      "step": 4500
    },
    {
      "epoch": 346.15,
      "eval_train_eval_loss": 0.05766908824443817,
      "eval_train_eval_loss_<cls>": 3.100989580154419,
      "eval_train_eval_perplexity_batch": 1.0593644380569458,
      "eval_train_eval_perplexity_res": 2.1119022369384766,
      "eval_train_eval_perplexity_seq": 1.0630096197128296,
      "eval_train_eval_reconstruction": 0.14540399610996246,
      "eval_train_eval_reconstruction_<cls>": 0.0517241396009922,
      "eval_train_loss": 0.15468843281269073,
      "eval_train_runtime": 0.9373,
      "eval_train_samples_per_second": 105.619,
      "eval_train_steps_per_second": 13.869,
      "step": 4500
    },
    {
      "epoch": 346.92,
      "learning_rate": 7.139311730589728e-05,
      "loss": 0.1551,
      "step": 4510
    },
    {
      "epoch": 347.69,
      "learning_rate": 7.127738507046632e-05,
      "loss": 0.1634,
      "step": 4520
    },
    {
      "epoch": 348.46,
      "learning_rate": 7.116151346328315e-05,
      "loss": 0.147,
      "step": 4530
    },
    {
      "epoch": 349.23,
      "learning_rate": 7.10455032433334e-05,
      "loss": 0.1456,
      "step": 4540
    },
    {
      "epoch": 350.0,
      "learning_rate": 7.092935517051058e-05,
      "loss": 0.1426,
      "step": 4550
    },
    {
      "epoch": 350.77,
      "learning_rate": 7.081307000561121e-05,
      "loss": 0.1351,
      "step": 4560
    },
    {
      "epoch": 351.54,
      "learning_rate": 7.069664851032982e-05,
      "loss": 0.1604,
      "step": 4570
    },
    {
      "epoch": 352.31,
      "learning_rate": 7.058009144725388e-05,
      "loss": 0.1414,
      "step": 4580
    },
    {
      "epoch": 353.08,
      "learning_rate": 7.046339957985891e-05,
      "loss": 0.1408,
      "step": 4590
    },
    {
      "epoch": 353.85,
      "learning_rate": 7.034657367250337e-05,
      "loss": 0.1379,
      "step": 4600
    },
    {
      "epoch": 353.85,
      "eval_valid_eval_loss": 5.385720729827881,
      "eval_valid_eval_loss_<cls>": 5.879682540893555,
      "eval_valid_eval_perplexity_batch": 218.26736450195312,
      "eval_valid_eval_perplexity_res": 809.6633911132812,
      "eval_valid_eval_perplexity_seq": 218.26736450195312,
      "eval_valid_eval_reconstruction": 0.05548037961125374,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.385720252990723,
      "eval_valid_runtime": 0.5009,
      "eval_valid_samples_per_second": 1.997,
      "eval_valid_steps_per_second": 1.997,
      "step": 4600
    },
    {
      "epoch": 353.85,
      "eval_train_eval_loss": 0.04102059453725815,
      "eval_train_eval_loss_<cls>": 2.997161626815796,
      "eval_train_eval_perplexity_batch": 1.041873574256897,
      "eval_train_eval_perplexity_res": 1.8115848302841187,
      "eval_train_eval_perplexity_seq": 1.0437976121902466,
      "eval_train_eval_reconstruction": 0.11734797060489655,
      "eval_train_eval_reconstruction_<cls>": 0.058510638773441315,
      "eval_train_loss": 0.1439691185951233,
      "eval_train_runtime": 1.0363,
      "eval_train_samples_per_second": 95.531,
      "eval_train_steps_per_second": 12.544,
      "step": 4600
    },
    {
      "epoch": 354.62,
      "learning_rate": 7.022961449042376e-05,
      "loss": 0.1352,
      "step": 4610
    },
    {
      "epoch": 355.38,
      "learning_rate": 7.011252279972957e-05,
      "loss": 0.1354,
      "step": 4620
    },
    {
      "epoch": 356.15,
      "learning_rate": 6.99952993673982e-05,
      "loss": 0.1619,
      "step": 4630
    },
    {
      "epoch": 356.92,
      "learning_rate": 6.987794496127004e-05,
      "loss": 0.1417,
      "step": 4640
    },
    {
      "epoch": 357.69,
      "learning_rate": 6.976046035004335e-05,
      "loss": 0.1685,
      "step": 4650
    },
    {
      "epoch": 358.46,
      "learning_rate": 6.964284630326927e-05,
      "loss": 0.144,
      "step": 4660
    },
    {
      "epoch": 359.23,
      "learning_rate": 6.952510359134681e-05,
      "loss": 0.1525,
      "step": 4670
    },
    {
      "epoch": 360.0,
      "learning_rate": 6.940723298551773e-05,
      "loss": 0.135,
      "step": 4680
    },
    {
      "epoch": 360.77,
      "learning_rate": 6.928923525786151e-05,
      "loss": 0.1414,
      "step": 4690
    },
    {
      "epoch": 361.54,
      "learning_rate": 6.917111118129035e-05,
      "loss": 0.1525,
      "step": 4700
    },
    {
      "epoch": 361.54,
      "eval_valid_eval_loss": 5.4138264656066895,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 224.48895263671875,
      "eval_valid_eval_perplexity_res": 927.9046020507812,
      "eval_valid_eval_perplexity_seq": 224.48895263671875,
      "eval_valid_eval_reconstruction": 0.06792452931404114,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 5.413825988769531,
      "eval_valid_runtime": 0.4226,
      "eval_valid_samples_per_second": 2.366,
      "eval_valid_steps_per_second": 2.366,
      "step": 4700
    },
    {
      "epoch": 361.54,
      "eval_train_eval_loss": 0.06475549191236496,
      "eval_train_eval_loss_<cls>": 2.849402904510498,
      "eval_train_eval_perplexity_batch": 1.0668981075286865,
      "eval_train_eval_perplexity_res": 2.2904860973358154,
      "eval_train_eval_perplexity_seq": 1.069480299949646,
      "eval_train_eval_reconstruction": 0.20319849252700806,
      "eval_train_eval_reconstruction_<cls>": 0.10144927352666855,
      "eval_train_loss": 0.12847888469696045,
      "eval_train_runtime": 0.8742,
      "eval_train_samples_per_second": 113.248,
      "eval_train_steps_per_second": 14.871,
      "step": 4700
    },
    {
      "epoch": 362.31,
      "learning_rate": 6.905286152954402e-05,
      "loss": 0.1386,
      "step": 4710
    },
    {
      "epoch": 363.08,
      "learning_rate": 6.89344870771849e-05,
      "loss": 0.1578,
      "step": 4720
    },
    {
      "epoch": 363.85,
      "learning_rate": 6.881598859959277e-05,
      "loss": 0.1425,
      "step": 4730
    },
    {
      "epoch": 364.62,
      "learning_rate": 6.869736687295985e-05,
      "loss": 0.1365,
      "step": 4740
    },
    {
      "epoch": 365.38,
      "learning_rate": 6.857862267428563e-05,
      "loss": 0.1392,
      "step": 4750
    },
    {
      "epoch": 366.15,
      "learning_rate": 6.845975678137189e-05,
      "loss": 0.1474,
      "step": 4760
    },
    {
      "epoch": 366.92,
      "learning_rate": 6.834076997281745e-05,
      "loss": 0.1215,
      "step": 4770
    },
    {
      "epoch": 367.69,
      "learning_rate": 6.822166302801321e-05,
      "loss": 0.1377,
      "step": 4780
    },
    {
      "epoch": 368.46,
      "learning_rate": 6.810243672713699e-05,
      "loss": 0.1383,
      "step": 4790
    },
    {
      "epoch": 369.23,
      "learning_rate": 6.79830918511484e-05,
      "loss": 0.1294,
      "step": 4800
    },
    {
      "epoch": 369.23,
      "eval_valid_eval_loss": 5.463819980621338,
      "eval_valid_eval_loss_<cls>": 5.717157363891602,
      "eval_valid_eval_perplexity_batch": 235.99720764160156,
      "eval_valid_eval_perplexity_res": 961.1387329101562,
      "eval_valid_eval_perplexity_seq": 235.99720764160156,
      "eval_valid_eval_reconstruction": 0.06873428076505661,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.46381950378418,
      "eval_valid_runtime": 0.4732,
      "eval_valid_samples_per_second": 2.113,
      "eval_valid_steps_per_second": 2.113,
      "step": 4800
    },
    {
      "epoch": 369.23,
      "eval_train_eval_loss": 0.0844264104962349,
      "eval_train_eval_loss_<cls>": 2.734118700027466,
      "eval_train_eval_perplexity_batch": 1.0880928039550781,
      "eval_train_eval_perplexity_res": 2.681062698364258,
      "eval_train_eval_perplexity_seq": 1.0909754037857056,
      "eval_train_eval_reconstruction": 0.27052703499794006,
      "eval_train_eval_reconstruction_<cls>": 0.09004739671945572,
      "eval_train_loss": 0.1483813375234604,
      "eval_train_runtime": 0.9851,
      "eval_train_samples_per_second": 100.495,
      "eval_train_steps_per_second": 13.196,
      "step": 4800
    },
    {
      "epoch": 370.0,
      "learning_rate": 6.786362918178371e-05,
      "loss": 0.1336,
      "step": 4810
    },
    {
      "epoch": 370.77,
      "learning_rate": 6.774404950155087e-05,
      "loss": 0.1377,
      "step": 4820
    },
    {
      "epoch": 371.54,
      "learning_rate": 6.762435359372415e-05,
      "loss": 0.1307,
      "step": 4830
    },
    {
      "epoch": 372.31,
      "learning_rate": 6.750454224233924e-05,
      "loss": 0.1103,
      "step": 4840
    },
    {
      "epoch": 373.08,
      "learning_rate": 6.738461623218795e-05,
      "loss": 0.1318,
      "step": 4850
    },
    {
      "epoch": 373.85,
      "learning_rate": 6.726457634881316e-05,
      "loss": 0.1304,
      "step": 4860
    },
    {
      "epoch": 374.62,
      "learning_rate": 6.714442337850364e-05,
      "loss": 0.1362,
      "step": 4870
    },
    {
      "epoch": 375.38,
      "learning_rate": 6.702415810828889e-05,
      "loss": 0.1327,
      "step": 4880
    },
    {
      "epoch": 376.15,
      "learning_rate": 6.690378132593404e-05,
      "loss": 0.1313,
      "step": 4890
    },
    {
      "epoch": 376.92,
      "learning_rate": 6.678329381993458e-05,
      "loss": 0.1331,
      "step": 4900
    },
    {
      "epoch": 376.92,
      "eval_valid_eval_loss": 5.504154682159424,
      "eval_valid_eval_loss_<cls>": 4.979052543640137,
      "eval_valid_eval_perplexity_batch": 245.71066284179688,
      "eval_valid_eval_perplexity_res": 1067.1468505859375,
      "eval_valid_eval_perplexity_seq": 245.71066284179688,
      "eval_valid_eval_reconstruction": 0.07232267409563065,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.504154682159424,
      "eval_valid_runtime": 0.5567,
      "eval_valid_samples_per_second": 1.796,
      "eval_valid_steps_per_second": 1.796,
      "step": 4900
    },
    {
      "epoch": 376.92,
      "eval_train_eval_loss": 0.05440526455640793,
      "eval_train_eval_loss_<cls>": 2.731804370880127,
      "eval_train_eval_perplexity_batch": 1.0559124946594238,
      "eval_train_eval_perplexity_res": 2.2018914222717285,
      "eval_train_eval_perplexity_seq": 1.0588061809539795,
      "eval_train_eval_reconstruction": 0.17801494896411896,
      "eval_train_eval_reconstruction_<cls>": 0.09405940771102905,
      "eval_train_loss": 0.12663593888282776,
      "eval_train_runtime": 0.8238,
      "eval_train_samples_per_second": 120.169,
      "eval_train_steps_per_second": 15.78,
      "step": 4900
    },
    {
      "epoch": 377.69,
      "learning_rate": 6.666269637951134e-05,
      "loss": 0.1382,
      "step": 4910
    },
    {
      "epoch": 378.46,
      "learning_rate": 6.654198979460522e-05,
      "loss": 0.1279,
      "step": 4920
    },
    {
      "epoch": 379.23,
      "learning_rate": 6.642117485587202e-05,
      "loss": 0.1401,
      "step": 4930
    },
    {
      "epoch": 380.0,
      "learning_rate": 6.630025235467727e-05,
      "loss": 0.133,
      "step": 4940
    },
    {
      "epoch": 380.77,
      "learning_rate": 6.617922308309115e-05,
      "loss": 0.1118,
      "step": 4950
    },
    {
      "epoch": 381.54,
      "learning_rate": 6.605808783388308e-05,
      "loss": 0.1236,
      "step": 4960
    },
    {
      "epoch": 382.31,
      "learning_rate": 6.593684740051678e-05,
      "loss": 0.1328,
      "step": 4970
    },
    {
      "epoch": 383.08,
      "learning_rate": 6.581550257714488e-05,
      "loss": 0.1166,
      "step": 4980
    },
    {
      "epoch": 383.85,
      "learning_rate": 6.569405415860377e-05,
      "loss": 0.123,
      "step": 4990
    },
    {
      "epoch": 384.62,
      "learning_rate": 6.557250294040849e-05,
      "loss": 0.108,
      "step": 5000
    },
    {
      "epoch": 384.62,
      "eval_valid_eval_loss": 5.547236919403076,
      "eval_valid_eval_loss_<cls>": 5.649626731872559,
      "eval_valid_eval_perplexity_batch": 256.52777099609375,
      "eval_valid_eval_perplexity_res": 1100.5430908203125,
      "eval_valid_eval_perplexity_seq": 256.52777099609375,
      "eval_valid_eval_reconstruction": 0.06676136702299118,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.547236919403076,
      "eval_valid_runtime": 0.4972,
      "eval_valid_samples_per_second": 2.011,
      "eval_valid_steps_per_second": 2.011,
      "step": 5000
    },
    {
      "epoch": 384.62,
      "eval_train_eval_loss": 0.05736321210861206,
      "eval_train_eval_loss_<cls>": 2.786989688873291,
      "eval_train_eval_perplexity_batch": 1.0590404272079468,
      "eval_train_eval_perplexity_res": 2.348323345184326,
      "eval_train_eval_perplexity_seq": 1.0613439083099365,
      "eval_train_eval_reconstruction": 0.20445255935192108,
      "eval_train_eval_reconstruction_<cls>": 0.14418604969978333,
      "eval_train_loss": 0.12245148420333862,
      "eval_train_runtime": 0.7931,
      "eval_train_samples_per_second": 124.827,
      "eval_train_steps_per_second": 16.391,
      "step": 5000
    },
    {
      "epoch": 385.38,
      "learning_rate": 6.545084971874738e-05,
      "loss": 0.1163,
      "step": 5010
    },
    {
      "epoch": 386.15,
      "learning_rate": 6.532909529047692e-05,
      "loss": 0.1237,
      "step": 5020
    },
    {
      "epoch": 386.92,
      "learning_rate": 6.520724045311659e-05,
      "loss": 0.132,
      "step": 5030
    },
    {
      "epoch": 387.69,
      "learning_rate": 6.508528600484348e-05,
      "loss": 0.124,
      "step": 5040
    },
    {
      "epoch": 388.46,
      "learning_rate": 6.496323274448721e-05,
      "loss": 0.1417,
      "step": 5050
    },
    {
      "epoch": 389.23,
      "learning_rate": 6.484108147152466e-05,
      "loss": 0.1175,
      "step": 5060
    },
    {
      "epoch": 390.0,
      "learning_rate": 6.471883298607462e-05,
      "loss": 0.1183,
      "step": 5070
    },
    {
      "epoch": 390.77,
      "learning_rate": 6.459648808889274e-05,
      "loss": 0.1046,
      "step": 5080
    },
    {
      "epoch": 391.54,
      "learning_rate": 6.447404758136617e-05,
      "loss": 0.1262,
      "step": 5090
    },
    {
      "epoch": 392.31,
      "learning_rate": 6.435151226550829e-05,
      "loss": 0.0965,
      "step": 5100
    },
    {
      "epoch": 392.31,
      "eval_valid_eval_loss": 5.570240020751953,
      "eval_valid_eval_loss_<cls>": 6.278763771057129,
      "eval_valid_eval_perplexity_batch": 262.4971008300781,
      "eval_valid_eval_perplexity_res": 1211.8935546875,
      "eval_valid_eval_perplexity_seq": 262.4971008300781,
      "eval_valid_eval_reconstruction": 0.07510431110858917,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.570240020751953,
      "eval_valid_runtime": 0.3872,
      "eval_valid_samples_per_second": 2.583,
      "eval_valid_steps_per_second": 2.583,
      "step": 5100
    },
    {
      "epoch": 392.31,
      "eval_train_eval_loss": 0.04414914548397064,
      "eval_train_eval_loss_<cls>": 2.5545096397399902,
      "eval_train_eval_perplexity_batch": 1.0451382398605347,
      "eval_train_eval_perplexity_res": 2.035720109939575,
      "eval_train_eval_perplexity_seq": 1.047135353088379,
      "eval_train_eval_reconstruction": 0.16011960804462433,
      "eval_train_eval_reconstruction_<cls>": 0.13901345431804657,
      "eval_train_loss": 0.10615496337413788,
      "eval_train_runtime": 1.0071,
      "eval_train_samples_per_second": 98.3,
      "eval_train_steps_per_second": 12.908,
      "step": 5100
    },
    {
      "epoch": 393.08,
      "learning_rate": 6.422888294395356e-05,
      "loss": 0.1183,
      "step": 5110
    },
    {
      "epoch": 393.85,
      "learning_rate": 6.410616041995212e-05,
      "loss": 0.1033,
      "step": 5120
    },
    {
      "epoch": 394.62,
      "learning_rate": 6.398334549736466e-05,
      "loss": 0.1172,
      "step": 5130
    },
    {
      "epoch": 395.38,
      "learning_rate": 6.38604389806571e-05,
      "loss": 0.1131,
      "step": 5140
    },
    {
      "epoch": 396.15,
      "learning_rate": 6.373744167489531e-05,
      "loss": 0.1298,
      "step": 5150
    },
    {
      "epoch": 396.92,
      "learning_rate": 6.361435438573986e-05,
      "loss": 0.1188,
      "step": 5160
    },
    {
      "epoch": 397.69,
      "learning_rate": 6.34911779194407e-05,
      "loss": 0.1336,
      "step": 5170
    },
    {
      "epoch": 398.46,
      "learning_rate": 6.336791308283196e-05,
      "loss": 0.1199,
      "step": 5180
    },
    {
      "epoch": 399.23,
      "learning_rate": 6.32445606833266e-05,
      "loss": 0.1191,
      "step": 5190
    },
    {
      "epoch": 400.0,
      "learning_rate": 6.312112152891107e-05,
      "loss": 0.1359,
      "step": 5200
    },
    {
      "epoch": 400.0,
      "eval_valid_eval_loss": 5.547768592834473,
      "eval_valid_eval_loss_<cls>": 5.1680073738098145,
      "eval_valid_eval_perplexity_batch": 256.6641845703125,
      "eval_valid_eval_perplexity_res": 1274.7515869140625,
      "eval_valid_eval_perplexity_seq": 256.6641845703125,
      "eval_valid_eval_reconstruction": 0.07122092694044113,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.547769069671631,
      "eval_valid_runtime": 0.4098,
      "eval_valid_samples_per_second": 2.44,
      "eval_valid_steps_per_second": 2.44,
      "step": 5200
    },
    {
      "epoch": 400.0,
      "eval_train_eval_loss": 0.06661508977413177,
      "eval_train_eval_loss_<cls>": 2.4227821826934814,
      "eval_train_eval_perplexity_batch": 1.068884015083313,
      "eval_train_eval_perplexity_res": 2.9036037921905518,
      "eval_train_eval_perplexity_seq": 1.0719225406646729,
      "eval_train_eval_reconstruction": 0.2629091739654541,
      "eval_train_eval_reconstruction_<cls>": 0.23645320534706116,
      "eval_train_loss": 0.11396721005439758,
      "eval_train_runtime": 1.1407,
      "eval_train_samples_per_second": 86.789,
      "eval_train_steps_per_second": 11.397,
      "step": 5200
    },
    {
      "epoch": 400.77,
      "learning_rate": 6.29975964281402e-05,
      "loss": 0.1081,
      "step": 5210
    },
    {
      "epoch": 401.54,
      "learning_rate": 6.287398619013172e-05,
      "loss": 0.1054,
      "step": 5220
    },
    {
      "epoch": 402.31,
      "learning_rate": 6.275029162456103e-05,
      "loss": 0.1212,
      "step": 5230
    },
    {
      "epoch": 403.08,
      "learning_rate": 6.26265135416559e-05,
      "loss": 0.1076,
      "step": 5240
    },
    {
      "epoch": 403.85,
      "learning_rate": 6.250265275219116e-05,
      "loss": 0.0893,
      "step": 5250
    },
    {
      "epoch": 404.62,
      "learning_rate": 6.23787100674834e-05,
      "loss": 0.116,
      "step": 5260
    },
    {
      "epoch": 405.38,
      "learning_rate": 6.225468629938564e-05,
      "loss": 0.1107,
      "step": 5270
    },
    {
      "epoch": 406.15,
      "learning_rate": 6.213058226028198e-05,
      "loss": 0.1089,
      "step": 5280
    },
    {
      "epoch": 406.92,
      "learning_rate": 6.200639876308236e-05,
      "loss": 0.106,
      "step": 5290
    },
    {
      "epoch": 407.69,
      "learning_rate": 6.188213662121716e-05,
      "loss": 0.0989,
      "step": 5300
    },
    {
      "epoch": 407.69,
      "eval_valid_eval_loss": 5.69902229309082,
      "eval_valid_eval_loss_<cls>": 6.309311866760254,
      "eval_valid_eval_perplexity_batch": 298.5753479003906,
      "eval_valid_eval_perplexity_res": 1360.0440673828125,
      "eval_valid_eval_perplexity_seq": 298.5753479003906,
      "eval_valid_eval_reconstruction": 0.07311827689409256,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.699024200439453,
      "eval_valid_runtime": 0.4503,
      "eval_valid_samples_per_second": 2.221,
      "eval_valid_steps_per_second": 2.221,
      "step": 5300
    },
    {
      "epoch": 407.69,
      "eval_train_eval_loss": 0.039702821522951126,
      "eval_train_eval_loss_<cls>": 2.462547540664673,
      "eval_train_eval_perplexity_batch": 1.0405014753341675,
      "eval_train_eval_perplexity_res": 2.086106538772583,
      "eval_train_eval_perplexity_seq": 1.0416423082351685,
      "eval_train_eval_reconstruction": 0.16322660446166992,
      "eval_train_eval_reconstruction_<cls>": 0.19617225229740143,
      "eval_train_loss": 0.09964016824960709,
      "eval_train_runtime": 0.8604,
      "eval_train_samples_per_second": 115.062,
      "eval_train_steps_per_second": 15.109,
      "step": 5300
    },
    {
      "epoch": 408.46,
      "learning_rate": 6.175779664863191e-05,
      "loss": 0.1111,
      "step": 5310
    },
    {
      "epoch": 409.23,
      "learning_rate": 6.163337965978194e-05,
      "loss": 0.0997,
      "step": 5320
    },
    {
      "epoch": 410.0,
      "learning_rate": 6.150888646962708e-05,
      "loss": 0.101,
      "step": 5330
    },
    {
      "epoch": 410.77,
      "learning_rate": 6.138431789362627e-05,
      "loss": 0.0973,
      "step": 5340
    },
    {
      "epoch": 411.54,
      "learning_rate": 6.125967474773223e-05,
      "loss": 0.1223,
      "step": 5350
    },
    {
      "epoch": 412.31,
      "learning_rate": 6.113495784838619e-05,
      "loss": 0.1148,
      "step": 5360
    },
    {
      "epoch": 413.08,
      "learning_rate": 6.1010168012512425e-05,
      "loss": 0.1026,
      "step": 5370
    },
    {
      "epoch": 413.85,
      "learning_rate": 6.088530605751298e-05,
      "loss": 0.0915,
      "step": 5380
    },
    {
      "epoch": 414.62,
      "learning_rate": 6.07603728012623e-05,
      "loss": 0.1059,
      "step": 5390
    },
    {
      "epoch": 415.38,
      "learning_rate": 6.0635369062101875e-05,
      "loss": 0.108,
      "step": 5400
    },
    {
      "epoch": 415.38,
      "eval_valid_eval_loss": 5.75724983215332,
      "eval_valid_eval_loss_<cls>": 6.6129841804504395,
      "eval_valid_eval_perplexity_batch": 316.4767761230469,
      "eval_valid_eval_perplexity_res": 1432.4720458984375,
      "eval_valid_eval_perplexity_seq": 316.4767761230469,
      "eval_valid_eval_reconstruction": 0.05999999865889549,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.757249355316162,
      "eval_valid_runtime": 0.4008,
      "eval_valid_samples_per_second": 2.495,
      "eval_valid_steps_per_second": 2.495,
      "step": 5400
    },
    {
      "epoch": 415.38,
      "eval_train_eval_loss": 0.050082962960004807,
      "eval_train_eval_loss_<cls>": 2.221273183822632,
      "eval_train_eval_perplexity_batch": 1.0513583421707153,
      "eval_train_eval_perplexity_res": 2.4318180084228516,
      "eval_train_eval_perplexity_seq": 1.0527881383895874,
      "eval_train_eval_reconstruction": 0.21915635466575623,
      "eval_train_eval_reconstruction_<cls>": 0.25106382369995117,
      "eval_train_loss": 0.10203404724597931,
      "eval_train_runtime": 0.8,
      "eval_train_samples_per_second": 123.748,
      "eval_train_steps_per_second": 16.25,
      "step": 5400
    },
    {
      "epoch": 416.15,
      "learning_rate": 6.051029565883486e-05,
      "loss": 0.1103,
      "step": 5410
    },
    {
      "epoch": 416.92,
      "learning_rate": 6.038515341072073e-05,
      "loss": 0.0912,
      "step": 5420
    },
    {
      "epoch": 417.69,
      "learning_rate": 6.025994313746991e-05,
      "loss": 0.096,
      "step": 5430
    },
    {
      "epoch": 418.46,
      "learning_rate": 6.013466565923841e-05,
      "loss": 0.1133,
      "step": 5440
    },
    {
      "epoch": 419.23,
      "learning_rate": 6.0009321796622444e-05,
      "loss": 0.0836,
      "step": 5450
    },
    {
      "epoch": 420.0,
      "learning_rate": 5.988391237065306e-05,
      "loss": 0.0985,
      "step": 5460
    },
    {
      "epoch": 420.77,
      "learning_rate": 5.9758438202790764e-05,
      "loss": 0.1003,
      "step": 5470
    },
    {
      "epoch": 421.54,
      "learning_rate": 5.9632900114920134e-05,
      "loss": 0.1004,
      "step": 5480
    },
    {
      "epoch": 422.31,
      "learning_rate": 5.950729892934444e-05,
      "loss": 0.0948,
      "step": 5490
    },
    {
      "epoch": 423.08,
      "learning_rate": 5.938163546878024e-05,
      "loss": 0.0977,
      "step": 5500
    },
    {
      "epoch": 423.08,
      "eval_valid_eval_loss": 5.750737190246582,
      "eval_valid_eval_loss_<cls>": 5.33015251159668,
      "eval_valid_eval_perplexity_batch": 314.42236328125,
      "eval_valid_eval_perplexity_res": 1459.096435546875,
      "eval_valid_eval_perplexity_seq": 314.42236328125,
      "eval_valid_eval_reconstruction": 0.0708092451095581,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.750737190246582,
      "eval_valid_runtime": 0.4373,
      "eval_valid_samples_per_second": 2.287,
      "eval_valid_steps_per_second": 2.287,
      "step": 5500
    },
    {
      "epoch": 423.08,
      "eval_train_eval_loss": 0.037140943109989166,
      "eval_train_eval_loss_<cls>": 2.3256821632385254,
      "eval_train_eval_perplexity_batch": 1.0378392934799194,
      "eval_train_eval_perplexity_res": 2.0733633041381836,
      "eval_train_eval_perplexity_seq": 1.0389666557312012,
      "eval_train_eval_reconstruction": 0.16478146612644196,
      "eval_train_eval_reconstruction_<cls>": 0.3047619163990021,
      "eval_train_loss": 0.09645234793424606,
      "eval_train_runtime": 0.8071,
      "eval_train_samples_per_second": 122.666,
      "eval_train_steps_per_second": 16.108,
      "step": 5500
    },
    {
      "epoch": 423.85,
      "learning_rate": 5.9255910556352026e-05,
      "loss": 0.0929,
      "step": 5510
    },
    {
      "epoch": 424.62,
      "learning_rate": 5.913012501558682e-05,
      "loss": 0.098,
      "step": 5520
    },
    {
      "epoch": 425.38,
      "learning_rate": 5.9004279670408766e-05,
      "loss": 0.101,
      "step": 5530
    },
    {
      "epoch": 426.15,
      "learning_rate": 5.887837534513372e-05,
      "loss": 0.0974,
      "step": 5540
    },
    {
      "epoch": 426.92,
      "learning_rate": 5.87524128644639e-05,
      "loss": 0.0919,
      "step": 5550
    },
    {
      "epoch": 427.69,
      "learning_rate": 5.8626393053482445e-05,
      "loss": 0.101,
      "step": 5560
    },
    {
      "epoch": 428.46,
      "learning_rate": 5.850031673764801e-05,
      "loss": 0.109,
      "step": 5570
    },
    {
      "epoch": 429.23,
      "learning_rate": 5.837418474278939e-05,
      "loss": 0.08,
      "step": 5580
    },
    {
      "epoch": 430.0,
      "learning_rate": 5.824799789510007e-05,
      "loss": 0.0942,
      "step": 5590
    },
    {
      "epoch": 430.77,
      "learning_rate": 5.812175702113286e-05,
      "loss": 0.0815,
      "step": 5600
    },
    {
      "epoch": 430.77,
      "eval_valid_eval_loss": 5.7288055419921875,
      "eval_valid_eval_loss_<cls>": 5.561684608459473,
      "eval_valid_eval_perplexity_batch": 307.60162353515625,
      "eval_valid_eval_perplexity_res": 1643.28857421875,
      "eval_valid_eval_perplexity_seq": 307.60162353515625,
      "eval_valid_eval_reconstruction": 0.07053942233324051,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.7288055419921875,
      "eval_valid_runtime": 0.3886,
      "eval_valid_samples_per_second": 2.573,
      "eval_valid_steps_per_second": 2.573,
      "step": 5600
    },
    {
      "epoch": 430.77,
      "eval_train_eval_loss": 0.03375554457306862,
      "eval_train_eval_loss_<cls>": 2.1189939975738525,
      "eval_train_eval_perplexity_batch": 1.0343316793441772,
      "eval_train_eval_perplexity_res": 2.0662460327148438,
      "eval_train_eval_perplexity_seq": 1.0353779792785645,
      "eval_train_eval_reconstruction": 0.16719630360603333,
      "eval_train_eval_reconstruction_<cls>": 0.3317756950855255,
      "eval_train_loss": 0.082293301820755,
      "eval_train_runtime": 0.8231,
      "eval_train_samples_per_second": 120.277,
      "eval_train_steps_per_second": 15.794,
      "step": 5600
    },
    {
      "epoch": 431.54,
      "learning_rate": 5.799546294779442e-05,
      "loss": 0.079,
      "step": 5610
    },
    {
      "epoch": 432.31,
      "learning_rate": 5.7869116502339905e-05,
      "loss": 0.0895,
      "step": 5620
    },
    {
      "epoch": 433.08,
      "learning_rate": 5.7742718512367514e-05,
      "loss": 0.0898,
      "step": 5630
    },
    {
      "epoch": 433.85,
      "learning_rate": 5.761626980581305e-05,
      "loss": 0.0812,
      "step": 5640
    },
    {
      "epoch": 434.62,
      "learning_rate": 5.7489771210944564e-05,
      "loss": 0.079,
      "step": 5650
    },
    {
      "epoch": 435.38,
      "learning_rate": 5.7363223556356884e-05,
      "loss": 0.0925,
      "step": 5660
    },
    {
      "epoch": 436.15,
      "learning_rate": 5.7236627670966125e-05,
      "loss": 0.0791,
      "step": 5670
    },
    {
      "epoch": 436.92,
      "learning_rate": 5.710998438400439e-05,
      "loss": 0.0872,
      "step": 5680
    },
    {
      "epoch": 437.69,
      "learning_rate": 5.6983294525014254e-05,
      "loss": 0.0831,
      "step": 5690
    },
    {
      "epoch": 438.46,
      "learning_rate": 5.6856558923843364e-05,
      "loss": 0.0802,
      "step": 5700
    },
    {
      "epoch": 438.46,
      "eval_valid_eval_loss": 5.795541763305664,
      "eval_valid_eval_loss_<cls>": 5.41426944732666,
      "eval_valid_eval_perplexity_batch": 328.8302917480469,
      "eval_valid_eval_perplexity_res": 1662.107421875,
      "eval_valid_eval_perplexity_seq": 328.8302917480469,
      "eval_valid_eval_reconstruction": 0.07307171821594238,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.795542240142822,
      "eval_valid_runtime": 0.3613,
      "eval_valid_samples_per_second": 2.768,
      "eval_valid_steps_per_second": 2.768,
      "step": 5700
    },
    {
      "epoch": 438.46,
      "eval_train_eval_loss": 0.04141640663146973,
      "eval_train_eval_loss_<cls>": 2.028834342956543,
      "eval_train_eval_perplexity_batch": 1.0422860383987427,
      "eval_train_eval_perplexity_res": 2.1778883934020996,
      "eval_train_eval_perplexity_seq": 1.0432944297790527,
      "eval_train_eval_reconstruction": 0.2090272456407547,
      "eval_train_eval_reconstruction_<cls>": 0.30978259444236755,
      "eval_train_loss": 0.08503185957670212,
      "eval_train_runtime": 0.717,
      "eval_train_samples_per_second": 138.071,
      "eval_train_steps_per_second": 18.131,
      "step": 5700
    },
    {
      "epoch": 439.23,
      "learning_rate": 5.6729778410638964e-05,
      "loss": 0.0915,
      "step": 5710
    },
    {
      "epoch": 440.0,
      "learning_rate": 5.660295381584249e-05,
      "loss": 0.0844,
      "step": 5720
    },
    {
      "epoch": 440.77,
      "learning_rate": 5.647608597018412e-05,
      "loss": 0.0908,
      "step": 5730
    },
    {
      "epoch": 441.54,
      "learning_rate": 5.634917570467735e-05,
      "loss": 0.0849,
      "step": 5740
    },
    {
      "epoch": 442.31,
      "learning_rate": 5.622222385061353e-05,
      "loss": 0.0943,
      "step": 5750
    },
    {
      "epoch": 443.08,
      "learning_rate": 5.609523123955641e-05,
      "loss": 0.0975,
      "step": 5760
    },
    {
      "epoch": 443.85,
      "learning_rate": 5.596819870333673e-05,
      "loss": 0.0865,
      "step": 5770
    },
    {
      "epoch": 444.62,
      "learning_rate": 5.5841127074046716e-05,
      "loss": 0.0882,
      "step": 5780
    },
    {
      "epoch": 445.38,
      "learning_rate": 5.5714017184034715e-05,
      "loss": 0.0817,
      "step": 5790
    },
    {
      "epoch": 446.15,
      "learning_rate": 5.558686986589963e-05,
      "loss": 0.095,
      "step": 5800
    },
    {
      "epoch": 446.15,
      "eval_valid_eval_loss": 5.823139667510986,
      "eval_valid_eval_loss_<cls>": 5.409822463989258,
      "eval_valid_eval_perplexity_batch": 338.0317077636719,
      "eval_valid_eval_perplexity_res": 1837.3271484375,
      "eval_valid_eval_perplexity_seq": 338.0317077636719,
      "eval_valid_eval_reconstruction": 0.07369942218065262,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.823139667510986,
      "eval_valid_runtime": 0.3964,
      "eval_valid_samples_per_second": 2.523,
      "eval_valid_steps_per_second": 2.523,
      "step": 5800
    },
    {
      "epoch": 446.15,
      "eval_train_eval_loss": 0.04630754142999649,
      "eval_train_eval_loss_<cls>": 1.9258754253387451,
      "eval_train_eval_perplexity_batch": 1.0473964214324951,
      "eval_train_eval_perplexity_res": 2.5238287448883057,
      "eval_train_eval_perplexity_seq": 1.0490585565567017,
      "eval_train_eval_reconstruction": 0.23352298140525818,
      "eval_train_eval_reconstruction_<cls>": 0.369047611951828,
      "eval_train_loss": 0.08034275472164154,
      "eval_train_runtime": 0.7618,
      "eval_train_samples_per_second": 129.958,
      "eval_train_steps_per_second": 17.065,
      "step": 5800
    },
    {
      "epoch": 446.92,
      "learning_rate": 5.5459685952485574e-05,
      "loss": 0.0796,
      "step": 5810
    },
    {
      "epoch": 447.69,
      "learning_rate": 5.533246627687634e-05,
      "loss": 0.0781,
      "step": 5820
    },
    {
      "epoch": 448.46,
      "learning_rate": 5.520521167238998e-05,
      "loss": 0.0771,
      "step": 5830
    },
    {
      "epoch": 449.23,
      "learning_rate": 5.507792297257335e-05,
      "loss": 0.0847,
      "step": 5840
    },
    {
      "epoch": 450.0,
      "learning_rate": 5.495060101119662e-05,
      "loss": 0.0932,
      "step": 5850
    },
    {
      "epoch": 450.77,
      "learning_rate": 5.4823246622247845e-05,
      "loss": 0.0759,
      "step": 5860
    },
    {
      "epoch": 451.54,
      "learning_rate": 5.469586063992748e-05,
      "loss": 0.0841,
      "step": 5870
    },
    {
      "epoch": 452.31,
      "learning_rate": 5.456844389864292e-05,
      "loss": 0.0766,
      "step": 5880
    },
    {
      "epoch": 453.08,
      "learning_rate": 5.4440997233003044e-05,
      "loss": 0.0796,
      "step": 5890
    },
    {
      "epoch": 453.85,
      "learning_rate": 5.431352147781275e-05,
      "loss": 0.0682,
      "step": 5900
    },
    {
      "epoch": 453.85,
      "eval_valid_eval_loss": 5.964574337005615,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 389.3872375488281,
      "eval_valid_eval_perplexity_res": 1924.3489990234375,
      "eval_valid_eval_perplexity_seq": 389.3872375488281,
      "eval_valid_eval_reconstruction": 0.056603774428367615,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 5.964574813842773,
      "eval_valid_runtime": 0.4129,
      "eval_valid_samples_per_second": 2.422,
      "eval_valid_steps_per_second": 2.422,
      "step": 5900
    },
    {
      "epoch": 453.85,
      "eval_train_eval_loss": 0.035005223006010056,
      "eval_train_eval_loss_<cls>": 1.8794214725494385,
      "eval_train_eval_perplexity_batch": 1.0356251001358032,
      "eval_train_eval_perplexity_res": 2.198676586151123,
      "eval_train_eval_perplexity_seq": 1.036454200744629,
      "eval_train_eval_reconstruction": 0.19399377703666687,
      "eval_train_eval_reconstruction_<cls>": 0.3729729652404785,
      "eval_train_loss": 0.06897992640733719,
      "eval_train_runtime": 0.7354,
      "eval_train_samples_per_second": 134.619,
      "eval_train_steps_per_second": 17.677,
      "step": 5900
    },
    {
      "epoch": 454.62,
      "learning_rate": 5.418601746806745e-05,
      "loss": 0.0796,
      "step": 5910
    },
    {
      "epoch": 455.38,
      "learning_rate": 5.4058486038947685e-05,
      "loss": 0.075,
      "step": 5920
    },
    {
      "epoch": 456.15,
      "learning_rate": 5.3930928025813545e-05,
      "loss": 0.095,
      "step": 5930
    },
    {
      "epoch": 456.92,
      "learning_rate": 5.380334426419926e-05,
      "loss": 0.085,
      "step": 5940
    },
    {
      "epoch": 457.69,
      "learning_rate": 5.367573558980775e-05,
      "loss": 0.0713,
      "step": 5950
    },
    {
      "epoch": 458.46,
      "learning_rate": 5.3548102838505096e-05,
      "loss": 0.0895,
      "step": 5960
    },
    {
      "epoch": 459.23,
      "learning_rate": 5.3420446846315084e-05,
      "loss": 0.0881,
      "step": 5970
    },
    {
      "epoch": 460.0,
      "learning_rate": 5.3292768449413745e-05,
      "loss": 0.0755,
      "step": 5980
    },
    {
      "epoch": 460.77,
      "learning_rate": 5.316506848412386e-05,
      "loss": 0.0705,
      "step": 5990
    },
    {
      "epoch": 461.54,
      "learning_rate": 5.3037347786909495e-05,
      "loss": 0.0722,
      "step": 6000
    },
    {
      "epoch": 461.54,
      "eval_valid_eval_loss": 5.9489827156066895,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 383.3631591796875,
      "eval_valid_eval_perplexity_res": 1894.587890625,
      "eval_valid_eval_perplexity_seq": 383.3631591796875,
      "eval_valid_eval_reconstruction": 0.0882352963089943,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 5.9489827156066895,
      "eval_valid_runtime": 0.3705,
      "eval_valid_samples_per_second": 2.699,
      "eval_valid_steps_per_second": 2.699,
      "step": 6000
    },
    {
      "epoch": 461.54,
      "eval_train_eval_loss": 0.027445543557405472,
      "eval_train_eval_loss_<cls>": 1.832075834274292,
      "eval_train_eval_perplexity_batch": 1.0278255939483643,
      "eval_train_eval_perplexity_res": 1.9311237335205078,
      "eval_train_eval_perplexity_seq": 1.0286478996276855,
      "eval_train_eval_reconstruction": 0.14742054045200348,
      "eval_train_eval_reconstruction_<cls>": 0.39461883902549744,
      "eval_train_loss": 0.07938006520271301,
      "eval_train_runtime": 0.8352,
      "eval_train_samples_per_second": 118.53,
      "eval_train_steps_per_second": 15.565,
      "step": 6000
    },
    {
      "epoch": 462.31,
      "learning_rate": 5.290960719437051e-05,
      "loss": 0.0773,
      "step": 6010
    },
    {
      "epoch": 463.08,
      "learning_rate": 5.2781847543237085e-05,
      "loss": 0.0754,
      "step": 6020
    },
    {
      "epoch": 463.85,
      "learning_rate": 5.2654069670364226e-05,
      "loss": 0.0781,
      "step": 6030
    },
    {
      "epoch": 464.62,
      "learning_rate": 5.2526274412726305e-05,
      "loss": 0.0802,
      "step": 6040
    },
    {
      "epoch": 465.38,
      "learning_rate": 5.239846260741158e-05,
      "loss": 0.0737,
      "step": 6050
    },
    {
      "epoch": 466.15,
      "learning_rate": 5.227063509161668e-05,
      "loss": 0.0674,
      "step": 6060
    },
    {
      "epoch": 466.92,
      "learning_rate": 5.2142792702641175e-05,
      "loss": 0.0888,
      "step": 6070
    },
    {
      "epoch": 467.69,
      "learning_rate": 5.2014936277882e-05,
      "loss": 0.0797,
      "step": 6080
    },
    {
      "epoch": 468.46,
      "learning_rate": 5.188706665482809e-05,
      "loss": 0.0733,
      "step": 6090
    },
    {
      "epoch": 469.23,
      "learning_rate": 5.1759184671054785e-05,
      "loss": 0.0728,
      "step": 6100
    },
    {
      "epoch": 469.23,
      "eval_valid_eval_loss": 5.941854953765869,
      "eval_valid_eval_loss_<cls>": 3.890920400619507,
      "eval_valid_eval_perplexity_batch": 380.6403503417969,
      "eval_valid_eval_perplexity_res": 2110.02734375,
      "eval_valid_eval_perplexity_seq": 380.6403503417969,
      "eval_valid_eval_reconstruction": 0.08242949843406677,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.941855430603027,
      "eval_valid_runtime": 0.4067,
      "eval_valid_samples_per_second": 2.459,
      "eval_valid_steps_per_second": 2.459,
      "step": 6100
    },
    {
      "epoch": 469.23,
      "eval_train_eval_loss": 0.030707143247127533,
      "eval_train_eval_loss_<cls>": 1.7429094314575195,
      "eval_train_eval_perplexity_batch": 1.0311834812164307,
      "eval_train_eval_perplexity_res": 2.094050407409668,
      "eval_train_eval_perplexity_seq": 1.0318663120269775,
      "eval_train_eval_reconstruction": 0.18061816692352295,
      "eval_train_eval_reconstruction_<cls>": 0.48543688654899597,
      "eval_train_loss": 0.0724242553114891,
      "eval_train_runtime": 0.7654,
      "eval_train_samples_per_second": 129.338,
      "eval_train_steps_per_second": 16.984,
      "step": 6100
    },
    {
      "epoch": 470.0,
      "learning_rate": 5.163129116421841e-05,
      "loss": 0.0773,
      "step": 6110
    },
    {
      "epoch": 470.77,
      "learning_rate": 5.150338697205077e-05,
      "loss": 0.0738,
      "step": 6120
    },
    {
      "epoch": 471.54,
      "learning_rate": 5.137547293235366e-05,
      "loss": 0.0648,
      "step": 6130
    },
    {
      "epoch": 472.31,
      "learning_rate": 5.1247549882993386e-05,
      "loss": 0.0799,
      "step": 6140
    },
    {
      "epoch": 473.08,
      "learning_rate": 5.111961866189524e-05,
      "loss": 0.0726,
      "step": 6150
    },
    {
      "epoch": 473.85,
      "learning_rate": 5.0991680107038083e-05,
      "loss": 0.0645,
      "step": 6160
    },
    {
      "epoch": 474.62,
      "learning_rate": 5.086373505644877e-05,
      "loss": 0.0601,
      "step": 6170
    },
    {
      "epoch": 475.38,
      "learning_rate": 5.073578434819676e-05,
      "loss": 0.0809,
      "step": 6180
    },
    {
      "epoch": 476.15,
      "learning_rate": 5.0607828820388506e-05,
      "loss": 0.071,
      "step": 6190
    },
    {
      "epoch": 476.92,
      "learning_rate": 5.047986931116205e-05,
      "loss": 0.0622,
      "step": 6200
    },
    {
      "epoch": 476.92,
      "eval_valid_eval_loss": 5.995825290679932,
      "eval_valid_eval_loss_<cls>": 5.79777717590332,
      "eval_valid_eval_perplexity_batch": 401.74810791015625,
      "eval_valid_eval_perplexity_res": 2235.1416015625,
      "eval_valid_eval_perplexity_seq": 401.74810791015625,
      "eval_valid_eval_reconstruction": 0.07119741290807724,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 5.995825290679932,
      "eval_valid_runtime": 0.4019,
      "eval_valid_samples_per_second": 2.488,
      "eval_valid_steps_per_second": 2.488,
      "step": 6200
    },
    {
      "epoch": 476.92,
      "eval_train_eval_loss": 0.03186070919036865,
      "eval_train_eval_loss_<cls>": 1.6746495962142944,
      "eval_train_eval_perplexity_batch": 1.0323736667633057,
      "eval_train_eval_perplexity_res": 2.2998242378234863,
      "eval_train_eval_perplexity_seq": 1.0333245992660522,
      "eval_train_eval_reconstruction": 0.192092165350914,
      "eval_train_eval_reconstruction_<cls>": 0.5069767236709595,
      "eval_train_loss": 0.06600341200828552,
      "eval_train_runtime": 0.8197,
      "eval_train_samples_per_second": 120.775,
      "eval_train_steps_per_second": 15.859,
      "step": 6200
    },
    {
      "epoch": 477.69,
      "learning_rate": 5.035190665868157e-05,
      "loss": 0.0653,
      "step": 6210
    },
    {
      "epoch": 478.46,
      "learning_rate": 5.0223941701131764e-05,
      "loss": 0.0748,
      "step": 6220
    },
    {
      "epoch": 479.23,
      "learning_rate": 5.009597527671245e-05,
      "loss": 0.0731,
      "step": 6230
    },
    {
      "epoch": 480.0,
      "learning_rate": 4.9968008223633065e-05,
      "loss": 0.0787,
      "step": 6240
    },
    {
      "epoch": 480.77,
      "learning_rate": 4.984004138010715e-05,
      "loss": 0.0602,
      "step": 6250
    },
    {
      "epoch": 481.54,
      "learning_rate": 4.971207558434688e-05,
      "loss": 0.0759,
      "step": 6260
    },
    {
      "epoch": 482.31,
      "learning_rate": 4.9584111674557566e-05,
      "loss": 0.0662,
      "step": 6270
    },
    {
      "epoch": 483.08,
      "learning_rate": 4.945615048893218e-05,
      "loss": 0.0684,
      "step": 6280
    },
    {
      "epoch": 483.85,
      "learning_rate": 4.93281928656458e-05,
      "loss": 0.0643,
      "step": 6290
    },
    {
      "epoch": 484.62,
      "learning_rate": 4.920023964285025e-05,
      "loss": 0.0634,
      "step": 6300
    },
    {
      "epoch": 484.62,
      "eval_valid_eval_loss": 5.901607513427734,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 365.624755859375,
      "eval_valid_eval_perplexity_res": 2163.5205078125,
      "eval_valid_eval_perplexity_seq": 365.624755859375,
      "eval_valid_eval_reconstruction": 0.0810810774564743,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 5.901607036590576,
      "eval_valid_runtime": 0.3461,
      "eval_valid_samples_per_second": 2.89,
      "eval_valid_steps_per_second": 2.89,
      "step": 6300
    },
    {
      "epoch": 484.62,
      "eval_train_eval_loss": 0.03326883167028427,
      "eval_train_eval_loss_<cls>": 1.5730355978012085,
      "eval_train_eval_perplexity_batch": 1.0338283777236938,
      "eval_train_eval_perplexity_res": 2.286679267883301,
      "eval_train_eval_perplexity_seq": 1.0346128940582275,
      "eval_train_eval_reconstruction": 0.21157003939151764,
      "eval_train_eval_reconstruction_<cls>": 0.5223214030265808,
      "eval_train_loss": 0.06828676164150238,
      "eval_train_runtime": 0.754,
      "eval_train_samples_per_second": 131.296,
      "eval_train_steps_per_second": 17.241,
      "step": 6300
    },
    {
      "epoch": 485.38,
      "learning_rate": 4.907229165866846e-05,
      "loss": 0.0705,
      "step": 6310
    },
    {
      "epoch": 486.15,
      "learning_rate": 4.894434975118908e-05,
      "loss": 0.0739,
      "step": 6320
    },
    {
      "epoch": 486.92,
      "learning_rate": 4.881641475846096e-05,
      "loss": 0.0617,
      "step": 6330
    },
    {
      "epoch": 487.69,
      "learning_rate": 4.868848751848761e-05,
      "loss": 0.0607,
      "step": 6340
    },
    {
      "epoch": 488.46,
      "learning_rate": 4.8560568869221805e-05,
      "loss": 0.063,
      "step": 6350
    },
    {
      "epoch": 489.23,
      "learning_rate": 4.843265964856005e-05,
      "loss": 0.0683,
      "step": 6360
    },
    {
      "epoch": 490.0,
      "learning_rate": 4.830476069433707e-05,
      "loss": 0.0717,
      "step": 6370
    },
    {
      "epoch": 490.77,
      "learning_rate": 4.8176872844320334e-05,
      "loss": 0.066,
      "step": 6380
    },
    {
      "epoch": 491.54,
      "learning_rate": 4.8048996936204614e-05,
      "loss": 0.0698,
      "step": 6390
    },
    {
      "epoch": 492.31,
      "learning_rate": 4.7921133807606424e-05,
      "loss": 0.064,
      "step": 6400
    },
    {
      "epoch": 492.31,
      "eval_valid_eval_loss": 6.082616806030273,
      "eval_valid_eval_loss_<cls>": 6.027974605560303,
      "eval_valid_eval_perplexity_batch": 438.17431640625,
      "eval_valid_eval_perplexity_res": 2475.693359375,
      "eval_valid_eval_perplexity_seq": 438.17431640625,
      "eval_valid_eval_reconstruction": 0.07319587469100952,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.082615852355957,
      "eval_valid_runtime": 0.3879,
      "eval_valid_samples_per_second": 2.578,
      "eval_valid_steps_per_second": 2.578,
      "step": 6400
    },
    {
      "epoch": 492.31,
      "eval_train_eval_loss": 0.024521226063370705,
      "eval_train_eval_loss_<cls>": 1.5867549180984497,
      "eval_train_eval_perplexity_batch": 1.0248243808746338,
      "eval_train_eval_perplexity_res": 2.005047082901001,
      "eval_train_eval_perplexity_seq": 1.0251356363296509,
      "eval_train_eval_reconstruction": 0.16961120069026947,
      "eval_train_eval_reconstruction_<cls>": 0.5570776462554932,
      "eval_train_loss": 0.0660979151725769,
      "eval_train_runtime": 0.7637,
      "eval_train_samples_per_second": 129.638,
      "eval_train_steps_per_second": 17.023,
      "step": 6400
    },
    {
      "epoch": 493.08,
      "learning_rate": 4.77932842960586e-05,
      "loss": 0.0722,
      "step": 6410
    },
    {
      "epoch": 493.85,
      "learning_rate": 4.766544923900473e-05,
      "loss": 0.061,
      "step": 6420
    },
    {
      "epoch": 494.62,
      "learning_rate": 4.7537629473793784e-05,
      "loss": 0.0724,
      "step": 6430
    },
    {
      "epoch": 495.38,
      "learning_rate": 4.7409825837674524e-05,
      "loss": 0.0641,
      "step": 6440
    },
    {
      "epoch": 496.15,
      "learning_rate": 4.728203916779009e-05,
      "loss": 0.0697,
      "step": 6450
    },
    {
      "epoch": 496.92,
      "learning_rate": 4.715427030117248e-05,
      "loss": 0.0608,
      "step": 6460
    },
    {
      "epoch": 497.69,
      "learning_rate": 4.702652007473707e-05,
      "loss": 0.0648,
      "step": 6470
    },
    {
      "epoch": 498.46,
      "learning_rate": 4.6898789325277134e-05,
      "loss": 0.0605,
      "step": 6480
    },
    {
      "epoch": 499.23,
      "learning_rate": 4.6771078889458385e-05,
      "loss": 0.0692,
      "step": 6490
    },
    {
      "epoch": 500.0,
      "learning_rate": 4.6643389603813486e-05,
      "loss": 0.0545,
      "step": 6500
    },
    {
      "epoch": 500.0,
      "eval_valid_eval_loss": 5.9195170402526855,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 372.2319030761719,
      "eval_valid_eval_perplexity_res": 2366.679931640625,
      "eval_valid_eval_perplexity_seq": 372.2319030761719,
      "eval_valid_eval_reconstruction": 0.0810810774564743,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 5.919517517089844,
      "eval_valid_runtime": 0.4157,
      "eval_valid_samples_per_second": 2.406,
      "eval_valid_steps_per_second": 2.406,
      "step": 6500
    },
    {
      "epoch": 500.0,
      "eval_train_eval_loss": 0.029200518503785133,
      "eval_train_eval_loss_<cls>": 1.4231513738632202,
      "eval_train_eval_perplexity_batch": 1.0296310186386108,
      "eval_train_eval_perplexity_res": 2.089128255844116,
      "eval_train_eval_perplexity_seq": 1.0302464962005615,
      "eval_train_eval_reconstruction": 0.20304647088050842,
      "eval_train_eval_reconstruction_<cls>": 0.5763546824455261,
      "eval_train_loss": 0.05977306887507439,
      "eval_train_runtime": 0.7482,
      "eval_train_samples_per_second": 132.324,
      "eval_train_steps_per_second": 17.376,
      "step": 6500
    },
    {
      "epoch": 500.77,
      "learning_rate": 4.651572230473651e-05,
      "loss": 0.0604,
      "step": 6510
    },
    {
      "epoch": 501.54,
      "learning_rate": 4.638807782847755e-05,
      "loss": 0.0573,
      "step": 6520
    },
    {
      "epoch": 502.31,
      "learning_rate": 4.626045701113722e-05,
      "loss": 0.0594,
      "step": 6530
    },
    {
      "epoch": 503.08,
      "learning_rate": 4.613286068866114e-05,
      "loss": 0.0644,
      "step": 6540
    },
    {
      "epoch": 503.85,
      "learning_rate": 4.600528969683448e-05,
      "loss": 0.0655,
      "step": 6550
    },
    {
      "epoch": 504.62,
      "learning_rate": 4.5877744871276484e-05,
      "loss": 0.0685,
      "step": 6560
    },
    {
      "epoch": 505.38,
      "learning_rate": 4.575022704743503e-05,
      "loss": 0.0623,
      "step": 6570
    },
    {
      "epoch": 506.15,
      "learning_rate": 4.562273706058112e-05,
      "loss": 0.0659,
      "step": 6580
    },
    {
      "epoch": 506.92,
      "learning_rate": 4.54952757458034e-05,
      "loss": 0.056,
      "step": 6590
    },
    {
      "epoch": 507.69,
      "learning_rate": 4.5367843938002694e-05,
      "loss": 0.0654,
      "step": 6600
    },
    {
      "epoch": 507.69,
      "eval_valid_eval_loss": 6.1267547607421875,
      "eval_valid_eval_loss_<cls>": 5.769625663757324,
      "eval_valid_eval_perplexity_batch": 457.9476013183594,
      "eval_valid_eval_perplexity_res": 2791.13134765625,
      "eval_valid_eval_perplexity_seq": 457.9476013183594,
      "eval_valid_eval_reconstruction": 0.08381503075361252,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.1267547607421875,
      "eval_valid_runtime": 0.4062,
      "eval_valid_samples_per_second": 2.462,
      "eval_valid_steps_per_second": 2.462,
      "step": 6600
    },
    {
      "epoch": 507.69,
      "eval_train_eval_loss": 0.02414872869849205,
      "eval_train_eval_loss_<cls>": 1.452972650527954,
      "eval_train_eval_perplexity_batch": 1.0244426727294922,
      "eval_train_eval_perplexity_res": 2.112896680831909,
      "eval_train_eval_perplexity_seq": 1.0249748229980469,
      "eval_train_eval_reconstruction": 0.174394890666008,
      "eval_train_eval_reconstruction_<cls>": 0.6310679316520691,
      "eval_train_loss": 0.05365924537181854,
      "eval_train_runtime": 0.8079,
      "eval_train_samples_per_second": 122.541,
      "eval_train_steps_per_second": 16.091,
      "step": 6600
    },
    {
      "epoch": 508.46,
      "learning_rate": 4.524044247188658e-05,
      "loss": 0.0528,
      "step": 6610
    },
    {
      "epoch": 509.23,
      "learning_rate": 4.511307218196389e-05,
      "loss": 0.0588,
      "step": 6620
    },
    {
      "epoch": 510.0,
      "learning_rate": 4.498573390253922e-05,
      "loss": 0.0649,
      "step": 6630
    },
    {
      "epoch": 510.77,
      "learning_rate": 4.4858428467707525e-05,
      "loss": 0.0573,
      "step": 6640
    },
    {
      "epoch": 511.54,
      "learning_rate": 4.473115671134859e-05,
      "loss": 0.0602,
      "step": 6650
    },
    {
      "epoch": 512.31,
      "learning_rate": 4.4603919467121615e-05,
      "loss": 0.0606,
      "step": 6660
    },
    {
      "epoch": 513.08,
      "learning_rate": 4.447671756845973e-05,
      "loss": 0.0584,
      "step": 6670
    },
    {
      "epoch": 513.85,
      "learning_rate": 4.4349551848564544e-05,
      "loss": 0.0559,
      "step": 6680
    },
    {
      "epoch": 514.62,
      "learning_rate": 4.422242314040067e-05,
      "loss": 0.0557,
      "step": 6690
    },
    {
      "epoch": 515.38,
      "learning_rate": 4.409533227669033e-05,
      "loss": 0.0605,
      "step": 6700
    },
    {
      "epoch": 515.38,
      "eval_valid_eval_loss": 6.151332378387451,
      "eval_valid_eval_loss_<cls>": 5.577587127685547,
      "eval_valid_eval_perplexity_batch": 469.3423156738281,
      "eval_valid_eval_perplexity_res": 2769.3837890625,
      "eval_valid_eval_perplexity_seq": 469.3423156738281,
      "eval_valid_eval_reconstruction": 0.06789606064558029,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.151332855224609,
      "eval_valid_runtime": 0.437,
      "eval_valid_samples_per_second": 2.288,
      "eval_valid_steps_per_second": 2.288,
      "step": 6700
    },
    {
      "epoch": 515.38,
      "eval_train_eval_loss": 0.020084353163838387,
      "eval_train_eval_loss_<cls>": 1.37746000289917,
      "eval_train_eval_perplexity_batch": 1.0202873945236206,
      "eval_train_eval_perplexity_res": 1.896010398864746,
      "eval_train_eval_perplexity_seq": 1.0206480026245117,
      "eval_train_eval_reconstruction": 0.15031176805496216,
      "eval_train_eval_reconstruction_<cls>": 0.6597937941551208,
      "eval_train_loss": 0.053500041365623474,
      "eval_train_runtime": 0.8427,
      "eval_train_samples_per_second": 117.483,
      "eval_train_steps_per_second": 15.427,
      "step": 6700
    },
    {
      "epoch": 516.15,
      "learning_rate": 4.396828008990782e-05,
      "loss": 0.064,
      "step": 6710
    },
    {
      "epoch": 516.92,
      "learning_rate": 4.38412674122741e-05,
      "loss": 0.0609,
      "step": 6720
    },
    {
      "epoch": 517.69,
      "learning_rate": 4.3714295075751345e-05,
      "loss": 0.0584,
      "step": 6730
    },
    {
      "epoch": 518.46,
      "learning_rate": 4.3587363912037475e-05,
      "loss": 0.0592,
      "step": 6740
    },
    {
      "epoch": 519.23,
      "learning_rate": 4.3460474752560724e-05,
      "loss": 0.0505,
      "step": 6750
    },
    {
      "epoch": 520.0,
      "learning_rate": 4.3333628428474216e-05,
      "loss": 0.0587,
      "step": 6760
    },
    {
      "epoch": 520.77,
      "learning_rate": 4.320682577065042e-05,
      "loss": 0.0542,
      "step": 6770
    },
    {
      "epoch": 521.54,
      "learning_rate": 4.308006760967585e-05,
      "loss": 0.0592,
      "step": 6780
    },
    {
      "epoch": 522.31,
      "learning_rate": 4.295335477584553e-05,
      "loss": 0.0526,
      "step": 6790
    },
    {
      "epoch": 523.08,
      "learning_rate": 4.282668809915758e-05,
      "loss": 0.0615,
      "step": 6800
    },
    {
      "epoch": 523.08,
      "eval_valid_eval_loss": 6.1739068031311035,
      "eval_valid_eval_loss_<cls>": 5.528611183166504,
      "eval_valid_eval_perplexity_batch": 480.0579528808594,
      "eval_valid_eval_perplexity_res": 2999.542236328125,
      "eval_valid_eval_perplexity_seq": 480.0579528808594,
      "eval_valid_eval_reconstruction": 0.07963117957115173,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.17390775680542,
      "eval_valid_runtime": 0.5822,
      "eval_valid_samples_per_second": 1.718,
      "eval_valid_steps_per_second": 1.718,
      "step": 6800
    },
    {
      "epoch": 523.08,
      "eval_train_eval_loss": 0.030394116416573524,
      "eval_train_eval_loss_<cls>": 1.4534364938735962,
      "eval_train_eval_perplexity_batch": 1.0308607816696167,
      "eval_train_eval_perplexity_res": 2.6651055812835693,
      "eval_train_eval_perplexity_seq": 1.0315899848937988,
      "eval_train_eval_reconstruction": 0.23111197352409363,
      "eval_train_eval_reconstruction_<cls>": 0.649289071559906,
      "eval_train_loss": 0.05313250422477722,
      "eval_train_runtime": 0.7733,
      "eval_train_samples_per_second": 128.017,
      "eval_train_steps_per_second": 16.81,
      "step": 6800
    },
    {
      "epoch": 523.85,
      "learning_rate": 4.2700068409307795e-05,
      "loss": 0.0529,
      "step": 6810
    },
    {
      "epoch": 524.62,
      "learning_rate": 4.257349653568417e-05,
      "loss": 0.0541,
      "step": 6820
    },
    {
      "epoch": 525.38,
      "learning_rate": 4.244697330736151e-05,
      "loss": 0.0583,
      "step": 6830
    },
    {
      "epoch": 526.15,
      "learning_rate": 4.232049955309601e-05,
      "loss": 0.0636,
      "step": 6840
    },
    {
      "epoch": 526.92,
      "learning_rate": 4.219407610131971e-05,
      "loss": 0.0568,
      "step": 6850
    },
    {
      "epoch": 527.69,
      "learning_rate": 4.2067703780135246e-05,
      "loss": 0.0595,
      "step": 6860
    },
    {
      "epoch": 528.46,
      "learning_rate": 4.194138341731031e-05,
      "loss": 0.0661,
      "step": 6870
    },
    {
      "epoch": 529.23,
      "learning_rate": 4.181511584027225e-05,
      "loss": 0.0568,
      "step": 6880
    },
    {
      "epoch": 530.0,
      "learning_rate": 4.168890187610263e-05,
      "loss": 0.0511,
      "step": 6890
    },
    {
      "epoch": 530.77,
      "learning_rate": 4.156274235153189e-05,
      "loss": 0.0499,
      "step": 6900
    },
    {
      "epoch": 530.77,
      "eval_valid_eval_loss": 6.199530601501465,
      "eval_valid_eval_loss_<cls>": 5.033731937408447,
      "eval_valid_eval_perplexity_batch": 492.5177917480469,
      "eval_valid_eval_perplexity_res": 3198.796630859375,
      "eval_valid_eval_perplexity_seq": 492.5177917480469,
      "eval_valid_eval_reconstruction": 0.0803757831454277,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.199530601501465,
      "eval_valid_runtime": 1.2032,
      "eval_valid_samples_per_second": 0.831,
      "eval_valid_steps_per_second": 0.831,
      "step": 6900
    },
    {
      "epoch": 530.77,
      "eval_train_eval_loss": 0.018178343772888184,
      "eval_train_eval_loss_<cls>": 1.4608032703399658,
      "eval_train_eval_perplexity_batch": 1.018344521522522,
      "eval_train_eval_perplexity_res": 1.9349170923233032,
      "eval_train_eval_perplexity_seq": 1.0187708139419556,
      "eval_train_eval_reconstruction": 0.14346472918987274,
      "eval_train_eval_reconstruction_<cls>": 0.5671641826629639,
      "eval_train_loss": 0.053956884890794754,
      "eval_train_runtime": 0.9346,
      "eval_train_samples_per_second": 105.923,
      "eval_train_steps_per_second": 13.909,
      "step": 6900
    },
    {
      "epoch": 531.54,
      "learning_rate": 4.1436638092933846e-05,
      "loss": 0.0501,
      "step": 6910
    },
    {
      "epoch": 532.31,
      "learning_rate": 4.131058992632031e-05,
      "loss": 0.0487,
      "step": 6920
    },
    {
      "epoch": 533.08,
      "learning_rate": 4.11845986773357e-05,
      "loss": 0.0567,
      "step": 6930
    },
    {
      "epoch": 533.85,
      "learning_rate": 4.1058665171251554e-05,
      "loss": 0.0476,
      "step": 6940
    },
    {
      "epoch": 534.62,
      "learning_rate": 4.0932790232961235e-05,
      "loss": 0.0513,
      "step": 6950
    },
    {
      "epoch": 535.38,
      "learning_rate": 4.080697468697448e-05,
      "loss": 0.0591,
      "step": 6960
    },
    {
      "epoch": 536.15,
      "learning_rate": 4.068121935741195e-05,
      "loss": 0.0523,
      "step": 6970
    },
    {
      "epoch": 536.92,
      "learning_rate": 4.0555525067999896e-05,
      "loss": 0.0496,
      "step": 6980
    },
    {
      "epoch": 537.69,
      "learning_rate": 4.0429892642064745e-05,
      "loss": 0.0551,
      "step": 6990
    },
    {
      "epoch": 538.46,
      "learning_rate": 4.030432290252771e-05,
      "loss": 0.0521,
      "step": 7000
    },
    {
      "epoch": 538.46,
      "eval_valid_eval_loss": 6.305184364318848,
      "eval_valid_eval_loss_<cls>": 7.334475517272949,
      "eval_valid_eval_perplexity_batch": 547.4025268554688,
      "eval_valid_eval_perplexity_res": 3475.37744140625,
      "eval_valid_eval_perplexity_seq": 547.4025268554688,
      "eval_valid_eval_reconstruction": 0.07188160717487335,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.305184841156006,
      "eval_valid_runtime": 0.4071,
      "eval_valid_samples_per_second": 2.456,
      "eval_valid_steps_per_second": 2.456,
      "step": 7000
    },
    {
      "epoch": 538.46,
      "eval_train_eval_loss": 0.024060208350419998,
      "eval_train_eval_loss_<cls>": 1.3393515348434448,
      "eval_train_eval_perplexity_batch": 1.024351954460144,
      "eval_train_eval_perplexity_res": 2.3751471042633057,
      "eval_train_eval_perplexity_seq": 1.024810552597046,
      "eval_train_eval_reconstruction": 0.2072654515504837,
      "eval_train_eval_reconstruction_<cls>": 0.7075471878051758,
      "eval_train_loss": 0.046524759382009506,
      "eval_train_runtime": 0.7595,
      "eval_train_samples_per_second": 130.342,
      "eval_train_steps_per_second": 17.116,
      "step": 7000
    },
    {
      "epoch": 539.23,
      "learning_rate": 4.0178816671899384e-05,
      "loss": 0.0549,
      "step": 7010
    },
    {
      "epoch": 540.0,
      "learning_rate": 4.005337477227436e-05,
      "loss": 0.0558,
      "step": 7020
    },
    {
      "epoch": 540.77,
      "learning_rate": 3.992799802532584e-05,
      "loss": 0.0501,
      "step": 7030
    },
    {
      "epoch": 541.54,
      "learning_rate": 3.980268725230027e-05,
      "loss": 0.0495,
      "step": 7040
    },
    {
      "epoch": 542.31,
      "learning_rate": 3.967744327401197e-05,
      "loss": 0.0493,
      "step": 7050
    },
    {
      "epoch": 543.08,
      "learning_rate": 3.955226691083772e-05,
      "loss": 0.0491,
      "step": 7060
    },
    {
      "epoch": 543.85,
      "learning_rate": 3.9427158982711396e-05,
      "loss": 0.0489,
      "step": 7070
    },
    {
      "epoch": 544.62,
      "learning_rate": 3.930212030911863e-05,
      "loss": 0.0485,
      "step": 7080
    },
    {
      "epoch": 545.38,
      "learning_rate": 3.917715170909141e-05,
      "loss": 0.0521,
      "step": 7090
    },
    {
      "epoch": 546.15,
      "learning_rate": 3.9052254001202746e-05,
      "loss": 0.0519,
      "step": 7100
    },
    {
      "epoch": 546.15,
      "eval_valid_eval_loss": 6.3465800285339355,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 570.5381469726562,
      "eval_valid_eval_perplexity_res": 3793.958740234375,
      "eval_valid_eval_perplexity_seq": 570.5381469726562,
      "eval_valid_eval_reconstruction": 0.0714285746216774,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 6.346579074859619,
      "eval_valid_runtime": 0.4203,
      "eval_valid_samples_per_second": 2.379,
      "eval_valid_steps_per_second": 2.379,
      "step": 7100
    },
    {
      "epoch": 546.15,
      "eval_train_eval_loss": 0.021709300577640533,
      "eval_train_eval_loss_<cls>": 1.2620952129364014,
      "eval_train_eval_perplexity_batch": 1.021946668624878,
      "eval_train_eval_perplexity_res": 2.1185250282287598,
      "eval_train_eval_perplexity_seq": 1.0223037004470825,
      "eval_train_eval_reconstruction": 0.18501877784729004,
      "eval_train_eval_reconstruction_<cls>": 0.7568807601928711,
      "eval_train_loss": 0.051615435630083084,
      "eval_train_runtime": 0.7595,
      "eval_train_samples_per_second": 130.346,
      "eval_train_steps_per_second": 17.116,
      "step": 7100
    },
    {
      "epoch": 546.92,
      "learning_rate": 3.892742800356124e-05,
      "loss": 0.0573,
      "step": 7110
    },
    {
      "epoch": 547.69,
      "learning_rate": 3.880267453380581e-05,
      "loss": 0.051,
      "step": 7120
    },
    {
      "epoch": 548.46,
      "learning_rate": 3.8677994409100307e-05,
      "loss": 0.0454,
      "step": 7130
    },
    {
      "epoch": 549.23,
      "learning_rate": 3.8553388446128136e-05,
      "loss": 0.0531,
      "step": 7140
    },
    {
      "epoch": 550.0,
      "learning_rate": 3.842885746108693e-05,
      "loss": 0.0509,
      "step": 7150
    },
    {
      "epoch": 550.77,
      "learning_rate": 3.830440226968322e-05,
      "loss": 0.0419,
      "step": 7160
    },
    {
      "epoch": 551.54,
      "learning_rate": 3.818002368712704e-05,
      "loss": 0.0522,
      "step": 7170
    },
    {
      "epoch": 552.31,
      "learning_rate": 3.8055722528126624e-05,
      "loss": 0.0484,
      "step": 7180
    },
    {
      "epoch": 553.08,
      "learning_rate": 3.793149960688311e-05,
      "loss": 0.0492,
      "step": 7190
    },
    {
      "epoch": 553.85,
      "learning_rate": 3.780735573708508e-05,
      "loss": 0.0434,
      "step": 7200
    },
    {
      "epoch": 553.85,
      "eval_valid_eval_loss": 6.243791103363037,
      "eval_valid_eval_loss_<cls>": 5.438448905944824,
      "eval_valid_eval_perplexity_batch": 514.8065185546875,
      "eval_valid_eval_perplexity_res": 3655.913818359375,
      "eval_valid_eval_perplexity_seq": 514.8065185546875,
      "eval_valid_eval_reconstruction": 0.07927677035331726,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.243791103363037,
      "eval_valid_runtime": 0.3359,
      "eval_valid_samples_per_second": 2.977,
      "eval_valid_steps_per_second": 2.977,
      "step": 7200
    },
    {
      "epoch": 553.85,
      "eval_train_eval_loss": 0.018282849341630936,
      "eval_train_eval_loss_<cls>": 1.2122036218643188,
      "eval_train_eval_perplexity_batch": 1.0184509754180908,
      "eval_train_eval_perplexity_res": 2.202291488647461,
      "eval_train_eval_perplexity_seq": 1.01868736743927,
      "eval_train_eval_reconstruction": 0.1704547256231308,
      "eval_train_eval_reconstruction_<cls>": 0.6853448152542114,
      "eval_train_loss": 0.04780210554599762,
      "eval_train_runtime": 0.7363,
      "eval_train_samples_per_second": 134.452,
      "eval_train_steps_per_second": 17.655,
      "step": 7200
    },
    {
      "epoch": 554.62,
      "learning_rate": 3.7683291731903356e-05,
      "loss": 0.0443,
      "step": 7210
    },
    {
      "epoch": 555.38,
      "learning_rate": 3.755930840398565e-05,
      "loss": 0.0497,
      "step": 7220
    },
    {
      "epoch": 556.15,
      "learning_rate": 3.743540656545118e-05,
      "loss": 0.0496,
      "step": 7230
    },
    {
      "epoch": 556.92,
      "learning_rate": 3.731158702788541e-05,
      "loss": 0.0501,
      "step": 7240
    },
    {
      "epoch": 557.69,
      "learning_rate": 3.718785060233471e-05,
      "loss": 0.0447,
      "step": 7250
    },
    {
      "epoch": 558.46,
      "learning_rate": 3.7064198099301045e-05,
      "loss": 0.0467,
      "step": 7260
    },
    {
      "epoch": 559.23,
      "learning_rate": 3.6940630328736684e-05,
      "loss": 0.0484,
      "step": 7270
    },
    {
      "epoch": 560.0,
      "learning_rate": 3.6817148100038835e-05,
      "loss": 0.0441,
      "step": 7280
    },
    {
      "epoch": 560.77,
      "learning_rate": 3.6693752222044435e-05,
      "loss": 0.0422,
      "step": 7290
    },
    {
      "epoch": 561.54,
      "learning_rate": 3.657044350302479e-05,
      "loss": 0.0498,
      "step": 7300
    },
    {
      "epoch": 561.54,
      "eval_valid_eval_loss": 6.334130764007568,
      "eval_valid_eval_loss_<cls>": 5.838603496551514,
      "eval_valid_eval_perplexity_batch": 563.4793701171875,
      "eval_valid_eval_perplexity_res": 3927.00732421875,
      "eval_valid_eval_perplexity_seq": 563.4793701171875,
      "eval_valid_eval_reconstruction": 0.07442347705364227,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.33413028717041,
      "eval_valid_runtime": 0.3859,
      "eval_valid_samples_per_second": 2.591,
      "eval_valid_steps_per_second": 2.591,
      "step": 7300
    },
    {
      "epoch": 561.54,
      "eval_train_eval_loss": 0.023061400279402733,
      "eval_train_eval_loss_<cls>": 1.201711893081665,
      "eval_train_eval_perplexity_batch": 1.0233293771743774,
      "eval_train_eval_perplexity_res": 2.3472161293029785,
      "eval_train_eval_perplexity_seq": 1.0236272811889648,
      "eval_train_eval_reconstruction": 0.21509531140327454,
      "eval_train_eval_reconstruction_<cls>": 0.6952381134033203,
      "eval_train_loss": 0.04401828348636627,
      "eval_train_runtime": 0.6844,
      "eval_train_samples_per_second": 144.649,
      "eval_train_steps_per_second": 18.994,
      "step": 7300
    },
    {
      "epoch": 562.31,
      "learning_rate": 3.644722275068027e-05,
      "loss": 0.0472,
      "step": 7310
    },
    {
      "epoch": 563.08,
      "learning_rate": 3.632409077213509e-05,
      "loss": 0.0556,
      "step": 7320
    },
    {
      "epoch": 563.85,
      "learning_rate": 3.6201048373931937e-05,
      "loss": 0.0445,
      "step": 7330
    },
    {
      "epoch": 564.62,
      "learning_rate": 3.6078096362026725e-05,
      "loss": 0.0432,
      "step": 7340
    },
    {
      "epoch": 565.38,
      "learning_rate": 3.595523554178336e-05,
      "loss": 0.0436,
      "step": 7350
    },
    {
      "epoch": 566.15,
      "learning_rate": 3.5832466717968396e-05,
      "loss": 0.0495,
      "step": 7360
    },
    {
      "epoch": 566.92,
      "learning_rate": 3.570979069474575e-05,
      "loss": 0.0468,
      "step": 7370
    },
    {
      "epoch": 567.69,
      "learning_rate": 3.558720827567153e-05,
      "loss": 0.0458,
      "step": 7380
    },
    {
      "epoch": 568.46,
      "learning_rate": 3.546472026368869e-05,
      "loss": 0.0463,
      "step": 7390
    },
    {
      "epoch": 569.23,
      "learning_rate": 3.5342327461121805e-05,
      "loss": 0.0457,
      "step": 7400
    },
    {
      "epoch": 569.23,
      "eval_valid_eval_loss": 6.398515701293945,
      "eval_valid_eval_loss_<cls>": 6.956682205200195,
      "eval_valid_eval_perplexity_batch": 600.952392578125,
      "eval_valid_eval_perplexity_res": 4228.4599609375,
      "eval_valid_eval_perplexity_seq": 600.952392578125,
      "eval_valid_eval_reconstruction": 0.08071135729551315,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.39851713180542,
      "eval_valid_runtime": 0.3721,
      "eval_valid_samples_per_second": 2.688,
      "eval_valid_steps_per_second": 2.688,
      "step": 7400
    },
    {
      "epoch": 569.23,
      "eval_train_eval_loss": 0.01493360847234726,
      "eval_train_eval_loss_<cls>": 1.1788151264190674,
      "eval_train_eval_perplexity_batch": 1.0150456428527832,
      "eval_train_eval_perplexity_res": 2.0255043506622314,
      "eval_train_eval_perplexity_seq": 1.0152907371520996,
      "eval_train_eval_reconstruction": 0.13425937294960022,
      "eval_train_eval_reconstruction_<cls>": 0.699999988079071,
      "eval_train_loss": 0.04231799766421318,
      "eval_train_runtime": 0.9781,
      "eval_train_samples_per_second": 101.216,
      "eval_train_steps_per_second": 13.291,
      "step": 7400
    },
    {
      "epoch": 570.0,
      "learning_rate": 3.522003066967181e-05,
      "loss": 0.0487,
      "step": 7410
    },
    {
      "epoch": 570.77,
      "learning_rate": 3.5097830690410724e-05,
      "loss": 0.0429,
      "step": 7420
    },
    {
      "epoch": 571.54,
      "learning_rate": 3.497572832377645e-05,
      "loss": 0.0513,
      "step": 7430
    },
    {
      "epoch": 572.31,
      "learning_rate": 3.4853724369567495e-05,
      "loss": 0.0441,
      "step": 7440
    },
    {
      "epoch": 573.08,
      "learning_rate": 3.473181962693773e-05,
      "loss": 0.0503,
      "step": 7450
    },
    {
      "epoch": 573.85,
      "learning_rate": 3.4610014894391177e-05,
      "loss": 0.0458,
      "step": 7460
    },
    {
      "epoch": 574.62,
      "learning_rate": 3.4488310969776784e-05,
      "loss": 0.0472,
      "step": 7470
    },
    {
      "epoch": 575.38,
      "learning_rate": 3.436670865028315e-05,
      "loss": 0.0429,
      "step": 7480
    },
    {
      "epoch": 576.15,
      "learning_rate": 3.424520873243338e-05,
      "loss": 0.0445,
      "step": 7490
    },
    {
      "epoch": 576.92,
      "learning_rate": 3.412381201207979e-05,
      "loss": 0.0423,
      "step": 7500
    },
    {
      "epoch": 576.92,
      "eval_valid_eval_loss": 6.3771467208862305,
      "eval_valid_eval_loss_<cls>": 5.718277931213379,
      "eval_valid_eval_perplexity_batch": 588.2468872070312,
      "eval_valid_eval_perplexity_res": 4124.48974609375,
      "eval_valid_eval_perplexity_seq": 588.2468872070312,
      "eval_valid_eval_reconstruction": 0.07412790507078171,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.3771467208862305,
      "eval_valid_runtime": 0.4187,
      "eval_valid_samples_per_second": 2.388,
      "eval_valid_steps_per_second": 2.388,
      "step": 7500
    },
    {
      "epoch": 576.92,
      "eval_train_eval_loss": 0.02715759538114071,
      "eval_train_eval_loss_<cls>": 1.0365954637527466,
      "eval_train_eval_perplexity_batch": 1.0275297164916992,
      "eval_train_eval_perplexity_res": 2.864764928817749,
      "eval_train_eval_perplexity_seq": 1.0278874635696411,
      "eval_train_eval_reconstruction": 0.2817579209804535,
      "eval_train_eval_reconstruction_<cls>": 0.8097165822982788,
      "eval_train_loss": 0.043766237795352936,
      "eval_train_runtime": 0.9199,
      "eval_train_samples_per_second": 107.616,
      "eval_train_steps_per_second": 14.131,
      "step": 7500
    },
    {
      "epoch": 577.69,
      "learning_rate": 3.4002519284398736e-05,
      "loss": 0.0421,
      "step": 7510
    },
    {
      "epoch": 578.46,
      "learning_rate": 3.388133134388542e-05,
      "loss": 0.0368,
      "step": 7520
    },
    {
      "epoch": 579.23,
      "learning_rate": 3.3760248984348644e-05,
      "loss": 0.0393,
      "step": 7530
    },
    {
      "epoch": 580.0,
      "learning_rate": 3.3639272998905614e-05,
      "loss": 0.0494,
      "step": 7540
    },
    {
      "epoch": 580.77,
      "learning_rate": 3.351840417997679e-05,
      "loss": 0.0384,
      "step": 7550
    },
    {
      "epoch": 581.54,
      "learning_rate": 3.3397643319280665e-05,
      "loss": 0.0392,
      "step": 7560
    },
    {
      "epoch": 582.31,
      "learning_rate": 3.327699120782856e-05,
      "loss": 0.0502,
      "step": 7570
    },
    {
      "epoch": 583.08,
      "learning_rate": 3.315644863591949e-05,
      "loss": 0.0458,
      "step": 7580
    },
    {
      "epoch": 583.85,
      "learning_rate": 3.303601639313494e-05,
      "loss": 0.0391,
      "step": 7590
    },
    {
      "epoch": 584.62,
      "learning_rate": 3.291569526833372e-05,
      "loss": 0.0385,
      "step": 7600
    },
    {
      "epoch": 584.62,
      "eval_valid_eval_loss": 6.493314743041992,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 660.7098388671875,
      "eval_valid_eval_perplexity_res": 5068.4697265625,
      "eval_valid_eval_perplexity_seq": 660.7098388671875,
      "eval_valid_eval_reconstruction": 0.0714285746216774,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 6.493314266204834,
      "eval_valid_runtime": 0.3596,
      "eval_valid_samples_per_second": 2.781,
      "eval_valid_steps_per_second": 2.781,
      "step": 7600
    },
    {
      "epoch": 584.62,
      "eval_train_eval_loss": 0.01883377507328987,
      "eval_train_eval_loss_<cls>": 1.1035805940628052,
      "eval_train_eval_perplexity_batch": 1.019012212753296,
      "eval_train_eval_perplexity_res": 2.4042742252349854,
      "eval_train_eval_perplexity_seq": 1.0193101167678833,
      "eval_train_eval_reconstruction": 0.19091586768627167,
      "eval_train_eval_reconstruction_<cls>": 0.7559808492660522,
      "eval_train_loss": 0.04107162356376648,
      "eval_train_runtime": 0.8646,
      "eval_train_samples_per_second": 114.506,
      "eval_train_steps_per_second": 15.036,
      "step": 7600
    },
    {
      "epoch": 585.38,
      "learning_rate": 3.27954860496468e-05,
      "loss": 0.0491,
      "step": 7610
    },
    {
      "epoch": 586.15,
      "learning_rate": 3.267538952447215e-05,
      "loss": 0.0433,
      "step": 7620
    },
    {
      "epoch": 586.92,
      "learning_rate": 3.2555406479469505e-05,
      "loss": 0.0372,
      "step": 7630
    },
    {
      "epoch": 587.69,
      "learning_rate": 3.243553770055536e-05,
      "loss": 0.0417,
      "step": 7640
    },
    {
      "epoch": 588.46,
      "learning_rate": 3.231578397289772e-05,
      "loss": 0.0435,
      "step": 7650
    },
    {
      "epoch": 589.23,
      "learning_rate": 3.219614608091094e-05,
      "loss": 0.0458,
      "step": 7660
    },
    {
      "epoch": 590.0,
      "learning_rate": 3.2076624808250666e-05,
      "loss": 0.041,
      "step": 7670
    },
    {
      "epoch": 590.77,
      "learning_rate": 3.195722093780864e-05,
      "loss": 0.0422,
      "step": 7680
    },
    {
      "epoch": 591.54,
      "learning_rate": 3.1837935251707615e-05,
      "loss": 0.0456,
      "step": 7690
    },
    {
      "epoch": 592.31,
      "learning_rate": 3.1718768531296196e-05,
      "loss": 0.0413,
      "step": 7700
    },
    {
      "epoch": 592.31,
      "eval_valid_eval_loss": 6.470393657684326,
      "eval_valid_eval_loss_<cls>": 7.043214797973633,
      "eval_valid_eval_perplexity_batch": 645.7378540039062,
      "eval_valid_eval_perplexity_res": 4725.55859375,
      "eval_valid_eval_perplexity_seq": 645.7378540039062,
      "eval_valid_eval_reconstruction": 0.07250341773033142,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.470394611358643,
      "eval_valid_runtime": 0.388,
      "eval_valid_samples_per_second": 2.577,
      "eval_valid_steps_per_second": 2.577,
      "step": 7700
    },
    {
      "epoch": 592.31,
      "eval_train_eval_loss": 0.017711054533720016,
      "eval_train_eval_loss_<cls>": 1.159847378730774,
      "eval_train_eval_perplexity_batch": 1.0178688764572144,
      "eval_train_eval_perplexity_res": 2.0504860877990723,
      "eval_train_eval_perplexity_seq": 1.0180370807647705,
      "eval_train_eval_reconstruction": 0.19451798498630524,
      "eval_train_eval_reconstruction_<cls>": 0.7788461446762085,
      "eval_train_loss": 0.040643252432346344,
      "eval_train_runtime": 0.7933,
      "eval_train_samples_per_second": 124.789,
      "eval_train_steps_per_second": 16.386,
      "step": 7700
    },
    {
      "epoch": 593.08,
      "learning_rate": 3.159972155714369e-05,
      "loss": 0.043,
      "step": 7710
    },
    {
      "epoch": 593.85,
      "learning_rate": 3.148079510903512e-05,
      "loss": 0.0401,
      "step": 7720
    },
    {
      "epoch": 594.62,
      "learning_rate": 3.1361989965965974e-05,
      "loss": 0.0453,
      "step": 7730
    },
    {
      "epoch": 595.38,
      "learning_rate": 3.1243306906137215e-05,
      "loss": 0.049,
      "step": 7740
    },
    {
      "epoch": 596.15,
      "learning_rate": 3.112474670695008e-05,
      "loss": 0.0401,
      "step": 7750
    },
    {
      "epoch": 596.92,
      "learning_rate": 3.100631014500108e-05,
      "loss": 0.039,
      "step": 7760
    },
    {
      "epoch": 597.69,
      "learning_rate": 3.088799799607684e-05,
      "loss": 0.0378,
      "step": 7770
    },
    {
      "epoch": 598.46,
      "learning_rate": 3.0769811035149104e-05,
      "loss": 0.0419,
      "step": 7780
    },
    {
      "epoch": 599.23,
      "learning_rate": 3.065175003636956e-05,
      "loss": 0.0369,
      "step": 7790
    },
    {
      "epoch": 600.0,
      "learning_rate": 3.053381577306481e-05,
      "loss": 0.04,
      "step": 7800
    },
    {
      "epoch": 600.0,
      "eval_valid_eval_loss": 6.430954456329346,
      "eval_valid_eval_loss_<cls>": 5.700572490692139,
      "eval_valid_eval_perplexity_batch": 620.7661743164062,
      "eval_valid_eval_perplexity_res": 4724.818359375,
      "eval_valid_eval_perplexity_seq": 620.7661743164062,
      "eval_valid_eval_reconstruction": 0.07711651176214218,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.430954456329346,
      "eval_valid_runtime": 0.4331,
      "eval_valid_samples_per_second": 2.309,
      "eval_valid_steps_per_second": 2.309,
      "step": 7800
    },
    {
      "epoch": 600.0,
      "eval_train_eval_loss": 0.02003304846584797,
      "eval_train_eval_loss_<cls>": 0.9714280366897583,
      "eval_train_eval_perplexity_batch": 1.0202350616455078,
      "eval_train_eval_perplexity_res": 2.5819284915924072,
      "eval_train_eval_perplexity_seq": 1.0204784870147705,
      "eval_train_eval_reconstruction": 0.21919427812099457,
      "eval_train_eval_reconstruction_<cls>": 0.8166666626930237,
      "eval_train_loss": 0.0440618060529232,
      "eval_train_runtime": 0.7522,
      "eval_train_samples_per_second": 131.606,
      "eval_train_steps_per_second": 17.282,
      "step": 7800
    },
    {
      "epoch": 600.77,
      "learning_rate": 3.0416009017731333e-05,
      "loss": 0.0419,
      "step": 7810
    },
    {
      "epoch": 601.54,
      "learning_rate": 3.0298330542030406e-05,
      "loss": 0.0391,
      "step": 7820
    },
    {
      "epoch": 602.31,
      "learning_rate": 3.018078111678302e-05,
      "loss": 0.036,
      "step": 7830
    },
    {
      "epoch": 603.08,
      "learning_rate": 3.006336151196488e-05,
      "loss": 0.0418,
      "step": 7840
    },
    {
      "epoch": 603.85,
      "learning_rate": 2.9946072496701334e-05,
      "loss": 0.0352,
      "step": 7850
    },
    {
      "epoch": 604.62,
      "learning_rate": 2.9828914839262325e-05,
      "loss": 0.0405,
      "step": 7860
    },
    {
      "epoch": 605.38,
      "learning_rate": 2.9711889307057384e-05,
      "loss": 0.0379,
      "step": 7870
    },
    {
      "epoch": 606.15,
      "learning_rate": 2.959499666663057e-05,
      "loss": 0.0447,
      "step": 7880
    },
    {
      "epoch": 606.92,
      "learning_rate": 2.9478237683655507e-05,
      "loss": 0.0409,
      "step": 7890
    },
    {
      "epoch": 607.69,
      "learning_rate": 2.9361613122930304e-05,
      "loss": 0.0371,
      "step": 7900
    },
    {
      "epoch": 607.69,
      "eval_valid_eval_loss": 6.4809980392456055,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 652.6219482421875,
      "eval_valid_eval_perplexity_res": 4954.61474609375,
      "eval_valid_eval_perplexity_seq": 652.6219482421875,
      "eval_valid_eval_reconstruction": 0.0782608687877655,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 6.480998992919922,
      "eval_valid_runtime": 0.3763,
      "eval_valid_samples_per_second": 2.657,
      "eval_valid_steps_per_second": 2.657,
      "step": 7900
    },
    {
      "epoch": 607.69,
      "eval_train_eval_loss": 0.012568811886012554,
      "eval_train_eval_loss_<cls>": 1.0241457223892212,
      "eval_train_eval_perplexity_batch": 1.012648105621338,
      "eval_train_eval_perplexity_res": 1.9556968212127686,
      "eval_train_eval_perplexity_seq": 1.0128029584884644,
      "eval_train_eval_reconstruction": 0.143289253115654,
      "eval_train_eval_reconstruction_<cls>": 0.8019323945045471,
      "eval_train_loss": 0.034171219915151596,
      "eval_train_runtime": 0.7756,
      "eval_train_samples_per_second": 127.644,
      "eval_train_steps_per_second": 16.761,
      "step": 7900
    },
    {
      "epoch": 608.46,
      "learning_rate": 2.9245123748372584e-05,
      "loss": 0.0368,
      "step": 7910
    },
    {
      "epoch": 609.23,
      "learning_rate": 2.9128770323014476e-05,
      "loss": 0.038,
      "step": 7920
    },
    {
      "epoch": 610.0,
      "learning_rate": 2.901255360899758e-05,
      "loss": 0.0425,
      "step": 7930
    },
    {
      "epoch": 610.77,
      "learning_rate": 2.889647436756804e-05,
      "loss": 0.037,
      "step": 7940
    },
    {
      "epoch": 611.54,
      "learning_rate": 2.8780533359071504e-05,
      "loss": 0.0384,
      "step": 7950
    },
    {
      "epoch": 612.31,
      "learning_rate": 2.8664731342948182e-05,
      "loss": 0.0424,
      "step": 7960
    },
    {
      "epoch": 613.08,
      "learning_rate": 2.854906907772783e-05,
      "loss": 0.0351,
      "step": 7970
    },
    {
      "epoch": 613.85,
      "learning_rate": 2.8433547321024816e-05,
      "loss": 0.0395,
      "step": 7980
    },
    {
      "epoch": 614.62,
      "learning_rate": 2.8318166829533138e-05,
      "loss": 0.0367,
      "step": 7990
    },
    {
      "epoch": 615.38,
      "learning_rate": 2.820292835902148e-05,
      "loss": 0.0358,
      "step": 8000
    },
    {
      "epoch": 615.38,
      "eval_valid_eval_loss": 6.540444374084473,
      "eval_valid_eval_loss_<cls>": 7.389999866485596,
      "eval_valid_eval_perplexity_batch": 692.5942993164062,
      "eval_valid_eval_perplexity_res": 5186.74755859375,
      "eval_valid_eval_perplexity_seq": 692.5942993164062,
      "eval_valid_eval_reconstruction": 0.07628866285085678,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.540445804595947,
      "eval_valid_runtime": 0.5344,
      "eval_valid_samples_per_second": 1.871,
      "eval_valid_steps_per_second": 1.871,
      "step": 8000
    },
    {
      "epoch": 615.38,
      "eval_train_eval_loss": 0.024587878957390785,
      "eval_train_eval_loss_<cls>": 1.0784152746200562,
      "eval_train_eval_perplexity_batch": 1.0248926877975464,
      "eval_train_eval_perplexity_res": 2.8401923179626465,
      "eval_train_eval_perplexity_seq": 1.0251468420028687,
      "eval_train_eval_reconstruction": 0.29400551319122314,
      "eval_train_eval_reconstruction_<cls>": 0.7299578189849854,
      "eval_train_loss": 0.03665895760059357,
      "eval_train_runtime": 0.8529,
      "eval_train_samples_per_second": 116.077,
      "eval_train_steps_per_second": 15.242,
      "step": 8000
    },
    {
      "epoch": 616.15,
      "learning_rate": 2.8087832664328252e-05,
      "loss": 0.0406,
      "step": 8010
    },
    {
      "epoch": 616.92,
      "learning_rate": 2.7972880499356656e-05,
      "loss": 0.0361,
      "step": 8020
    },
    {
      "epoch": 617.69,
      "learning_rate": 2.785807261706973e-05,
      "loss": 0.0361,
      "step": 8030
    },
    {
      "epoch": 618.46,
      "learning_rate": 2.774340976948544e-05,
      "loss": 0.0399,
      "step": 8040
    },
    {
      "epoch": 619.23,
      "learning_rate": 2.762889270767175e-05,
      "loss": 0.0402,
      "step": 8050
    },
    {
      "epoch": 620.0,
      "learning_rate": 2.7514522181741648e-05,
      "loss": 0.0349,
      "step": 8060
    },
    {
      "epoch": 620.77,
      "learning_rate": 2.7400298940848325e-05,
      "loss": 0.0372,
      "step": 8070
    },
    {
      "epoch": 621.54,
      "learning_rate": 2.7286223733180206e-05,
      "loss": 0.0384,
      "step": 8080
    },
    {
      "epoch": 622.31,
      "learning_rate": 2.7172297305956062e-05,
      "loss": 0.0359,
      "step": 8090
    },
    {
      "epoch": 623.08,
      "learning_rate": 2.7058520405420123e-05,
      "loss": 0.0403,
      "step": 8100
    },
    {
      "epoch": 623.08,
      "eval_valid_eval_loss": 6.494941711425781,
      "eval_valid_eval_loss_<cls>": 6.256099700927734,
      "eval_valid_eval_perplexity_batch": 661.78564453125,
      "eval_valid_eval_perplexity_res": 5076.24169921875,
      "eval_valid_eval_perplexity_seq": 661.78564453125,
      "eval_valid_eval_reconstruction": 0.08455114811658859,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.4949421882629395,
      "eval_valid_runtime": 0.3744,
      "eval_valid_samples_per_second": 2.671,
      "eval_valid_steps_per_second": 2.671,
      "step": 8100
    },
    {
      "epoch": 623.08,
      "eval_train_eval_loss": 0.02391669526696205,
      "eval_train_eval_loss_<cls>": 1.0334235429763794,
      "eval_train_eval_perplexity_batch": 1.024204969406128,
      "eval_train_eval_perplexity_res": 3.5571322441101074,
      "eval_train_eval_perplexity_seq": 1.024455189704895,
      "eval_train_eval_reconstruction": 0.2935931980609894,
      "eval_train_eval_reconstruction_<cls>": 0.7629310488700867,
      "eval_train_loss": 0.03842999413609505,
      "eval_train_runtime": 0.8534,
      "eval_train_samples_per_second": 116.01,
      "eval_train_steps_per_second": 15.234,
      "step": 8100
    },
    {
      "epoch": 623.85,
      "learning_rate": 2.6944893776837187e-05,
      "loss": 0.0335,
      "step": 8110
    },
    {
      "epoch": 624.62,
      "learning_rate": 2.683141816448772e-05,
      "loss": 0.0331,
      "step": 8120
    },
    {
      "epoch": 625.38,
      "learning_rate": 2.6718094311663043e-05,
      "loss": 0.0366,
      "step": 8130
    },
    {
      "epoch": 626.15,
      "learning_rate": 2.660492296066034e-05,
      "loss": 0.033,
      "step": 8140
    },
    {
      "epoch": 626.92,
      "learning_rate": 2.649190485277792e-05,
      "loss": 0.0349,
      "step": 8150
    },
    {
      "epoch": 627.69,
      "learning_rate": 2.6379040728310328e-05,
      "loss": 0.0337,
      "step": 8160
    },
    {
      "epoch": 628.46,
      "learning_rate": 2.6266331326543457e-05,
      "loss": 0.0366,
      "step": 8170
    },
    {
      "epoch": 629.23,
      "learning_rate": 2.6153777385749745e-05,
      "loss": 0.0378,
      "step": 8180
    },
    {
      "epoch": 630.0,
      "learning_rate": 2.604137964318332e-05,
      "loss": 0.037,
      "step": 8190
    },
    {
      "epoch": 630.77,
      "learning_rate": 2.5929138835075152e-05,
      "loss": 0.0353,
      "step": 8200
    },
    {
      "epoch": 630.77,
      "eval_valid_eval_loss": 6.546078205108643,
      "eval_valid_eval_loss_<cls>": 6.405113697052002,
      "eval_valid_eval_perplexity_batch": 696.5072631835938,
      "eval_valid_eval_perplexity_res": 5222.099609375,
      "eval_valid_eval_perplexity_seq": 696.5072631835938,
      "eval_valid_eval_reconstruction": 0.07982739806175232,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.546078681945801,
      "eval_valid_runtime": 0.4117,
      "eval_valid_samples_per_second": 2.429,
      "eval_valid_steps_per_second": 2.429,
      "step": 8200
    },
    {
      "epoch": 630.77,
      "eval_train_eval_loss": 0.017823584377765656,
      "eval_train_eval_loss_<cls>": 0.9711154699325562,
      "eval_train_eval_perplexity_batch": 1.017983317375183,
      "eval_train_eval_perplexity_res": 2.511093854904175,
      "eval_train_eval_perplexity_seq": 1.0182170867919922,
      "eval_train_eval_reconstruction": 0.22361434996128082,
      "eval_train_eval_reconstruction_<cls>": 0.7767441868782043,
      "eval_train_loss": 0.03189940005540848,
      "eval_train_runtime": 0.8513,
      "eval_train_samples_per_second": 116.287,
      "eval_train_steps_per_second": 15.27,
      "step": 8200
    },
    {
      "epoch": 631.54,
      "learning_rate": 2.58170556966283e-05,
      "loss": 0.0343,
      "step": 8210
    },
    {
      "epoch": 632.31,
      "learning_rate": 2.5705130962013046e-05,
      "loss": 0.0357,
      "step": 8220
    },
    {
      "epoch": 633.08,
      "learning_rate": 2.5593365364362022e-05,
      "loss": 0.0362,
      "step": 8230
    },
    {
      "epoch": 633.85,
      "learning_rate": 2.5481759635765558e-05,
      "loss": 0.0334,
      "step": 8240
    },
    {
      "epoch": 634.62,
      "learning_rate": 2.5370314507266756e-05,
      "loss": 0.039,
      "step": 8250
    },
    {
      "epoch": 635.38,
      "learning_rate": 2.525903070885678e-05,
      "loss": 0.0354,
      "step": 8260
    },
    {
      "epoch": 636.15,
      "learning_rate": 2.514790896947003e-05,
      "loss": 0.0348,
      "step": 8270
    },
    {
      "epoch": 636.92,
      "learning_rate": 2.503695001697939e-05,
      "loss": 0.0365,
      "step": 8280
    },
    {
      "epoch": 637.69,
      "learning_rate": 2.4926154578191435e-05,
      "loss": 0.0413,
      "step": 8290
    },
    {
      "epoch": 638.46,
      "learning_rate": 2.4815523378841726e-05,
      "loss": 0.0358,
      "step": 8300
    },
    {
      "epoch": 638.46,
      "eval_valid_eval_loss": 6.583761215209961,
      "eval_valid_eval_loss_<cls>": 5.766593933105469,
      "eval_valid_eval_perplexity_batch": 723.2545166015625,
      "eval_valid_eval_perplexity_res": 6216.57275390625,
      "eval_valid_eval_perplexity_seq": 723.2545166015625,
      "eval_valid_eval_reconstruction": 0.07891156524419785,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.5837602615356445,
      "eval_valid_runtime": 0.3782,
      "eval_valid_samples_per_second": 2.644,
      "eval_valid_steps_per_second": 2.644,
      "step": 8300
    },
    {
      "epoch": 638.46,
      "eval_train_eval_loss": 0.012797066941857338,
      "eval_train_eval_loss_<cls>": 0.8727076053619385,
      "eval_train_eval_perplexity_batch": 1.0128792524337769,
      "eval_train_eval_perplexity_res": 2.0029914379119873,
      "eval_train_eval_perplexity_seq": 1.0130184888839722,
      "eval_train_eval_reconstruction": 0.16617527604103088,
      "eval_train_eval_reconstruction_<cls>": 0.8037382960319519,
      "eval_train_loss": 0.030312998220324516,
      "eval_train_runtime": 0.9425,
      "eval_train_samples_per_second": 105.036,
      "eval_train_steps_per_second": 13.793,
      "step": 8300
    },
    {
      "epoch": 639.23,
      "learning_rate": 2.470505714358996e-05,
      "loss": 0.0409,
      "step": 8310
    },
    {
      "epoch": 640.0,
      "learning_rate": 2.459475659601533e-05,
      "loss": 0.0361,
      "step": 8320
    },
    {
      "epoch": 640.77,
      "learning_rate": 2.448462245861171e-05,
      "loss": 0.0334,
      "step": 8330
    },
    {
      "epoch": 641.54,
      "learning_rate": 2.437465545278297e-05,
      "loss": 0.0349,
      "step": 8340
    },
    {
      "epoch": 642.31,
      "learning_rate": 2.4264856298838213e-05,
      "loss": 0.0414,
      "step": 8350
    },
    {
      "epoch": 643.08,
      "learning_rate": 2.4155225715987085e-05,
      "loss": 0.0338,
      "step": 8360
    },
    {
      "epoch": 643.85,
      "learning_rate": 2.4045764422335043e-05,
      "loss": 0.0335,
      "step": 8370
    },
    {
      "epoch": 644.62,
      "learning_rate": 2.3936473134878673e-05,
      "loss": 0.0355,
      "step": 8380
    },
    {
      "epoch": 645.38,
      "learning_rate": 2.3827352569500966e-05,
      "loss": 0.0358,
      "step": 8390
    },
    {
      "epoch": 646.15,
      "learning_rate": 2.371840344096665e-05,
      "loss": 0.0362,
      "step": 8400
    },
    {
      "epoch": 646.15,
      "eval_valid_eval_loss": 6.55246114730835,
      "eval_valid_eval_loss_<cls>": 6.452251434326172,
      "eval_valid_eval_perplexity_batch": 700.9672241210938,
      "eval_valid_eval_perplexity_res": 5938.2255859375,
      "eval_valid_eval_perplexity_seq": 700.9672241210938,
      "eval_valid_eval_reconstruction": 0.0779547393321991,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.55246114730835,
      "eval_valid_runtime": 0.3841,
      "eval_valid_samples_per_second": 2.603,
      "eval_valid_steps_per_second": 2.603,
      "step": 8400
    },
    {
      "epoch": 646.15,
      "eval_train_eval_loss": 0.016883475705981255,
      "eval_train_eval_loss_<cls>": 0.9930781722068787,
      "eval_train_eval_perplexity_batch": 1.0170267820358276,
      "eval_train_eval_perplexity_res": 2.313149929046631,
      "eval_train_eval_perplexity_seq": 1.017216682434082,
      "eval_train_eval_reconstruction": 0.2200077325105667,
      "eval_train_eval_reconstruction_<cls>": 0.8125,
      "eval_train_loss": 0.03309067338705063,
      "eval_train_runtime": 0.9386,
      "eval_train_samples_per_second": 105.479,
      "eval_train_steps_per_second": 13.851,
      "step": 8400
    },
    {
      "epoch": 646.92,
      "learning_rate": 2.3609626462917515e-05,
      "loss": 0.0306,
      "step": 8410
    },
    {
      "epoch": 647.69,
      "learning_rate": 2.350102234786771e-05,
      "loss": 0.0336,
      "step": 8420
    },
    {
      "epoch": 648.46,
      "learning_rate": 2.3392591807199098e-05,
      "loss": 0.0362,
      "step": 8430
    },
    {
      "epoch": 649.23,
      "learning_rate": 2.3284335551156582e-05,
      "loss": 0.0352,
      "step": 8440
    },
    {
      "epoch": 650.0,
      "learning_rate": 2.317625428884348e-05,
      "loss": 0.0318,
      "step": 8450
    },
    {
      "epoch": 650.77,
      "learning_rate": 2.306834872821684e-05,
      "loss": 0.0336,
      "step": 8460
    },
    {
      "epoch": 651.54,
      "learning_rate": 2.2960619576082852e-05,
      "loss": 0.0346,
      "step": 8470
    },
    {
      "epoch": 652.31,
      "learning_rate": 2.2853067538092133e-05,
      "loss": 0.0377,
      "step": 8480
    },
    {
      "epoch": 653.08,
      "learning_rate": 2.2745693318735224e-05,
      "loss": 0.0294,
      "step": 8490
    },
    {
      "epoch": 653.85,
      "learning_rate": 2.263849762133788e-05,
      "loss": 0.0356,
      "step": 8500
    },
    {
      "epoch": 653.85,
      "eval_valid_eval_loss": 6.625594615936279,
      "eval_valid_eval_loss_<cls>": 4.294053077697754,
      "eval_valid_eval_perplexity_batch": 754.1525268554688,
      "eval_valid_eval_perplexity_res": 6656.80419921875,
      "eval_valid_eval_perplexity_seq": 754.1525268554688,
      "eval_valid_eval_reconstruction": 0.0810234546661377,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.625594615936279,
      "eval_valid_runtime": 0.292,
      "eval_valid_samples_per_second": 3.425,
      "eval_valid_steps_per_second": 3.425,
      "step": 8500
    },
    {
      "epoch": 653.85,
      "eval_train_eval_loss": 0.015406019054353237,
      "eval_train_eval_loss_<cls>": 0.9235057830810547,
      "eval_train_eval_perplexity_batch": 1.0155253410339355,
      "eval_train_eval_perplexity_res": 2.2368736267089844,
      "eval_train_eval_perplexity_seq": 1.0157005786895752,
      "eval_train_eval_reconstruction": 0.21067485213279724,
      "eval_train_eval_reconstruction_<cls>": 0.8509615659713745,
      "eval_train_loss": 0.0326874740421772,
      "eval_train_runtime": 0.7661,
      "eval_train_samples_per_second": 129.218,
      "eval_train_steps_per_second": 16.968,
      "step": 8500
    },
    {
      "epoch": 654.62,
      "learning_rate": 2.2531481148056505e-05,
      "loss": 0.0315,
      "step": 8510
    },
    {
      "epoch": 655.38,
      "learning_rate": 2.242464459987355e-05,
      "loss": 0.0338,
      "step": 8520
    },
    {
      "epoch": 656.15,
      "learning_rate": 2.2317988676592906e-05,
      "loss": 0.0338,
      "step": 8530
    },
    {
      "epoch": 656.92,
      "learning_rate": 2.221151407683533e-05,
      "loss": 0.0356,
      "step": 8540
    },
    {
      "epoch": 657.69,
      "learning_rate": 2.2105221498033862e-05,
      "loss": 0.0328,
      "step": 8550
    },
    {
      "epoch": 658.46,
      "learning_rate": 2.199911163642931e-05,
      "loss": 0.0368,
      "step": 8560
    },
    {
      "epoch": 659.23,
      "learning_rate": 2.189318518706555e-05,
      "loss": 0.0362,
      "step": 8570
    },
    {
      "epoch": 660.0,
      "learning_rate": 2.178744284378515e-05,
      "loss": 0.0305,
      "step": 8580
    },
    {
      "epoch": 660.77,
      "learning_rate": 2.1681885299224698e-05,
      "loss": 0.0306,
      "step": 8590
    },
    {
      "epoch": 661.54,
      "learning_rate": 2.157651324481033e-05,
      "loss": 0.0311,
      "step": 8600
    },
    {
      "epoch": 661.54,
      "eval_valid_eval_loss": 6.605250358581543,
      "eval_valid_eval_loss_<cls>": 6.5748748779296875,
      "eval_valid_eval_perplexity_batch": 738.96484375,
      "eval_valid_eval_perplexity_res": 6121.4853515625,
      "eval_valid_eval_perplexity_seq": 738.96484375,
      "eval_valid_eval_reconstruction": 0.07963117957115173,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.605250835418701,
      "eval_valid_runtime": 0.3503,
      "eval_valid_samples_per_second": 2.855,
      "eval_valid_steps_per_second": 2.855,
      "step": 8600
    },
    {
      "epoch": 661.54,
      "eval_train_eval_loss": 0.01210664864629507,
      "eval_train_eval_loss_<cls>": 0.8790566325187683,
      "eval_train_eval_perplexity_batch": 1.012180209159851,
      "eval_train_eval_perplexity_res": 1.7609262466430664,
      "eval_train_eval_perplexity_seq": 1.0122785568237305,
      "eval_train_eval_reconstruction": 0.16755037009716034,
      "eval_train_eval_reconstruction_<cls>": 0.8541666865348816,
      "eval_train_loss": 0.03057742863893509,
      "eval_train_runtime": 0.7227,
      "eval_train_samples_per_second": 136.982,
      "eval_train_steps_per_second": 17.988,
      "step": 8600
    },
    {
      "epoch": 662.31,
      "learning_rate": 2.1471327370753168e-05,
      "loss": 0.033,
      "step": 8610
    },
    {
      "epoch": 663.08,
      "learning_rate": 2.1366328366044813e-05,
      "loss": 0.03,
      "step": 8620
    },
    {
      "epoch": 663.85,
      "learning_rate": 2.1261516918452824e-05,
      "loss": 0.0318,
      "step": 8630
    },
    {
      "epoch": 664.62,
      "learning_rate": 2.115689371451622e-05,
      "loss": 0.0376,
      "step": 8640
    },
    {
      "epoch": 665.38,
      "learning_rate": 2.1052459439541005e-05,
      "loss": 0.0267,
      "step": 8650
    },
    {
      "epoch": 666.15,
      "learning_rate": 2.0948214777595594e-05,
      "loss": 0.0356,
      "step": 8660
    },
    {
      "epoch": 666.92,
      "learning_rate": 2.0844160411506448e-05,
      "loss": 0.0292,
      "step": 8670
    },
    {
      "epoch": 667.69,
      "learning_rate": 2.0740297022853532e-05,
      "loss": 0.0334,
      "step": 8680
    },
    {
      "epoch": 668.46,
      "learning_rate": 2.063662529196586e-05,
      "loss": 0.0312,
      "step": 8690
    },
    {
      "epoch": 669.23,
      "learning_rate": 2.0533145897917057e-05,
      "loss": 0.0365,
      "step": 8700
    },
    {
      "epoch": 669.23,
      "eval_valid_eval_loss": 6.586404800415039,
      "eval_valid_eval_loss_<cls>": 4.287359237670898,
      "eval_valid_eval_perplexity_batch": 725.1690673828125,
      "eval_valid_eval_perplexity_res": 6265.9951171875,
      "eval_valid_eval_perplexity_seq": 725.1690673828125,
      "eval_valid_eval_reconstruction": 0.08803301304578781,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.5864057540893555,
      "eval_valid_runtime": 0.3404,
      "eval_valid_samples_per_second": 2.938,
      "eval_valid_steps_per_second": 2.938,
      "step": 8700
    },
    {
      "epoch": 669.23,
      "eval_train_eval_loss": 0.016768675297498703,
      "eval_train_eval_loss_<cls>": 0.8097524642944336,
      "eval_train_eval_perplexity_batch": 1.0169100761413574,
      "eval_train_eval_perplexity_res": 2.5606985092163086,
      "eval_train_eval_perplexity_seq": 1.0170398950576782,
      "eval_train_eval_reconstruction": 0.24914932250976562,
      "eval_train_eval_reconstruction_<cls>": 0.9050279259681702,
      "eval_train_loss": 0.031870532780885696,
      "eval_train_runtime": 0.6384,
      "eval_train_samples_per_second": 155.066,
      "eval_train_steps_per_second": 20.362,
      "step": 8700
    },
    {
      "epoch": 670.0,
      "learning_rate": 2.042985951852087e-05,
      "loss": 0.0314,
      "step": 8710
    },
    {
      "epoch": 670.77,
      "learning_rate": 2.032676683032678e-05,
      "loss": 0.0308,
      "step": 8720
    },
    {
      "epoch": 671.54,
      "learning_rate": 2.022386850861555e-05,
      "loss": 0.0308,
      "step": 8730
    },
    {
      "epoch": 672.31,
      "learning_rate": 2.0121165227394774e-05,
      "loss": 0.0345,
      "step": 8740
    },
    {
      "epoch": 673.08,
      "learning_rate": 2.0018657659394496e-05,
      "loss": 0.0332,
      "step": 8750
    },
    {
      "epoch": 673.85,
      "learning_rate": 1.9916346476062813e-05,
      "loss": 0.028,
      "step": 8760
    },
    {
      "epoch": 674.62,
      "learning_rate": 1.981423234756143e-05,
      "loss": 0.0318,
      "step": 8770
    },
    {
      "epoch": 675.38,
      "learning_rate": 1.9712315942761306e-05,
      "loss": 0.03,
      "step": 8780
    },
    {
      "epoch": 676.15,
      "learning_rate": 1.9610597929238274e-05,
      "loss": 0.0335,
      "step": 8790
    },
    {
      "epoch": 676.92,
      "learning_rate": 1.9509078973268645e-05,
      "loss": 0.0313,
      "step": 8800
    },
    {
      "epoch": 676.92,
      "eval_valid_eval_loss": 6.650909423828125,
      "eval_valid_eval_loss_<cls>": 6.523036003112793,
      "eval_valid_eval_perplexity_batch": 773.4874267578125,
      "eval_valid_eval_perplexity_res": 6646.6416015625,
      "eval_valid_eval_perplexity_seq": 773.4874267578125,
      "eval_valid_eval_reconstruction": 0.08046940714120865,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.650909900665283,
      "eval_valid_runtime": 0.2995,
      "eval_valid_samples_per_second": 3.339,
      "eval_valid_steps_per_second": 3.339,
      "step": 8800
    },
    {
      "epoch": 676.92,
      "eval_train_eval_loss": 0.01351111475378275,
      "eval_train_eval_loss_<cls>": 0.9288144111633301,
      "eval_train_eval_perplexity_batch": 1.01360285282135,
      "eval_train_eval_perplexity_res": 2.1771938800811768,
      "eval_train_eval_perplexity_seq": 1.013761281967163,
      "eval_train_eval_reconstruction": 0.19570773839950562,
      "eval_train_eval_reconstruction_<cls>": 0.8415841460227966,
      "eval_train_loss": 0.028010137379169464,
      "eval_train_runtime": 0.674,
      "eval_train_samples_per_second": 146.889,
      "eval_train_steps_per_second": 19.288,
      "step": 8800
    },
    {
      "epoch": 677.69,
      "learning_rate": 1.940775973982487e-05,
      "loss": 0.0295,
      "step": 8810
    },
    {
      "epoch": 678.46,
      "learning_rate": 1.930664089257119e-05,
      "loss": 0.0315,
      "step": 8820
    },
    {
      "epoch": 679.23,
      "learning_rate": 1.920572309385919e-05,
      "loss": 0.0271,
      "step": 8830
    },
    {
      "epoch": 680.0,
      "learning_rate": 1.9105007004723635e-05,
      "loss": 0.0349,
      "step": 8840
    },
    {
      "epoch": 680.77,
      "learning_rate": 1.9004493284877995e-05,
      "loss": 0.0301,
      "step": 8850
    },
    {
      "epoch": 681.54,
      "learning_rate": 1.8904182592710195e-05,
      "loss": 0.0347,
      "step": 8860
    },
    {
      "epoch": 682.31,
      "learning_rate": 1.880407558527828e-05,
      "loss": 0.0338,
      "step": 8870
    },
    {
      "epoch": 683.08,
      "learning_rate": 1.8704172918306094e-05,
      "loss": 0.0299,
      "step": 8880
    },
    {
      "epoch": 683.85,
      "learning_rate": 1.8604475246179033e-05,
      "loss": 0.028,
      "step": 8890
    },
    {
      "epoch": 684.62,
      "learning_rate": 1.850498322193972e-05,
      "loss": 0.0268,
      "step": 8900
    },
    {
      "epoch": 684.62,
      "eval_valid_eval_loss": 6.687021732330322,
      "eval_valid_eval_loss_<cls>": 6.005221366882324,
      "eval_valid_eval_perplexity_batch": 801.9303588867188,
      "eval_valid_eval_perplexity_res": 7046.12890625,
      "eval_valid_eval_perplexity_seq": 801.9303588867188,
      "eval_valid_eval_reconstruction": 0.08214584738016129,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.6870222091674805,
      "eval_valid_runtime": 0.3382,
      "eval_valid_samples_per_second": 2.957,
      "eval_valid_steps_per_second": 2.957,
      "step": 8900
    },
    {
      "epoch": 684.62,
      "eval_train_eval_loss": 0.011238628067076206,
      "eval_train_eval_loss_<cls>": 0.8991841673851013,
      "eval_train_eval_perplexity_batch": 1.0113019943237305,
      "eval_train_eval_perplexity_res": 1.8867607116699219,
      "eval_train_eval_perplexity_seq": 1.0113688707351685,
      "eval_train_eval_reconstruction": 0.17311003804206848,
      "eval_train_eval_reconstruction_<cls>": 0.8396226167678833,
      "eval_train_loss": 0.027385039255023003,
      "eval_train_runtime": 0.8239,
      "eval_train_samples_per_second": 120.16,
      "eval_train_steps_per_second": 15.779,
      "step": 8900
    },
    {
      "epoch": 685.38,
      "learning_rate": 1.8405697497283703e-05,
      "loss": 0.0266,
      "step": 8910
    },
    {
      "epoch": 686.15,
      "learning_rate": 1.830661872255527e-05,
      "loss": 0.0299,
      "step": 8920
    },
    {
      "epoch": 686.92,
      "learning_rate": 1.8207747546743115e-05,
      "loss": 0.0263,
      "step": 8930
    },
    {
      "epoch": 687.69,
      "learning_rate": 1.8109084617476118e-05,
      "loss": 0.0295,
      "step": 8940
    },
    {
      "epoch": 688.46,
      "learning_rate": 1.8010630581019095e-05,
      "loss": 0.0294,
      "step": 8950
    },
    {
      "epoch": 689.23,
      "learning_rate": 1.7912386082268566e-05,
      "loss": 0.0304,
      "step": 8960
    },
    {
      "epoch": 690.0,
      "learning_rate": 1.7814351764748527e-05,
      "loss": 0.0308,
      "step": 8970
    },
    {
      "epoch": 690.77,
      "learning_rate": 1.7716528270606243e-05,
      "loss": 0.0261,
      "step": 8980
    },
    {
      "epoch": 691.54,
      "learning_rate": 1.7618916240608052e-05,
      "loss": 0.0325,
      "step": 8990
    },
    {
      "epoch": 692.31,
      "learning_rate": 1.752151631413511e-05,
      "loss": 0.0301,
      "step": 9000
    },
    {
      "epoch": 692.31,
      "eval_valid_eval_loss": 6.722228527069092,
      "eval_valid_eval_loss_<cls>": 6.865252494812012,
      "eval_valid_eval_perplexity_batch": 830.6666259765625,
      "eval_valid_eval_perplexity_res": 7610.84326171875,
      "eval_valid_eval_perplexity_seq": 830.6666259765625,
      "eval_valid_eval_reconstruction": 0.08955223858356476,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.72222900390625,
      "eval_valid_runtime": 0.3026,
      "eval_valid_samples_per_second": 3.304,
      "eval_valid_steps_per_second": 3.304,
      "step": 9000
    },
    {
      "epoch": 692.31,
      "eval_train_eval_loss": 0.012829795479774475,
      "eval_train_eval_loss_<cls>": 0.9089764356613159,
      "eval_train_eval_perplexity_batch": 1.012912392616272,
      "eval_train_eval_perplexity_res": 2.2564520835876465,
      "eval_train_eval_perplexity_seq": 1.0130138397216797,
      "eval_train_eval_reconstruction": 0.19336166977882385,
      "eval_train_eval_reconstruction_<cls>": 0.8299999833106995,
      "eval_train_loss": 0.026684729382395744,
      "eval_train_runtime": 0.6842,
      "eval_train_samples_per_second": 144.685,
      "eval_train_steps_per_second": 18.999,
      "step": 9000
    },
    {
      "epoch": 693.08,
      "learning_rate": 1.7424329129179285e-05,
      "loss": 0.0312,
      "step": 9010
    },
    {
      "epoch": 693.85,
      "learning_rate": 1.7327355322338933e-05,
      "loss": 0.0301,
      "step": 9020
    },
    {
      "epoch": 694.62,
      "learning_rate": 1.7230595528814724e-05,
      "loss": 0.0265,
      "step": 9030
    },
    {
      "epoch": 695.38,
      "learning_rate": 1.71340503824055e-05,
      "loss": 0.0297,
      "step": 9040
    },
    {
      "epoch": 696.15,
      "learning_rate": 1.703772051550412e-05,
      "loss": 0.0247,
      "step": 9050
    },
    {
      "epoch": 696.92,
      "learning_rate": 1.6941606559093293e-05,
      "loss": 0.0291,
      "step": 9060
    },
    {
      "epoch": 697.69,
      "learning_rate": 1.684570914274149e-05,
      "loss": 0.029,
      "step": 9070
    },
    {
      "epoch": 698.46,
      "learning_rate": 1.6750028894598784e-05,
      "loss": 0.0278,
      "step": 9080
    },
    {
      "epoch": 699.23,
      "learning_rate": 1.6654566441392726e-05,
      "loss": 0.0277,
      "step": 9090
    },
    {
      "epoch": 700.0,
      "learning_rate": 1.6559322408424287e-05,
      "loss": 0.0288,
      "step": 9100
    },
    {
      "epoch": 700.0,
      "eval_valid_eval_loss": 6.771044731140137,
      "eval_valid_eval_loss_<cls>": 7.315093040466309,
      "eval_valid_eval_perplexity_batch": 872.22265625,
      "eval_valid_eval_perplexity_res": 7640.30078125,
      "eval_valid_eval_perplexity_seq": 872.22265625,
      "eval_valid_eval_reconstruction": 0.07000000029802322,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.7710442543029785,
      "eval_valid_runtime": 0.2952,
      "eval_valid_samples_per_second": 3.388,
      "eval_valid_steps_per_second": 3.388,
      "step": 9100
    },
    {
      "epoch": 700.0,
      "eval_train_eval_loss": 0.010117064230144024,
      "eval_train_eval_loss_<cls>": 0.8551220893859863,
      "eval_train_eval_perplexity_batch": 1.010168433189392,
      "eval_train_eval_perplexity_res": 1.9597883224487305,
      "eval_train_eval_perplexity_seq": 1.0102757215499878,
      "eval_train_eval_reconstruction": 0.15287697315216064,
      "eval_train_eval_reconstruction_<cls>": 0.8377193212509155,
      "eval_train_loss": 0.028046470135450363,
      "eval_train_runtime": 0.7125,
      "eval_train_samples_per_second": 138.952,
      "eval_train_steps_per_second": 18.246,
      "step": 9100
    },
    {
      "epoch": 700.77,
      "learning_rate": 1.6464297419563736e-05,
      "loss": 0.0243,
      "step": 9110
    },
    {
      "epoch": 701.54,
      "learning_rate": 1.636949209724653e-05,
      "loss": 0.0307,
      "step": 9120
    },
    {
      "epoch": 702.31,
      "learning_rate": 1.6274907062469284e-05,
      "loss": 0.0279,
      "step": 9130
    },
    {
      "epoch": 703.08,
      "learning_rate": 1.618054293478567e-05,
      "loss": 0.0291,
      "step": 9140
    },
    {
      "epoch": 703.85,
      "learning_rate": 1.608640033230236e-05,
      "loss": 0.0243,
      "step": 9150
    },
    {
      "epoch": 704.62,
      "learning_rate": 1.5992479871674998e-05,
      "loss": 0.0298,
      "step": 9160
    },
    {
      "epoch": 705.38,
      "learning_rate": 1.5898782168104136e-05,
      "loss": 0.0311,
      "step": 9170
    },
    {
      "epoch": 706.15,
      "learning_rate": 1.580530783533123e-05,
      "loss": 0.0301,
      "step": 9180
    },
    {
      "epoch": 706.92,
      "learning_rate": 1.571205748563459e-05,
      "loss": 0.0283,
      "step": 9190
    },
    {
      "epoch": 707.69,
      "learning_rate": 1.5619031729825402e-05,
      "loss": 0.0328,
      "step": 9200
    },
    {
      "epoch": 707.69,
      "eval_valid_eval_loss": 6.767180442810059,
      "eval_valid_eval_loss_<cls>": 6.960301876068115,
      "eval_valid_eval_perplexity_batch": 868.858642578125,
      "eval_valid_eval_perplexity_res": 7296.8154296875,
      "eval_valid_eval_perplexity_seq": 868.858642578125,
      "eval_valid_eval_reconstruction": 0.07659115642309189,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.7671799659729,
      "eval_valid_runtime": 0.3232,
      "eval_valid_samples_per_second": 3.094,
      "eval_valid_steps_per_second": 3.094,
      "step": 9200
    },
    {
      "epoch": 707.69,
      "eval_train_eval_loss": 0.01234171912074089,
      "eval_train_eval_loss_<cls>": 0.7632714509963989,
      "eval_train_eval_perplexity_batch": 1.0124181509017944,
      "eval_train_eval_perplexity_res": 2.013550281524658,
      "eval_train_eval_perplexity_seq": 1.0125021934509277,
      "eval_train_eval_reconstruction": 0.19897252321243286,
      "eval_train_eval_reconstruction_<cls>": 0.8552631735801697,
      "eval_train_loss": 0.027071230113506317,
      "eval_train_runtime": 0.6754,
      "eval_train_samples_per_second": 146.574,
      "eval_train_steps_per_second": 19.247,
      "step": 9200
    },
    {
      "epoch": 708.46,
      "learning_rate": 1.5526231177243693e-05,
      "loss": 0.0278,
      "step": 9210
    },
    {
      "epoch": 709.23,
      "learning_rate": 1.5433656435754372e-05,
      "loss": 0.0275,
      "step": 9220
    },
    {
      "epoch": 710.0,
      "learning_rate": 1.534130811174323e-05,
      "loss": 0.0283,
      "step": 9230
    },
    {
      "epoch": 710.77,
      "learning_rate": 1.524918681011296e-05,
      "loss": 0.0227,
      "step": 9240
    },
    {
      "epoch": 711.54,
      "learning_rate": 1.5157293134279244e-05,
      "loss": 0.0303,
      "step": 9250
    },
    {
      "epoch": 712.31,
      "learning_rate": 1.5065627686166689e-05,
      "loss": 0.03,
      "step": 9260
    },
    {
      "epoch": 713.08,
      "learning_rate": 1.4974191066205034e-05,
      "loss": 0.0283,
      "step": 9270
    },
    {
      "epoch": 713.85,
      "learning_rate": 1.4882983873325102e-05,
      "loss": 0.0286,
      "step": 9280
    },
    {
      "epoch": 714.62,
      "learning_rate": 1.4792006704954925e-05,
      "loss": 0.0275,
      "step": 9290
    },
    {
      "epoch": 715.38,
      "learning_rate": 1.4701260157015822e-05,
      "loss": 0.0286,
      "step": 9300
    },
    {
      "epoch": 715.38,
      "eval_valid_eval_loss": 6.6875386238098145,
      "eval_valid_eval_loss_<cls>": 4.325540542602539,
      "eval_valid_eval_perplexity_batch": 802.344970703125,
      "eval_valid_eval_perplexity_res": 7472.23486328125,
      "eval_valid_eval_perplexity_seq": 802.344970703125,
      "eval_valid_eval_reconstruction": 0.08459869772195816,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.6875386238098145,
      "eval_valid_runtime": 0.3633,
      "eval_valid_samples_per_second": 2.753,
      "eval_valid_steps_per_second": 2.753,
      "step": 9300
    },
    {
      "epoch": 715.38,
      "eval_train_eval_loss": 0.009590023197233677,
      "eval_train_eval_loss_<cls>": 0.7974693179130554,
      "eval_train_eval_perplexity_batch": 1.0096361637115479,
      "eval_train_eval_perplexity_res": 1.7613314390182495,
      "eval_train_eval_perplexity_seq": 1.0097073316574097,
      "eval_train_eval_reconstruction": 0.15748488903045654,
      "eval_train_eval_reconstruction_<cls>": 0.8659793734550476,
      "eval_train_loss": 0.025226563215255737,
      "eval_train_runtime": 0.8036,
      "eval_train_samples_per_second": 123.188,
      "eval_train_steps_per_second": 16.176,
      "step": 9300
    },
    {
      "epoch": 716.15,
      "learning_rate": 1.4610744823918498e-05,
      "loss": 0.0258,
      "step": 9310
    },
    {
      "epoch": 716.92,
      "learning_rate": 1.4520461298559145e-05,
      "loss": 0.0225,
      "step": 9320
    },
    {
      "epoch": 717.69,
      "learning_rate": 1.4430410172315572e-05,
      "loss": 0.0249,
      "step": 9330
    },
    {
      "epoch": 718.46,
      "learning_rate": 1.4340592035043282e-05,
      "loss": 0.0274,
      "step": 9340
    },
    {
      "epoch": 719.23,
      "learning_rate": 1.4251007475071687e-05,
      "loss": 0.0281,
      "step": 9350
    },
    {
      "epoch": 720.0,
      "learning_rate": 1.4161657079200202e-05,
      "loss": 0.025,
      "step": 9360
    },
    {
      "epoch": 720.77,
      "learning_rate": 1.4072541432694408e-05,
      "loss": 0.024,
      "step": 9370
    },
    {
      "epoch": 721.54,
      "learning_rate": 1.3983661119282237e-05,
      "loss": 0.0255,
      "step": 9380
    },
    {
      "epoch": 722.31,
      "learning_rate": 1.3895016721150123e-05,
      "loss": 0.0294,
      "step": 9390
    },
    {
      "epoch": 723.08,
      "learning_rate": 1.3806608818939203e-05,
      "loss": 0.0258,
      "step": 9400
    },
    {
      "epoch": 723.08,
      "eval_valid_eval_loss": 6.7946367263793945,
      "eval_valid_eval_loss_<cls>": 5.777524471282959,
      "eval_valid_eval_perplexity_batch": 893.0447998046875,
      "eval_valid_eval_perplexity_res": 8140.6083984375,
      "eval_valid_eval_perplexity_seq": 893.0447998046875,
      "eval_valid_eval_reconstruction": 0.08130762726068497,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.794635772705078,
      "eval_valid_runtime": 0.4125,
      "eval_valid_samples_per_second": 2.424,
      "eval_valid_steps_per_second": 2.424,
      "step": 9400
    },
    {
      "epoch": 723.08,
      "eval_train_eval_loss": 0.010588185861706734,
      "eval_train_eval_loss_<cls>": 0.8354957699775696,
      "eval_train_eval_perplexity_batch": 1.0106444358825684,
      "eval_train_eval_perplexity_res": 2.0663652420043945,
      "eval_train_eval_perplexity_seq": 1.0107030868530273,
      "eval_train_eval_reconstruction": 0.17107871174812317,
      "eval_train_eval_reconstruction_<cls>": 0.8280543088912964,
      "eval_train_loss": 0.029771557077765465,
      "eval_train_runtime": 0.7196,
      "eval_train_samples_per_second": 137.574,
      "eval_train_steps_per_second": 18.065,
      "step": 9400
    },
    {
      "epoch": 723.85,
      "learning_rate": 1.3718437991741518e-05,
      "loss": 0.0252,
      "step": 9410
    },
    {
      "epoch": 724.62,
      "learning_rate": 1.3630504817096213e-05,
      "loss": 0.0329,
      "step": 9420
    },
    {
      "epoch": 725.38,
      "learning_rate": 1.354280987098573e-05,
      "loss": 0.0279,
      "step": 9430
    },
    {
      "epoch": 726.15,
      "learning_rate": 1.3455353727832087e-05,
      "loss": 0.0278,
      "step": 9440
    },
    {
      "epoch": 726.92,
      "learning_rate": 1.3368136960493083e-05,
      "loss": 0.0242,
      "step": 9450
    },
    {
      "epoch": 727.69,
      "learning_rate": 1.3281160140258547e-05,
      "loss": 0.0237,
      "step": 9460
    },
    {
      "epoch": 728.46,
      "learning_rate": 1.3194423836846598e-05,
      "loss": 0.0255,
      "step": 9470
    },
    {
      "epoch": 729.23,
      "learning_rate": 1.3107928618399917e-05,
      "loss": 0.0243,
      "step": 9480
    },
    {
      "epoch": 730.0,
      "learning_rate": 1.302167505148203e-05,
      "loss": 0.0264,
      "step": 9490
    },
    {
      "epoch": 730.77,
      "learning_rate": 1.2935663701073586e-05,
      "loss": 0.0251,
      "step": 9500
    },
    {
      "epoch": 730.77,
      "eval_valid_eval_loss": 6.775576591491699,
      "eval_valid_eval_loss_<cls>": 5.837193012237549,
      "eval_valid_eval_perplexity_batch": 876.1844482421875,
      "eval_valid_eval_perplexity_res": 7927.3740234375,
      "eval_valid_eval_perplexity_seq": 876.1844482421875,
      "eval_valid_eval_reconstruction": 0.08414239436388016,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.775576591491699,
      "eval_valid_runtime": 0.4267,
      "eval_valid_samples_per_second": 2.344,
      "eval_valid_steps_per_second": 2.344,
      "step": 9500
    },
    {
      "epoch": 730.77,
      "eval_train_eval_loss": 0.015152014791965485,
      "eval_train_eval_loss_<cls>": 0.8169143795967102,
      "eval_train_eval_perplexity_batch": 1.0152673721313477,
      "eval_train_eval_perplexity_res": 2.2544147968292236,
      "eval_train_eval_perplexity_seq": 1.0153931379318237,
      "eval_train_eval_reconstruction": 0.25620001554489136,
      "eval_train_eval_reconstruction_<cls>": 0.8722466826438904,
      "eval_train_loss": 0.02643924579024315,
      "eval_train_runtime": 0.7387,
      "eval_train_samples_per_second": 134.025,
      "eval_train_steps_per_second": 17.599,
      "step": 9500
    },
    {
      "epoch": 731.54,
      "learning_rate": 1.2849895130568635e-05,
      "loss": 0.0251,
      "step": 9510
    },
    {
      "epoch": 732.31,
      "learning_rate": 1.2764369901770989e-05,
      "loss": 0.028,
      "step": 9520
    },
    {
      "epoch": 733.08,
      "learning_rate": 1.267908857489053e-05,
      "loss": 0.0274,
      "step": 9530
    },
    {
      "epoch": 733.85,
      "learning_rate": 1.2594051708539496e-05,
      "loss": 0.0244,
      "step": 9540
    },
    {
      "epoch": 734.62,
      "learning_rate": 1.2509259859728862e-05,
      "loss": 0.0288,
      "step": 9550
    },
    {
      "epoch": 735.38,
      "learning_rate": 1.2424713583864695e-05,
      "loss": 0.0259,
      "step": 9560
    },
    {
      "epoch": 736.15,
      "learning_rate": 1.2340413434744485e-05,
      "loss": 0.0262,
      "step": 9570
    },
    {
      "epoch": 736.92,
      "learning_rate": 1.2256359964553549e-05,
      "loss": 0.0239,
      "step": 9580
    },
    {
      "epoch": 737.69,
      "learning_rate": 1.2172553723861391e-05,
      "loss": 0.0249,
      "step": 9590
    },
    {
      "epoch": 738.46,
      "learning_rate": 1.2088995261618108e-05,
      "loss": 0.0241,
      "step": 9600
    },
    {
      "epoch": 738.46,
      "eval_valid_eval_loss": 6.889394760131836,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 981.8070068359375,
      "eval_valid_eval_perplexity_res": 8343.7236328125,
      "eval_valid_eval_perplexity_seq": 981.8070068359375,
      "eval_valid_eval_reconstruction": 0.0555555559694767,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 6.889395713806152,
      "eval_valid_runtime": 0.3819,
      "eval_valid_samples_per_second": 2.619,
      "eval_valid_steps_per_second": 2.619,
      "step": 9600
    },
    {
      "epoch": 738.46,
      "eval_train_eval_loss": 0.00927885714918375,
      "eval_train_eval_loss_<cls>": 0.6711149215698242,
      "eval_train_eval_perplexity_batch": 1.0093220472335815,
      "eval_train_eval_perplexity_res": 2.0556700229644775,
      "eval_train_eval_perplexity_seq": 1.009387731552124,
      "eval_train_eval_reconstruction": 0.15895237028598785,
      "eval_train_eval_reconstruction_<cls>": 0.9117646813392639,
      "eval_train_loss": 0.02501882240176201,
      "eval_train_runtime": 0.7815,
      "eval_train_samples_per_second": 126.684,
      "eval_train_steps_per_second": 16.635,
      "step": 9600
    },
    {
      "epoch": 739.23,
      "learning_rate": 1.2005685125150795e-05,
      "loss": 0.0267,
      "step": 9610
    },
    {
      "epoch": 740.0,
      "learning_rate": 1.1922623860159948e-05,
      "loss": 0.0212,
      "step": 9620
    },
    {
      "epoch": 740.77,
      "learning_rate": 1.1839812010715911e-05,
      "loss": 0.0247,
      "step": 9630
    },
    {
      "epoch": 741.54,
      "learning_rate": 1.1757250119255281e-05,
      "loss": 0.0231,
      "step": 9640
    },
    {
      "epoch": 742.31,
      "learning_rate": 1.167493872657739e-05,
      "loss": 0.0282,
      "step": 9650
    },
    {
      "epoch": 743.08,
      "learning_rate": 1.1592878371840732e-05,
      "loss": 0.0245,
      "step": 9660
    },
    {
      "epoch": 743.85,
      "learning_rate": 1.151106959255946e-05,
      "loss": 0.0233,
      "step": 9670
    },
    {
      "epoch": 744.62,
      "learning_rate": 1.142951292459985e-05,
      "loss": 0.0269,
      "step": 9680
    },
    {
      "epoch": 745.38,
      "learning_rate": 1.134820890217675e-05,
      "loss": 0.0249,
      "step": 9690
    },
    {
      "epoch": 746.15,
      "learning_rate": 1.1267158057850175e-05,
      "loss": 0.0278,
      "step": 9700
    },
    {
      "epoch": 746.15,
      "eval_valid_eval_loss": 6.8835129737854,
      "eval_valid_eval_loss_<cls>": 9.171708106994629,
      "eval_valid_eval_perplexity_batch": 976.0491943359375,
      "eval_valid_eval_perplexity_res": 8959.0390625,
      "eval_valid_eval_perplexity_seq": 976.0491943359375,
      "eval_valid_eval_reconstruction": 0.07188160717487335,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.8835129737854,
      "eval_valid_runtime": 0.3726,
      "eval_valid_samples_per_second": 2.684,
      "eval_valid_steps_per_second": 2.684,
      "step": 9700
    },
    {
      "epoch": 746.15,
      "eval_train_eval_loss": 0.012740452773869038,
      "eval_train_eval_loss_<cls>": 0.7661535739898682,
      "eval_train_eval_perplexity_batch": 1.012821912765503,
      "eval_train_eval_perplexity_res": 2.4235360622406006,
      "eval_train_eval_perplexity_seq": 1.0129035711288452,
      "eval_train_eval_reconstruction": 0.2269882708787918,
      "eval_train_eval_reconstruction_<cls>": 0.8629441857337952,
      "eval_train_loss": 0.02336050197482109,
      "eval_train_runtime": 0.7092,
      "eval_train_samples_per_second": 139.591,
      "eval_train_steps_per_second": 18.33,
      "step": 9700
    },
    {
      "epoch": 746.92,
      "learning_rate": 1.1186360922521744e-05,
      "loss": 0.0254,
      "step": 9710
    },
    {
      "epoch": 747.69,
      "learning_rate": 1.110581802543122e-05,
      "loss": 0.0294,
      "step": 9720
    },
    {
      "epoch": 748.46,
      "learning_rate": 1.1025529894153052e-05,
      "loss": 0.0261,
      "step": 9730
    },
    {
      "epoch": 749.23,
      "learning_rate": 1.0945497054592917e-05,
      "loss": 0.0245,
      "step": 9740
    },
    {
      "epoch": 750.0,
      "learning_rate": 1.0865720030984267e-05,
      "loss": 0.0299,
      "step": 9750
    },
    {
      "epoch": 750.77,
      "learning_rate": 1.0786199345884924e-05,
      "loss": 0.0259,
      "step": 9760
    },
    {
      "epoch": 751.54,
      "learning_rate": 1.0706935520173578e-05,
      "loss": 0.0231,
      "step": 9770
    },
    {
      "epoch": 752.31,
      "learning_rate": 1.0627929073046483e-05,
      "loss": 0.0273,
      "step": 9780
    },
    {
      "epoch": 753.08,
      "learning_rate": 1.054918052201399e-05,
      "loss": 0.027,
      "step": 9790
    },
    {
      "epoch": 753.85,
      "learning_rate": 1.0470690382897163e-05,
      "loss": 0.0232,
      "step": 9800
    },
    {
      "epoch": 753.85,
      "eval_valid_eval_loss": 6.8023505210876465,
      "eval_valid_eval_loss_<cls>": 6.8473663330078125,
      "eval_valid_eval_perplexity_batch": 899.960205078125,
      "eval_valid_eval_perplexity_res": 8496.498046875,
      "eval_valid_eval_perplexity_seq": 899.960205078125,
      "eval_valid_eval_reconstruction": 0.07588357478380203,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.8023505210876465,
      "eval_valid_runtime": 0.3484,
      "eval_valid_samples_per_second": 2.87,
      "eval_valid_steps_per_second": 2.87,
      "step": 9800
    },
    {
      "epoch": 753.85,
      "eval_train_eval_loss": 0.015851540490984917,
      "eval_train_eval_loss_<cls>": 0.6710284352302551,
      "eval_train_eval_perplexity_batch": 1.0159778594970703,
      "eval_train_eval_perplexity_res": 2.968815803527832,
      "eval_train_eval_perplexity_seq": 1.0160844326019287,
      "eval_train_eval_reconstruction": 0.2885178327560425,
      "eval_train_eval_reconstruction_<cls>": 0.89449542760849,
      "eval_train_loss": 0.02618546225130558,
      "eval_train_runtime": 0.6715,
      "eval_train_samples_per_second": 147.44,
      "eval_train_steps_per_second": 19.361,
      "step": 9800
    },
    {
      "epoch": 754.62,
      "learning_rate": 1.0392459169824426e-05,
      "loss": 0.0267,
      "step": 9810
    },
    {
      "epoch": 755.38,
      "learning_rate": 1.0314487395228157e-05,
      "loss": 0.0281,
      "step": 9820
    },
    {
      "epoch": 756.15,
      "learning_rate": 1.0236775569841368e-05,
      "loss": 0.0225,
      "step": 9830
    },
    {
      "epoch": 756.92,
      "learning_rate": 1.0159324202694337e-05,
      "loss": 0.0221,
      "step": 9840
    },
    {
      "epoch": 757.69,
      "learning_rate": 1.0082133801111293e-05,
      "loss": 0.022,
      "step": 9850
    },
    {
      "epoch": 758.46,
      "learning_rate": 1.0005204870707047e-05,
      "loss": 0.026,
      "step": 9860
    },
    {
      "epoch": 759.23,
      "learning_rate": 9.928537915383745e-06,
      "loss": 0.0253,
      "step": 9870
    },
    {
      "epoch": 760.0,
      "learning_rate": 9.852133437327538e-06,
      "loss": 0.0245,
      "step": 9880
    },
    {
      "epoch": 760.77,
      "learning_rate": 9.775991937005274e-06,
      "loss": 0.023,
      "step": 9890
    },
    {
      "epoch": 761.54,
      "learning_rate": 9.70011391316124e-06,
      "loss": 0.0267,
      "step": 9900
    },
    {
      "epoch": 761.54,
      "eval_valid_eval_loss": 6.803272724151611,
      "eval_valid_eval_loss_<cls>": 6.266879558563232,
      "eval_valid_eval_perplexity_batch": 900.79052734375,
      "eval_valid_eval_perplexity_res": 8381.53125,
      "eval_valid_eval_perplexity_seq": 900.79052734375,
      "eval_valid_eval_reconstruction": 0.08385743945837021,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.8032732009887695,
      "eval_valid_runtime": 0.3725,
      "eval_valid_samples_per_second": 2.685,
      "eval_valid_steps_per_second": 2.685,
      "step": 9900
    },
    {
      "epoch": 761.54,
      "eval_train_eval_loss": 0.011176512576639652,
      "eval_train_eval_loss_<cls>": 0.7866771221160889,
      "eval_train_eval_perplexity_batch": 1.0112391710281372,
      "eval_train_eval_perplexity_res": 2.373685836791992,
      "eval_train_eval_perplexity_seq": 1.011297583580017,
      "eval_train_eval_reconstruction": 0.2037060558795929,
      "eval_train_eval_reconstruction_<cls>": 0.8656716346740723,
      "eval_train_loss": 0.02575542964041233,
      "eval_train_runtime": 0.7105,
      "eval_train_samples_per_second": 139.345,
      "eval_train_steps_per_second": 18.298,
      "step": 9900
    },
    {
      "epoch": 762.31,
      "learning_rate": 9.624499862813908e-06,
      "loss": 0.0226,
      "step": 9910
    },
    {
      "epoch": 763.08,
      "learning_rate": 9.549150281252633e-06,
      "loss": 0.0242,
      "step": 9920
    },
    {
      "epoch": 763.85,
      "learning_rate": 9.474065662034466e-06,
      "loss": 0.0233,
      "step": 9930
    },
    {
      "epoch": 764.62,
      "learning_rate": 9.399246496980878e-06,
      "loss": 0.028,
      "step": 9940
    },
    {
      "epoch": 765.38,
      "learning_rate": 9.324693276174567e-06,
      "loss": 0.0261,
      "step": 9950
    },
    {
      "epoch": 766.15,
      "learning_rate": 9.250406487956226e-06,
      "loss": 0.0239,
      "step": 9960
    },
    {
      "epoch": 766.92,
      "learning_rate": 9.176386618921351e-06,
      "loss": 0.022,
      "step": 9970
    },
    {
      "epoch": 767.69,
      "learning_rate": 9.102634153917062e-06,
      "loss": 0.0259,
      "step": 9980
    },
    {
      "epoch": 768.46,
      "learning_rate": 9.029149576038925e-06,
      "loss": 0.0233,
      "step": 9990
    },
    {
      "epoch": 769.23,
      "learning_rate": 8.955933366627778e-06,
      "loss": 0.0262,
      "step": 10000
    },
    {
      "epoch": 769.23,
      "eval_valid_eval_loss": 6.828697681427002,
      "eval_valid_eval_loss_<cls>": 6.987382888793945,
      "eval_valid_eval_perplexity_batch": 923.9866943359375,
      "eval_valid_eval_perplexity_res": 8421.5625,
      "eval_valid_eval_perplexity_seq": 923.9866943359375,
      "eval_valid_eval_reconstruction": 0.07966457307338715,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.828697204589844,
      "eval_valid_runtime": 0.3908,
      "eval_valid_samples_per_second": 2.559,
      "eval_valid_steps_per_second": 2.559,
      "step": 10000
    },
    {
      "epoch": 769.23,
      "eval_train_eval_loss": 0.01385403424501419,
      "eval_train_eval_loss_<cls>": 0.6893555521965027,
      "eval_train_eval_perplexity_batch": 1.0139504671096802,
      "eval_train_eval_perplexity_res": 2.527881622314453,
      "eval_train_eval_perplexity_seq": 1.0140231847763062,
      "eval_train_eval_reconstruction": 0.25673577189445496,
      "eval_train_eval_reconstruction_<cls>": 0.9217391014099121,
      "eval_train_loss": 0.025682998821139336,
      "eval_train_runtime": 0.7057,
      "eval_train_samples_per_second": 140.278,
      "eval_train_steps_per_second": 18.42,
      "step": 10000
    },
    {
      "epoch": 770.0,
      "learning_rate": 8.882986005266574e-06,
      "loss": 0.0279,
      "step": 10010
    },
    {
      "epoch": 770.77,
      "learning_rate": 8.810307969777288e-06,
      "loss": 0.0226,
      "step": 10020
    },
    {
      "epoch": 771.54,
      "learning_rate": 8.737899736217685e-06,
      "loss": 0.0243,
      "step": 10030
    },
    {
      "epoch": 772.31,
      "learning_rate": 8.665761778878323e-06,
      "loss": 0.0225,
      "step": 10040
    },
    {
      "epoch": 773.08,
      "learning_rate": 8.593894570279365e-06,
      "loss": 0.0276,
      "step": 10050
    },
    {
      "epoch": 773.85,
      "learning_rate": 8.522298581167504e-06,
      "loss": 0.0262,
      "step": 10060
    },
    {
      "epoch": 774.62,
      "learning_rate": 8.450974280512897e-06,
      "loss": 0.0265,
      "step": 10070
    },
    {
      "epoch": 775.38,
      "learning_rate": 8.379922135506069e-06,
      "loss": 0.0253,
      "step": 10080
    },
    {
      "epoch": 776.15,
      "learning_rate": 8.30914261155486e-06,
      "loss": 0.0242,
      "step": 10090
    },
    {
      "epoch": 776.92,
      "learning_rate": 8.238636172281395e-06,
      "loss": 0.0254,
      "step": 10100
    },
    {
      "epoch": 776.92,
      "eval_valid_eval_loss": 6.7959136962890625,
      "eval_valid_eval_loss_<cls>": 9.527302742004395,
      "eval_valid_eval_perplexity_batch": 894.1859130859375,
      "eval_valid_eval_perplexity_res": 7405.51318359375,
      "eval_valid_eval_perplexity_seq": 894.1859130859375,
      "eval_valid_eval_reconstruction": 0.06564551591873169,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.7959136962890625,
      "eval_valid_runtime": 0.3818,
      "eval_valid_samples_per_second": 2.619,
      "eval_valid_steps_per_second": 2.619,
      "step": 10100
    },
    {
      "epoch": 776.92,
      "eval_train_eval_loss": 0.010061243548989296,
      "eval_train_eval_loss_<cls>": 0.7801128625869751,
      "eval_train_eval_perplexity_batch": 1.0101120471954346,
      "eval_train_eval_perplexity_res": 2.0746541023254395,
      "eval_train_eval_perplexity_seq": 1.0101861953735352,
      "eval_train_eval_reconstruction": 0.18182118237018585,
      "eval_train_eval_reconstruction_<cls>": 0.8893616795539856,
      "eval_train_loss": 0.02180519700050354,
      "eval_train_runtime": 0.994,
      "eval_train_samples_per_second": 99.602,
      "eval_train_steps_per_second": 13.079,
      "step": 10100
    },
    {
      "epoch": 777.69,
      "learning_rate": 8.168403279519027e-06,
      "loss": 0.021,
      "step": 10110
    },
    {
      "epoch": 778.46,
      "learning_rate": 8.09844439330929e-06,
      "loss": 0.0267,
      "step": 10120
    },
    {
      "epoch": 779.23,
      "learning_rate": 8.028759971898947e-06,
      "loss": 0.0223,
      "step": 10130
    },
    {
      "epoch": 780.0,
      "learning_rate": 7.959350471736936e-06,
      "loss": 0.0219,
      "step": 10140
    },
    {
      "epoch": 780.77,
      "learning_rate": 7.890216347471407e-06,
      "loss": 0.0235,
      "step": 10150
    },
    {
      "epoch": 781.54,
      "learning_rate": 7.821358051946731e-06,
      "loss": 0.0231,
      "step": 10160
    },
    {
      "epoch": 782.31,
      "learning_rate": 7.752776036200532e-06,
      "loss": 0.0229,
      "step": 10170
    },
    {
      "epoch": 783.08,
      "learning_rate": 7.684470749460742e-06,
      "loss": 0.0251,
      "step": 10180
    },
    {
      "epoch": 783.85,
      "learning_rate": 7.616442639142673e-06,
      "loss": 0.0224,
      "step": 10190
    },
    {
      "epoch": 784.62,
      "learning_rate": 7.548692150846021e-06,
      "loss": 0.0222,
      "step": 10200
    },
    {
      "epoch": 784.62,
      "eval_valid_eval_loss": 6.88859748840332,
      "eval_valid_eval_loss_<cls>": 5.881895065307617,
      "eval_valid_eval_perplexity_batch": 981.0245361328125,
      "eval_valid_eval_perplexity_res": 9925.322265625,
      "eval_valid_eval_perplexity_seq": 981.0245361328125,
      "eval_valid_eval_reconstruction": 0.08843537420034409,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.88859748840332,
      "eval_valid_runtime": 0.3528,
      "eval_valid_samples_per_second": 2.834,
      "eval_valid_steps_per_second": 2.834,
      "step": 10200
    },
    {
      "epoch": 784.62,
      "eval_train_eval_loss": 0.009820813313126564,
      "eval_train_eval_loss_<cls>": 0.7148778438568115,
      "eval_train_eval_perplexity_batch": 1.0098692178726196,
      "eval_train_eval_perplexity_res": 2.1857588291168213,
      "eval_train_eval_perplexity_seq": 1.0099399089813232,
      "eval_train_eval_reconstruction": 0.18185153603553772,
      "eval_train_eval_reconstruction_<cls>": 0.8571428656578064,
      "eval_train_loss": 0.02356754243373871,
      "eval_train_runtime": 0.7434,
      "eval_train_samples_per_second": 133.167,
      "eval_train_steps_per_second": 17.487,
      "step": 10200
    },
    {
      "epoch": 785.38,
      "learning_rate": 7.4812197283520346e-06,
      "loss": 0.0268,
      "step": 10210
    },
    {
      "epoch": 786.15,
      "learning_rate": 7.414025813620562e-06,
      "loss": 0.0206,
      "step": 10220
    },
    {
      "epoch": 786.92,
      "learning_rate": 7.347110846787148e-06,
      "loss": 0.023,
      "step": 10230
    },
    {
      "epoch": 787.69,
      "learning_rate": 7.280475266160175e-06,
      "loss": 0.0255,
      "step": 10240
    },
    {
      "epoch": 788.46,
      "learning_rate": 7.214119508217976e-06,
      "loss": 0.0249,
      "step": 10250
    },
    {
      "epoch": 789.23,
      "learning_rate": 7.148044007605992e-06,
      "loss": 0.0265,
      "step": 10260
    },
    {
      "epoch": 790.0,
      "learning_rate": 7.082249197133906e-06,
      "loss": 0.0227,
      "step": 10270
    },
    {
      "epoch": 790.77,
      "learning_rate": 7.01673550777282e-06,
      "loss": 0.0235,
      "step": 10280
    },
    {
      "epoch": 791.54,
      "learning_rate": 6.951503368652407e-06,
      "loss": 0.0227,
      "step": 10290
    },
    {
      "epoch": 792.31,
      "learning_rate": 6.886553207058149e-06,
      "loss": 0.0227,
      "step": 10300
    },
    {
      "epoch": 792.31,
      "eval_valid_eval_loss": 6.847240447998047,
      "eval_valid_eval_loss_<cls>": 5.803927898406982,
      "eval_valid_eval_perplexity_batch": 941.27978515625,
      "eval_valid_eval_perplexity_res": 9051.388671875,
      "eval_valid_eval_perplexity_seq": 941.27978515625,
      "eval_valid_eval_reconstruction": 0.08130762726068497,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.847240924835205,
      "eval_valid_runtime": 0.3697,
      "eval_valid_samples_per_second": 2.705,
      "eval_valid_steps_per_second": 2.705,
      "step": 10300
    },
    {
      "epoch": 792.31,
      "eval_train_eval_loss": 0.010743041522800922,
      "eval_train_eval_loss_<cls>": 0.7032456994056702,
      "eval_train_eval_perplexity_batch": 1.0108009576797485,
      "eval_train_eval_perplexity_res": 2.4287607669830322,
      "eval_train_eval_perplexity_seq": 1.0108833312988281,
      "eval_train_eval_reconstruction": 0.19968761503696442,
      "eval_train_eval_reconstruction_<cls>": 0.9115044474601746,
      "eval_train_loss": 0.021518083289265633,
      "eval_train_runtime": 0.8643,
      "eval_train_samples_per_second": 114.543,
      "eval_train_steps_per_second": 15.041,
      "step": 10300
    },
    {
      "epoch": 793.08,
      "learning_rate": 6.821885448428506e-06,
      "loss": 0.024,
      "step": 10310
    },
    {
      "epoch": 793.85,
      "learning_rate": 6.7575005163521225e-06,
      "loss": 0.0249,
      "step": 10320
    },
    {
      "epoch": 794.62,
      "learning_rate": 6.693398832565073e-06,
      "loss": 0.026,
      "step": 10330
    },
    {
      "epoch": 795.38,
      "learning_rate": 6.629580816948089e-06,
      "loss": 0.024,
      "step": 10340
    },
    {
      "epoch": 796.15,
      "learning_rate": 6.5660468875238076e-06,
      "loss": 0.02,
      "step": 10350
    },
    {
      "epoch": 796.92,
      "learning_rate": 6.502797460454041e-06,
      "loss": 0.0222,
      "step": 10360
    },
    {
      "epoch": 797.69,
      "learning_rate": 6.439832950037039e-06,
      "loss": 0.0208,
      "step": 10370
    },
    {
      "epoch": 798.46,
      "learning_rate": 6.377153768704785e-06,
      "loss": 0.0247,
      "step": 10380
    },
    {
      "epoch": 799.23,
      "learning_rate": 6.314760327020297e-06,
      "loss": 0.0247,
      "step": 10390
    },
    {
      "epoch": 800.0,
      "learning_rate": 6.2526530336749265e-06,
      "loss": 0.0264,
      "step": 10400
    },
    {
      "epoch": 800.0,
      "eval_valid_eval_loss": 6.946436882019043,
      "eval_valid_eval_loss_<cls>": 4.256481647491455,
      "eval_valid_eval_perplexity_batch": 1039.439453125,
      "eval_valid_eval_perplexity_res": 10106.857421875,
      "eval_valid_eval_perplexity_seq": 1039.439453125,
      "eval_valid_eval_reconstruction": 0.07399576902389526,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.946437835693359,
      "eval_valid_runtime": 0.3686,
      "eval_valid_samples_per_second": 2.713,
      "eval_valid_steps_per_second": 2.713,
      "step": 10400
    },
    {
      "epoch": 800.0,
      "eval_train_eval_loss": 0.009680314920842648,
      "eval_train_eval_loss_<cls>": 0.6755750775337219,
      "eval_train_eval_perplexity_batch": 1.0097273588180542,
      "eval_train_eval_perplexity_res": 1.7974724769592285,
      "eval_train_eval_perplexity_seq": 1.0097864866256714,
      "eval_train_eval_reconstruction": 0.1880849152803421,
      "eval_train_eval_reconstruction_<cls>": 0.9090909361839294,
      "eval_train_loss": 0.022059837356209755,
      "eval_train_runtime": 0.9508,
      "eval_train_samples_per_second": 104.118,
      "eval_train_steps_per_second": 13.672,
      "step": 10400
    },
    {
      "epoch": 800.77,
      "learning_rate": 6.190832295485688e-06,
      "loss": 0.0199,
      "step": 10410
    },
    {
      "epoch": 801.54,
      "learning_rate": 6.129298517392601e-06,
      "loss": 0.0196,
      "step": 10420
    },
    {
      "epoch": 802.31,
      "learning_rate": 6.0680521024560125e-06,
      "loss": 0.0226,
      "step": 10430
    },
    {
      "epoch": 803.08,
      "learning_rate": 6.007093451853995e-06,
      "loss": 0.022,
      "step": 10440
    },
    {
      "epoch": 803.85,
      "learning_rate": 5.946422964879706e-06,
      "loss": 0.0251,
      "step": 10450
    },
    {
      "epoch": 804.62,
      "learning_rate": 5.886041038938722e-06,
      "loss": 0.0213,
      "step": 10460
    },
    {
      "epoch": 805.38,
      "learning_rate": 5.825948069546516e-06,
      "loss": 0.0228,
      "step": 10470
    },
    {
      "epoch": 806.15,
      "learning_rate": 5.766144450325828e-06,
      "loss": 0.0216,
      "step": 10480
    },
    {
      "epoch": 806.92,
      "learning_rate": 5.706630573004068e-06,
      "loss": 0.0236,
      "step": 10490
    },
    {
      "epoch": 807.69,
      "learning_rate": 5.647406827410795e-06,
      "loss": 0.0222,
      "step": 10500
    },
    {
      "epoch": 807.69,
      "eval_valid_eval_loss": 6.862207889556885,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 955.4743041992188,
      "eval_valid_eval_perplexity_res": 10533.484375,
      "eval_valid_eval_perplexity_seq": 955.4743041992188,
      "eval_valid_eval_reconstruction": 0.09056603908538818,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 6.862207412719727,
      "eval_valid_runtime": 0.3441,
      "eval_valid_samples_per_second": 2.906,
      "eval_valid_steps_per_second": 2.906,
      "step": 10500
    },
    {
      "epoch": 807.69,
      "eval_train_eval_loss": 0.009160429239273071,
      "eval_train_eval_loss_<cls>": 0.7475155591964722,
      "eval_train_eval_perplexity_batch": 1.009202480316162,
      "eval_train_eval_perplexity_res": 2.2422587871551514,
      "eval_train_eval_perplexity_seq": 1.009257197380066,
      "eval_train_eval_reconstruction": 0.1783241182565689,
      "eval_train_eval_reconstruction_<cls>": 0.9038461446762085,
      "eval_train_loss": 0.022420547902584076,
      "eval_train_runtime": 0.7182,
      "eval_train_samples_per_second": 137.837,
      "eval_train_steps_per_second": 18.1,
      "step": 10500
    },
    {
      "epoch": 808.46,
      "learning_rate": 5.588473601475114e-06,
      "loss": 0.023,
      "step": 10510
    },
    {
      "epoch": 809.23,
      "learning_rate": 5.52983128122318e-06,
      "loss": 0.0236,
      "step": 10520
    },
    {
      "epoch": 810.0,
      "learning_rate": 5.471480250775651e-06,
      "loss": 0.026,
      "step": 10530
    },
    {
      "epoch": 810.77,
      "learning_rate": 5.413420892345128e-06,
      "loss": 0.0228,
      "step": 10540
    },
    {
      "epoch": 811.54,
      "learning_rate": 5.35565358623375e-06,
      "loss": 0.0211,
      "step": 10550
    },
    {
      "epoch": 812.31,
      "learning_rate": 5.298178710830609e-06,
      "loss": 0.0222,
      "step": 10560
    },
    {
      "epoch": 813.08,
      "learning_rate": 5.240996642609325e-06,
      "loss": 0.0234,
      "step": 10570
    },
    {
      "epoch": 813.85,
      "learning_rate": 5.1841077561255614e-06,
      "loss": 0.0223,
      "step": 10580
    },
    {
      "epoch": 814.62,
      "learning_rate": 5.127512424014569e-06,
      "loss": 0.022,
      "step": 10590
    },
    {
      "epoch": 815.38,
      "learning_rate": 5.0712110169887525e-06,
      "loss": 0.0202,
      "step": 10600
    },
    {
      "epoch": 815.38,
      "eval_valid_eval_loss": 6.839648723602295,
      "eval_valid_eval_loss_<cls>": 4.576301574707031,
      "eval_valid_eval_perplexity_batch": 934.1609497070312,
      "eval_valid_eval_perplexity_res": 7973.22119140625,
      "eval_valid_eval_perplexity_seq": 934.1609497070312,
      "eval_valid_eval_reconstruction": 0.07658643275499344,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.839648723602295,
      "eval_valid_runtime": 0.371,
      "eval_valid_samples_per_second": 2.696,
      "eval_valid_steps_per_second": 2.696,
      "step": 10600
    },
    {
      "epoch": 815.38,
      "eval_train_eval_loss": 0.009840264916419983,
      "eval_train_eval_loss_<cls>": 0.6344262957572937,
      "eval_train_eval_perplexity_batch": 1.0098888874053955,
      "eval_train_eval_perplexity_res": 2.1800851821899414,
      "eval_train_eval_perplexity_seq": 1.0099565982818604,
      "eval_train_eval_reconstruction": 0.18433105945587158,
      "eval_train_eval_reconstruction_<cls>": 0.9189189076423645,
      "eval_train_loss": 0.02283162996172905,
      "eval_train_runtime": 0.8206,
      "eval_train_samples_per_second": 120.64,
      "eval_train_steps_per_second": 15.842,
      "step": 10600
    },
    {
      "epoch": 816.15,
      "learning_rate": 5.0152039038352475e-06,
      "loss": 0.023,
      "step": 10610
    },
    {
      "epoch": 816.92,
      "learning_rate": 4.9594914514134894e-06,
      "loss": 0.021,
      "step": 10620
    },
    {
      "epoch": 817.69,
      "learning_rate": 4.904074024652811e-06,
      "loss": 0.0258,
      "step": 10630
    },
    {
      "epoch": 818.46,
      "learning_rate": 4.848951986550071e-06,
      "loss": 0.0252,
      "step": 10640
    },
    {
      "epoch": 819.23,
      "learning_rate": 4.794125698167262e-06,
      "loss": 0.0239,
      "step": 10650
    },
    {
      "epoch": 820.0,
      "learning_rate": 4.739595518629153e-06,
      "loss": 0.0236,
      "step": 10660
    },
    {
      "epoch": 820.77,
      "learning_rate": 4.6853618051209134e-06,
      "loss": 0.0208,
      "step": 10670
    },
    {
      "epoch": 821.54,
      "learning_rate": 4.631424912885824e-06,
      "loss": 0.0244,
      "step": 10680
    },
    {
      "epoch": 822.31,
      "learning_rate": 4.5777851952228775e-06,
      "loss": 0.0222,
      "step": 10690
    },
    {
      "epoch": 823.08,
      "learning_rate": 4.5244430034845465e-06,
      "loss": 0.0232,
      "step": 10700
    },
    {
      "epoch": 823.08,
      "eval_valid_eval_loss": 6.961528301239014,
      "eval_valid_eval_loss_<cls>": 7.916636943817139,
      "eval_valid_eval_perplexity_batch": 1055.2451171875,
      "eval_valid_eval_perplexity_res": 10283.0224609375,
      "eval_valid_eval_perplexity_seq": 1055.2451171875,
      "eval_valid_eval_reconstruction": 0.07731958478689194,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.96152925491333,
      "eval_valid_runtime": 0.3672,
      "eval_valid_samples_per_second": 2.723,
      "eval_valid_steps_per_second": 2.723,
      "step": 10700
    },
    {
      "epoch": 823.08,
      "eval_train_eval_loss": 0.010453755035996437,
      "eval_train_eval_loss_<cls>": 0.7180119752883911,
      "eval_train_eval_perplexity_batch": 1.0105085372924805,
      "eval_train_eval_perplexity_res": 2.6419310569763184,
      "eval_train_eval_perplexity_seq": 1.0105654001235962,
      "eval_train_eval_reconstruction": 0.19849061965942383,
      "eval_train_eval_reconstruction_<cls>": 0.9126213788986206,
      "eval_train_loss": 0.023634497076272964,
      "eval_train_runtime": 0.8974,
      "eval_train_samples_per_second": 110.32,
      "eval_train_steps_per_second": 14.487,
      "step": 10700
    },
    {
      "epoch": 823.85,
      "learning_rate": 4.471398687074419e-06,
      "loss": 0.0239,
      "step": 10710
    },
    {
      "epoch": 824.62,
      "learning_rate": 4.418652593444916e-06,
      "loss": 0.019,
      "step": 10720
    },
    {
      "epoch": 825.38,
      "learning_rate": 4.366205068095064e-06,
      "loss": 0.023,
      "step": 10730
    },
    {
      "epoch": 826.15,
      "learning_rate": 4.3140564545681776e-06,
      "loss": 0.0216,
      "step": 10740
    },
    {
      "epoch": 826.92,
      "learning_rate": 4.262207094449633e-06,
      "loss": 0.025,
      "step": 10750
    },
    {
      "epoch": 827.69,
      "learning_rate": 4.210657327364631e-06,
      "loss": 0.0241,
      "step": 10760
    },
    {
      "epoch": 828.46,
      "learning_rate": 4.15940749097598e-06,
      "loss": 0.0252,
      "step": 10770
    },
    {
      "epoch": 829.23,
      "learning_rate": 4.1084579209818505e-06,
      "loss": 0.0219,
      "step": 10780
    },
    {
      "epoch": 830.0,
      "learning_rate": 4.057808951113617e-06,
      "loss": 0.0225,
      "step": 10790
    },
    {
      "epoch": 830.77,
      "learning_rate": 4.0074609131336425e-06,
      "loss": 0.0222,
      "step": 10800
    },
    {
      "epoch": 830.77,
      "eval_valid_eval_loss": 6.947589874267578,
      "eval_valid_eval_loss_<cls>": 4.270355701446533,
      "eval_valid_eval_perplexity_batch": 1040.638671875,
      "eval_valid_eval_perplexity_res": 10212.3212890625,
      "eval_valid_eval_perplexity_seq": 1040.638671875,
      "eval_valid_eval_reconstruction": 0.08742004632949829,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.947589874267578,
      "eval_valid_runtime": 0.402,
      "eval_valid_samples_per_second": 2.487,
      "eval_valid_steps_per_second": 2.487,
      "step": 10800
    },
    {
      "epoch": 830.77,
      "eval_train_eval_loss": 0.010687319561839104,
      "eval_train_eval_loss_<cls>": 0.731237530708313,
      "eval_train_eval_perplexity_batch": 1.0107446908950806,
      "eval_train_eval_perplexity_res": 2.1861259937286377,
      "eval_train_eval_perplexity_seq": 1.0108072757720947,
      "eval_train_eval_reconstruction": 0.21153277158737183,
      "eval_train_eval_reconstruction_<cls>": 0.8936170339584351,
      "eval_train_loss": 0.02377024106681347,
      "eval_train_runtime": 0.9157,
      "eval_train_samples_per_second": 108.116,
      "eval_train_steps_per_second": 14.197,
      "step": 10800
    },
    {
      "epoch": 831.54,
      "learning_rate": 3.9574141368331315e-06,
      "loss": 0.0219,
      "step": 10810
    },
    {
      "epoch": 832.31,
      "learning_rate": 3.907668950029941e-06,
      "loss": 0.0218,
      "step": 10820
    },
    {
      "epoch": 833.08,
      "learning_rate": 3.8582256785664575e-06,
      "loss": 0.0219,
      "step": 10830
    },
    {
      "epoch": 833.85,
      "learning_rate": 3.809084646307448e-06,
      "loss": 0.0199,
      "step": 10840
    },
    {
      "epoch": 834.62,
      "learning_rate": 3.760246175137938e-06,
      "loss": 0.0253,
      "step": 10850
    },
    {
      "epoch": 835.38,
      "learning_rate": 3.7117105849611256e-06,
      "loss": 0.0246,
      "step": 10860
    },
    {
      "epoch": 836.15,
      "learning_rate": 3.6634781936962515e-06,
      "loss": 0.0229,
      "step": 10870
    },
    {
      "epoch": 836.92,
      "learning_rate": 3.6155493172765508e-06,
      "loss": 0.0217,
      "step": 10880
    },
    {
      "epoch": 837.69,
      "learning_rate": 3.5679242696471438e-06,
      "loss": 0.0261,
      "step": 10890
    },
    {
      "epoch": 838.46,
      "learning_rate": 3.520603362763014e-06,
      "loss": 0.0205,
      "step": 10900
    },
    {
      "epoch": 838.46,
      "eval_valid_eval_loss": 6.958984851837158,
      "eval_valid_eval_loss_<cls>": 6.929849147796631,
      "eval_valid_eval_perplexity_batch": 1052.564453125,
      "eval_valid_eval_perplexity_res": 9546.943359375,
      "eval_valid_eval_perplexity_seq": 1052.564453125,
      "eval_valid_eval_reconstruction": 0.07526881992816925,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.958985805511475,
      "eval_valid_runtime": 0.356,
      "eval_valid_samples_per_second": 2.809,
      "eval_valid_steps_per_second": 2.809,
      "step": 10900
    },
    {
      "epoch": 838.46,
      "eval_train_eval_loss": 0.013702702708542347,
      "eval_train_eval_loss_<cls>": 0.6720725893974304,
      "eval_train_eval_perplexity_batch": 1.0137970447540283,
      "eval_train_eval_perplexity_res": 2.2467989921569824,
      "eval_train_eval_perplexity_seq": 1.0138778686523438,
      "eval_train_eval_reconstruction": 0.2792002260684967,
      "eval_train_eval_reconstruction_<cls>": 0.9058296084403992,
      "eval_train_loss": 0.023150993511080742,
      "eval_train_runtime": 0.613,
      "eval_train_samples_per_second": 161.49,
      "eval_train_steps_per_second": 21.206,
      "step": 10900
    },
    {
      "epoch": 839.23,
      "learning_rate": 3.473586906586962e-06,
      "loss": 0.0229,
      "step": 10910
    },
    {
      "epoch": 840.0,
      "learning_rate": 3.4268752090875543e-06,
      "loss": 0.0221,
      "step": 10920
    },
    {
      "epoch": 840.77,
      "learning_rate": 3.380468576237128e-06,
      "loss": 0.0231,
      "step": 10930
    },
    {
      "epoch": 841.54,
      "learning_rate": 3.3343673120097674e-06,
      "loss": 0.027,
      "step": 10940
    },
    {
      "epoch": 842.31,
      "learning_rate": 3.2885717183793296e-06,
      "loss": 0.0238,
      "step": 10950
    },
    {
      "epoch": 843.08,
      "learning_rate": 3.2430820953174623e-06,
      "loss": 0.0246,
      "step": 10960
    },
    {
      "epoch": 843.85,
      "learning_rate": 3.1978987407916162e-06,
      "loss": 0.0213,
      "step": 10970
    },
    {
      "epoch": 844.62,
      "learning_rate": 3.1530219507631296e-06,
      "loss": 0.0216,
      "step": 10980
    },
    {
      "epoch": 845.38,
      "learning_rate": 3.1084520191852763e-06,
      "loss": 0.0191,
      "step": 10990
    },
    {
      "epoch": 846.15,
      "learning_rate": 3.0641892380013247e-06,
      "loss": 0.0217,
      "step": 11000
    },
    {
      "epoch": 846.15,
      "eval_valid_eval_loss": 6.898919582366943,
      "eval_valid_eval_loss_<cls>": 7.204011917114258,
      "eval_valid_eval_perplexity_batch": 991.2032470703125,
      "eval_valid_eval_perplexity_res": 9693.5361328125,
      "eval_valid_eval_perplexity_seq": 991.2032470703125,
      "eval_valid_eval_reconstruction": 0.08004157990217209,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.898919105529785,
      "eval_valid_runtime": 0.405,
      "eval_valid_samples_per_second": 2.469,
      "eval_valid_steps_per_second": 2.469,
      "step": 11000
    },
    {
      "epoch": 846.15,
      "eval_train_eval_loss": 0.010717160068452358,
      "eval_train_eval_loss_<cls>": 0.6188697814941406,
      "eval_train_eval_perplexity_batch": 1.010774850845337,
      "eval_train_eval_perplexity_res": 2.1382739543914795,
      "eval_train_eval_perplexity_seq": 1.0108332633972168,
      "eval_train_eval_reconstruction": 0.22411754727363586,
      "eval_train_eval_reconstruction_<cls>": 0.9027777910232544,
      "eval_train_loss": 0.021648472175002098,
      "eval_train_runtime": 0.7907,
      "eval_train_samples_per_second": 125.213,
      "eval_train_steps_per_second": 16.442,
      "step": 11000
    },
    {
      "epoch": 846.92,
      "learning_rate": 3.0202338971426437e-06,
      "loss": 0.0207,
      "step": 11010
    },
    {
      "epoch": 847.69,
      "learning_rate": 2.9765862845268e-06,
      "loss": 0.0197,
      "step": 11020
    },
    {
      "epoch": 848.46,
      "learning_rate": 2.9332466860556686e-06,
      "loss": 0.0245,
      "step": 11030
    },
    {
      "epoch": 849.23,
      "learning_rate": 2.8902153856135484e-06,
      "loss": 0.0201,
      "step": 11040
    },
    {
      "epoch": 850.0,
      "learning_rate": 2.847492665065349e-06,
      "loss": 0.0247,
      "step": 11050
    },
    {
      "epoch": 850.77,
      "learning_rate": 2.805078804254663e-06,
      "loss": 0.0229,
      "step": 11060
    },
    {
      "epoch": 851.54,
      "learning_rate": 2.7629740810020176e-06,
      "loss": 0.0224,
      "step": 11070
    },
    {
      "epoch": 852.31,
      "learning_rate": 2.721178771103011e-06,
      "loss": 0.0206,
      "step": 11080
    },
    {
      "epoch": 853.08,
      "learning_rate": 2.6796931483265008e-06,
      "loss": 0.0209,
      "step": 11090
    },
    {
      "epoch": 853.85,
      "learning_rate": 2.6385174844128334e-06,
      "loss": 0.0217,
      "step": 11100
    },
    {
      "epoch": 853.85,
      "eval_valid_eval_loss": 6.890743732452393,
      "eval_valid_eval_loss_<cls>": 7.152748107910156,
      "eval_valid_eval_perplexity_batch": 983.13232421875,
      "eval_valid_eval_perplexity_res": 9649.2724609375,
      "eval_valid_eval_perplexity_seq": 983.13232421875,
      "eval_valid_eval_reconstruction": 0.07692307978868484,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.890743732452393,
      "eval_valid_runtime": 0.3995,
      "eval_valid_samples_per_second": 2.503,
      "eval_valid_steps_per_second": 2.503,
      "step": 11100
    },
    {
      "epoch": 853.85,
      "eval_train_eval_loss": 0.008153509348630905,
      "eval_train_eval_loss_<cls>": 0.6054012775421143,
      "eval_train_eval_perplexity_batch": 1.0081868171691895,
      "eval_train_eval_perplexity_res": 2.0005786418914795,
      "eval_train_eval_perplexity_seq": 1.0082240104675293,
      "eval_train_eval_reconstruction": 0.16663967072963715,
      "eval_train_eval_reconstruction_<cls>": 0.9383886456489563,
      "eval_train_loss": 0.02123594842851162,
      "eval_train_runtime": 0.7952,
      "eval_train_samples_per_second": 124.496,
      "eval_train_steps_per_second": 16.348,
      "step": 11100
    },
    {
      "epoch": 854.62,
      "learning_rate": 2.5976520490720526e-06,
      "loss": 0.0213,
      "step": 11110
    },
    {
      "epoch": 855.38,
      "learning_rate": 2.5570971099821373e-06,
      "loss": 0.0246,
      "step": 11120
    },
    {
      "epoch": 856.15,
      "learning_rate": 2.516852932787228e-06,
      "loss": 0.0231,
      "step": 11130
    },
    {
      "epoch": 856.92,
      "learning_rate": 2.4769197810959265e-06,
      "loss": 0.0208,
      "step": 11140
    },
    {
      "epoch": 857.69,
      "learning_rate": 2.4372979164795308e-06,
      "loss": 0.0206,
      "step": 11150
    },
    {
      "epoch": 858.46,
      "learning_rate": 2.397987598470336e-06,
      "loss": 0.0236,
      "step": 11160
    },
    {
      "epoch": 859.23,
      "learning_rate": 2.358989084559954e-06,
      "loss": 0.0241,
      "step": 11170
    },
    {
      "epoch": 860.0,
      "learning_rate": 2.320302630197585e-06,
      "loss": 0.0227,
      "step": 11180
    },
    {
      "epoch": 860.77,
      "learning_rate": 2.2819284887883765e-06,
      "loss": 0.0208,
      "step": 11190
    },
    {
      "epoch": 861.54,
      "learning_rate": 2.243866911691761e-06,
      "loss": 0.0219,
      "step": 11200
    },
    {
      "epoch": 861.54,
      "eval_valid_eval_loss": 6.676845073699951,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 793.8107299804688,
      "eval_valid_eval_perplexity_res": 6288.45263671875,
      "eval_valid_eval_perplexity_seq": 793.8107299804688,
      "eval_valid_eval_reconstruction": 0.0810810774564743,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 6.676844596862793,
      "eval_valid_runtime": 0.3875,
      "eval_valid_samples_per_second": 2.58,
      "eval_valid_steps_per_second": 2.58,
      "step": 11200
    },
    {
      "epoch": 861.54,
      "eval_train_eval_loss": 0.007687866687774658,
      "eval_train_eval_loss_<cls>": 0.67119961977005,
      "eval_train_eval_perplexity_batch": 1.007717490196228,
      "eval_train_eval_perplexity_res": 2.4383323192596436,
      "eval_train_eval_perplexity_seq": 1.007771611213684,
      "eval_train_eval_reconstruction": 0.15181632339954376,
      "eval_train_eval_reconstruction_<cls>": 0.907489001750946,
      "eval_train_loss": 0.020011892542243004,
      "eval_train_runtime": 0.785,
      "eval_train_samples_per_second": 126.118,
      "eval_train_steps_per_second": 16.561,
      "step": 11200
    },
    {
      "epoch": 862.31,
      "learning_rate": 2.206118148219799e-06,
      "loss": 0.0265,
      "step": 11210
    },
    {
      "epoch": 863.08,
      "learning_rate": 2.1686824456355558e-06,
      "loss": 0.022,
      "step": 11220
    },
    {
      "epoch": 863.85,
      "learning_rate": 2.1315600491514485e-06,
      "loss": 0.022,
      "step": 11230
    },
    {
      "epoch": 864.62,
      "learning_rate": 2.094751201927703e-06,
      "loss": 0.0221,
      "step": 11240
    },
    {
      "epoch": 865.38,
      "learning_rate": 2.0582561450707026e-06,
      "loss": 0.0225,
      "step": 11250
    },
    {
      "epoch": 866.15,
      "learning_rate": 2.0220751176314503e-06,
      "loss": 0.0227,
      "step": 11260
    },
    {
      "epoch": 866.92,
      "learning_rate": 1.9862083566039637e-06,
      "loss": 0.0232,
      "step": 11270
    },
    {
      "epoch": 867.69,
      "learning_rate": 1.950656096923764e-06,
      "loss": 0.0224,
      "step": 11280
    },
    {
      "epoch": 868.46,
      "learning_rate": 1.9154185714662985e-06,
      "loss": 0.0223,
      "step": 11290
    },
    {
      "epoch": 869.23,
      "learning_rate": 1.8804960110454406e-06,
      "loss": 0.02,
      "step": 11300
    },
    {
      "epoch": 869.23,
      "eval_valid_eval_loss": 6.906391143798828,
      "eval_valid_eval_loss_<cls>": 6.092331409454346,
      "eval_valid_eval_perplexity_batch": 998.6367797851562,
      "eval_valid_eval_perplexity_res": 9328.8291015625,
      "eval_valid_eval_perplexity_seq": 998.6367797851562,
      "eval_valid_eval_reconstruction": 0.07982739806175232,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.906391143798828,
      "eval_valid_runtime": 0.3842,
      "eval_valid_samples_per_second": 2.603,
      "eval_valid_steps_per_second": 2.603,
      "step": 11300
    },
    {
      "epoch": 869.23,
      "eval_train_eval_loss": 0.01159481331706047,
      "eval_train_eval_loss_<cls>": 0.7114319205284119,
      "eval_train_eval_perplexity_batch": 1.011662244796753,
      "eval_train_eval_perplexity_res": 2.2178473472595215,
      "eval_train_eval_perplexity_seq": 1.0117323398590088,
      "eval_train_eval_reconstruction": 0.2354690581560135,
      "eval_train_eval_reconstruction_<cls>": 0.9343434572219849,
      "eval_train_loss": 0.020665399730205536,
      "eval_train_runtime": 0.6516,
      "eval_train_samples_per_second": 151.929,
      "eval_train_steps_per_second": 19.95,
      "step": 11300
    },
    {
      "epoch": 870.0,
      "learning_rate": 1.8458886444119806e-06,
      "loss": 0.0187,
      "step": 11310
    },
    {
      "epoch": 870.77,
      "learning_rate": 1.8115966982520938e-06,
      "loss": 0.0214,
      "step": 11320
    },
    {
      "epoch": 871.54,
      "learning_rate": 1.7776203971858974e-06,
      "loss": 0.0211,
      "step": 11330
    },
    {
      "epoch": 872.31,
      "learning_rate": 1.743959963765951e-06,
      "loss": 0.0229,
      "step": 11340
    },
    {
      "epoch": 873.08,
      "learning_rate": 1.7106156184758249e-06,
      "loss": 0.0216,
      "step": 11350
    },
    {
      "epoch": 873.85,
      "learning_rate": 1.6775875797286234e-06,
      "loss": 0.0202,
      "step": 11360
    },
    {
      "epoch": 874.62,
      "learning_rate": 1.6448760638655692e-06,
      "loss": 0.0208,
      "step": 11370
    },
    {
      "epoch": 875.38,
      "learning_rate": 1.6124812851546045e-06,
      "loss": 0.023,
      "step": 11380
    },
    {
      "epoch": 876.15,
      "learning_rate": 1.5804034557889535e-06,
      "loss": 0.0218,
      "step": 11390
    },
    {
      "epoch": 876.92,
      "learning_rate": 1.5486427858857567e-06,
      "loss": 0.0215,
      "step": 11400
    },
    {
      "epoch": 876.92,
      "eval_valid_eval_loss": 6.6875834465026855,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 802.3809204101562,
      "eval_valid_eval_perplexity_res": 6263.6328125,
      "eval_valid_eval_perplexity_seq": 802.3809204101562,
      "eval_valid_eval_reconstruction": 0.0855855867266655,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 6.6875834465026855,
      "eval_valid_runtime": 0.3525,
      "eval_valid_samples_per_second": 2.837,
      "eval_valid_steps_per_second": 2.837,
      "step": 11400
    },
    {
      "epoch": 876.92,
      "eval_train_eval_loss": 0.010999321937561035,
      "eval_train_eval_loss_<cls>": 0.7235427498817444,
      "eval_train_eval_perplexity_batch": 1.0110599994659424,
      "eval_train_eval_perplexity_res": 2.3359272480010986,
      "eval_train_eval_perplexity_seq": 1.011117935180664,
      "eval_train_eval_reconstruction": 0.2274261713027954,
      "eval_train_eval_reconstruction_<cls>": 0.9230769276618958,
      "eval_train_loss": 0.02097875252366066,
      "eval_train_runtime": 0.6446,
      "eval_train_samples_per_second": 153.588,
      "eval_train_steps_per_second": 20.168,
      "step": 11400
    },
    {
      "epoch": 877.69,
      "learning_rate": 1.5171994834846826e-06,
      "loss": 0.0224,
      "step": 11410
    },
    {
      "epoch": 878.46,
      "learning_rate": 1.4860737545465742e-06,
      "loss": 0.0224,
      "step": 11420
    },
    {
      "epoch": 879.23,
      "learning_rate": 1.4552658029520994e-06,
      "loss": 0.0197,
      "step": 11430
    },
    {
      "epoch": 880.0,
      "learning_rate": 1.4247758305004077e-06,
      "loss": 0.0223,
      "step": 11440
    },
    {
      "epoch": 880.77,
      "learning_rate": 1.394604036907804e-06,
      "loss": 0.0232,
      "step": 11450
    },
    {
      "epoch": 881.54,
      "learning_rate": 1.3647506198064596e-06,
      "loss": 0.0248,
      "step": 11460
    },
    {
      "epoch": 882.31,
      "learning_rate": 1.3352157747431094e-06,
      "loss": 0.0216,
      "step": 11470
    },
    {
      "epoch": 883.08,
      "learning_rate": 1.305999695177762e-06,
      "loss": 0.0207,
      "step": 11480
    },
    {
      "epoch": 883.85,
      "learning_rate": 1.277102572482425e-06,
      "loss": 0.0223,
      "step": 11490
    },
    {
      "epoch": 884.62,
      "learning_rate": 1.2485245959398927e-06,
      "loss": 0.0275,
      "step": 11500
    },
    {
      "epoch": 884.62,
      "eval_valid_eval_loss": 6.871699810028076,
      "eval_valid_eval_loss_<cls>": 4.409453868865967,
      "eval_valid_eval_perplexity_batch": 964.5867919921875,
      "eval_valid_eval_perplexity_res": 9294.0068359375,
      "eval_valid_eval_perplexity_seq": 964.5867919921875,
      "eval_valid_eval_reconstruction": 0.08959537744522095,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.871700763702393,
      "eval_valid_runtime": 0.3713,
      "eval_valid_samples_per_second": 2.694,
      "eval_valid_steps_per_second": 2.694,
      "step": 11500
    },
    {
      "epoch": 884.62,
      "eval_train_eval_loss": 0.010697917081415653,
      "eval_train_eval_loss_<cls>": 0.5805320739746094,
      "eval_train_eval_perplexity_batch": 1.0107553005218506,
      "eval_train_eval_perplexity_res": 2.411928176879883,
      "eval_train_eval_perplexity_seq": 1.0108165740966797,
      "eval_train_eval_reconstruction": 0.2141442447900772,
      "eval_train_eval_reconstruction_<cls>": 0.9241706132888794,
      "eval_train_loss": 0.022208109498023987,
      "eval_train_runtime": 0.8968,
      "eval_train_samples_per_second": 110.397,
      "eval_train_steps_per_second": 14.497,
      "step": 11500
    },
    {
      "epoch": 885.38,
      "learning_rate": 1.220265952742461e-06,
      "loss": 0.0222,
      "step": 11510
    },
    {
      "epoch": 886.15,
      "learning_rate": 1.1923268279907306e-06,
      "loss": 0.0206,
      "step": 11520
    },
    {
      "epoch": 886.92,
      "learning_rate": 1.1647074046923666e-06,
      "loss": 0.0183,
      "step": 11530
    },
    {
      "epoch": 887.69,
      "learning_rate": 1.137407863760931e-06,
      "loss": 0.0222,
      "step": 11540
    },
    {
      "epoch": 888.46,
      "learning_rate": 1.1104283840146834e-06,
      "loss": 0.021,
      "step": 11550
    },
    {
      "epoch": 889.23,
      "learning_rate": 1.0837691421753949e-06,
      "loss": 0.0204,
      "step": 11560
    },
    {
      "epoch": 890.0,
      "learning_rate": 1.057430312867219e-06,
      "loss": 0.02,
      "step": 11570
    },
    {
      "epoch": 890.77,
      "learning_rate": 1.0314120686155282e-06,
      "loss": 0.0211,
      "step": 11580
    },
    {
      "epoch": 891.54,
      "learning_rate": 1.0057145798457846e-06,
      "loss": 0.0221,
      "step": 11590
    },
    {
      "epoch": 892.31,
      "learning_rate": 9.80338014882437e-07,
      "loss": 0.0196,
      "step": 11600
    },
    {
      "epoch": 892.31,
      "eval_valid_eval_loss": 6.915256023406982,
      "eval_valid_eval_loss_<cls>": 6.842366695404053,
      "eval_valid_eval_perplexity_batch": 1007.5289306640625,
      "eval_valid_eval_perplexity_res": 9317.6494140625,
      "eval_valid_eval_perplexity_seq": 1007.5289306640625,
      "eval_valid_eval_reconstruction": 0.07874865084886551,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.915256500244141,
      "eval_valid_runtime": 0.3614,
      "eval_valid_samples_per_second": 2.767,
      "eval_valid_steps_per_second": 2.767,
      "step": 11600
    },
    {
      "epoch": 892.31,
      "eval_train_eval_loss": 0.010006732307374477,
      "eval_train_eval_loss_<cls>": 0.5690823793411255,
      "eval_train_eval_perplexity_batch": 1.010056972503662,
      "eval_train_eval_perplexity_res": 2.3411388397216797,
      "eval_train_eval_perplexity_seq": 1.010114312171936,
      "eval_train_eval_reconstruction": 0.20512790977954865,
      "eval_train_eval_reconstruction_<cls>": 0.9276018142700195,
      "eval_train_loss": 0.02137184701859951,
      "eval_train_runtime": 0.6709,
      "eval_train_samples_per_second": 147.56,
      "eval_train_steps_per_second": 19.377,
      "step": 11600
    },
    {
      "epoch": 893.08,
      "learning_rate": 9.552825399478105e-07,
      "loss": 0.0223,
      "step": 11610
    },
    {
      "epoch": 893.85,
      "learning_rate": 9.305483191610065e-07,
      "loss": 0.0207,
      "step": 11620
    },
    {
      "epoch": 894.62,
      "learning_rate": 9.061355145368488e-07,
      "loss": 0.0245,
      "step": 11630
    },
    {
      "epoch": 895.38,
      "learning_rate": 8.820442859848011e-07,
      "loss": 0.0253,
      "step": 11640
    },
    {
      "epoch": 896.15,
      "learning_rate": 8.582747913079448e-07,
      "loss": 0.0228,
      "step": 11650
    },
    {
      "epoch": 896.92,
      "learning_rate": 8.348271862019086e-07,
      "loss": 0.0218,
      "step": 11660
    },
    {
      "epoch": 897.69,
      "learning_rate": 8.117016242538911e-07,
      "loss": 0.0227,
      "step": 11670
    },
    {
      "epoch": 898.46,
      "learning_rate": 7.888982569416281e-07,
      "loss": 0.0202,
      "step": 11680
    },
    {
      "epoch": 899.23,
      "learning_rate": 7.664172336323994e-07,
      "loss": 0.0203,
      "step": 11690
    },
    {
      "epoch": 900.0,
      "learning_rate": 7.442587015820734e-07,
      "loss": 0.0222,
      "step": 11700
    },
    {
      "epoch": 900.0,
      "eval_valid_eval_loss": 6.866573810577393,
      "eval_valid_eval_loss_<cls>": 9.581642150878906,
      "eval_valid_eval_perplexity_batch": 959.6549682617188,
      "eval_valid_eval_perplexity_res": 8101.447265625,
      "eval_valid_eval_perplexity_seq": 959.6549682617188,
      "eval_valid_eval_reconstruction": 0.07002188265323639,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.866573810577393,
      "eval_valid_runtime": 0.3315,
      "eval_valid_samples_per_second": 3.016,
      "eval_valid_steps_per_second": 3.016,
      "step": 11700
    },
    {
      "epoch": 900.0,
      "eval_train_eval_loss": 0.01174135971814394,
      "eval_train_eval_loss_<cls>": 0.5718596577644348,
      "eval_train_eval_perplexity_batch": 1.011810541152954,
      "eval_train_eval_perplexity_res": 3.3392529487609863,
      "eval_train_eval_perplexity_seq": 1.011875033378601,
      "eval_train_eval_reconstruction": 0.24137485027313232,
      "eval_train_eval_reconstruction_<cls>": 0.9512194991111755,
      "eval_train_loss": 0.02128995768725872,
      "eval_train_runtime": 0.9354,
      "eval_train_samples_per_second": 105.836,
      "eval_train_steps_per_second": 13.898,
      "step": 11700
    },
    {
      "epoch": 900.77,
      "learning_rate": 7.224228059341087e-07,
      "loss": 0.0208,
      "step": 11710
    },
    {
      "epoch": 901.54,
      "learning_rate": 7.009096897186318e-07,
      "loss": 0.0207,
      "step": 11720
    },
    {
      "epoch": 902.31,
      "learning_rate": 6.79719493851494e-07,
      "loss": 0.0226,
      "step": 11730
    },
    {
      "epoch": 903.08,
      "learning_rate": 6.588523571333328e-07,
      "loss": 0.0203,
      "step": 11740
    },
    {
      "epoch": 903.85,
      "learning_rate": 6.38308416248673e-07,
      "loss": 0.0183,
      "step": 11750
    },
    {
      "epoch": 904.62,
      "learning_rate": 6.180878057650552e-07,
      "loss": 0.0227,
      "step": 11760
    },
    {
      "epoch": 905.38,
      "learning_rate": 5.981906581320973e-07,
      "loss": 0.0206,
      "step": 11770
    },
    {
      "epoch": 906.15,
      "learning_rate": 5.786171036806953e-07,
      "loss": 0.0223,
      "step": 11780
    },
    {
      "epoch": 906.92,
      "learning_rate": 5.593672706221132e-07,
      "loss": 0.0223,
      "step": 11790
    },
    {
      "epoch": 907.69,
      "learning_rate": 5.404412850471719e-07,
      "loss": 0.0215,
      "step": 11800
    },
    {
      "epoch": 907.69,
      "eval_valid_eval_loss": 7.032078266143799,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 1132.381591796875,
      "eval_valid_eval_perplexity_res": 10104.216796875,
      "eval_valid_eval_perplexity_seq": 1132.381591796875,
      "eval_valid_eval_reconstruction": 0.05982905998826027,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 7.032077789306641,
      "eval_valid_runtime": 0.3761,
      "eval_valid_samples_per_second": 2.659,
      "eval_valid_steps_per_second": 2.659,
      "step": 11800
    },
    {
      "epoch": 907.69,
      "eval_train_eval_loss": 0.01180767361074686,
      "eval_train_eval_loss_<cls>": 0.6629999876022339,
      "eval_train_eval_perplexity_batch": 1.0118776559829712,
      "eval_train_eval_perplexity_res": 2.6215898990631104,
      "eval_train_eval_perplexity_seq": 1.0119553804397583,
      "eval_train_eval_reconstruction": 0.24453958868980408,
      "eval_train_eval_reconstruction_<cls>": 0.9301310181617737,
      "eval_train_loss": 0.02157609723508358,
      "eval_train_runtime": 0.9209,
      "eval_train_samples_per_second": 107.509,
      "eval_train_steps_per_second": 14.117,
      "step": 11800
    },
    {
      "epoch": 908.46,
      "learning_rate": 5.218392709254171e-07,
      "loss": 0.0239,
      "step": 11810
    },
    {
      "epoch": 909.23,
      "learning_rate": 5.035613501043146e-07,
      "loss": 0.0223,
      "step": 11820
    },
    {
      "epoch": 910.0,
      "learning_rate": 4.856076423084332e-07,
      "loss": 0.0209,
      "step": 11830
    },
    {
      "epoch": 910.77,
      "learning_rate": 4.6797826513867437e-07,
      "loss": 0.0189,
      "step": 11840
    },
    {
      "epoch": 911.54,
      "learning_rate": 4.506733340714997e-07,
      "loss": 0.0199,
      "step": 11850
    },
    {
      "epoch": 912.31,
      "learning_rate": 4.3369296245818205e-07,
      "loss": 0.0212,
      "step": 11860
    },
    {
      "epoch": 913.08,
      "learning_rate": 4.1703726152405034e-07,
      "loss": 0.0199,
      "step": 11870
    },
    {
      "epoch": 913.85,
      "learning_rate": 4.0070634036776244e-07,
      "loss": 0.0202,
      "step": 11880
    },
    {
      "epoch": 914.62,
      "learning_rate": 3.8470030596060026e-07,
      "loss": 0.0216,
      "step": 11890
    },
    {
      "epoch": 915.38,
      "learning_rate": 3.6901926314575894e-07,
      "loss": 0.0244,
      "step": 11900
    },
    {
      "epoch": 915.38,
      "eval_valid_eval_loss": 6.871036052703857,
      "eval_valid_eval_loss_<cls>": 5.681734085083008,
      "eval_valid_eval_perplexity_batch": 963.94677734375,
      "eval_valid_eval_perplexity_res": 9157.7373046875,
      "eval_valid_eval_perplexity_seq": 963.94677734375,
      "eval_valid_eval_reconstruction": 0.08670520037412643,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.871036052703857,
      "eval_valid_runtime": 0.3424,
      "eval_valid_samples_per_second": 2.921,
      "eval_valid_steps_per_second": 2.921,
      "step": 11900
    },
    {
      "epoch": 915.38,
      "eval_train_eval_loss": 0.010236835107207298,
      "eval_train_eval_loss_<cls>": 0.6443630456924438,
      "eval_train_eval_perplexity_batch": 1.0102894306182861,
      "eval_train_eval_perplexity_res": 1.9677406549453735,
      "eval_train_eval_perplexity_seq": 1.0103548765182495,
      "eval_train_eval_reconstruction": 0.2108142226934433,
      "eval_train_eval_reconstruction_<cls>": 0.9473684430122375,
      "eval_train_loss": 0.020746562629938126,
      "eval_train_runtime": 0.8735,
      "eval_train_samples_per_second": 113.342,
      "eval_train_steps_per_second": 14.883,
      "step": 11900
    },
    {
      "epoch": 916.15,
      "learning_rate": 3.5366331463766997e-07,
      "loss": 0.0186,
      "step": 11910
    },
    {
      "epoch": 916.92,
      "learning_rate": 3.3863256102130704e-07,
      "loss": 0.022,
      "step": 11920
    },
    {
      "epoch": 917.69,
      "learning_rate": 3.2392710075155876e-07,
      "loss": 0.0215,
      "step": 11930
    },
    {
      "epoch": 918.46,
      "learning_rate": 3.0954703015256825e-07,
      "loss": 0.0216,
      "step": 11940
    },
    {
      "epoch": 919.23,
      "learning_rate": 2.9549244341708916e-07,
      "loss": 0.0209,
      "step": 11950
    },
    {
      "epoch": 920.0,
      "learning_rate": 2.8176343260589157e-07,
      "loss": 0.0227,
      "step": 11960
    },
    {
      "epoch": 920.77,
      "learning_rate": 2.6836008764714037e-07,
      "loss": 0.0241,
      "step": 11970
    },
    {
      "epoch": 921.54,
      "learning_rate": 2.5528249633582357e-07,
      "loss": 0.0236,
      "step": 11980
    },
    {
      "epoch": 922.31,
      "learning_rate": 2.425307443331637e-07,
      "loss": 0.021,
      "step": 11990
    },
    {
      "epoch": 923.08,
      "learning_rate": 2.301049151660628e-07,
      "loss": 0.0208,
      "step": 12000
    },
    {
      "epoch": 923.08,
      "eval_valid_eval_loss": 6.9727396965026855,
      "eval_valid_eval_loss_<cls>": 7.573707580566406,
      "eval_valid_eval_perplexity_batch": 1067.1424560546875,
      "eval_valid_eval_perplexity_res": 10757.4287109375,
      "eval_valid_eval_perplexity_seq": 1067.1424560546875,
      "eval_valid_eval_reconstruction": 0.07800000160932541,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.972739219665527,
      "eval_valid_runtime": 0.3658,
      "eval_valid_samples_per_second": 2.734,
      "eval_valid_steps_per_second": 2.734,
      "step": 12000
    },
    {
      "epoch": 923.08,
      "eval_train_eval_loss": 0.012886582873761654,
      "eval_train_eval_loss_<cls>": 0.6950795650482178,
      "eval_train_eval_perplexity_batch": 1.012969970703125,
      "eval_train_eval_perplexity_res": 2.9909048080444336,
      "eval_train_eval_perplexity_seq": 1.0130571126937866,
      "eval_train_eval_reconstruction": 0.26865118741989136,
      "eval_train_eval_reconstruction_<cls>": 0.9154228568077087,
      "eval_train_loss": 0.01989542692899704,
      "eval_train_runtime": 0.8739,
      "eval_train_samples_per_second": 113.286,
      "eval_train_steps_per_second": 14.876,
      "step": 12000
    },
    {
      "epoch": 923.85,
      "learning_rate": 2.1800509022654736e-07,
      "loss": 0.0222,
      "step": 12010
    },
    {
      "epoch": 924.62,
      "learning_rate": 2.062313487712464e-07,
      "loss": 0.0212,
      "step": 12020
    },
    {
      "epoch": 925.38,
      "learning_rate": 1.9478376792086416e-07,
      "loss": 0.0207,
      "step": 12030
    },
    {
      "epoch": 926.15,
      "learning_rate": 1.836624226596917e-07,
      "loss": 0.0217,
      "step": 12040
    },
    {
      "epoch": 926.92,
      "learning_rate": 1.7286738583507933e-07,
      "loss": 0.0215,
      "step": 12050
    },
    {
      "epoch": 927.69,
      "learning_rate": 1.6239872815699831e-07,
      "loss": 0.02,
      "step": 12060
    },
    {
      "epoch": 928.46,
      "learning_rate": 1.522565181975577e-07,
      "loss": 0.0215,
      "step": 12070
    },
    {
      "epoch": 929.23,
      "learning_rate": 1.4244082239056045e-07,
      "loss": 0.0226,
      "step": 12080
    },
    {
      "epoch": 930.0,
      "learning_rate": 1.329517050310647e-07,
      "loss": 0.0232,
      "step": 12090
    },
    {
      "epoch": 930.77,
      "learning_rate": 1.2378922827496199e-07,
      "loss": 0.0227,
      "step": 12100
    },
    {
      "epoch": 930.77,
      "eval_valid_eval_loss": 6.953578472137451,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 1046.8892822265625,
      "eval_valid_eval_perplexity_res": 9743.6142578125,
      "eval_valid_eval_perplexity_seq": 1046.8892822265625,
      "eval_valid_eval_reconstruction": 0.08260869234800339,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 6.953579425811768,
      "eval_valid_runtime": 0.3447,
      "eval_valid_samples_per_second": 2.901,
      "eval_valid_steps_per_second": 2.901,
      "step": 12100
    },
    {
      "epoch": 930.77,
      "eval_train_eval_loss": 0.010517541319131851,
      "eval_train_eval_loss_<cls>": 0.7325906753540039,
      "eval_train_eval_perplexity_batch": 1.0105730295181274,
      "eval_train_eval_perplexity_res": 2.3503191471099854,
      "eval_train_eval_perplexity_seq": 1.0106277465820312,
      "eval_train_eval_reconstruction": 0.22131912410259247,
      "eval_train_eval_reconstruction_<cls>": 0.9075630307197571,
      "eval_train_loss": 0.021875431761145592,
      "eval_train_runtime": 0.8938,
      "eval_train_samples_per_second": 110.764,
      "eval_train_steps_per_second": 14.545,
      "step": 12100
    },
    {
      "epoch": 931.54,
      "learning_rate": 1.1495345213858311e-07,
      "loss": 0.0219,
      "step": 12110
    },
    {
      "epoch": 932.31,
      "learning_rate": 1.0644443449828179e-07,
      "loss": 0.0206,
      "step": 12120
    },
    {
      "epoch": 933.08,
      "learning_rate": 9.826223109007937e-08,
      "loss": 0.0228,
      "step": 12130
    },
    {
      "epoch": 933.85,
      "learning_rate": 9.040689550927628e-08,
      "loss": 0.0211,
      "step": 12140
    },
    {
      "epoch": 934.62,
      "learning_rate": 8.287847921013559e-08,
      "loss": 0.0198,
      "step": 12150
    },
    {
      "epoch": 935.38,
      "learning_rate": 7.567703150550553e-08,
      "loss": 0.0234,
      "step": 12160
    },
    {
      "epoch": 936.15,
      "learning_rate": 6.880259956651424e-08,
      "loss": 0.024,
      "step": 12170
    },
    {
      "epoch": 936.92,
      "learning_rate": 6.225522842226994e-08,
      "loss": 0.02,
      "step": 12180
    },
    {
      "epoch": 937.69,
      "learning_rate": 5.603496095956118e-08,
      "loss": 0.0207,
      "step": 12190
    },
    {
      "epoch": 938.46,
      "learning_rate": 5.014183792256266e-08,
      "loss": 0.0226,
      "step": 12200
    },
    {
      "epoch": 938.46,
      "eval_valid_eval_loss": 6.955082893371582,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 1048.4654541015625,
      "eval_valid_eval_perplexity_res": 9719.736328125,
      "eval_valid_eval_perplexity_seq": 1048.4654541015625,
      "eval_valid_eval_reconstruction": 0.08260869234800339,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 6.955081939697266,
      "eval_valid_runtime": 0.3676,
      "eval_valid_samples_per_second": 2.72,
      "eval_valid_steps_per_second": 2.72,
      "step": 12200
    },
    {
      "epoch": 938.46,
      "eval_train_eval_loss": 0.0071123563684523106,
      "eval_train_eval_loss_<cls>": 0.6328437924385071,
      "eval_train_eval_perplexity_batch": 1.007137656211853,
      "eval_train_eval_perplexity_res": 1.8795498609542847,
      "eval_train_eval_perplexity_seq": 1.0071818828582764,
      "eval_train_eval_reconstruction": 0.1447519063949585,
      "eval_train_eval_reconstruction_<cls>": 0.9186602830886841,
      "eval_train_loss": 0.02310245856642723,
      "eval_train_runtime": 0.9071,
      "eval_train_samples_per_second": 109.142,
      "eval_train_steps_per_second": 14.332,
      "step": 12200
    },
    {
      "epoch": 939.23,
      "learning_rate": 4.4575897912579876e-08,
      "loss": 0.0213,
      "step": 12210
    },
    {
      "epoch": 940.0,
      "learning_rate": 3.933717738780485e-08,
      "loss": 0.0215,
      "step": 12220
    },
    {
      "epoch": 940.77,
      "learning_rate": 3.442571066304412e-08,
      "loss": 0.0204,
      "step": 12230
    },
    {
      "epoch": 941.54,
      "learning_rate": 2.984152990954114e-08,
      "loss": 0.021,
      "step": 12240
    },
    {
      "epoch": 942.31,
      "learning_rate": 2.558466515473201e-08,
      "loss": 0.0233,
      "step": 12250
    },
    {
      "epoch": 943.08,
      "learning_rate": 2.1655144282051176e-08,
      "loss": 0.0215,
      "step": 12260
    },
    {
      "epoch": 943.85,
      "learning_rate": 1.805299303077046e-08,
      "loss": 0.0211,
      "step": 12270
    },
    {
      "epoch": 944.62,
      "learning_rate": 1.4778234995793671e-08,
      "loss": 0.0198,
      "step": 12280
    },
    {
      "epoch": 945.38,
      "learning_rate": 1.1830891627551133e-08,
      "loss": 0.0206,
      "step": 12290
    },
    {
      "epoch": 946.15,
      "learning_rate": 9.210982231805387e-09,
      "loss": 0.0229,
      "step": 12300
    },
    {
      "epoch": 946.15,
      "eval_valid_eval_loss": 6.961234092712402,
      "eval_valid_eval_loss_<cls>": 6.200901031494141,
      "eval_valid_eval_perplexity_batch": 1054.9346923828125,
      "eval_valid_eval_perplexity_res": 10679.7333984375,
      "eval_valid_eval_perplexity_seq": 1054.9346923828125,
      "eval_valid_eval_reconstruction": 0.08144330233335495,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.961233615875244,
      "eval_valid_runtime": 0.3793,
      "eval_valid_samples_per_second": 2.636,
      "eval_valid_steps_per_second": 2.636,
      "step": 12300
    },
    {
      "epoch": 946.15,
      "eval_train_eval_loss": 0.00920071266591549,
      "eval_train_eval_loss_<cls>": 0.6699367761611938,
      "eval_train_eval_perplexity_batch": 1.009243130683899,
      "eval_train_eval_perplexity_res": 2.219935655593872,
      "eval_train_eval_perplexity_seq": 1.0093036890029907,
      "eval_train_eval_reconstruction": 0.1907922923564911,
      "eval_train_eval_reconstruction_<cls>": 0.9264705777168274,
      "eval_train_loss": 0.019737664610147476,
      "eval_train_runtime": 0.8908,
      "eval_train_samples_per_second": 111.138,
      "eval_train_steps_per_second": 14.594,
      "step": 12300
    },
    {
      "epoch": 946.92,
      "learning_rate": 6.918523969573487e-09,
      "loss": 0.0197,
      "step": 12310
    },
    {
      "epoch": 947.69,
      "learning_rate": 4.953531856977112e-09,
      "loss": 0.0221,
      "step": 12320
    },
    {
      "epoch": 948.46,
      "learning_rate": 3.3160187651704034e-09,
      "loss": 0.02,
      "step": 12330
    },
    {
      "epoch": 949.23,
      "learning_rate": 2.0059954202289456e-09,
      "loss": 0.0196,
      "step": 12340
    },
    {
      "epoch": 950.0,
      "learning_rate": 1.0234704031220066e-09,
      "loss": 0.0229,
      "step": 12350
    },
    {
      "epoch": 950.77,
      "learning_rate": 3.6845014960151804e-10,
      "loss": 0.0204,
      "step": 12360
    },
    {
      "epoch": 951.54,
      "learning_rate": 4.0938950202074056e-11,
      "loss": 0.0236,
      "step": 12370
    },
    {
      "epoch": 952.31,
      "learning_rate": 4.0938950202074056e-11,
      "loss": 0.0219,
      "step": 12380
    },
    {
      "epoch": 953.08,
      "learning_rate": 3.6845014960151804e-10,
      "loss": 0.024,
      "step": 12390
    },
    {
      "epoch": 953.85,
      "learning_rate": 1.0234704031220066e-09,
      "loss": 0.0198,
      "step": 12400
    },
    {
      "epoch": 953.85,
      "eval_valid_eval_loss": 6.924297332763672,
      "eval_valid_eval_loss_<cls>": 6.32051944732666,
      "eval_valid_eval_perplexity_batch": 1016.6796264648438,
      "eval_valid_eval_perplexity_res": 9970.369140625,
      "eval_valid_eval_perplexity_seq": 1016.6796264648438,
      "eval_valid_eval_reconstruction": 0.08382229506969452,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.924297332763672,
      "eval_valid_runtime": 0.3745,
      "eval_valid_samples_per_second": 2.671,
      "eval_valid_steps_per_second": 2.671,
      "step": 12400
    },
    {
      "epoch": 953.85,
      "eval_train_eval_loss": 0.010168945416808128,
      "eval_train_eval_loss_<cls>": 0.6105339527130127,
      "eval_train_eval_perplexity_batch": 1.0102207660675049,
      "eval_train_eval_perplexity_res": 2.1547725200653076,
      "eval_train_eval_perplexity_seq": 1.010284662246704,
      "eval_train_eval_reconstruction": 0.2084512561559677,
      "eval_train_eval_reconstruction_<cls>": 0.9408602118492126,
      "eval_train_loss": 0.02201378159224987,
      "eval_train_runtime": 0.8582,
      "eval_train_samples_per_second": 115.36,
      "eval_train_steps_per_second": 15.148,
      "step": 12400
    },
    {
      "epoch": 954.62,
      "learning_rate": 2.0059954202289456e-09,
      "loss": 0.0221,
      "step": 12410
    },
    {
      "epoch": 955.38,
      "learning_rate": 3.3160187651704034e-09,
      "loss": 0.0218,
      "step": 12420
    },
    {
      "epoch": 956.15,
      "learning_rate": 4.953531856977112e-09,
      "loss": 0.0219,
      "step": 12430
    },
    {
      "epoch": 956.92,
      "learning_rate": 6.918523969573487e-09,
      "loss": 0.018,
      "step": 12440
    },
    {
      "epoch": 957.69,
      "learning_rate": 9.210982231805387e-09,
      "loss": 0.0235,
      "step": 12450
    },
    {
      "epoch": 958.46,
      "learning_rate": 1.1830891627551133e-08,
      "loss": 0.0213,
      "step": 12460
    },
    {
      "epoch": 959.23,
      "learning_rate": 1.4778234995793671e-08,
      "loss": 0.0231,
      "step": 12470
    },
    {
      "epoch": 960.0,
      "learning_rate": 1.805299303077046e-08,
      "loss": 0.0198,
      "step": 12480
    },
    {
      "epoch": 960.77,
      "learning_rate": 2.1655144282051176e-08,
      "loss": 0.0208,
      "step": 12490
    },
    {
      "epoch": 961.54,
      "learning_rate": 2.558466515473201e-08,
      "loss": 0.0212,
      "step": 12500
    },
    {
      "epoch": 961.54,
      "eval_valid_eval_loss": 6.897107124328613,
      "eval_valid_eval_loss_<cls>": 7.016357898712158,
      "eval_valid_eval_perplexity_batch": 989.4083251953125,
      "eval_valid_eval_perplexity_res": 9522.927734375,
      "eval_valid_eval_perplexity_seq": 989.4083251953125,
      "eval_valid_eval_reconstruction": 0.08280922472476959,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.897106647491455,
      "eval_valid_runtime": 0.4362,
      "eval_valid_samples_per_second": 2.292,
      "eval_valid_steps_per_second": 2.292,
      "step": 12500
    },
    {
      "epoch": 961.54,
      "eval_train_eval_loss": 0.013946357183158398,
      "eval_train_eval_loss_<cls>": 0.5930605530738831,
      "eval_train_eval_perplexity_batch": 1.0140440464019775,
      "eval_train_eval_perplexity_res": 3.3974409103393555,
      "eval_train_eval_perplexity_seq": 1.0141150951385498,
      "eval_train_eval_reconstruction": 0.303459495306015,
      "eval_train_eval_reconstruction_<cls>": 0.9455445408821106,
      "eval_train_loss": 0.02192719280719757,
      "eval_train_runtime": 0.8315,
      "eval_train_samples_per_second": 119.064,
      "eval_train_steps_per_second": 15.635,
      "step": 12500
    },
    {
      "epoch": 962.31,
      "learning_rate": 2.984152990954114e-08,
      "loss": 0.0224,
      "step": 12510
    },
    {
      "epoch": 963.08,
      "learning_rate": 3.442571066304412e-08,
      "loss": 0.0216,
      "step": 12520
    },
    {
      "epoch": 963.85,
      "learning_rate": 3.933717738780485e-08,
      "loss": 0.0212,
      "step": 12530
    },
    {
      "epoch": 964.62,
      "learning_rate": 4.457589791258543e-08,
      "loss": 0.0213,
      "step": 12540
    },
    {
      "epoch": 965.38,
      "learning_rate": 5.014183792256266e-08,
      "loss": 0.0234,
      "step": 12550
    },
    {
      "epoch": 966.15,
      "learning_rate": 5.603496095956118e-08,
      "loss": 0.0226,
      "step": 12560
    },
    {
      "epoch": 966.92,
      "learning_rate": 6.225522842226994e-08,
      "loss": 0.0183,
      "step": 12570
    },
    {
      "epoch": 967.69,
      "learning_rate": 6.880259956651424e-08,
      "loss": 0.0215,
      "step": 12580
    },
    {
      "epoch": 968.46,
      "learning_rate": 7.567703150549999e-08,
      "loss": 0.0255,
      "step": 12590
    },
    {
      "epoch": 969.23,
      "learning_rate": 8.287847921013559e-08,
      "loss": 0.02,
      "step": 12600
    },
    {
      "epoch": 969.23,
      "eval_valid_eval_loss": 6.904047966003418,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 996.299560546875,
      "eval_valid_eval_perplexity_res": 11262.041015625,
      "eval_valid_eval_perplexity_seq": 996.299560546875,
      "eval_valid_eval_reconstruction": 0.08301886916160583,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 6.904048442840576,
      "eval_valid_runtime": 0.315,
      "eval_valid_samples_per_second": 3.174,
      "eval_valid_steps_per_second": 3.174,
      "step": 12600
    },
    {
      "epoch": 969.23,
      "eval_train_eval_loss": 0.007513290736824274,
      "eval_train_eval_loss_<cls>": 0.6963664889335632,
      "eval_train_eval_perplexity_batch": 1.007541537284851,
      "eval_train_eval_perplexity_res": 2.1001203060150146,
      "eval_train_eval_perplexity_seq": 1.007591724395752,
      "eval_train_eval_reconstruction": 0.15207558870315552,
      "eval_train_eval_reconstruction_<cls>": 0.9285714030265808,
      "eval_train_loss": 0.020154161378741264,
      "eval_train_runtime": 0.9525,
      "eval_train_samples_per_second": 103.94,
      "eval_train_steps_per_second": 13.649,
      "step": 12600
    },
    {
      "epoch": 970.0,
      "learning_rate": 9.040689550927628e-08,
      "loss": 0.0229,
      "step": 12610
    },
    {
      "epoch": 970.77,
      "learning_rate": 9.826223109007382e-08,
      "loss": 0.0211,
      "step": 12620
    },
    {
      "epoch": 971.54,
      "learning_rate": 1.0644443449828179e-07,
      "loss": 0.0222,
      "step": 12630
    },
    {
      "epoch": 972.31,
      "learning_rate": 1.1495345213858311e-07,
      "loss": 0.0207,
      "step": 12640
    },
    {
      "epoch": 973.08,
      "learning_rate": 1.2378922827496199e-07,
      "loss": 0.0236,
      "step": 12650
    },
    {
      "epoch": 973.85,
      "learning_rate": 1.329517050310647e-07,
      "loss": 0.0211,
      "step": 12660
    },
    {
      "epoch": 974.62,
      "learning_rate": 1.4244082239056045e-07,
      "loss": 0.0234,
      "step": 12670
    },
    {
      "epoch": 975.38,
      "learning_rate": 1.522565181975577e-07,
      "loss": 0.0231,
      "step": 12680
    },
    {
      "epoch": 976.15,
      "learning_rate": 1.6239872815699831e-07,
      "loss": 0.0209,
      "step": 12690
    },
    {
      "epoch": 976.92,
      "learning_rate": 1.7286738583507933e-07,
      "loss": 0.0214,
      "step": 12700
    },
    {
      "epoch": 976.92,
      "eval_valid_eval_loss": 6.971138000488281,
      "eval_valid_eval_loss_<cls>": 7.579351425170898,
      "eval_valid_eval_perplexity_batch": 1065.4345703125,
      "eval_valid_eval_perplexity_res": 11441.1025390625,
      "eval_valid_eval_perplexity_seq": 1065.4345703125,
      "eval_valid_eval_reconstruction": 0.0793650820851326,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.971138000488281,
      "eval_valid_runtime": 0.3467,
      "eval_valid_samples_per_second": 2.884,
      "eval_valid_steps_per_second": 2.884,
      "step": 12700
    },
    {
      "epoch": 976.92,
      "eval_train_eval_loss": 0.009522379375994205,
      "eval_train_eval_loss_<cls>": 0.7343506217002869,
      "eval_train_eval_perplexity_batch": 1.0095678567886353,
      "eval_train_eval_perplexity_res": 2.4408955574035645,
      "eval_train_eval_perplexity_seq": 1.0096399784088135,
      "eval_train_eval_reconstruction": 0.19472788274288177,
      "eval_train_eval_reconstruction_<cls>": 0.8990825414657593,
      "eval_train_loss": 0.022220101207494736,
      "eval_train_runtime": 0.8108,
      "eval_train_samples_per_second": 122.098,
      "eval_train_steps_per_second": 16.033,
      "step": 12700
    },
    {
      "epoch": 977.69,
      "learning_rate": 1.836624226596917e-07,
      "loss": 0.0192,
      "step": 12710
    },
    {
      "epoch": 978.46,
      "learning_rate": 1.9478376792086416e-07,
      "loss": 0.0205,
      "step": 12720
    },
    {
      "epoch": 979.23,
      "learning_rate": 2.0623134877124084e-07,
      "loss": 0.0236,
      "step": 12730
    },
    {
      "epoch": 980.0,
      "learning_rate": 2.180050902265418e-07,
      "loss": 0.0262,
      "step": 12740
    },
    {
      "epoch": 980.77,
      "learning_rate": 2.301049151660628e-07,
      "loss": 0.0203,
      "step": 12750
    },
    {
      "epoch": 981.54,
      "learning_rate": 2.425307443331637e-07,
      "loss": 0.0209,
      "step": 12760
    },
    {
      "epoch": 982.31,
      "learning_rate": 2.5528249633582357e-07,
      "loss": 0.0217,
      "step": 12770
    },
    {
      "epoch": 983.08,
      "learning_rate": 2.6836008764714037e-07,
      "loss": 0.023,
      "step": 12780
    },
    {
      "epoch": 983.85,
      "learning_rate": 2.81763432605886e-07,
      "loss": 0.0195,
      "step": 12790
    },
    {
      "epoch": 984.62,
      "learning_rate": 2.9549244341708916e-07,
      "loss": 0.0239,
      "step": 12800
    },
    {
      "epoch": 984.62,
      "eval_valid_eval_loss": 6.924684524536133,
      "eval_valid_eval_loss_<cls>": 5.7720046043396,
      "eval_valid_eval_perplexity_batch": 1017.0733642578125,
      "eval_valid_eval_perplexity_res": 10165.2744140625,
      "eval_valid_eval_perplexity_seq": 1017.0733642578125,
      "eval_valid_eval_reconstruction": 0.0829840749502182,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.924684524536133,
      "eval_valid_runtime": 0.3564,
      "eval_valid_samples_per_second": 2.806,
      "eval_valid_steps_per_second": 2.806,
      "step": 12800
    },
    {
      "epoch": 984.62,
      "eval_train_eval_loss": 0.009996984153985977,
      "eval_train_eval_loss_<cls>": 0.6062405705451965,
      "eval_train_eval_perplexity_batch": 1.0100470781326294,
      "eval_train_eval_perplexity_res": 2.120746374130249,
      "eval_train_eval_perplexity_seq": 1.0101032257080078,
      "eval_train_eval_reconstruction": 0.21091262996196747,
      "eval_train_eval_reconstruction_<cls>": 0.9395161271095276,
      "eval_train_loss": 0.02001449652016163,
      "eval_train_runtime": 0.6767,
      "eval_train_samples_per_second": 146.306,
      "eval_train_steps_per_second": 19.212,
      "step": 12800
    },
    {
      "epoch": 985.38,
      "learning_rate": 3.0954703015256825e-07,
      "loss": 0.0201,
      "step": 12810
    },
    {
      "epoch": 986.15,
      "learning_rate": 3.2392710075155876e-07,
      "loss": 0.0239,
      "step": 12820
    },
    {
      "epoch": 986.92,
      "learning_rate": 3.3863256102130704e-07,
      "loss": 0.0236,
      "step": 12830
    },
    {
      "epoch": 987.69,
      "learning_rate": 3.5366331463766446e-07,
      "loss": 0.0221,
      "step": 12840
    },
    {
      "epoch": 988.46,
      "learning_rate": 3.6901926314575894e-07,
      "loss": 0.0229,
      "step": 12850
    },
    {
      "epoch": 989.23,
      "learning_rate": 3.847003059606058e-07,
      "loss": 0.0162,
      "step": 12860
    },
    {
      "epoch": 990.0,
      "learning_rate": 4.00706340367768e-07,
      "loss": 0.0257,
      "step": 12870
    },
    {
      "epoch": 990.77,
      "learning_rate": 4.1703726152405034e-07,
      "loss": 0.0224,
      "step": 12880
    },
    {
      "epoch": 991.54,
      "learning_rate": 4.3369296245818756e-07,
      "loss": 0.0209,
      "step": 12890
    },
    {
      "epoch": 992.31,
      "learning_rate": 4.5067333407149416e-07,
      "loss": 0.0211,
      "step": 12900
    },
    {
      "epoch": 992.31,
      "eval_valid_eval_loss": 6.924152851104736,
      "eval_valid_eval_loss_<cls>": 7.586577892303467,
      "eval_valid_eval_perplexity_batch": 1016.5327758789062,
      "eval_valid_eval_perplexity_res": 10675.1494140625,
      "eval_valid_eval_perplexity_seq": 1016.5327758789062,
      "eval_valid_eval_reconstruction": 0.08467742055654526,
      "eval_valid_eval_reconstruction_<cls>": 0.0,
      "eval_valid_loss": 6.924153804779053,
      "eval_valid_runtime": 0.3725,
      "eval_valid_samples_per_second": 2.684,
      "eval_valid_steps_per_second": 2.684,
      "step": 12900
    },
    {
      "epoch": 992.31,
      "eval_train_eval_loss": 0.010251599363982677,
      "eval_train_eval_loss_<cls>": 0.6630845665931702,
      "eval_train_eval_perplexity_batch": 1.01030433177948,
      "eval_train_eval_perplexity_res": 2.2841360569000244,
      "eval_train_eval_perplexity_seq": 1.0103672742843628,
      "eval_train_eval_reconstruction": 0.2122843712568283,
      "eval_train_eval_reconstruction_<cls>": 0.904347836971283,
      "eval_train_loss": 0.021173033863306046,
      "eval_train_runtime": 0.883,
      "eval_train_samples_per_second": 112.124,
      "eval_train_steps_per_second": 14.723,
      "step": 12900
    },
    {
      "epoch": 993.08,
      "learning_rate": 4.679782651386688e-07,
      "loss": 0.0221,
      "step": 12910
    },
    {
      "epoch": 993.85,
      "learning_rate": 4.856076423084277e-07,
      "loss": 0.019,
      "step": 12920
    },
    {
      "epoch": 994.62,
      "learning_rate": 5.035613501043146e-07,
      "loss": 0.0214,
      "step": 12930
    },
    {
      "epoch": 995.38,
      "learning_rate": 5.218392709254171e-07,
      "loss": 0.0194,
      "step": 12940
    },
    {
      "epoch": 996.15,
      "learning_rate": 5.404412850471719e-07,
      "loss": 0.0222,
      "step": 12950
    },
    {
      "epoch": 996.92,
      "learning_rate": 5.593672706221132e-07,
      "loss": 0.0214,
      "step": 12960
    },
    {
      "epoch": 997.69,
      "learning_rate": 5.786171036806953e-07,
      "loss": 0.0232,
      "step": 12970
    },
    {
      "epoch": 998.46,
      "learning_rate": 5.981906581320973e-07,
      "loss": 0.0226,
      "step": 12980
    },
    {
      "epoch": 999.23,
      "learning_rate": 6.180878057650497e-07,
      "loss": 0.0181,
      "step": 12990
    },
    {
      "epoch": 1000.0,
      "learning_rate": 6.38308416248673e-07,
      "loss": 0.0208,
      "step": 13000
    },
    {
      "epoch": 1000.0,
      "eval_valid_eval_loss": 6.692291736602783,
      "eval_valid_eval_loss_<cls>": NaN,
      "eval_valid_eval_perplexity_batch": 806.1676635742188,
      "eval_valid_eval_perplexity_res": 6364.76513671875,
      "eval_valid_eval_perplexity_seq": 806.1676635742188,
      "eval_valid_eval_reconstruction": 0.0855855867266655,
      "eval_valid_eval_reconstruction_<cls>": NaN,
      "eval_valid_loss": 6.692291736602783,
      "eval_valid_runtime": 0.3041,
      "eval_valid_samples_per_second": 3.289,
      "eval_valid_steps_per_second": 3.289,
      "step": 13000
    },
    {
      "epoch": 1000.0,
      "eval_train_eval_loss": 0.014821738004684448,
      "eval_train_eval_loss_<cls>": 0.5959479808807373,
      "eval_train_eval_perplexity_batch": 1.0149321556091309,
      "eval_train_eval_perplexity_res": 2.717517137527466,
      "eval_train_eval_perplexity_seq": 1.0150054693222046,
      "eval_train_eval_reconstruction": 0.3200661540031433,
      "eval_train_eval_reconstruction_<cls>": 0.930232584476471,
      "eval_train_loss": 0.022915082052350044,
      "eval_train_runtime": 0.7934,
      "eval_train_samples_per_second": 124.779,
      "eval_train_steps_per_second": 16.385,
      "step": 13000
    }
  ],
  "logging_steps": 10,
  "max_steps": 13000,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1000,
  "save_steps": 200,
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}
