====================================================================================================
    - data : ../data/wikitext-2/
    - dataset : wt103
    - n_layer : 16
    - n_head : 10
    - d_head : 40
    - d_embed : 400
    - d_model : 400
    - d_inner : 900
    - dropout : 0.2
    - dropoute : 0.2
    - dropouto : 0.5
    - dropouti : 0.6
    - dropatt : 0.2
    - init : normal
    - emb_init : normal
    - init_range : 0.1
    - emb_init_range : 0.01
    - init_std : 0.02
    - proj_init_std : 0.01
    - optim : adam
    - lr : 0.00035
    - mom : 0.0
    - scheduler : cosine
    - warmup_step : 3000
    - decay_rate : 0.5
    - lr_min : 0.0
    - clip : 0.25
    - clip_nonemb : False
    - max_step : 125000
    - batch_size : 32
    - batch_chunk : 1
    - tgt_len : 150
    - eval_tgt_len : 150
    - ext_len : 0
    - mem_len : 150
    - not_tied : False
    - seed : 444
    - cuda : True
    - adaptive : True
    - div_val : 1
    - pre_lnorm : False
    - varlen : False
    - multi_gpu : False
    - log_interval : 50
    - eval_interval : 400
    - work_dir : LM-TFM-wt103-444
    - restart : False
    - restart_dir : 
    - debug : False
    - same_length : False
    - attn_type : 0
    - clamp_len : -1
    - eta_min : 0.0
    - gpu0_bsz : 1
    - max_eval_steps : -1
    - sample_softmax : -1
    - patience : 0
    - finetune_v2 : False
    - finetune_v3 : False
    - fp16 : False
    - static_loss_scale : 1
    - dynamic_loss_scale : True
    - wdecay : 1.2e-06
    - tied : True
    - n_token : 33278
    - n_all_param : 37712881
    - n_nonemb_param : 24366400
====================================================================================================
#params = 37712881
#non emb params = 24366400
----------------------------------------------------------------------------------------------------
Exiting from training early
====================================================================================================
    - data : ../data/wikitext-2/
    - dataset : wt103
    - n_layer : 16
    - n_head : 10
    - d_head : 40
    - d_embed : 400
    - d_model : 400
    - d_inner : 900
    - dropout : 0.2
    - dropoute : 0.2
    - dropouto : 0.5
    - dropouti : 0.6
    - dropatt : 0.2
    - init : normal
    - emb_init : normal
    - init_range : 0.1
    - emb_init_range : 0.01
    - init_std : 0.02
    - proj_init_std : 0.01
    - optim : adam
    - lr : 0.00035
    - mom : 0.0
    - scheduler : cosine
    - warmup_step : 3000
    - decay_rate : 0.5
    - lr_min : 0.0
    - clip : 0.25
    - clip_nonemb : False
    - max_step : 125000
    - batch_size : 32
    - batch_chunk : 1
    - tgt_len : 150
    - eval_tgt_len : 150
    - ext_len : 0
    - mem_len : 150
    - not_tied : False
    - seed : 444
    - cuda : True
    - adaptive : True
    - div_val : 1
    - pre_lnorm : False
    - varlen : False
    - multi_gpu : True
    - log_interval : 50
    - eval_interval : 400
    - work_dir : LM-TFM-wt103-444
    - restart : False
    - restart_dir : 
    - debug : False
    - same_length : False
    - attn_type : 0
    - clamp_len : -1
    - eta_min : 0.0
    - gpu0_bsz : 1
    - max_eval_steps : -1
    - sample_softmax : -1
    - patience : 0
    - finetune_v2 : False
    - finetune_v3 : False
    - fp16 : False
    - static_loss_scale : 1
    - dynamic_loss_scale : True
    - wdecay : 1.2e-06
    - tied : True
    - n_token : 33278
    - n_all_param : 37712881
    - n_nonemb_param : 24366400
====================================================================================================
#params = 37712881
#non emb params = 24366400
====================================================================================================
    - data : ../data/wikitext-2/
    - dataset : wt103
    - n_layer : 16
    - n_head : 10
    - d_head : 40
    - d_embed : 400
    - d_model : 400
    - d_inner : 900
    - dropout : 0.2
    - dropoute : 0.2
    - dropouto : 0.5
    - dropouti : 0.6
    - dropatt : 0.2
    - init : normal
    - emb_init : normal
    - init_range : 0.1
    - emb_init_range : 0.01
    - init_std : 0.02
    - proj_init_std : 0.01
    - optim : adam
    - lr : 0.00035
    - mom : 0.0
    - scheduler : cosine
    - warmup_step : 3000
    - decay_rate : 0.5
    - lr_min : 0.0
    - clip : 0.25
    - clip_nonemb : False
    - max_step : 125000
    - batch_size : 32
    - batch_chunk : 1
    - tgt_len : 150
    - eval_tgt_len : 150
    - ext_len : 0
    - mem_len : 150
    - not_tied : False
    - seed : 444
    - cuda : True
    - adaptive : True
    - div_val : 1
    - pre_lnorm : False
    - varlen : False
    - multi_gpu : True
    - log_interval : 50
    - eval_interval : 400
    - work_dir : LM-TFM-wt103-444
    - restart : False
    - restart_dir : 
    - debug : False
    - same_length : False
    - attn_type : 0
    - clamp_len : -1
    - eta_min : 0.0
    - gpu0_bsz : 1
    - max_eval_steps : -1
    - sample_softmax : -1
    - patience : 0
    - finetune_v2 : False
    - finetune_v3 : False
    - fp16 : False
    - static_loss_scale : 1
    - dynamic_loss_scale : True
    - wdecay : 1.2e-06
    - tied : True
    - n_token : 33278
    - n_all_param : 37712881
    - n_nonemb_param : 24366400
====================================================================================================
#params = 37712881
#non emb params = 24366400
====================================================================================================
    - data : ../data/wikitext-2/
    - dataset : wt103
    - n_layer : 16
    - n_head : 10
    - d_head : 40
    - d_embed : 400
    - d_model : 400
    - d_inner : 900
    - dropout : 0.2
    - dropoute : 0.2
    - dropouto : 0.5
    - dropouti : 0.6
    - dropatt : 0.2
    - init : normal
    - emb_init : normal
    - init_range : 0.1
    - emb_init_range : 0.01
    - init_std : 0.02
    - proj_init_std : 0.01
    - optim : adam
    - lr : 0.00035
    - mom : 0.0
    - scheduler : cosine
    - warmup_step : 3000
    - decay_rate : 0.5
    - lr_min : 0.0
    - clip : 0.25
    - clip_nonemb : False
    - max_step : 125000
    - batch_size : 32
    - batch_chunk : 1
    - tgt_len : 150
    - eval_tgt_len : 150
    - ext_len : 0
    - mem_len : 150
    - not_tied : False
    - seed : 444
    - cuda : True
    - adaptive : True
    - div_val : 1
    - pre_lnorm : False
    - varlen : False
    - multi_gpu : True
    - log_interval : 50
    - eval_interval : 400
    - work_dir : LM-TFM-wt103-444
    - restart : False
    - restart_dir : 
    - debug : False
    - same_length : False
    - attn_type : 0
    - clamp_len : -1
    - eta_min : 0.0
    - gpu0_bsz : 0
    - max_eval_steps : -1
    - sample_softmax : -1
    - patience : 0
    - finetune_v2 : False
    - finetune_v3 : False
    - fp16 : False
    - static_loss_scale : 1
    - dynamic_loss_scale : True
    - wdecay : 1.2e-06
    - tied : True
    - n_token : 33278
    - n_all_param : 37712881
    - n_nonemb_param : 24366400
====================================================================================================
#params = 37712881
#non emb params = 24366400
====================================================================================================
    - data : ../data/wikitext-2/
    - dataset : wt103
    - n_layer : 16
    - n_head : 10
    - d_head : 40
    - d_embed : 400
    - d_model : 400
    - d_inner : 900
    - dropout : 0.2
    - dropoute : 0.2
    - dropouto : 0.5
    - dropouti : 0.6
    - dropatt : 0.2
    - init : normal
    - emb_init : normal
    - init_range : 0.1
    - emb_init_range : 0.01
    - init_std : 0.02
    - proj_init_std : 0.01
    - optim : adam
    - lr : 0.00035
    - mom : 0.0
    - scheduler : cosine
    - warmup_step : 3000
    - decay_rate : 0.5
    - lr_min : 0.0
    - clip : 0.25
    - clip_nonemb : False
    - max_step : 125000
    - batch_size : 32
    - batch_chunk : 1
    - tgt_len : 150
    - eval_tgt_len : 150
    - ext_len : 0
    - mem_len : 150
    - not_tied : False
    - seed : 444
    - cuda : True
    - adaptive : True
    - div_val : 1
    - pre_lnorm : False
    - varlen : False
    - multi_gpu : False
    - log_interval : 50
    - eval_interval : 400
    - work_dir : LM-TFM-wt103-444
    - restart : False
    - restart_dir : 
    - debug : False
    - same_length : False
    - attn_type : 0
    - clamp_len : -1
    - eta_min : 0.0
    - gpu0_bsz : 1
    - max_eval_steps : -1
    - sample_softmax : -1
    - patience : 0
    - finetune_v2 : False
    - finetune_v3 : False
    - fp16 : True
    - static_loss_scale : 1
    - dynamic_loss_scale : True
    - wdecay : 1.2e-06
    - tied : True
    - n_token : 33278
    - n_all_param : 37712881
    - n_nonemb_param : 24366400
====================================================================================================
#params = 37712881
#non emb params = 24366400
| epoch   1 step       50 |     50 batches | lr 5.83e-06 | ms/batch 323.01 | loss 10.21 | ppl 27199.055
| epoch   1 step      100 |    100 batches | lr 1.17e-05 | ms/batch 318.88 | loss  9.72 | ppl 16577.162
| epoch   1 step      150 |    150 batches | lr 1.75e-05 | ms/batch 320.25 | loss  9.28 | ppl 10749.949
| epoch   1 step      200 |    200 batches | lr 2.33e-05 | ms/batch 319.56 | loss  8.96 | ppl  7815.828
====================================================================================================
    - data : ../data/wikitext-2/
    - dataset : wt103
    - n_layer : 16
    - n_head : 10
    - d_head : 40
    - d_embed : 400
    - d_model : 400
    - d_inner : 900
    - dropout : 0.2
    - dropoute : 0.2
    - dropouto : 0.5
    - dropouti : 0.6
    - dropatt : 0.2
    - init : normal
    - emb_init : normal
    - init_range : 0.1
    - emb_init_range : 0.01
    - init_std : 0.02
    - proj_init_std : 0.01
    - optim : adam
    - lr : 0.00035
    - mom : 0.0
    - scheduler : cosine
    - warmup_step : 3000
    - decay_rate : 0.5
    - lr_min : 0.0
    - clip : 0.25
    - clip_nonemb : False
    - max_step : 125000
    - batch_size : 32
    - batch_chunk : 1
    - tgt_len : 150
    - eval_tgt_len : 150
    - ext_len : 0
    - mem_len : 150
    - not_tied : False
    - seed : 444
    - cuda : True
    - adaptive : True
    - div_val : 1
    - pre_lnorm : False
    - varlen : False
    - multi_gpu : False
    - log_interval : 50
    - eval_interval : 400
    - work_dir : LM-TFM-wt103-444
    - restart : False
    - restart_dir : 
    - debug : False
    - same_length : False
    - attn_type : 0
    - clamp_len : -1
    - eta_min : 0.0
    - gpu0_bsz : 1
    - max_eval_steps : -1
    - sample_softmax : -1
    - patience : 0
    - finetune_v2 : False
    - finetune_v3 : False
    - fp16 : True
    - static_loss_scale : 1
    - dynamic_loss_scale : True
    - wdecay : 1.2e-06
    - tied : True
    - n_token : 33278
    - n_all_param : 37712881
    - n_nonemb_param : 24366400
====================================================================================================
#params = 37712881
#non emb params = 24366400
| epoch   1 step      250 |    250 batches | lr 2.92e-05 | ms/batch 319.49 | loss  8.59 | ppl  5364.186
----------------------------------------------------------------------------------------------------
Exiting from training early
| epoch   1 step      300 |    300 batches | lr 3.5e-05 | ms/batch 317.92 | loss  8.19 | ppl  3587.865
| epoch   1 step      350 |    350 batches | lr 4.08e-05 | ms/batch 317.84 | loss  7.75 | ppl  2331.023
| epoch   1 step      400 |    400 batches | lr 4.67e-05 | ms/batch 317.35 | loss  7.42 | ppl  1662.397
----------------------------------------------------------------------------------------------------
| Eval   1 at step      400 | time: 132.68s | valid loss  6.86 | valid ppl   950.785
----------------------------------------------------------------------------------------------------
| epoch   2 step      450 |     14 batches | lr 5.25e-05 | ms/batch 440.87 | loss  7.17 | ppl  1298.322
| epoch   2 step      500 |     64 batches | lr 5.83e-05 | ms/batch 319.99 | loss  7.05 | ppl  1154.211
| epoch   2 step      550 |    114 batches | lr 6.42e-05 | ms/batch 319.50 | loss  7.01 | ppl  1102.647
| epoch   2 step      600 |    164 batches | lr 7e-05 | ms/batch 317.74 | loss  6.96 | ppl  1056.023
| epoch   2 step      650 |    214 batches | lr 7.58e-05 | ms/batch 319.21 | loss  6.95 | ppl  1041.440
| epoch   2 step      700 |    264 batches | lr 8.17e-05 | ms/batch 319.02 | loss  6.87 | ppl   960.919
| epoch   2 step      750 |    314 batches | lr 8.75e-05 | ms/batch 319.33 | loss  6.87 | ppl   960.619
| epoch   2 step      800 |    364 batches | lr 9.33e-05 | ms/batch 321.12 | loss  6.80 | ppl   894.557
----------------------------------------------------------------------------------------------------
| Eval   2 at step      800 | time: 132.41s | valid loss  6.31 | valid ppl   551.426
----------------------------------------------------------------------------------------------------
| epoch   2 step      850 |    414 batches | lr 9.92e-05 | ms/batch 452.46 | loss  6.78 | ppl   880.825
| epoch   3 step      900 |     28 batches | lr 0.000105 | ms/batch 312.58 | loss  6.72 | ppl   831.022
| epoch   3 step      950 |     78 batches | lr 0.000111 | ms/batch 320.73 | loss  6.67 | ppl   789.567
| epoch   3 step     1000 |    128 batches | lr 0.000117 | ms/batch 321.80 | loss  6.61 | ppl   741.961
| epoch   3 step     1050 |    178 batches | lr 0.000122 | ms/batch 321.86 | loss  6.60 | ppl   736.993
| epoch   3 step     1100 |    228 batches | lr 0.000128 | ms/batch 335.93 | loss  6.59 | ppl   730.230
| epoch   3 step     1150 |    278 batches | lr 0.000134 | ms/batch 322.57 | loss  6.53 | ppl   687.705
| epoch   3 step     1200 |    328 batches | lr 0.00014 | ms/batch 321.28 | loss  6.50 | ppl   663.481
----------------------------------------------------------------------------------------------------
| Eval   3 at step     1200 | time: 133.81s | valid loss  5.97 | valid ppl   391.502
----------------------------------------------------------------------------------------------------
| epoch   3 step     1250 |    378 batches | lr 0.000146 | ms/batch 456.58 | loss  6.46 | ppl   639.511
| epoch   3 step     1300 |    428 batches | lr 0.000152 | ms/batch 321.20 | loss  6.44 | ppl   625.429
| epoch   4 step     1350 |     42 batches | lr 0.000157 | ms/batch 315.01 | loss  6.39 | ppl   596.043
| epoch   4 step     1400 |     92 batches | lr 0.000163 | ms/batch 320.00 | loss  6.38 | ppl   590.250
| epoch   4 step     1450 |    142 batches | lr 0.000169 | ms/batch 320.65 | loss  6.31 | ppl   548.329
| epoch   4 step     1500 |    192 batches | lr 0.000175 | ms/batch 321.90 | loss  6.33 | ppl   558.532
| epoch   4 step     1550 |    242 batches | lr 0.000181 | ms/batch 319.81 | loss  6.30 | ppl   544.827
| epoch   4 step     1600 |    292 batches | lr 0.000187 | ms/batch 320.48 | loss  6.28 | ppl   532.747
----------------------------------------------------------------------------------------------------
| Eval   4 at step     1600 | time: 133.07s | valid loss  5.74 | valid ppl   310.701
----------------------------------------------------------------------------------------------------
| epoch   4 step     1650 |    342 batches | lr 0.000193 | ms/batch 456.48 | loss  6.21 | ppl   495.412
| epoch   4 step     1700 |    392 batches | lr 0.000198 | ms/batch 321.44 | loss  6.21 | ppl   496.614
| epoch   5 step     1750 |      6 batches | lr 0.000204 | ms/batch 314.57 | loss  6.18 | ppl   484.542
| epoch   5 step     1800 |     56 batches | lr 0.00021 | ms/batch 319.44 | loss  6.12 | ppl   457.073
| epoch   5 step     1850 |    106 batches | lr 0.000216 | ms/batch 320.96 | loss  6.15 | ppl   466.416
| epoch   5 step     1900 |    156 batches | lr 0.000222 | ms/batch 320.78 | loss  6.08 | ppl   437.986
| epoch   5 step     1950 |    206 batches | lr 0.000228 | ms/batch 319.77 | loss  6.09 | ppl   443.391
| epoch   5 step     2000 |    256 batches | lr 0.000233 | ms/batch 320.20 | loss  6.05 | ppl   425.839
----------------------------------------------------------------------------------------------------
| Eval   5 at step     2000 | time: 132.98s | valid loss  5.53 | valid ppl   253.210
----------------------------------------------------------------------------------------------------
| epoch   5 step     2050 |    306 batches | lr 0.000239 | ms/batch 453.89 | loss  6.05 | ppl   422.526
| epoch   5 step     2100 |    356 batches | lr 0.000245 | ms/batch 321.93 | loss  5.98 | ppl   395.471
| epoch   5 step     2150 |    406 batches | lr 0.000251 | ms/batch 320.42 | loss  5.98 | ppl   395.719
| epoch   6 step     2200 |     20 batches | lr 0.000257 | ms/batch 314.93 | loss  5.99 | ppl   398.231
| epoch   6 step     2250 |     70 batches | lr 0.000262 | ms/batch 321.28 | loss  5.91 | ppl   367.987
| epoch   6 step     2300 |    120 batches | lr 0.000268 | ms/batch 321.59 | loss  5.92 | ppl   372.732
| epoch   6 step     2350 |    170 batches | lr 0.000274 | ms/batch 322.14 | loss  5.88 | ppl   356.414
| epoch   6 step     2400 |    220 batches | lr 0.00028 | ms/batch 321.00 | loss  5.92 | ppl   372.383
----------------------------------------------------------------------------------------------------
| Eval   6 at step     2400 | time: 133.47s | valid loss  5.40 | valid ppl   222.401
----------------------------------------------------------------------------------------------------
| epoch   6 step     2450 |    270 batches | lr 0.000286 | ms/batch 452.59 | loss  5.85 | ppl   346.503
| epoch   6 step     2500 |    320 batches | lr 0.000292 | ms/batch 321.49 | loss  5.87 | ppl   355.469
| epoch   6 step     2550 |    370 batches | lr 0.000297 | ms/batch 320.12 | loss  5.80 | ppl   331.826
| epoch   6 step     2600 |    420 batches | lr 0.000303 | ms/batch 319.43 | loss  5.80 | ppl   330.480
| epoch   7 step     2650 |     34 batches | lr 0.000309 | ms/batch 314.57 | loss  5.78 | ppl   325.103
| epoch   7 step     2700 |     84 batches | lr 0.000315 | ms/batch 325.71 | loss  5.76 | ppl   316.457
| epoch   7 step     2750 |    134 batches | lr 0.000321 | ms/batch 321.19 | loss  5.74 | ppl   310.773
| epoch   7 step     2800 |    184 batches | lr 0.000327 | ms/batch 320.07 | loss  5.75 | ppl   315.741
----------------------------------------------------------------------------------------------------
| Eval   7 at step     2800 | time: 133.14s | valid loss  5.25 | valid ppl   191.406
----------------------------------------------------------------------------------------------------
| epoch   7 step     2850 |    234 batches | lr 0.000333 | ms/batch 451.86 | loss  5.73 | ppl   307.633
| epoch   7 step     2900 |    284 batches | lr 0.000338 | ms/batch 321.31 | loss  5.71 | ppl   303.147
| epoch   7 step     2950 |    334 batches | lr 0.000344 | ms/batch 320.77 | loss  5.65 | ppl   284.380
| epoch   7 step     3000 |    384 batches | lr 0.00035 | ms/batch 320.05 | loss  5.66 | ppl   287.935
| epoch   7 step     3050 |    434 batches | lr 0.000349 | ms/batch 320.90 | loss  5.70 | ppl   297.795
| epoch   8 step     3100 |     48 batches | lr 0.000349 | ms/batch 315.72 | loss  5.64 | ppl   282.167
| epoch   8 step     3150 |     98 batches | lr 0.000349 | ms/batch 320.72 | loss  5.62 | ppl   276.818
| epoch   8 step     3200 |    148 batches | lr 0.000349 | ms/batch 321.15 | loss  5.58 | ppl   264.761
----------------------------------------------------------------------------------------------------
| Eval   8 at step     3200 | time: 133.10s | valid loss  5.16 | valid ppl   174.996
----------------------------------------------------------------------------------------------------
| epoch   8 step     3250 |    198 batches | lr 0.000349 | ms/batch 455.64 | loss  5.61 | ppl   273.892
| epoch   8 step     3300 |    248 batches | lr 0.000349 | ms/batch 322.55 | loss  5.60 | ppl   271.464
| epoch   8 step     3350 |    298 batches | lr 0.000349 | ms/batch 321.34 | loss  5.60 | ppl   269.604
| epoch   8 step     3400 |    348 batches | lr 0.000349 | ms/batch 321.08 | loss  5.49 | ppl   242.845
| epoch   8 step     3450 |    398 batches | lr 0.000349 | ms/batch 319.53 | loss  5.56 | ppl   258.628
| epoch   9 step     3500 |     12 batches | lr 0.000349 | ms/batch 314.69 | loss  5.57 | ppl   262.947
| epoch   9 step     3550 |     62 batches | lr 0.000349 | ms/batch 321.21 | loss  5.47 | ppl   236.664
| epoch   9 step     3600 |    112 batches | lr 0.000349 | ms/batch 321.13 | loss  5.50 | ppl   245.842
----------------------------------------------------------------------------------------------------
| Eval   9 at step     3600 | time: 133.20s | valid loss  5.09 | valid ppl   162.479
----------------------------------------------------------------------------------------------------
| epoch   9 step     3650 |    162 batches | lr 0.000349 | ms/batch 456.97 | loss  5.48 | ppl   239.472
| epoch   9 step     3700 |    212 batches | lr 0.000349 | ms/batch 323.07 | loss  5.51 | ppl   247.654
| epoch   9 step     3750 |    262 batches | lr 0.000349 | ms/batch 324.39 | loss  5.46 | ppl   235.741
| epoch   9 step     3800 |    312 batches | lr 0.000349 | ms/batch 320.70 | loss  5.48 | ppl   238.874
| epoch   9 step     3850 |    362 batches | lr 0.000349 | ms/batch 320.62 | loss  5.42 | ppl   224.893
| epoch   9 step     3900 |    412 batches | lr 0.000349 | ms/batch 321.85 | loss  5.45 | ppl   232.213
| epoch  10 step     3950 |     26 batches | lr 0.000349 | ms/batch 314.45 | loss  5.46 | ppl   235.686
| epoch  10 step     4000 |     76 batches | lr 0.000349 | ms/batch 321.60 | loss  5.39 | ppl   220.061
----------------------------------------------------------------------------------------------------
| Eval  10 at step     4000 | time: 133.45s | valid loss  5.02 | valid ppl   151.694
----------------------------------------------------------------------------------------------------
| epoch  10 step     4050 |    126 batches | lr 0.000349 | ms/batch 455.71 | loss  5.40 | ppl   221.874
| epoch  10 step     4100 |    176 batches | lr 0.000349 | ms/batch 327.92 | loss  5.39 | ppl   219.769
| epoch  10 step     4150 |    226 batches | lr 0.000349 | ms/batch 328.27 | loss  5.42 | ppl   225.086
| epoch  10 step     4200 |    276 batches | lr 0.000349 | ms/batch 319.81 | loss  5.40 | ppl   222.395
| epoch  10 step     4250 |    326 batches | lr 0.000349 | ms/batch 321.04 | loss  5.35 | ppl   209.886
| epoch  10 step     4300 |    376 batches | lr 0.000349 | ms/batch 321.28 | loss  5.34 | ppl   207.489
| epoch  10 step     4350 |    426 batches | lr 0.000349 | ms/batch 321.06 | loss  5.35 | ppl   210.526
| epoch  11 step     4400 |     40 batches | lr 0.000349 | ms/batch 316.44 | loss  5.34 | ppl   208.317
----------------------------------------------------------------------------------------------------
| Eval  11 at step     4400 | time: 133.87s | valid loss  4.95 | valid ppl   141.564
----------------------------------------------------------------------------------------------------
| epoch  11 step     4450 |     90 batches | lr 0.000349 | ms/batch 481.78 | loss  5.33 | ppl   205.730
| epoch  11 step     4500 |    140 batches | lr 0.000349 | ms/batch 327.89 | loss  5.29 | ppl   197.416
| epoch  11 step     4550 |    190 batches | lr 0.000349 | ms/batch 320.94 | loss  5.35 | ppl   209.689
| epoch  11 step     4600 |    240 batches | lr 0.000349 | ms/batch 319.50 | loss  5.32 | ppl   205.040
| epoch  11 step     4650 |    290 batches | lr 0.000349 | ms/batch 321.33 | loss  5.35 | ppl   211.350
| epoch  11 step     4700 |    340 batches | lr 0.000349 | ms/batch 322.17 | loss  5.21 | ppl   183.137
| epoch  11 step     4750 |    390 batches | lr 0.000349 | ms/batch 320.95 | loss  5.29 | ppl   198.607
| epoch  12 step     4800 |      4 batches | lr 0.000349 | ms/batch 313.79 | loss  5.32 | ppl   203.365
----------------------------------------------------------------------------------------------------
| Eval  12 at step     4800 | time: 134.01s | valid loss  4.94 | valid ppl   139.420
----------------------------------------------------------------------------------------------------
| epoch  12 step     4850 |     54 batches | lr 0.000349 | ms/batch 454.99 | loss  5.25 | ppl   190.269
| epoch  12 step     4900 |    104 batches | lr 0.000349 | ms/batch 320.72 | loss  5.25 | ppl   189.808
| epoch  12 step     4950 |    154 batches | lr 0.000349 | ms/batch 319.74 | loss  5.23 | ppl   187.480
| epoch  12 step     5000 |    204 batches | lr 0.000349 | ms/batch 319.99 | loss  5.27 | ppl   195.040
| epoch  12 step     5050 |    254 batches | lr 0.000349 | ms/batch 324.27 | loss  5.24 | ppl   189.409
| epoch  12 step     5100 |    304 batches | lr 0.000349 | ms/batch 325.77 | loss  5.29 | ppl   197.508
| epoch  12 step     5150 |    354 batches | lr 0.000349 | ms/batch 335.67 | loss  5.16 | ppl   174.928
| epoch  12 step     5200 |    404 batches | lr 0.000349 | ms/batch 320.06 | loss  5.22 | ppl   184.573
----------------------------------------------------------------------------------------------------
| Eval  13 at step     5200 | time: 135.52s | valid loss  4.87 | valid ppl   130.947
----------------------------------------------------------------------------------------------------
| epoch  13 step     5250 |     18 batches | lr 0.000348 | ms/batch 447.83 | loss  5.27 | ppl   194.037
| epoch  13 step     5300 |     68 batches | lr 0.000348 | ms/batch 327.22 | loss  5.17 | ppl   175.448
| epoch  13 step     5350 |    118 batches | lr 0.000348 | ms/batch 320.56 | loss  5.20 | ppl   181.329
| epoch  13 step     5400 |    168 batches | lr 0.000348 | ms/batch 320.35 | loss  5.18 | ppl   177.975
| epoch  13 step     5450 |    218 batches | lr 0.000348 | ms/batch 323.86 | loss  5.24 | ppl   188.023
| epoch  13 step     5500 |    268 batches | lr 0.000348 | ms/batch 332.03 | loss  5.20 | ppl   180.806
| epoch  13 step     5550 |    318 batches | lr 0.000348 | ms/batch 320.86 | loss  5.15 | ppl   173.160
| epoch  13 step     5600 |    368 batches | lr 0.000348 | ms/batch 320.97 | loss  5.14 | ppl   170.876
----------------------------------------------------------------------------------------------------
| Eval  14 at step     5600 | time: 134.09s | valid loss  4.85 | valid ppl   127.389
----------------------------------------------------------------------------------------------------
| epoch  13 step     5650 |    418 batches | lr 0.000348 | ms/batch 459.31 | loss  5.17 | ppl   175.764
| epoch  14 step     5700 |     32 batches | lr 0.000348 | ms/batch 315.17 | loss  5.17 | ppl   176.783
| epoch  14 step     5750 |     82 batches | lr 0.000348 | ms/batch 320.67 | loss  5.11 | ppl   165.852
| epoch  14 step     5800 |    132 batches | lr 0.000348 | ms/batch 320.53 | loss  5.12 | ppl   167.335
| epoch  14 step     5850 |    182 batches | lr 0.000348 | ms/batch 322.17 | loss  5.14 | ppl   170.037
| epoch  14 step     5900 |    232 batches | lr 0.000348 | ms/batch 321.29 | loss  5.16 | ppl   173.418
| epoch  14 step     5950 |    282 batches | lr 0.000348 | ms/batch 321.54 | loss  5.17 | ppl   175.750
| epoch  14 step     6000 |    332 batches | lr 0.000348 | ms/batch 320.69 | loss  5.08 | ppl   160.623
----------------------------------------------------------------------------------------------------
| Eval  15 at step     6000 | time: 133.41s | valid loss  4.80 | valid ppl   122.112
----------------------------------------------------------------------------------------------------
| epoch  14 step     6050 |    382 batches | lr 0.000348 | ms/batch 468.43 | loss  5.11 | ppl   166.150
| epoch  14 step     6100 |    432 batches | lr 0.000348 | ms/batch 320.27 | loss  5.10 | ppl   164.497
| epoch  15 step     6150 |     46 batches | lr 0.000348 | ms/batch 313.71 | loss  5.11 | ppl   165.955
| epoch  15 step     6200 |     96 batches | lr 0.000348 | ms/batch 319.49 | loss  5.09 | ppl   162.733
| epoch  15 step     6250 |    146 batches | lr 0.000348 | ms/batch 319.97 | loss  5.06 | ppl   156.939
| epoch  15 step     6300 |    196 batches | lr 0.000348 | ms/batch 319.38 | loss  5.13 | ppl   169.017
| epoch  15 step     6350 |    246 batches | lr 0.000348 | ms/batch 319.25 | loss  5.09 | ppl   163.204
| epoch  15 step     6400 |    296 batches | lr 0.000348 | ms/batch 319.68 | loss  5.15 | ppl   171.800
----------------------------------------------------------------------------------------------------
| Eval  16 at step     6400 | time: 132.63s | valid loss  4.77 | valid ppl   117.592
----------------------------------------------------------------------------------------------------
| epoch  15 step     6450 |    346 batches | lr 0.000348 | ms/batch 470.49 | loss  5.01 | ppl   149.320
| epoch  15 step     6500 |    396 batches | lr 0.000348 | ms/batch 337.12 | loss  5.07 | ppl   158.393
| epoch  16 step     6550 |     10 batches | lr 0.000348 | ms/batch 318.82 | loss  5.10 | ppl   164.317
| epoch  16 step     6600 |     60 batches | lr 0.000348 | ms/batch 320.80 | loss  4.99 | ppl   147.546
| epoch  16 step     6650 |    110 batches | lr 0.000348 | ms/batch 320.25 | loss  5.04 | ppl   154.966
| epoch  16 step     6700 |    160 batches | lr 0.000348 | ms/batch 326.57 | loss  5.04 | ppl   154.349
| epoch  16 step     6750 |    210 batches | lr 0.000347 | ms/batch 325.34 | loss  5.06 | ppl   157.406
| epoch  16 step     6800 |    260 batches | lr 0.000347 | ms/batch 329.25 | loss  5.04 | ppl   154.651
----------------------------------------------------------------------------------------------------
| Eval  17 at step     6800 | time: 136.03s | valid loss  4.75 | valid ppl   115.109
----------------------------------------------------------------------------------------------------
| epoch  16 step     6850 |    310 batches | lr 0.000347 | ms/batch 460.82 | loss  5.07 | ppl   158.665
| epoch  16 step     6900 |    360 batches | lr 0.000347 | ms/batch 320.96 | loss  5.00 | ppl   149.087
| epoch  16 step     6950 |    410 batches | lr 0.000347 | ms/batch 320.74 | loss  5.01 | ppl   149.425
| epoch  17 step     7000 |     24 batches | lr 0.000347 | ms/batch 314.09 | loss  5.08 | ppl   160.172
| epoch  17 step     7050 |     74 batches | lr 0.000347 | ms/batch 329.07 | loss  4.97 | ppl   143.813
| epoch  17 step     7100 |    124 batches | lr 0.000347 | ms/batch 323.27 | loss  4.99 | ppl   147.615
| epoch  17 step     7150 |    174 batches | lr 0.000347 | ms/batch 321.72 | loss  5.01 | ppl   149.437
| epoch  17 step     7200 |    224 batches | lr 0.000347 | ms/batch 324.10 | loss  5.03 | ppl   152.551
----------------------------------------------------------------------------------------------------
| Eval  18 at step     7200 | time: 133.81s | valid loss  4.72 | valid ppl   111.902
----------------------------------------------------------------------------------------------------
| epoch  17 step     7250 |    274 batches | lr 0.000347 | ms/batch 453.17 | loss  5.01 | ppl   149.589
| epoch  17 step     7300 |    324 batches | lr 0.000347 | ms/batch 320.08 | loss  4.99 | ppl   146.352
| epoch  17 step     7350 |    374 batches | lr 0.000347 | ms/batch 320.08 | loss  4.97 | ppl   144.647
| epoch  17 step     7400 |    424 batches | lr 0.000347 | ms/batch 320.82 | loss  4.98 | ppl   145.032
| epoch  18 step     7450 |     38 batches | lr 0.000347 | ms/batch 315.28 | loss  5.02 | ppl   150.762
| epoch  18 step     7500 |     88 batches | lr 0.000347 | ms/batch 321.79 | loss  4.96 | ppl   142.661
| epoch  18 step     7550 |    138 batches | lr 0.000347 | ms/batch 321.93 | loss  4.97 | ppl   144.568
| epoch  18 step     7600 |    188 batches | lr 0.000347 | ms/batch 319.98 | loss  4.99 | ppl   147.592
----------------------------------------------------------------------------------------------------
| Eval  19 at step     7600 | time: 133.07s | valid loss  4.69 | valid ppl   109.363
----------------------------------------------------------------------------------------------------
| epoch  18 step     7650 |    238 batches | lr 0.000347 | ms/batch 468.70 | loss  4.99 | ppl   147.558
| epoch  18 step     7700 |    288 batches | lr 0.000347 | ms/batch 319.24 | loss  5.02 | ppl   151.933
| epoch  18 step     7750 |    338 batches | lr 0.000347 | ms/batch 322.31 | loss  4.88 | ppl   132.167
| epoch  18 step     7800 |    388 batches | lr 0.000347 | ms/batch 320.99 | loss  4.95 | ppl   140.767
| epoch  19 step     7850 |      2 batches | lr 0.000347 | ms/batch 314.15 | loss  4.98 | ppl   145.202
| epoch  19 step     7900 |     52 batches | lr 0.000347 | ms/batch 321.05 | loss  4.94 | ppl   139.214
| epoch  19 step     7950 |    102 batches | lr 0.000347 | ms/batch 320.37 | loss  4.94 | ppl   140.230
| epoch  19 step     8000 |    152 batches | lr 0.000346 | ms/batch 321.47 | loss  4.93 | ppl   137.872
----------------------------------------------------------------------------------------------------
| Eval  20 at step     8000 | time: 133.99s | valid loss  4.68 | valid ppl   108.147
----------------------------------------------------------------------------------------------------
| epoch  19 step     8050 |    202 batches | lr 0.000346 | ms/batch 457.36 | loss  4.97 | ppl   143.656
| epoch  19 step     8100 |    252 batches | lr 0.000346 | ms/batch 320.30 | loss  4.94 | ppl   140.317
| epoch  19 step     8150 |    302 batches | lr 0.000346 | ms/batch 319.72 | loss  4.99 | ppl   146.306
| epoch  19 step     8200 |    352 batches | lr 0.000346 | ms/batch 320.11 | loss  4.86 | ppl   128.783
| epoch  19 step     8250 |    402 batches | lr 0.000346 | ms/batch 321.25 | loss  4.94 | ppl   139.334
| epoch  20 step     8300 |     16 batches | lr 0.000346 | ms/batch 316.07 | loss  4.97 | ppl   143.353
| epoch  20 step     8350 |     66 batches | lr 0.000346 | ms/batch 320.96 | loss  4.88 | ppl   131.703
| epoch  20 step     8400 |    116 batches | lr 0.000346 | ms/batch 320.40 | loss  4.91 | ppl   135.905
----------------------------------------------------------------------------------------------------
| Eval  21 at step     8400 | time: 133.13s | valid loss  4.66 | valid ppl   105.756
----------------------------------------------------------------------------------------------------
| epoch  20 step     8450 |    166 batches | lr 0.000346 | ms/batch 453.70 | loss  4.92 | ppl   136.330
| epoch  20 step     8500 |    216 batches | lr 0.000346 | ms/batch 326.96 | loss  4.94 | ppl   139.836
| epoch  20 step     8550 |    266 batches | lr 0.000346 | ms/batch 334.90 | loss  4.93 | ppl   138.823
| epoch  20 step     8600 |    316 batches | lr 0.000346 | ms/batch 333.52 | loss  4.92 | ppl   136.981
| epoch  20 step     8650 |    366 batches | lr 0.000346 | ms/batch 320.07 | loss  4.86 | ppl   129.307
| epoch  20 step     8700 |    416 batches | lr 0.000346 | ms/batch 320.06 | loss  4.91 | ppl   135.639
| epoch  21 step     8750 |     30 batches | lr 0.000346 | ms/batch 315.22 | loss  4.92 | ppl   136.874
| epoch  21 step     8800 |     80 batches | lr 0.000346 | ms/batch 320.33 | loss  4.85 | ppl   128.351
----------------------------------------------------------------------------------------------------
| Eval  22 at step     8800 | time: 134.60s | valid loss  4.65 | valid ppl   104.752
----------------------------------------------------------------------------------------------------
| epoch  21 step     8850 |    130 batches | lr 0.000346 | ms/batch 465.51 | loss  4.88 | ppl   132.208
| epoch  21 step     8900 |    180 batches | lr 0.000346 | ms/batch 320.46 | loss  4.87 | ppl   130.811
| epoch  21 step     8950 |    230 batches | lr 0.000346 | ms/batch 319.75 | loss  4.93 | ppl   138.965
| epoch  21 step     9000 |    280 batches | lr 0.000346 | ms/batch 321.67 | loss  4.91 | ppl   136.075
| epoch  21 step     9050 |    330 batches | lr 0.000345 | ms/batch 320.71 | loss  4.84 | ppl   126.420
| epoch  21 step     9100 |    380 batches | lr 0.000345 | ms/batch 319.53 | loss  4.86 | ppl   129.054
| epoch  21 step     9150 |    430 batches | lr 0.000345 | ms/batch 319.58 | loss  4.89 | ppl   133.057
| epoch  22 step     9200 |     44 batches | lr 0.000345 | ms/batch 313.04 | loss  4.85 | ppl   127.661
----------------------------------------------------------------------------------------------------
| Eval  23 at step     9200 | time: 132.85s | valid loss  4.61 | valid ppl    99.984
----------------------------------------------------------------------------------------------------
| epoch  22 step     9250 |     94 batches | lr 0.000345 | ms/batch 486.88 | loss  4.82 | ppl   123.955
| epoch  22 step     9300 |    144 batches | lr 0.000345 | ms/batch 326.21 | loss  4.85 | ppl   127.352
| epoch  22 step     9350 |    194 batches | lr 0.000345 | ms/batch 322.80 | loss  4.86 | ppl   129.661
| epoch  22 step     9400 |    244 batches | lr 0.000345 | ms/batch 336.77 | loss  4.91 | ppl   135.745
| epoch  22 step     9450 |    294 batches | lr 0.000345 | ms/batch 331.36 | loss  4.90 | ppl   134.910
| epoch  22 step     9500 |    344 batches | lr 0.000345 | ms/batch 328.30 | loss  4.78 | ppl   119.281
| epoch  22 step     9550 |    394 batches | lr 0.000345 | ms/batch 319.84 | loss  4.85 | ppl   127.591
| epoch  23 step     9600 |      8 batches | lr 0.000345 | ms/batch 314.56 | loss  4.88 | ppl   132.053
----------------------------------------------------------------------------------------------------
| Eval  24 at step     9600 | time: 137.06s | valid loss  4.60 | valid ppl    99.317
----------------------------------------------------------------------------------------------------
| epoch  23 step     9650 |     58 batches | lr 0.000345 | ms/batch 479.86 | loss  4.81 | ppl   123.279
| epoch  23 step     9700 |    108 batches | lr 0.000345 | ms/batch 319.79 | loss  4.82 | ppl   123.984
| epoch  23 step     9750 |    158 batches | lr 0.000345 | ms/batch 320.79 | loss  4.82 | ppl   123.463
| epoch  23 step     9800 |    208 batches | lr 0.000345 | ms/batch 322.13 | loss  4.86 | ppl   129.075
| epoch  23 step     9850 |    258 batches | lr 0.000345 | ms/batch 321.00 | loss  4.87 | ppl   130.464
| epoch  23 step     9900 |    308 batches | lr 0.000345 | ms/batch 319.79 | loss  4.85 | ppl   127.920
| epoch  23 step     9950 |    358 batches | lr 0.000345 | ms/batch 320.67 | loss  4.77 | ppl   118.344
| epoch  23 step    10000 |    408 batches | lr 0.000345 | ms/batch 320.57 | loss  4.83 | ppl   124.869
----------------------------------------------------------------------------------------------------
| Eval  25 at step    10000 | time: 133.27s | valid loss  4.62 | valid ppl   101.610
----------------------------------------------------------------------------------------------------
| epoch  24 step    10050 |     22 batches | lr 0.000344 | ms/batch 412.89 | loss  4.87 | ppl   130.331
| epoch  24 step    10100 |     72 batches | lr 0.000344 | ms/batch 319.92 | loss  4.79 | ppl   119.992
| epoch  24 step    10150 |    122 batches | lr 0.000344 | ms/batch 318.61 | loss  4.82 | ppl   124.140
| epoch  24 step    10200 |    172 batches | lr 0.000344 | ms/batch 320.53 | loss  4.81 | ppl   122.895
| epoch  24 step    10250 |    222 batches | lr 0.000344 | ms/batch 320.15 | loss  4.84 | ppl   126.045
| epoch  24 step    10300 |    272 batches | lr 0.000344 | ms/batch 318.39 | loss  4.83 | ppl   125.231
| epoch  24 step    10350 |    322 batches | lr 0.000344 | ms/batch 318.97 | loss  4.81 | ppl   122.244
| epoch  24 step    10400 |    372 batches | lr 0.000344 | ms/batch 319.18 | loss  4.78 | ppl   118.779
----------------------------------------------------------------------------------------------------
| Eval  26 at step    10400 | time: 132.45s | valid loss  4.59 | valid ppl    98.254
----------------------------------------------------------------------------------------------------
| epoch  24 step    10450 |    422 batches | lr 0.000344 | ms/batch 477.10 | loss  4.81 | ppl   122.511
| epoch  25 step    10500 |     36 batches | lr 0.000344 | ms/batch 313.86 | loss  4.81 | ppl   122.483
| epoch  25 step    10550 |     86 batches | lr 0.000344 | ms/batch 319.80 | loss  4.77 | ppl   117.671
| epoch  25 step    10600 |    136 batches | lr 0.000344 | ms/batch 320.05 | loss  4.80 | ppl   120.914
| epoch  25 step    10650 |    186 batches | lr 0.000344 | ms/batch 322.20 | loss  4.82 | ppl   124.178
| epoch  25 step    10700 |    236 batches | lr 0.000344 | ms/batch 324.13 | loss  4.81 | ppl   122.396
| epoch  25 step    10750 |    286 batches | lr 0.000344 | ms/batch 326.53 | loss  4.86 | ppl   129.004
| epoch  25 step    10800 |    336 batches | lr 0.000344 | ms/batch 320.74 | loss  4.71 | ppl   111.130
----------------------------------------------------------------------------------------------------
| Eval  27 at step    10800 | time: 133.50s | valid loss  4.57 | valid ppl    96.857
----------------------------------------------------------------------------------------------------
| epoch  25 step    10850 |    386 batches | lr 0.000344 | ms/batch 479.23 | loss  4.80 | ppl   121.416
| epoch  25 step    10900 |    436 batches | lr 0.000343 | ms/batch 318.20 | loss  4.80 | ppl   120.952
| epoch  26 step    10950 |     50 batches | lr 0.000343 | ms/batch 318.46 | loss  4.76 | ppl   116.691
| epoch  26 step    11000 |    100 batches | lr 0.000343 | ms/batch 321.57 | loss  4.75 | ppl   115.323
| epoch  26 step    11050 |    150 batches | lr 0.000343 | ms/batch 320.45 | loss  4.78 | ppl   119.533
| epoch  26 step    11100 |    200 batches | lr 0.000343 | ms/batch 319.87 | loss  4.78 | ppl   119.151
| epoch  26 step    11150 |    250 batches | lr 0.000343 | ms/batch 319.30 | loss  4.80 | ppl   121.634
| epoch  26 step    11200 |    300 batches | lr 0.000343 | ms/batch 320.38 | loss  4.82 | ppl   123.801
----------------------------------------------------------------------------------------------------
| Eval  28 at step    11200 | time: 132.99s | valid loss  4.55 | valid ppl    94.898
----------------------------------------------------------------------------------------------------
| epoch  26 step    11250 |    350 batches | lr 0.000343 | ms/batch 497.15 | loss  4.69 | ppl   108.344
| epoch  26 step    11300 |    400 batches | lr 0.000343 | ms/batch 319.64 | loss  4.77 | ppl   117.892
| epoch  27 step    11350 |     14 batches | lr 0.000343 | ms/batch 313.78 | loss  4.81 | ppl   122.655
| epoch  27 step    11400 |     64 batches | lr 0.000343 | ms/batch 320.92 | loss  4.73 | ppl   113.340
| epoch  27 step    11450 |    114 batches | lr 0.000343 | ms/batch 320.39 | loss  4.74 | ppl   114.613
| epoch  27 step    11500 |    164 batches | lr 0.000343 | ms/batch 320.98 | loss  4.75 | ppl   115.341
| epoch  27 step    11550 |    214 batches | lr 0.000343 | ms/batch 321.50 | loss  4.78 | ppl   119.393
| epoch  27 step    11600 |    264 batches | lr 0.000343 | ms/batch 320.60 | loss  4.79 | ppl   120.001
----------------------------------------------------------------------------------------------------
| Eval  29 at step    11600 | time: 133.04s | valid loss  4.56 | valid ppl    95.812
----------------------------------------------------------------------------------------------------
| epoch  27 step    11650 |    314 batches | lr 0.000343 | ms/batch 424.80 | loss  4.76 | ppl   117.249
| epoch  27 step    11700 |    364 batches | lr 0.000342 | ms/batch 319.64 | loss  4.70 | ppl   109.638
| epoch  27 step    11750 |    414 batches | lr 0.000342 | ms/batch 321.15 | loss  4.75 | ppl   115.089
| epoch  28 step    11800 |     28 batches | lr 0.000342 | ms/batch 315.02 | loss  4.77 | ppl   118.011
| epoch  28 step    11850 |     78 batches | lr 0.000342 | ms/batch 320.69 | loss  4.69 | ppl   109.100
| epoch  28 step    11900 |    128 batches | lr 0.000342 | ms/batch 320.03 | loss  4.73 | ppl   113.048
| epoch  28 step    11950 |    178 batches | lr 0.000342 | ms/batch 318.37 | loss  4.74 | ppl   114.131
| epoch  28 step    12000 |    228 batches | lr 0.000342 | ms/batch 320.45 | loss  4.78 | ppl   119.151
----------------------------------------------------------------------------------------------------
| Eval  30 at step    12000 | time: 133.02s | valid loss  4.55 | valid ppl    94.741
----------------------------------------------------------------------------------------------------
| epoch  28 step    12050 |    278 batches | lr 0.000342 | ms/batch 455.95 | loss  4.78 | ppl   119.011
| epoch  28 step    12100 |    328 batches | lr 0.000342 | ms/batch 322.22 | loss  4.71 | ppl   111.557
| epoch  28 step    12150 |    378 batches | lr 0.000342 | ms/batch 321.50 | loss  4.71 | ppl   111.417
| epoch  28 step    12200 |    428 batches | lr 0.000342 | ms/batch 321.14 | loss  4.73 | ppl   113.846
| epoch  29 step    12250 |     42 batches | lr 0.000342 | ms/batch 314.68 | loss  4.73 | ppl   113.819
| epoch  29 step    12300 |     92 batches | lr 0.000342 | ms/batch 318.82 | loss  4.70 | ppl   109.613
| epoch  29 step    12350 |    142 batches | lr 0.000342 | ms/batch 320.09 | loss  4.70 | ppl   109.638
| epoch  29 step    12400 |    192 batches | lr 0.000342 | ms/batch 321.30 | loss  4.75 | ppl   115.792
----------------------------------------------------------------------------------------------------
| Eval  31 at step    12400 | time: 133.17s | valid loss  4.54 | valid ppl    94.160
----------------------------------------------------------------------------------------------------
| epoch  29 step    12450 |    242 batches | lr 0.000342 | ms/batch 454.63 | loss  4.76 | ppl   116.182
| epoch  29 step    12500 |    292 batches | lr 0.000341 | ms/batch 321.51 | loss  4.76 | ppl   117.276
| epoch  29 step    12550 |    342 batches | lr 0.000341 | ms/batch 321.99 | loss  4.65 | ppl   104.781
| epoch  29 step    12600 |    392 batches | lr 0.000341 | ms/batch 322.13 | loss  4.72 | ppl   112.511
| epoch  30 step    12650 |      6 batches | lr 0.000341 | ms/batch 323.16 | loss  4.72 | ppl   112.563
| epoch  30 step    12700 |     56 batches | lr 0.000341 | ms/batch 320.37 | loss  4.69 | ppl   108.480
| epoch  30 step    12750 |    106 batches | lr 0.000341 | ms/batch 320.26 | loss  4.69 | ppl   108.429
| epoch  30 step    12800 |    156 batches | lr 0.000341 | ms/batch 321.02 | loss  4.72 | ppl   112.300
----------------------------------------------------------------------------------------------------
| Eval  32 at step    12800 | time: 133.53s | valid loss  4.51 | valid ppl    91.312
----------------------------------------------------------------------------------------------------
| epoch  30 step    12850 |    206 batches | lr 0.000341 | ms/batch 455.30 | loss  4.73 | ppl   113.739
| epoch  30 step    12900 |    256 batches | lr 0.000341 | ms/batch 319.69 | loss  4.72 | ppl   112.159
| epoch  30 step    12950 |    306 batches | lr 0.000341 | ms/batch 323.63 | loss  4.75 | ppl   115.035
| epoch  30 step    13000 |    356 batches | lr 0.000341 | ms/batch 321.39 | loss  4.66 | ppl   105.851
| epoch  30 step    13050 |    406 batches | lr 0.000341 | ms/batch 320.06 | loss  4.67 | ppl   106.806
| epoch  31 step    13100 |     20 batches | lr 0.000341 | ms/batch 314.79 | loss  4.73 | ppl   113.837
| epoch  31 step    13150 |     70 batches | lr 0.000341 | ms/batch 320.95 | loss  4.65 | ppl   105.011
| epoch  31 step    13200 |    120 batches | lr 0.00034 | ms/batch 320.14 | loss  4.68 | ppl   107.762
----------------------------------------------------------------------------------------------------
| Eval  33 at step    13200 | time: 133.12s | valid loss  4.52 | valid ppl    92.280
----------------------------------------------------------------------------------------------------
| epoch  31 step    13250 |    170 batches | lr 0.00034 | ms/batch 420.61 | loss  4.68 | ppl   107.400
| epoch  31 step    13300 |    220 batches | lr 0.00034 | ms/batch 320.43 | loss  4.72 | ppl   112.142
| epoch  31 step    13350 |    270 batches | lr 0.00034 | ms/batch 320.67 | loss  4.71 | ppl   110.541
| epoch  31 step    13400 |    320 batches | lr 0.00034 | ms/batch 320.91 | loss  4.69 | ppl   109.194
| epoch  31 step    13450 |    370 batches | lr 0.00034 | ms/batch 319.97 | loss  4.67 | ppl   106.648
| epoch  31 step    13500 |    420 batches | lr 0.00034 | ms/batch 326.11 | loss  4.67 | ppl   106.423
| epoch  32 step    13550 |     34 batches | lr 0.00034 | ms/batch 314.19 | loss  4.71 | ppl   111.452
| epoch  32 step    13600 |     84 batches | lr 0.00034 | ms/batch 320.80 | loss  4.63 | ppl   102.482
----------------------------------------------------------------------------------------------------
| Eval  34 at step    13600 | time: 133.19s | valid loss  4.51 | valid ppl    90.473
----------------------------------------------------------------------------------------------------
| epoch  32 step    13650 |    134 batches | lr 0.00034 | ms/batch 454.81 | loss  4.68 | ppl   107.787
| epoch  32 step    13700 |    184 batches | lr 0.00034 | ms/batch 320.43 | loss  4.69 | ppl   108.998
| epoch  32 step    13750 |    234 batches | lr 0.00034 | ms/batch 321.40 | loss  4.68 | ppl   108.167
| epoch  32 step    13800 |    284 batches | lr 0.00034 | ms/batch 320.93 | loss  4.72 | ppl   112.238
| epoch  32 step    13850 |    334 batches | lr 0.00034 | ms/batch 317.81 | loss  4.60 | ppl    99.244
| epoch  32 step    13900 |    384 batches | lr 0.000339 | ms/batch 320.36 | loss  4.67 | ppl   107.082
| epoch  32 step    13950 |    434 batches | lr 0.000339 | ms/batch 320.25 | loss  4.68 | ppl   107.602
| epoch  33 step    14000 |     48 batches | lr 0.000339 | ms/batch 315.11 | loss  4.66 | ppl   105.669
----------------------------------------------------------------------------------------------------
| Eval  35 at step    14000 | time: 133.11s | valid loss  4.50 | valid ppl    90.374
----------------------------------------------------------------------------------------------------
| epoch  33 step    14050 |     98 batches | lr 0.000339 | ms/batch 461.27 | loss  4.64 | ppl   103.868
| epoch  33 step    14100 |    148 batches | lr 0.000339 | ms/batch 321.13 | loss  4.66 | ppl   105.909
| epoch  33 step    14150 |    198 batches | lr 0.000339 | ms/batch 320.73 | loss  4.68 | ppl   108.310
| epoch  33 step    14200 |    248 batches | lr 0.000339 | ms/batch 320.93 | loss  4.68 | ppl   107.602
| epoch  33 step    14250 |    298 batches | lr 0.000339 | ms/batch 320.32 | loss  4.72 | ppl   111.670
| epoch  33 step    14300 |    348 batches | lr 0.000339 | ms/batch 320.90 | loss  4.59 | ppl    98.371
| epoch  33 step    14350 |    398 batches | lr 0.000339 | ms/batch 320.81 | loss  4.65 | ppl   104.978
| epoch  34 step    14400 |     12 batches | lr 0.000339 | ms/batch 315.12 | loss  4.68 | ppl   108.192
----------------------------------------------------------------------------------------------------
| Eval  36 at step    14400 | time: 133.18s | valid loss  4.50 | valid ppl    89.690
----------------------------------------------------------------------------------------------------
| epoch  34 step    14450 |     62 batches | lr 0.000339 | ms/batch 459.53 | loss  4.61 | ppl   100.571
| epoch  34 step    14500 |    112 batches | lr 0.000339 | ms/batch 326.00 | loss  4.63 | ppl   102.322
| epoch  34 step    14550 |    162 batches | lr 0.000338 | ms/batch 321.91 | loss  4.65 | ppl   104.218
| epoch  34 step    14600 |    212 batches | lr 0.000338 | ms/batch 322.64 | loss  4.67 | ppl   106.798
| epoch  34 step    14650 |    262 batches | lr 0.000338 | ms/batch 321.50 | loss  4.66 | ppl   105.488
| epoch  34 step    14700 |    312 batches | lr 0.000338 | ms/batch 319.98 | loss  4.65 | ppl   104.781
| epoch  34 step    14750 |    362 batches | lr 0.000338 | ms/batch 321.05 | loss  4.60 | ppl    99.757
| epoch  34 step    14800 |    412 batches | lr 0.000338 | ms/batch 320.85 | loss  4.62 | ppl   101.772
----------------------------------------------------------------------------------------------------
| Eval  37 at step    14800 | time: 133.92s | valid loss  4.49 | valid ppl    89.372
----------------------------------------------------------------------------------------------------
| epoch  35 step    14850 |     26 batches | lr 0.000338 | ms/batch 489.52 | loss  4.68 | ppl   108.251
| epoch  35 step    14900 |     76 batches | lr 0.000338 | ms/batch 320.39 | loss  4.60 | ppl    99.166
| epoch  35 step    14950 |    126 batches | lr 0.000338 | ms/batch 319.63 | loss  4.64 | ppl   103.544
| epoch  35 step    15000 |    176 batches | lr 0.000338 | ms/batch 320.91 | loss  4.62 | ppl   101.320
| epoch  35 step    15050 |    226 batches | lr 0.000338 | ms/batch 323.77 | loss  4.66 | ppl   105.331
| epoch  35 step    15100 |    276 batches | lr 0.000338 | ms/batch 319.91 | loss  4.65 | ppl   104.462
| epoch  35 step    15150 |    326 batches | lr 0.000337 | ms/batch 326.86 | loss  4.62 | ppl   101.383
| epoch  35 step    15200 |    376 batches | lr 0.000337 | ms/batch 336.71 | loss  4.60 | ppl    99.913
----------------------------------------------------------------------------------------------------
| Eval  38 at step    15200 | time: 134.58s | valid loss  4.48 | valid ppl    87.932
----------------------------------------------------------------------------------------------------
| epoch  35 step    15250 |    426 batches | lr 0.000337 | ms/batch 488.34 | loss  4.65 | ppl   104.080
| epoch  36 step    15300 |     40 batches | lr 0.000337 | ms/batch 314.35 | loss  4.64 | ppl   103.415
| epoch  36 step    15350 |     90 batches | lr 0.000337 | ms/batch 322.67 | loss  4.57 | ppl    96.582
| epoch  36 step    15400 |    140 batches | lr 0.000337 | ms/batch 320.10 | loss  4.60 | ppl    99.788
| epoch  36 step    15450 |    190 batches | lr 0.000337 | ms/batch 320.55 | loss  4.63 | ppl   102.075
| epoch  36 step    15500 |    240 batches | lr 0.000337 | ms/batch 320.56 | loss  4.64 | ppl   103.787
| epoch  36 step    15550 |    290 batches | lr 0.000337 | ms/batch 321.33 | loss  4.68 | ppl   107.400
| epoch  36 step    15600 |    340 batches | lr 0.000337 | ms/batch 321.83 | loss  4.55 | ppl    94.988
----------------------------------------------------------------------------------------------------
| Eval  39 at step    15600 | time: 133.26s | valid loss  4.47 | valid ppl    87.480
----------------------------------------------------------------------------------------------------
| epoch  36 step    15650 |    390 batches | lr 0.000337 | ms/batch 470.65 | loss  4.61 | ppl   100.885
| epoch  37 step    15700 |      4 batches | lr 0.000337 | ms/batch 331.74 | loss  4.65 | ppl   104.659
| epoch  37 step    15750 |     54 batches | lr 0.000336 | ms/batch 335.22 | loss  4.58 | ppl    97.438
| epoch  37 step    15800 |    104 batches | lr 0.000336 | ms/batch 325.73 | loss  4.59 | ppl    98.741
| epoch  37 step    15850 |    154 batches | lr 0.000336 | ms/batch 320.94 | loss  4.61 | ppl   100.257
| epoch  37 step    15900 |    204 batches | lr 0.000336 | ms/batch 320.12 | loss  4.62 | ppl   101.955
| epoch  37 step    15950 |    254 batches | lr 0.000336 | ms/batch 320.84 | loss  4.61 | ppl   100.116
| epoch  37 step    16000 |    304 batches | lr 0.000336 | ms/batch 333.51 | loss  4.67 | ppl   106.940
----------------------------------------------------------------------------------------------------
| Eval  40 at step    16000 | time: 135.74s | valid loss  4.47 | valid ppl    87.345
----------------------------------------------------------------------------------------------------
| epoch  37 step    16050 |    354 batches | lr 0.000336 | ms/batch 473.62 | loss  4.54 | ppl    93.720
| epoch  37 step    16100 |    404 batches | lr 0.000336 | ms/batch 321.42 | loss  4.61 | ppl   100.045
| epoch  38 step    16150 |     18 batches | lr 0.000336 | ms/batch 324.95 | loss  4.65 | ppl   104.088
| epoch  38 step    16200 |     68 batches | lr 0.000336 | ms/batch 327.90 | loss  4.56 | ppl    95.531
| epoch  38 step    16250 |    118 batches | lr 0.000336 | ms/batch 321.31 | loss  4.58 | ppl    97.812
| epoch  38 step    16300 |    168 batches | lr 0.000336 | ms/batch 320.05 | loss  4.59 | ppl    98.942
| epoch  38 step    16350 |    218 batches | lr 0.000335 | ms/batch 320.84 | loss  4.64 | ppl   103.674
| epoch  38 step    16400 |    268 batches | lr 0.000335 | ms/batch 321.06 | loss  4.62 | ppl   101.059
----------------------------------------------------------------------------------------------------
| Eval  41 at step    16400 | time: 134.64s | valid loss  4.47 | valid ppl    87.433
----------------------------------------------------------------------------------------------------
| epoch  38 step    16450 |    318 batches | lr 0.000335 | ms/batch 420.74 | loss  4.61 | ppl   100.280
| epoch  38 step    16500 |    368 batches | lr 0.000335 | ms/batch 321.47 | loss  4.57 | ppl    96.567
| epoch  38 step    16550 |    418 batches | lr 0.000335 | ms/batch 322.04 | loss  4.57 | ppl    96.145
| epoch  39 step    16600 |     32 batches | lr 0.000335 | ms/batch 314.74 | loss  4.63 | ppl   102.907
| epoch  39 step    16650 |     82 batches | lr 0.000335 | ms/batch 322.17 | loss  4.55 | ppl    94.573
| epoch  39 step    16700 |    132 batches | lr 0.000335 | ms/batch 320.56 | loss  4.59 | ppl    98.602
| epoch  39 step    16750 |    182 batches | lr 0.000335 | ms/batch 320.57 | loss  4.58 | ppl    97.294
| epoch  39 step    16800 |    232 batches | lr 0.000335 | ms/batch 320.16 | loss  4.62 | ppl   101.740
----------------------------------------------------------------------------------------------------
| Eval  42 at step    16800 | time: 133.14s | valid loss  4.46 | valid ppl    86.428
----------------------------------------------------------------------------------------------------
| epoch  39 step    16850 |    282 batches | lr 0.000335 | ms/batch 470.45 | loss  4.61 | ppl   100.901
| epoch  39 step    16900 |    332 batches | lr 0.000334 | ms/batch 324.81 | loss  4.55 | ppl    94.684
| epoch  39 step    16950 |    382 batches | lr 0.000334 | ms/batch 327.93 | loss  4.58 | ppl    97.408
| epoch  39 step    17000 |    432 batches | lr 0.000334 | ms/batch 320.03 | loss  4.59 | ppl    98.487
| epoch  40 step    17050 |     46 batches | lr 0.000334 | ms/batch 313.90 | loss  4.57 | ppl    96.401
| epoch  40 step    17100 |     96 batches | lr 0.000334 | ms/batch 320.51 | loss  4.52 | ppl    91.778
| epoch  40 step    17150 |    146 batches | lr 0.000334 | ms/batch 320.52 | loss  4.56 | ppl    95.785
| epoch  40 step    17200 |    196 batches | lr 0.000334 | ms/batch 319.54 | loss  4.60 | ppl    99.749
----------------------------------------------------------------------------------------------------
| Eval  43 at step    17200 | time: 133.41s | valid loss  4.43 | valid ppl    84.326
----------------------------------------------------------------------------------------------------
| epoch  40 step    17250 |    246 batches | lr 0.000334 | ms/batch 469.09 | loss  4.61 | ppl   100.870
| epoch  40 step    17300 |    296 batches | lr 0.000334 | ms/batch 319.53 | loss  4.62 | ppl   101.138
| epoch  40 step    17350 |    346 batches | lr 0.000334 | ms/batch 318.17 | loss  4.49 | ppl    88.892
| epoch  40 step    17400 |    396 batches | lr 0.000334 | ms/batch 320.51 | loss  4.57 | ppl    96.680
| epoch  41 step    17450 |     10 batches | lr 0.000333 | ms/batch 314.40 | loss  4.63 | ppl   102.522
| epoch  41 step    17500 |     60 batches | lr 0.000333 | ms/batch 319.84 | loss  4.53 | ppl    92.744
| epoch  41 step    17550 |    110 batches | lr 0.000333 | ms/batch 320.75 | loss  4.55 | ppl    94.840
| epoch  41 step    17600 |    160 batches | lr 0.000333 | ms/batch 321.38 | loss  4.56 | ppl    96.055
----------------------------------------------------------------------------------------------------
| Eval  44 at step    17600 | time: 132.71s | valid loss  4.43 | valid ppl    83.899
----------------------------------------------------------------------------------------------------
| epoch  41 step    17650 |    210 batches | lr 0.000333 | ms/batch 453.16 | loss  4.57 | ppl    96.115
| epoch  41 step    17700 |    260 batches | lr 0.000333 | ms/batch 320.44 | loss  4.57 | ppl    96.514
| epoch  41 step    17750 |    310 batches | lr 0.000333 | ms/batch 320.73 | loss  4.58 | ppl    97.720
| epoch  41 step    17800 |    360 batches | lr 0.000333 | ms/batch 320.54 | loss  4.52 | ppl    92.065
| epoch  41 step    17850 |    410 batches | lr 0.000333 | ms/batch 319.04 | loss  4.56 | ppl    95.226
| epoch  42 step    17900 |     24 batches | lr 0.000333 | ms/batch 313.05 | loss  4.59 | ppl    98.394
| epoch  42 step    17950 |     74 batches | lr 0.000332 | ms/batch 320.21 | loss  4.52 | ppl    91.592
| epoch  42 step    18000 |    124 batches | lr 0.000332 | ms/batch 320.02 | loss  4.57 | ppl    96.243
----------------------------------------------------------------------------------------------------
| Eval  45 at step    18000 | time: 132.72s | valid loss  4.43 | valid ppl    84.156
----------------------------------------------------------------------------------------------------
| epoch  42 step    18050 |    174 batches | lr 0.000332 | ms/batch 420.73 | loss  4.56 | ppl    95.189
| epoch  42 step    18100 |    224 batches | lr 0.000332 | ms/batch 320.65 | loss  4.61 | ppl   100.108
| epoch  42 step    18150 |    274 batches | lr 0.000332 | ms/batch 321.08 | loss  4.58 | ppl    97.568
| epoch  42 step    18200 |    324 batches | lr 0.000332 | ms/batch 320.47 | loss  4.54 | ppl    93.749
| epoch  42 step    18250 |    374 batches | lr 0.000332 | ms/batch 320.88 | loss  4.56 | ppl    95.248
| epoch  42 step    18300 |    424 batches | lr 0.000332 | ms/batch 320.30 | loss  4.56 | ppl    95.360
| epoch  43 step    18350 |     38 batches | lr 0.000332 | ms/batch 314.54 | loss  4.55 | ppl    94.212
| epoch  43 step    18400 |     88 batches | lr 0.000332 | ms/batch 320.44 | loss  4.52 | ppl    91.757
----------------------------------------------------------------------------------------------------
| Eval  46 at step    18400 | time: 132.96s | valid loss  4.43 | valid ppl    83.929
----------------------------------------------------------------------------------------------------
| epoch  43 step    18450 |    138 batches | lr 0.000332 | ms/batch 419.03 | loss  4.56 | ppl    95.233
| epoch  43 step    18500 |    188 batches | lr 0.000331 | ms/batch 320.50 | loss  4.55 | ppl    94.322
| epoch  43 step    18550 |    238 batches | lr 0.000331 | ms/batch 320.40 | loss  4.57 | ppl    96.220
| epoch  43 step    18600 |    288 batches | lr 0.000331 | ms/batch 320.28 | loss  4.60 | ppl    99.251
| epoch  43 step    18650 |    338 batches | lr 0.000331 | ms/batch 319.98 | loss  4.49 | ppl    89.435
| epoch  43 step    18700 |    388 batches | lr 0.000331 | ms/batch 320.23 | loss  4.55 | ppl    94.492
| epoch  44 step    18750 |      2 batches | lr 0.000331 | ms/batch 314.83 | loss  4.56 | ppl    95.800
| epoch  44 step    18800 |     52 batches | lr 0.000331 | ms/batch 320.98 | loss  4.49 | ppl    89.254
----------------------------------------------------------------------------------------------------
| Eval  47 at step    18800 | time: 132.83s | valid loss  4.42 | valid ppl    83.165
----------------------------------------------------------------------------------------------------
| epoch  44 step    18850 |    102 batches | lr 0.000331 | ms/batch 450.48 | loss  4.50 | ppl    89.940
| epoch  44 step    18900 |    152 batches | lr 0.000331 | ms/batch 320.21 | loss  4.54 | ppl    93.326
| epoch  44 step    18950 |    202 batches | lr 0.000331 | ms/batch 319.86 | loss  4.55 | ppl    94.929
| epoch  44 step    19000 |    252 batches | lr 0.00033 | ms/batch 320.94 | loss  4.55 | ppl    94.610
| epoch  44 step    19050 |    302 batches | lr 0.00033 | ms/batch 320.20 | loss  4.57 | ppl    96.756
| epoch  44 step    19100 |    352 batches | lr 0.00033 | ms/batch 321.27 | loss  4.46 | ppl    86.785
| epoch  44 step    19150 |    402 batches | lr 0.00033 | ms/batch 320.88 | loss  4.53 | ppl    92.599
| epoch  45 step    19200 |     16 batches | lr 0.00033 | ms/batch 315.79 | loss  4.57 | ppl    96.589
----------------------------------------------------------------------------------------------------
| Eval  48 at step    19200 | time: 132.95s | valid loss  4.42 | valid ppl    83.372
----------------------------------------------------------------------------------------------------
| epoch  45 step    19250 |     66 batches | lr 0.00033 | ms/batch 420.41 | loss  4.50 | ppl    90.109
| epoch  45 step    19300 |    116 batches | lr 0.00033 | ms/batch 320.10 | loss  4.50 | ppl    90.172
| epoch  45 step    19350 |    166 batches | lr 0.00033 | ms/batch 320.28 | loss  4.52 | ppl    91.435
| epoch  45 step    19400 |    216 batches | lr 0.00033 | ms/batch 319.92 | loss  4.53 | ppl    92.969
| epoch  45 step    19450 |    266 batches | lr 0.00033 | ms/batch 319.18 | loss  4.56 | ppl    96.018
| epoch  45 step    19500 |    316 batches | lr 0.000329 | ms/batch 321.11 | loss  4.56 | ppl    95.375
| epoch  45 step    19550 |    366 batches | lr 0.000329 | ms/batch 322.20 | loss  4.47 | ppl    87.473
| epoch  45 step    19600 |    416 batches | lr 0.000329 | ms/batch 320.23 | loss  4.53 | ppl    92.831
----------------------------------------------------------------------------------------------------
| Eval  49 at step    19600 | time: 133.18s | valid loss  4.43 | valid ppl    83.603
----------------------------------------------------------------------------------------------------
| epoch  46 step    19650 |     30 batches | lr 0.000329 | ms/batch 414.38 | loss  4.55 | ppl    94.981
| epoch  46 step    19700 |     80 batches | lr 0.000329 | ms/batch 319.71 | loss  4.47 | ppl    87.057
| epoch  46 step    19750 |    130 batches | lr 0.000329 | ms/batch 320.66 | loss  4.53 | ppl    92.925
| epoch  46 step    19800 |    180 batches | lr 0.000329 | ms/batch 318.57 | loss  4.52 | ppl    91.785
| epoch  46 step    19850 |    230 batches | lr 0.000329 | ms/batch 319.53 | loss  4.55 | ppl    95.107
| epoch  46 step    19900 |    280 batches | lr 0.000329 | ms/batch 319.78 | loss  4.56 | ppl    95.226
| epoch  46 step    19950 |    330 batches | lr 0.000328 | ms/batch 319.42 | loss  4.47 | ppl    87.486
| epoch  46 step    20000 |    380 batches | lr 0.000328 | ms/batch 318.91 | loss  4.50 | ppl    89.849
----------------------------------------------------------------------------------------------------
| Eval  50 at step    20000 | time: 132.51s | valid loss  4.40 | valid ppl    81.533
----------------------------------------------------------------------------------------------------
| epoch  46 step    20050 |    430 batches | lr 0.000328 | ms/batch 452.41 | loss  4.55 | ppl    94.884
| epoch  47 step    20100 |     44 batches | lr 0.000328 | ms/batch 314.07 | loss  4.53 | ppl    92.788
| epoch  47 step    20150 |     94 batches | lr 0.000328 | ms/batch 321.02 | loss  4.47 | ppl    87.091
| epoch  47 step    20200 |    144 batches | lr 0.000328 | ms/batch 320.05 | loss  4.52 | ppl    92.188
| epoch  47 step    20250 |    194 batches | lr 0.000328 | ms/batch 320.85 | loss  4.52 | ppl    91.678
| epoch  47 step    20300 |    244 batches | lr 0.000328 | ms/batch 320.20 | loss  4.53 | ppl    92.831
| epoch  47 step    20350 |    294 batches | lr 0.000328 | ms/batch 320.33 | loss  4.56 | ppl    95.263
| epoch  47 step    20400 |    344 batches | lr 0.000327 | ms/batch 321.13 | loss  4.42 | ppl    82.992
----------------------------------------------------------------------------------------------------
| Eval  51 at step    20400 | time: 132.93s | valid loss  4.41 | valid ppl    81.964
----------------------------------------------------------------------------------------------------
| epoch  47 step    20450 |    394 batches | lr 0.000327 | ms/batch 420.69 | loss  4.51 | ppl    90.950
| epoch  48 step    20500 |      8 batches | lr 0.000327 | ms/batch 313.99 | loss  4.53 | ppl    92.860
| epoch  48 step    20550 |     58 batches | lr 0.000327 | ms/batch 320.92 | loss  4.47 | ppl    87.534
| epoch  48 step    20600 |    108 batches | lr 0.000327 | ms/batch 320.84 | loss  4.48 | ppl    87.952
| epoch  48 step    20650 |    158 batches | lr 0.000327 | ms/batch 319.99 | loss  4.50 | ppl    89.624
| epoch  48 step    20700 |    208 batches | lr 0.000327 | ms/batch 319.82 | loss  4.51 | ppl    90.759
| epoch  48 step    20750 |    258 batches | lr 0.000327 | ms/batch 318.87 | loss  4.53 | ppl    92.635
| epoch  48 step    20800 |    308 batches | lr 0.000327 | ms/batch 320.91 | loss  4.54 | ppl    93.545
----------------------------------------------------------------------------------------------------
| Eval  52 at step    20800 | time: 132.77s | valid loss  4.42 | valid ppl    82.982
----------------------------------------------------------------------------------------------------
| epoch  48 step    20850 |    358 batches | lr 0.000327 | ms/batch 419.02 | loss  4.44 | ppl    84.782
| epoch  48 step    20900 |    408 batches | lr 0.000326 | ms/batch 319.34 | loss  4.50 | ppl    89.891
| epoch  49 step    20950 |     22 batches | lr 0.000326 | ms/batch 315.18 | loss  4.53 | ppl    92.469
| epoch  49 step    21000 |     72 batches | lr 0.000326 | ms/batch 321.29 | loss  4.45 | ppl    85.527
| epoch  49 step    21050 |    122 batches | lr 0.000326 | ms/batch 320.91 | loss  4.50 | ppl    89.933
| epoch  49 step    21100 |    172 batches | lr 0.000326 | ms/batch 322.04 | loss  4.48 | ppl    88.317
| epoch  49 step    21150 |    222 batches | lr 0.000326 | ms/batch 318.90 | loss  4.53 | ppl    92.346
| epoch  49 step    21200 |    272 batches | lr 0.000326 | ms/batch 319.39 | loss  4.51 | ppl    90.638
----------------------------------------------------------------------------------------------------
| Eval  53 at step    21200 | time: 132.81s | valid loss  4.40 | valid ppl    81.725
----------------------------------------------------------------------------------------------------
| epoch  49 step    21250 |    322 batches | lr 0.000326 | ms/batch 420.55 | loss  4.48 | ppl    87.925
| epoch  49 step    21300 |    372 batches | lr 0.000326 | ms/batch 321.04 | loss  4.46 | ppl    86.704
| epoch  49 step    21350 |    422 batches | lr 0.000325 | ms/batch 328.78 | loss  4.50 | ppl    90.433
| epoch  50 step    21400 |     36 batches | lr 0.000325 | ms/batch 313.20 | loss  4.53 | ppl    92.635
| epoch  50 step    21450 |     86 batches | lr 0.000325 | ms/batch 320.01 | loss  4.46 | ppl    86.224
| epoch  50 step    21500 |    136 batches | lr 0.000325 | ms/batch 320.33 | loss  4.48 | ppl    88.290
| epoch  50 step    21550 |    186 batches | lr 0.000325 | ms/batch 319.95 | loss  4.50 | ppl    89.799
| epoch  50 step    21600 |    236 batches | lr 0.000325 | ms/batch 320.93 | loss  4.51 | ppl    91.263
----------------------------------------------------------------------------------------------------
| Eval  54 at step    21600 | time: 133.46s | valid loss  4.39 | valid ppl    80.662
----------------------------------------------------------------------------------------------------
| epoch  50 step    21650 |    286 batches | lr 0.000325 | ms/batch 475.32 | loss  4.53 | ppl    93.020
| epoch  50 step    21700 |    336 batches | lr 0.000325 | ms/batch 319.58 | loss  4.41 | ppl    82.083
| epoch  50 step    21750 |    386 batches | lr 0.000324 | ms/batch 319.84 | loss  4.49 | ppl    88.934
| epoch  50 step    21800 |    436 batches | lr 0.000324 | ms/batch 314.79 | loss  4.50 | ppl    89.624
| epoch  51 step    21850 |     50 batches | lr 0.000324 | ms/batch 318.22 | loss  4.47 | ppl    86.955
| epoch  51 step    21900 |    100 batches | lr 0.000324 | ms/batch 319.71 | loss  4.43 | ppl    84.273
| epoch  51 step    21950 |    150 batches | lr 0.000324 | ms/batch 319.39 | loss  4.47 | ppl    87.016
| epoch  51 step    22000 |    200 batches | lr 0.000324 | ms/batch 321.49 | loss  4.49 | ppl    88.705
----------------------------------------------------------------------------------------------------
| Eval  55 at step    22000 | time: 132.71s | valid loss  4.39 | valid ppl    80.370
----------------------------------------------------------------------------------------------------
| epoch  51 step    22050 |    250 batches | lr 0.000324 | ms/batch 453.89 | loss  4.50 | ppl    90.320
| epoch  51 step    22100 |    300 batches | lr 0.000324 | ms/batch 319.27 | loss  4.52 | ppl    91.606
| epoch  51 step    22150 |    350 batches | lr 0.000324 | ms/batch 320.14 | loss  4.40 | ppl    81.260
| epoch  51 step    22200 |    400 batches | lr 0.000323 | ms/batch 318.86 | loss  4.46 | ppl    86.359
| epoch  52 step    22250 |     14 batches | lr 0.000323 | ms/batch 314.00 | loss  4.49 | ppl    88.836
| epoch  52 step    22300 |     64 batches | lr 0.000323 | ms/batch 321.30 | loss  4.45 | ppl    85.687
| epoch  52 step    22350 |    114 batches | lr 0.000323 | ms/batch 320.38 | loss  4.46 | ppl    86.144
| epoch  52 step    22400 |    164 batches | lr 0.000323 | ms/batch 320.90 | loss  4.47 | ppl    87.569
----------------------------------------------------------------------------------------------------
| Eval  56 at step    22400 | time: 132.77s | valid loss  4.38 | valid ppl    79.762
----------------------------------------------------------------------------------------------------
| epoch  52 step    22450 |    214 batches | lr 0.000323 | ms/batch 452.73 | loss  4.49 | ppl    88.788
| epoch  52 step    22500 |    264 batches | lr 0.000323 | ms/batch 319.38 | loss  4.48 | ppl    87.877
| epoch  52 step    22550 |    314 batches | lr 0.000323 | ms/batch 319.22 | loss  4.48 | ppl    88.186
| epoch  52 step    22600 |    364 batches | lr 0.000323 | ms/batch 319.50 | loss  4.40 | ppl    81.527
| epoch  52 step    22650 |    414 batches | lr 0.000322 | ms/batch 319.60 | loss  4.46 | ppl    86.346
| epoch  53 step    22700 |     28 batches | lr 0.000322 | ms/batch 315.08 | loss  4.49 | ppl    88.698
| epoch  53 step    22750 |     78 batches | lr 0.000322 | ms/batch 321.90 | loss  4.43 | ppl    83.774
| epoch  53 step    22800 |    128 batches | lr 0.000322 | ms/batch 320.75 | loss  4.46 | ppl    86.157
----------------------------------------------------------------------------------------------------
| Eval  57 at step    22800 | time: 132.78s | valid loss  4.38 | valid ppl    79.941
----------------------------------------------------------------------------------------------------
| epoch  53 step    22850 |    178 batches | lr 0.000322 | ms/batch 421.70 | loss  4.45 | ppl    85.902
| epoch  53 step    22900 |    228 batches | lr 0.000322 | ms/batch 320.13 | loss  4.49 | ppl    88.934
| epoch  53 step    22950 |    278 batches | lr 0.000322 | ms/batch 321.26 | loss  4.52 | ppl    91.728
| epoch  53 step    23000 |    328 batches | lr 0.000322 | ms/batch 320.26 | loss  4.43 | ppl    83.741
| epoch  53 step    23050 |    378 batches | lr 0.000321 | ms/batch 320.28 | loss  4.43 | ppl    84.109
| epoch  53 step    23100 |    428 batches | lr 0.000321 | ms/batch 320.20 | loss  4.45 | ppl    85.908
| epoch  54 step    23150 |     42 batches | lr 0.000321 | ms/batch 312.79 | loss  4.46 | ppl    86.670
| epoch  54 step    23200 |     92 batches | lr 0.000321 | ms/batch 320.95 | loss  4.40 | ppl    81.763
----------------------------------------------------------------------------------------------------
| Eval  58 at step    23200 | time: 132.86s | valid loss  4.39 | valid ppl    80.427
----------------------------------------------------------------------------------------------------
| epoch  54 step    23250 |    142 batches | lr 0.000321 | ms/batch 419.71 | loss  4.44 | ppl    84.596
| epoch  54 step    23300 |    192 batches | lr 0.000321 | ms/batch 318.76 | loss  4.48 | ppl    88.097
| epoch  54 step    23350 |    242 batches | lr 0.000321 | ms/batch 319.59 | loss  4.47 | ppl    87.003
| epoch  54 step    23400 |    292 batches | lr 0.000321 | ms/batch 320.38 | loss  4.50 | ppl    90.264
| epoch  54 step    23450 |    342 batches | lr 0.00032 | ms/batch 318.96 | loss  4.35 | ppl    77.327
| epoch  54 step    23500 |    392 batches | lr 0.00032 | ms/batch 319.19 | loss  4.47 | ppl    87.411
| epoch  55 step    23550 |      6 batches | lr 0.00032 | ms/batch 314.22 | loss  4.48 | ppl    88.615
| epoch  55 step    23600 |     56 batches | lr 0.00032 | ms/batch 318.75 | loss  4.41 | ppl    82.572
----------------------------------------------------------------------------------------------------
| Eval  59 at step    23600 | time: 132.47s | valid loss  4.36 | valid ppl    78.290
----------------------------------------------------------------------------------------------------
| epoch  55 step    23650 |    106 batches | lr 0.00032 | ms/batch 467.08 | loss  4.41 | ppl    82.224
| epoch  55 step    23700 |    156 batches | lr 0.00032 | ms/batch 319.64 | loss  4.42 | ppl    83.487
| epoch  55 step    23750 |    206 batches | lr 0.00032 | ms/batch 321.11 | loss  4.46 | ppl    86.569
| epoch  55 step    23800 |    256 batches | lr 0.00032 | ms/batch 320.29 | loss  4.46 | ppl    86.751
| epoch  55 step    23850 |    306 batches | lr 0.000319 | ms/batch 319.73 | loss  4.48 | ppl    88.476
| epoch  55 step    23900 |    356 batches | lr 0.000319 | ms/batch 320.00 | loss  4.41 | ppl    82.160
| epoch  55 step    23950 |    406 batches | lr 0.000319 | ms/batch 320.24 | loss  4.47 | ppl    86.941
| epoch  56 step    24000 |     20 batches | lr 0.000319 | ms/batch 313.47 | loss  4.49 | ppl    88.805
----------------------------------------------------------------------------------------------------
| Eval  60 at step    24000 | time: 133.44s | valid loss  4.37 | valid ppl    78.984
----------------------------------------------------------------------------------------------------
| epoch  56 step    24050 |     70 batches | lr 0.000319 | ms/batch 419.31 | loss  4.39 | ppl    80.697
| epoch  56 step    24100 |    120 batches | lr 0.000319 | ms/batch 319.24 | loss  4.43 | ppl    83.774
| epoch  56 step    24150 |    170 batches | lr 0.000319 | ms/batch 319.58 | loss  4.44 | ppl    84.954
| epoch  56 step    24200 |    220 batches | lr 0.000319 | ms/batch 319.85 | loss  4.48 | ppl    88.400
| epoch  56 step    24250 |    270 batches | lr 0.000318 | ms/batch 317.91 | loss  4.44 | ppl    85.020
| epoch  56 step    24300 |    320 batches | lr 0.000318 | ms/batch 318.75 | loss  4.43 | ppl    83.853
| epoch  56 step    24350 |    370 batches | lr 0.000318 | ms/batch 318.84 | loss  4.42 | ppl    83.239
| epoch  56 step    24400 |    420 batches | lr 0.000318 | ms/batch 318.78 | loss  4.44 | ppl    84.530
----------------------------------------------------------------------------------------------------
| Eval  61 at step    24400 | time: 132.60s | valid loss  4.37 | valid ppl    79.395
----------------------------------------------------------------------------------------------------
| epoch  57 step    24450 |     34 batches | lr 0.000318 | ms/batch 412.13 | loss  4.47 | ppl    86.941
| epoch  57 step    24500 |     84 batches | lr 0.000318 | ms/batch 318.09 | loss  4.40 | ppl    81.699
| epoch  57 step    24550 |    134 batches | lr 0.000318 | ms/batch 319.71 | loss  4.44 | ppl    84.629
| epoch  57 step    24600 |    184 batches | lr 0.000318 | ms/batch 318.84 | loss  4.42 | ppl    83.461
| epoch  57 step    24650 |    234 batches | lr 0.000317 | ms/batch 317.74 | loss  4.44 | ppl    84.974
| epoch  57 step    24700 |    284 batches | lr 0.000317 | ms/batch 318.61 | loss  4.46 | ppl    86.137
| epoch  57 step    24750 |    334 batches | lr 0.000317 | ms/batch 319.96 | loss  4.37 | ppl    78.926
| epoch  57 step    24800 |    384 batches | lr 0.000317 | ms/batch 320.08 | loss  4.43 | ppl    83.813
----------------------------------------------------------------------------------------------------
| Eval  62 at step    24800 | time: 132.28s | valid loss  4.37 | valid ppl    79.107
----------------------------------------------------------------------------------------------------
| epoch  57 step    24850 |    434 batches | lr 0.000317 | ms/batch 421.36 | loss  4.43 | ppl    84.142
| epoch  58 step    24900 |     48 batches | lr 0.000317 | ms/batch 317.17 | loss  4.40 | ppl    81.505
| epoch  58 step    24950 |     98 batches | lr 0.000317 | ms/batch 326.98 | loss  4.39 | ppl    80.414
| epoch  58 step    25000 |    148 batches | lr 0.000317 | ms/batch 326.63 | loss  4.41 | ppl    82.553
| epoch  58 step    25050 |    198 batches | lr 0.000316 | ms/batch 325.62 | loss  4.44 | ppl    84.696
| epoch  58 step    25100 |    248 batches | lr 0.000316 | ms/batch 320.99 | loss  4.43 | ppl    83.578
| epoch  58 step    25150 |    298 batches | lr 0.000316 | ms/batch 318.80 | loss  4.49 | ppl    89.191
| epoch  58 step    25200 |    348 batches | lr 0.000316 | ms/batch 318.47 | loss  4.35 | ppl    77.672
----------------------------------------------------------------------------------------------------
| Eval  63 at step    25200 | time: 133.75s | valid loss  4.35 | valid ppl    77.836
----------------------------------------------------------------------------------------------------
| epoch  58 step    25250 |    398 batches | lr 0.000316 | ms/batch 455.04 | loss  4.41 | ppl    82.656
| epoch  59 step    25300 |     12 batches | lr 0.000316 | ms/batch 314.82 | loss  4.46 | ppl    86.582
| epoch  59 step    25350 |     62 batches | lr 0.000316 | ms/batch 321.08 | loss  4.37 | ppl    79.328
| epoch  59 step    25400 |    112 batches | lr 0.000316 | ms/batch 320.65 | loss  4.39 | ppl    80.754
| epoch  59 step    25450 |    162 batches | lr 0.000315 | ms/batch 319.38 | loss  4.43 | ppl    84.260
| epoch  59 step    25500 |    212 batches | lr 0.000315 | ms/batch 330.20 | loss  4.42 | ppl    82.818
| epoch  59 step    25550 |    262 batches | lr 0.000315 | ms/batch 335.97 | loss  4.44 | ppl    84.464
| epoch  59 step    25600 |    312 batches | lr 0.000315 | ms/batch 335.32 | loss  4.45 | ppl    85.346
----------------------------------------------------------------------------------------------------
| Eval  64 at step    25600 | time: 135.13s | valid loss  4.37 | valid ppl    79.015
----------------------------------------------------------------------------------------------------
| epoch  59 step    25650 |    362 batches | lr 0.000315 | ms/batch 438.33 | loss  4.37 | ppl    79.180
| epoch  59 step    25700 |    412 batches | lr 0.000315 | ms/batch 319.26 | loss  4.41 | ppl    82.566
| epoch  60 step    25750 |     26 batches | lr 0.000315 | ms/batch 313.98 | loss  4.46 | ppl    86.758
| epoch  60 step    25800 |     76 batches | lr 0.000314 | ms/batch 320.68 | loss  4.39 | ppl    80.842
| epoch  60 step    25850 |    126 batches | lr 0.000314 | ms/batch 320.85 | loss  4.41 | ppl    81.987
| epoch  60 step    25900 |    176 batches | lr 0.000314 | ms/batch 320.56 | loss  4.41 | ppl    82.559
| epoch  60 step    25950 |    226 batches | lr 0.000314 | ms/batch 321.79 | loss  4.42 | ppl    83.435
| epoch  60 step    26000 |    276 batches | lr 0.000314 | ms/batch 320.61 | loss  4.43 | ppl    83.715
----------------------------------------------------------------------------------------------------
| Eval  65 at step    26000 | time: 133.58s | valid loss  4.35 | valid ppl    77.660
----------------------------------------------------------------------------------------------------
| epoch  60 step    26050 |    326 batches | lr 0.000314 | ms/batch 468.64 | loss  4.39 | ppl    81.007
| epoch  60 step    26100 |    376 batches | lr 0.000314 | ms/batch 318.53 | loss  4.38 | ppl    79.844
| epoch  60 step    26150 |    426 batches | lr 0.000314 | ms/batch 319.44 | loss  4.42 | ppl    83.291
| epoch  61 step    26200 |     40 batches | lr 0.000313 | ms/batch 313.44 | loss  4.41 | ppl    82.327
| epoch  61 step    26250 |     90 batches | lr 0.000313 | ms/batch 318.68 | loss  4.37 | ppl    78.896
| epoch  61 step    26300 |    140 batches | lr 0.000313 | ms/batch 319.18 | loss  4.41 | ppl    82.475
| epoch  61 step    26350 |    190 batches | lr 0.000313 | ms/batch 318.60 | loss  4.40 | ppl    81.744
| epoch  61 step    26400 |    240 batches | lr 0.000313 | ms/batch 318.99 | loss  4.41 | ppl    81.936
----------------------------------------------------------------------------------------------------
| Eval  66 at step    26400 | time: 132.37s | valid loss  4.35 | valid ppl    77.119
----------------------------------------------------------------------------------------------------
| epoch  61 step    26450 |    290 batches | lr 0.000313 | ms/batch 453.51 | loss  4.47 | ppl    87.616
| epoch  61 step    26500 |    340 batches | lr 0.000313 | ms/batch 319.10 | loss  4.30 | ppl    73.458
| epoch  61 step    26550 |    390 batches | lr 0.000312 | ms/batch 320.45 | loss  4.40 | ppl    81.527
| epoch  62 step    26600 |      4 batches | lr 0.000312 | ms/batch 314.24 | loss  4.44 | ppl    84.649
| epoch  62 step    26650 |     54 batches | lr 0.000312 | ms/batch 320.13 | loss  4.38 | ppl    79.726
| epoch  62 step    26700 |    104 batches | lr 0.000312 | ms/batch 319.84 | loss  4.35 | ppl    77.545
| epoch  62 step    26750 |    154 batches | lr 0.000312 | ms/batch 320.36 | loss  4.39 | ppl    80.975
| epoch  62 step    26800 |    204 batches | lr 0.000312 | ms/batch 318.63 | loss  4.45 | ppl    85.333
----------------------------------------------------------------------------------------------------
| Eval  67 at step    26800 | time: 132.64s | valid loss  4.35 | valid ppl    77.531
----------------------------------------------------------------------------------------------------
| epoch  62 step    26850 |    254 batches | lr 0.000312 | ms/batch 419.42 | loss  4.41 | ppl    82.424
| epoch  62 step    26900 |    304 batches | lr 0.000312 | ms/batch 320.28 | loss  4.45 | ppl    85.540
| epoch  62 step    26950 |    354 batches | lr 0.000311 | ms/batch 318.77 | loss  4.34 | ppl    76.564
| epoch  62 step    27000 |    404 batches | lr 0.000311 | ms/batch 319.83 | loss  4.38 | ppl    79.801
| epoch  63 step    27050 |     18 batches | lr 0.000311 | ms/batch 313.79 | loss  4.43 | ppl    83.679
| epoch  63 step    27100 |     68 batches | lr 0.000311 | ms/batch 320.60 | loss  4.35 | ppl    77.855
| epoch  63 step    27150 |    118 batches | lr 0.000311 | ms/batch 319.92 | loss  4.39 | ppl    80.760
| epoch  63 step    27200 |    168 batches | lr 0.000311 | ms/batch 320.13 | loss  4.38 | ppl    80.194
----------------------------------------------------------------------------------------------------
| Eval  68 at step    27200 | time: 132.65s | valid loss  4.35 | valid ppl    77.106
----------------------------------------------------------------------------------------------------
| epoch  63 step    27250 |    218 batches | lr 0.000311 | ms/batch 467.24 | loss  4.40 | ppl    81.731
| epoch  63 step    27300 |    268 batches | lr 0.00031 | ms/batch 320.17 | loss  4.41 | ppl    82.051
| epoch  63 step    27350 |    318 batches | lr 0.00031 | ms/batch 320.78 | loss  4.42 | ppl    82.902
| epoch  63 step    27400 |    368 batches | lr 0.00031 | ms/batch 321.47 | loss  4.36 | ppl    78.165
| epoch  63 step    27450 |    418 batches | lr 0.00031 | ms/batch 320.52 | loss  4.38 | ppl    79.944
| epoch  64 step    27500 |     32 batches | lr 0.00031 | ms/batch 313.35 | loss  4.42 | ppl    82.737
| epoch  64 step    27550 |     82 batches | lr 0.00031 | ms/batch 319.05 | loss  4.32 | ppl    75.224
| epoch  64 step    27600 |    132 batches | lr 0.00031 | ms/batch 319.59 | loss  4.38 | ppl    79.465
----------------------------------------------------------------------------------------------------
| Eval  69 at step    27600 | time: 132.77s | valid loss  4.34 | valid ppl    76.498
----------------------------------------------------------------------------------------------------
| epoch  64 step    27650 |    182 batches | lr 0.000309 | ms/batch 465.32 | loss  4.38 | ppl    79.745
| epoch  64 step    27700 |    232 batches | lr 0.000309 | ms/batch 320.87 | loss  4.41 | ppl    82.553
| epoch  64 step    27750 |    282 batches | lr 0.000309 | ms/batch 319.08 | loss  4.43 | ppl    84.056
| epoch  64 step    27800 |    332 batches | lr 0.000309 | ms/batch 332.36 | loss  4.35 | ppl    77.836
| epoch  64 step    27850 |    382 batches | lr 0.000309 | ms/batch 335.40 | loss  4.38 | ppl    79.975
| epoch  64 step    27900 |    432 batches | lr 0.000309 | ms/batch 322.05 | loss  4.41 | ppl    82.347
| epoch  65 step    27950 |     46 batches | lr 0.000309 | ms/batch 315.54 | loss  4.38 | ppl    79.496
| epoch  65 step    28000 |     96 batches | lr 0.000308 | ms/batch 319.10 | loss  4.34 | ppl    76.456
----------------------------------------------------------------------------------------------------
| Eval  70 at step    28000 | time: 134.17s | valid loss  4.34 | valid ppl    76.894
----------------------------------------------------------------------------------------------------
| epoch  65 step    28050 |    146 batches | lr 0.000308 | ms/batch 420.39 | loss  4.39 | ppl    80.678
| epoch  65 step    28100 |    196 batches | lr 0.000308 | ms/batch 320.06 | loss  4.41 | ppl    82.058
| epoch  65 step    28150 |    246 batches | lr 0.000308 | ms/batch 319.67 | loss  4.41 | ppl    82.675
| epoch  65 step    28200 |    296 batches | lr 0.000308 | ms/batch 319.83 | loss  4.42 | ppl    83.422
| epoch  65 step    28250 |    346 batches | lr 0.000308 | ms/batch 319.49 | loss  4.29 | ppl    73.018
| epoch  65 step    28300 |    396 batches | lr 0.000308 | ms/batch 320.56 | loss  4.37 | ppl    79.427
| epoch  66 step    28350 |     10 batches | lr 0.000307 | ms/batch 314.97 | loss  4.41 | ppl    82.093
| epoch  66 step    28400 |     60 batches | lr 0.000307 | ms/batch 320.35 | loss  4.35 | ppl    77.164
----------------------------------------------------------------------------------------------------
| Eval  71 at step    28400 | time: 132.78s | valid loss  4.33 | valid ppl    76.038
----------------------------------------------------------------------------------------------------
| epoch  66 step    28450 |    110 batches | lr 0.000307 | ms/batch 453.97 | loss  4.34 | ppl    76.852
| epoch  66 step    28500 |    160 batches | lr 0.000307 | ms/batch 321.41 | loss  4.38 | ppl    79.944
| epoch  66 step    28550 |    210 batches | lr 0.000307 | ms/batch 320.75 | loss  4.38 | ppl    79.832
| epoch  66 step    28600 |    260 batches | lr 0.000307 | ms/batch 320.95 | loss  4.40 | ppl    81.419
| epoch  66 step    28650 |    310 batches | lr 0.000307 | ms/batch 320.79 | loss  4.39 | ppl    80.988
| epoch  66 step    28700 |    360 batches | lr 0.000306 | ms/batch 320.27 | loss  4.31 | ppl    74.464
| epoch  66 step    28750 |    410 batches | lr 0.000306 | ms/batch 320.62 | loss  4.38 | ppl    79.676
| epoch  67 step    28800 |     24 batches | lr 0.000306 | ms/batch 314.64 | loss  4.41 | ppl    82.359
----------------------------------------------------------------------------------------------------
| Eval  72 at step    28800 | time: 133.06s | valid loss  4.36 | valid ppl    77.924
----------------------------------------------------------------------------------------------------
| epoch  67 step    28850 |     74 batches | lr 0.000306 | ms/batch 421.68 | loss  4.34 | ppl    76.408
| epoch  67 step    28900 |    124 batches | lr 0.000306 | ms/batch 320.79 | loss  4.38 | ppl    79.932
| epoch  67 step    28950 |    174 batches | lr 0.000306 | ms/batch 320.38 | loss  4.37 | ppl    78.963
| epoch  67 step    29000 |    224 batches | lr 0.000306 | ms/batch 321.61 | loss  4.40 | ppl    81.470
| epoch  67 step    29050 |    274 batches | lr 0.000305 | ms/batch 320.82 | loss  4.39 | ppl    80.464
| epoch  67 step    29100 |    324 batches | lr 0.000305 | ms/batch 320.75 | loss  4.34 | ppl    76.990
| epoch  67 step    29150 |    374 batches | lr 0.000305 | ms/batch 321.90 | loss  4.36 | ppl    78.208
| epoch  67 step    29200 |    424 batches | lr 0.000305 | ms/batch 322.27 | loss  4.37 | ppl    79.241
----------------------------------------------------------------------------------------------------
| Eval  73 at step    29200 | time: 133.52s | valid loss  4.34 | valid ppl    76.359
----------------------------------------------------------------------------------------------------
| epoch  68 step    29250 |     38 batches | lr 0.000305 | ms/batch 416.28 | loss  4.37 | ppl    79.424
| epoch  68 step    29300 |     88 batches | lr 0.000305 | ms/batch 322.02 | loss  4.33 | ppl    76.230
| epoch  68 step    29350 |    138 batches | lr 0.000305 | ms/batch 321.08 | loss  4.35 | ppl    77.581
| epoch  68 step    29400 |    188 batches | lr 0.000304 | ms/batch 321.31 | loss  4.37 | ppl    79.396
| epoch  68 step    29450 |    238 batches | lr 0.000304 | ms/batch 320.95 | loss  4.37 | ppl    78.705
| epoch  68 step    29500 |    288 batches | lr 0.000304 | ms/batch 321.01 | loss  4.42 | ppl    82.824
| epoch  68 step    29550 |    338 batches | lr 0.000304 | ms/batch 320.92 | loss  4.28 | ppl    72.303
| epoch  68 step    29600 |    388 batches | lr 0.000304 | ms/batch 327.82 | loss  4.36 | ppl    77.982
----------------------------------------------------------------------------------------------------
| Eval  74 at step    29600 | time: 133.59s | valid loss  4.34 | valid ppl    76.441
----------------------------------------------------------------------------------------------------
| epoch  69 step    29650 |      2 batches | lr 0.000304 | ms/batch 415.63 | loss  4.40 | ppl    81.311
| epoch  69 step    29700 |     52 batches | lr 0.000303 | ms/batch 321.09 | loss  4.32 | ppl    75.306
| epoch  69 step    29750 |    102 batches | lr 0.000303 | ms/batch 320.03 | loss  4.32 | ppl    75.177
| epoch  69 step    29800 |    152 batches | lr 0.000303 | ms/batch 321.85 | loss  4.38 | ppl    79.490
| epoch  69 step    29850 |    202 batches | lr 0.000303 | ms/batch 321.18 | loss  4.37 | ppl    78.846
| epoch  69 step    29900 |    252 batches | lr 0.000303 | ms/batch 320.73 | loss  4.37 | ppl    79.427
| epoch  69 step    29950 |    302 batches | lr 0.000303 | ms/batch 321.57 | loss  4.39 | ppl    80.969
| epoch  69 step    30000 |    352 batches | lr 0.000303 | ms/batch 322.24 | loss  4.27 | ppl    71.544
----------------------------------------------------------------------------------------------------
| Eval  75 at step    30000 | time: 133.22s | valid loss  4.32 | valid ppl    75.512
----------------------------------------------------------------------------------------------------
| epoch  69 step    30050 |    402 batches | lr 0.000302 | ms/batch 457.60 | loss  4.37 | ppl    79.365
| epoch  70 step    30100 |     16 batches | lr 0.000302 | ms/batch 315.92 | loss  4.38 | ppl    79.745
| epoch  70 step    30150 |     66 batches | lr 0.000302 | ms/batch 321.87 | loss  4.32 | ppl    74.831
| epoch  70 step    30200 |    116 batches | lr 0.000302 | ms/batch 322.32 | loss  4.35 | ppl    77.460
| epoch  70 step    30250 |    166 batches | lr 0.000302 | ms/batch 323.29 | loss  4.35 | ppl    77.612
| epoch  70 step    30300 |    216 batches | lr 0.000302 | ms/batch 319.34 | loss  4.40 | ppl    81.292
| epoch  70 step    30350 |    266 batches | lr 0.000302 | ms/batch 320.54 | loss  4.37 | ppl    79.291
| epoch  70 step    30400 |    316 batches | lr 0.000301 | ms/batch 320.64 | loss  4.34 | ppl    76.696
----------------------------------------------------------------------------------------------------
| Eval  76 at step    30400 | time: 133.29s | valid loss  4.34 | valid ppl    76.499
----------------------------------------------------------------------------------------------------
| epoch  70 step    30450 |    366 batches | lr 0.000301 | ms/batch 420.85 | loss  4.30 | ppl    73.729
| epoch  70 step    30500 |    416 batches | lr 0.000301 | ms/batch 321.15 | loss  4.35 | ppl    77.727
| epoch  71 step    30550 |     30 batches | lr 0.000301 | ms/batch 315.70 | loss  4.38 | ppl    79.561
| epoch  71 step    30600 |     80 batches | lr 0.000301 | ms/batch 322.34 | loss  4.30 | ppl    73.688
| epoch  71 step    30650 |    130 batches | lr 0.000301 | ms/batch 321.33 | loss  4.34 | ppl    76.744
| epoch  71 step    30700 |    180 batches | lr 0.0003 | ms/batch 321.61 | loss  4.34 | ppl    76.797
| epoch  71 step    30750 |    230 batches | lr 0.0003 | ms/batch 320.63 | loss  4.38 | ppl    80.075
| epoch  71 step    30800 |    280 batches | lr 0.0003 | ms/batch 319.54 | loss  4.40 | ppl    81.610
----------------------------------------------------------------------------------------------------
| Eval  77 at step    30800 | time: 133.14s | valid loss  4.32 | valid ppl    75.483
----------------------------------------------------------------------------------------------------
| epoch  71 step    30850 |    330 batches | lr 0.0003 | ms/batch 450.68 | loss  4.28 | ppl    72.308
| epoch  71 step    30900 |    380 batches | lr 0.0003 | ms/batch 320.05 | loss  4.34 | ppl    76.690
| epoch  71 step    30950 |    430 batches | lr 0.0003 | ms/batch 319.29 | loss  4.36 | ppl    78.318
| epoch  72 step    31000 |     44 batches | lr 0.0003 | ms/batch 313.55 | loss  4.37 | ppl    79.136
| epoch  72 step    31050 |     94 batches | lr 0.000299 | ms/batch 319.32 | loss  4.30 | ppl    73.487
| epoch  72 step    31100 |    144 batches | lr 0.000299 | ms/batch 319.57 | loss  4.37 | ppl    79.093
| epoch  72 step    31150 |    194 batches | lr 0.000299 | ms/batch 319.79 | loss  4.35 | ppl    77.261
| epoch  72 step    31200 |    244 batches | lr 0.000299 | ms/batch 320.09 | loss  4.39 | ppl    80.489
----------------------------------------------------------------------------------------------------
| Eval  78 at step    31200 | time: 132.55s | valid loss  4.33 | valid ppl    75.684
----------------------------------------------------------------------------------------------------
| epoch  72 step    31250 |    294 batches | lr 0.000299 | ms/batch 421.07 | loss  4.40 | ppl    81.502
| epoch  72 step    31300 |    344 batches | lr 0.000299 | ms/batch 321.01 | loss  4.26 | ppl    70.926
| epoch  72 step    31350 |    394 batches | lr 0.000298 | ms/batch 321.49 | loss  4.34 | ppl    77.050
| epoch  73 step    31400 |      8 batches | lr 0.000298 | ms/batch 313.80 | loss  4.38 | ppl    79.754
| epoch  73 step    31450 |     58 batches | lr 0.000298 | ms/batch 320.52 | loss  4.28 | ppl    72.447
| epoch  73 step    31500 |    108 batches | lr 0.000298 | ms/batch 320.40 | loss  4.31 | ppl    74.714
| epoch  73 step    31550 |    158 batches | lr 0.000298 | ms/batch 323.57 | loss  4.34 | ppl    76.672
| epoch  73 step    31600 |    208 batches | lr 0.000298 | ms/batch 330.34 | loss  4.34 | ppl    76.522
----------------------------------------------------------------------------------------------------
| Eval  79 at step    31600 | time: 133.62s | valid loss  4.32 | valid ppl    75.537
----------------------------------------------------------------------------------------------------
| epoch  73 step    31650 |    258 batches | lr 0.000297 | ms/batch 420.38 | loss  4.35 | ppl    77.855
| epoch  73 step    31700 |    308 batches | lr 0.000297 | ms/batch 319.71 | loss  4.35 | ppl    77.818
| epoch  73 step    31750 |    358 batches | lr 0.000297 | ms/batch 319.97 | loss  4.28 | ppl    72.342
| epoch  73 step    31800 |    408 batches | lr 0.000297 | ms/batch 319.96 | loss  4.35 | ppl    77.763
| epoch  74 step    31850 |     22 batches | lr 0.000297 | ms/batch 313.62 | loss  4.38 | ppl    79.751
| epoch  74 step    31900 |     72 batches | lr 0.000297 | ms/batch 319.53 | loss  4.29 | ppl    72.972
| epoch  74 step    31950 |    122 batches | lr 0.000297 | ms/batch 319.94 | loss  4.32 | ppl    75.083
| epoch  74 step    32000 |    172 batches | lr 0.000296 | ms/batch 320.32 | loss  4.33 | ppl    75.891
----------------------------------------------------------------------------------------------------
| Eval  80 at step    32000 | time: 132.67s | valid loss  4.32 | valid ppl    75.030
----------------------------------------------------------------------------------------------------
| epoch  74 step    32050 |    222 batches | lr 0.000296 | ms/batch 454.39 | loss  4.36 | ppl    77.995
| epoch  74 step    32100 |    272 batches | lr 0.000296 | ms/batch 320.31 | loss  4.37 | ppl    79.285
| epoch  74 step    32150 |    322 batches | lr 0.000296 | ms/batch 320.59 | loss  4.33 | ppl    75.595
| epoch  74 step    32200 |    372 batches | lr 0.000296 | ms/batch 319.52 | loss  4.29 | ppl    72.941
| epoch  74 step    32250 |    422 batches | lr 0.000296 | ms/batch 319.47 | loss  4.33 | ppl    75.566
| epoch  75 step    32300 |     36 batches | lr 0.000295 | ms/batch 315.42 | loss  4.35 | ppl    77.503
| epoch  75 step    32350 |     86 batches | lr 0.000295 | ms/batch 319.72 | loss  4.29 | ppl    73.218
| epoch  75 step    32400 |    136 batches | lr 0.000295 | ms/batch 320.23 | loss  4.32 | ppl    75.312
----------------------------------------------------------------------------------------------------
| Eval  81 at step    32400 | time: 132.79s | valid loss  4.32 | valid ppl    75.108
----------------------------------------------------------------------------------------------------
| epoch  75 step    32450 |    186 batches | lr 0.000295 | ms/batch 419.64 | loss  4.34 | ppl    76.373
| epoch  75 step    32500 |    236 batches | lr 0.000295 | ms/batch 319.76 | loss  4.35 | ppl    77.104
| epoch  75 step    32550 |    286 batches | lr 0.000295 | ms/batch 319.58 | loss  4.38 | ppl    79.477
| epoch  75 step    32600 |    336 batches | lr 0.000294 | ms/batch 320.19 | loss  4.26 | ppl    70.589
| epoch  75 step    32650 |    386 batches | lr 0.000294 | ms/batch 319.55 | loss  4.32 | ppl    75.077
| epoch  75 step    32700 |    436 batches | lr 0.000294 | ms/batch 315.73 | loss  4.36 | ppl    78.233
| epoch  76 step    32750 |     50 batches | lr 0.000294 | ms/batch 317.98 | loss  4.29 | ppl    72.622
| epoch  76 step    32800 |    100 batches | lr 0.000294 | ms/batch 319.69 | loss  4.28 | ppl    72.190
----------------------------------------------------------------------------------------------------
| Eval  82 at step    32800 | time: 132.58s | valid loss  4.31 | valid ppl    74.544
----------------------------------------------------------------------------------------------------
| epoch  76 step    32850 |    150 batches | lr 0.000294 | ms/batch 451.54 | loss  4.31 | ppl    74.173
| epoch  76 step    32900 |    200 batches | lr 0.000294 | ms/batch 330.73 | loss  4.35 | ppl    77.261
| epoch  76 step    32950 |    250 batches | lr 0.000293 | ms/batch 321.79 | loss  4.33 | ppl    75.956
| epoch  76 step    33000 |    300 batches | lr 0.000293 | ms/batch 320.45 | loss  4.38 | ppl    79.751
| epoch  76 step    33050 |    350 batches | lr 0.000293 | ms/batch 319.75 | loss  4.24 | ppl    69.375
| epoch  76 step    33100 |    400 batches | lr 0.000293 | ms/batch 320.84 | loss  4.33 | ppl    75.932
| epoch  77 step    33150 |     14 batches | lr 0.000293 | ms/batch 314.20 | loss  4.39 | ppl    80.493
| epoch  77 step    33200 |     64 batches | lr 0.000293 | ms/batch 321.76 | loss  4.26 | ppl    71.093
----------------------------------------------------------------------------------------------------
| Eval  83 at step    33200 | time: 133.49s | valid loss  4.29 | valid ppl    73.166
----------------------------------------------------------------------------------------------------
| epoch  77 step    33250 |    114 batches | lr 0.000292 | ms/batch 457.19 | loss  4.32 | ppl    75.065
| epoch  77 step    33300 |    164 batches | lr 0.000292 | ms/batch 320.61 | loss  4.32 | ppl    74.819
| epoch  77 step    33350 |    214 batches | lr 0.000292 | ms/batch 321.19 | loss  4.34 | ppl    76.761
| epoch  77 step    33400 |    264 batches | lr 0.000292 | ms/batch 322.13 | loss  4.34 | ppl    76.570
| epoch  77 step    33450 |    314 batches | lr 0.000292 | ms/batch 320.85 | loss  4.34 | ppl    77.014
| epoch  77 step    33500 |    364 batches | lr 0.000292 | ms/batch 320.10 | loss  4.27 | ppl    71.483
| epoch  77 step    33550 |    414 batches | lr 0.000291 | ms/batch 320.06 | loss  4.31 | ppl    74.298
| epoch  78 step    33600 |     28 batches | lr 0.000291 | ms/batch 314.82 | loss  4.36 | ppl    77.912
----------------------------------------------------------------------------------------------------
| Eval  84 at step    33600 | time: 133.05s | valid loss  4.30 | valid ppl    73.988
----------------------------------------------------------------------------------------------------
| epoch  78 step    33650 |     78 batches | lr 0.000291 | ms/batch 420.64 | loss  4.26 | ppl    70.534
| epoch  78 step    33700 |    128 batches | lr 0.000291 | ms/batch 320.44 | loss  4.32 | ppl    75.483
| epoch  78 step    33750 |    178 batches | lr 0.000291 | ms/batch 321.49 | loss  4.30 | ppl    73.971
| epoch  78 step    33800 |    228 batches | lr 0.000291 | ms/batch 321.06 | loss  4.32 | ppl    74.942
| epoch  78 step    33850 |    278 batches | lr 0.00029 | ms/batch 320.55 | loss  4.35 | ppl    77.551
| epoch  78 step    33900 |    328 batches | lr 0.00029 | ms/batch 320.64 | loss  4.27 | ppl    71.768
| epoch  78 step    33950 |    378 batches | lr 0.00029 | ms/batch 319.67 | loss  4.27 | ppl    71.639
| epoch  78 step    34000 |    428 batches | lr 0.00029 | ms/batch 320.86 | loss  4.33 | ppl    75.666
----------------------------------------------------------------------------------------------------
| Eval  85 at step    34000 | time: 133.24s | valid loss  4.31 | valid ppl    74.699
----------------------------------------------------------------------------------------------------
| epoch  79 step    34050 |     42 batches | lr 0.00029 | ms/batch 414.31 | loss  4.32 | ppl    75.468
| epoch  79 step    34100 |     92 batches | lr 0.00029 | ms/batch 319.33 | loss  4.24 | ppl    69.283
| epoch  79 step    34150 |    142 batches | lr 0.000289 | ms/batch 319.86 | loss  4.30 | ppl    73.971
| epoch  79 step    34200 |    192 batches | lr 0.000289 | ms/batch 321.05 | loss  4.33 | ppl    75.690
| epoch  79 step    34250 |    242 batches | lr 0.000289 | ms/batch 320.50 | loss  4.33 | ppl    76.170
| epoch  79 step    34300 |    292 batches | lr 0.000289 | ms/batch 320.37 | loss  4.37 | ppl    78.859
| epoch  79 step    34350 |    342 batches | lr 0.000289 | ms/batch 319.71 | loss  4.24 | ppl    69.121
| epoch  79 step    34400 |    392 batches | lr 0.000289 | ms/batch 322.50 | loss  4.33 | ppl    75.713
----------------------------------------------------------------------------------------------------
| Eval  86 at step    34400 | time: 132.90s | valid loss  4.32 | valid ppl    74.880
----------------------------------------------------------------------------------------------------
| epoch  80 step    34450 |      6 batches | lr 0.000288 | ms/batch 414.77 | loss  4.34 | ppl    76.382
| epoch  80 step    34500 |     56 batches | lr 0.000288 | ms/batch 319.81 | loss  4.27 | ppl    71.388
| epoch  80 step    34550 |    106 batches | lr 0.000288 | ms/batch 320.48 | loss  4.28 | ppl    72.308
| epoch  80 step    34600 |    156 batches | lr 0.000288 | ms/batch 321.31 | loss  4.28 | ppl    72.359
| epoch  80 step    34650 |    206 batches | lr 0.000288 | ms/batch 320.19 | loss  4.32 | ppl    75.306
| epoch  80 step    34700 |    256 batches | lr 0.000288 | ms/batch 319.79 | loss  4.31 | ppl    74.185
| epoch  80 step    34750 |    306 batches | lr 0.000287 | ms/batch 321.04 | loss  4.34 | ppl    76.981
| epoch  80 step    34800 |    356 batches | lr 0.000287 | ms/batch 330.09 | loss  4.24 | ppl    69.105
----------------------------------------------------------------------------------------------------
| Eval  87 at step    34800 | time: 133.49s | valid loss  4.31 | valid ppl    74.202
----------------------------------------------------------------------------------------------------
| epoch  80 step    34850 |    406 batches | lr 0.000287 | ms/batch 422.62 | loss  4.32 | ppl    75.007
| epoch  81 step    34900 |     20 batches | lr 0.000287 | ms/batch 313.15 | loss  4.33 | ppl    76.245
| epoch  81 step    34950 |     70 batches | lr 0.000287 | ms/batch 319.45 | loss  4.25 | ppl    70.347
| epoch  81 step    35000 |    120 batches | lr 0.000287 | ms/batch 320.64 | loss  4.28 | ppl    72.523
| epoch  81 step    35050 |    170 batches | lr 0.000286 | ms/batch 320.64 | loss  4.30 | ppl    73.919
| epoch  81 step    35100 |    220 batches | lr 0.000286 | ms/batch 320.11 | loss  4.32 | ppl    74.843
| epoch  81 step    35150 |    270 batches | lr 0.000286 | ms/batch 319.49 | loss  4.33 | ppl    75.690
| epoch  81 step    35200 |    320 batches | lr 0.000286 | ms/batch 319.60 | loss  4.30 | ppl    73.717
----------------------------------------------------------------------------------------------------
| Eval  88 at step    35200 | time: 132.69s | valid loss  4.32 | valid ppl    75.118
----------------------------------------------------------------------------------------------------
| epoch  81 step    35250 |    370 batches | lr 0.000286 | ms/batch 421.30 | loss  4.26 | ppl    70.592
| epoch  81 step    35300 |    420 batches | lr 0.000286 | ms/batch 320.75 | loss  4.32 | ppl    75.289
| epoch  82 step    35350 |     34 batches | lr 0.000285 | ms/batch 315.67 | loss  4.33 | ppl    75.654
| epoch  82 step    35400 |     84 batches | lr 0.000285 | ms/batch 320.95 | loss  4.23 | ppl    68.449
| epoch  82 step    35450 |    134 batches | lr 0.000285 | ms/batch 336.08 | loss  4.29 | ppl    73.046
| epoch  82 step    35500 |    184 batches | lr 0.000285 | ms/batch 335.68 | loss  4.27 | ppl    71.855
| epoch  82 step    35550 |    234 batches | lr 0.000285 | ms/batch 335.43 | loss  4.30 | ppl    73.734
| epoch  82 step    35600 |    284 batches | lr 0.000285 | ms/batch 335.76 | loss  4.34 | ppl    76.414
----------------------------------------------------------------------------------------------------
| Eval  89 at step    35600 | time: 137.50s | valid loss  4.29 | valid ppl    73.264
----------------------------------------------------------------------------------------------------
| epoch  82 step    35650 |    334 batches | lr 0.000284 | ms/batch 423.10 | loss  4.22 | ppl    68.060
| epoch  82 step    35700 |    384 batches | lr 0.000284 | ms/batch 319.96 | loss  4.28 | ppl    72.540
| epoch  82 step    35750 |    434 batches | lr 0.000284 | ms/batch 319.58 | loss  4.30 | ppl    74.069
| epoch  83 step    35800 |     48 batches | lr 0.000284 | ms/batch 314.01 | loss  4.26 | ppl    70.996
| epoch  83 step    35850 |     98 batches | lr 0.000284 | ms/batch 321.75 | loss  4.25 | ppl    69.868
| epoch  83 step    35900 |    148 batches | lr 0.000283 | ms/batch 321.14 | loss  4.29 | ppl    72.654
| epoch  83 step    35950 |    198 batches | lr 0.000283 | ms/batch 320.80 | loss  4.30 | ppl    73.654
| epoch  83 step    36000 |    248 batches | lr 0.000283 | ms/batch 322.19 | loss  4.30 | ppl    73.613
----------------------------------------------------------------------------------------------------
| Eval  90 at step    36000 | time: 133.01s | valid loss  4.29 | valid ppl    73.253
----------------------------------------------------------------------------------------------------
| epoch  83 step    36050 |    298 batches | lr 0.000283 | ms/batch 421.27 | loss  4.34 | ppl    76.708
| epoch  83 step    36100 |    348 batches | lr 0.000283 | ms/batch 320.79 | loss  4.19 | ppl    66.279
| epoch  83 step    36150 |    398 batches | lr 0.000283 | ms/batch 321.49 | loss  4.30 | ppl    73.579
| epoch  84 step    36200 |     12 batches | lr 0.000282 | ms/batch 314.86 | loss  4.31 | ppl    74.298
| epoch  84 step    36250 |     62 batches | lr 0.000282 | ms/batch 320.86 | loss  4.23 | ppl    68.897
| epoch  84 step    36300 |    112 batches | lr 0.000282 | ms/batch 319.71 | loss  4.24 | ppl    69.734
| epoch  84 step    36350 |    162 batches | lr 0.000282 | ms/batch 320.50 | loss  4.28 | ppl    72.043
| epoch  84 step    36400 |    212 batches | lr 0.000282 | ms/batch 321.65 | loss  4.26 | ppl    70.899
----------------------------------------------------------------------------------------------------
| Eval  91 at step    36400 | time: 133.07s | valid loss  4.31 | valid ppl    74.219
----------------------------------------------------------------------------------------------------
| epoch  84 step    36450 |    262 batches | lr 0.000282 | ms/batch 421.98 | loss  4.27 | ppl    71.661
| epoch  84 step    36500 |    312 batches | lr 0.000281 | ms/batch 319.84 | loss  4.30 | ppl    73.590
| epoch  84 step    36550 |    362 batches | lr 0.000281 | ms/batch 319.94 | loss  4.23 | ppl    68.605
| epoch  84 step    36600 |    412 batches | lr 0.000281 | ms/batch 320.13 | loss  4.27 | ppl    71.416
| epoch  85 step    36650 |     26 batches | lr 0.000281 | ms/batch 314.26 | loss  4.33 | ppl    75.873
| epoch  85 step    36700 |     76 batches | lr 0.000281 | ms/batch 320.37 | loss  4.24 | ppl    69.251
| epoch  85 step    36750 |    126 batches | lr 0.000281 | ms/batch 320.58 | loss  4.27 | ppl    71.377
| epoch  85 step    36800 |    176 batches | lr 0.00028 | ms/batch 321.33 | loss  4.28 | ppl    72.257
----------------------------------------------------------------------------------------------------
| Eval  92 at step    36800 | time: 132.91s | valid loss  4.29 | valid ppl    72.639
----------------------------------------------------------------------------------------------------
| epoch  85 step    36850 |    226 batches | lr 0.00028 | ms/batch 454.06 | loss  4.29 | ppl    72.944
| epoch  85 step    36900 |    276 batches | lr 0.00028 | ms/batch 319.71 | loss  4.31 | ppl    74.266
| epoch  85 step    36950 |    326 batches | lr 0.00028 | ms/batch 320.17 | loss  4.23 | ppl    68.543
| epoch  85 step    37000 |    376 batches | lr 0.00028 | ms/batch 319.65 | loss  4.26 | ppl    70.600
| epoch  85 step    37050 |    426 batches | lr 0.000279 | ms/batch 319.28 | loss  4.27 | ppl    71.193
| epoch  86 step    37100 |     40 batches | lr 0.000279 | ms/batch 313.46 | loss  4.27 | ppl    71.418
| epoch  86 step    37150 |     90 batches | lr 0.000279 | ms/batch 320.05 | loss  4.23 | ppl    68.744
| epoch  86 step    37200 |    140 batches | lr 0.000279 | ms/batch 320.06 | loss  4.27 | ppl    71.416
----------------------------------------------------------------------------------------------------
| Eval  93 at step    37200 | time: 132.70s | valid loss  4.29 | valid ppl    72.998
----------------------------------------------------------------------------------------------------
| epoch  86 step    37250 |    190 batches | lr 0.000279 | ms/batch 420.70 | loss  4.30 | ppl    73.412
| epoch  86 step    37300 |    240 batches | lr 0.000279 | ms/batch 320.27 | loss  4.27 | ppl    71.315
| epoch  86 step    37350 |    290 batches | lr 0.000278 | ms/batch 320.96 | loss  4.31 | ppl    74.773
| epoch  86 step    37400 |    340 batches | lr 0.000278 | ms/batch 321.61 | loss  4.19 | ppl    66.054
| epoch  86 step    37450 |    390 batches | lr 0.000278 | ms/batch 320.81 | loss  4.28 | ppl    71.959
| epoch  87 step    37500 |      4 batches | lr 0.000278 | ms/batch 315.09 | loss  4.32 | ppl    75.418
| epoch  87 step    37550 |     54 batches | lr 0.000278 | ms/batch 321.47 | loss  4.24 | ppl    69.478
| epoch  87 step    37600 |    104 batches | lr 0.000278 | ms/batch 321.67 | loss  4.24 | ppl    69.392
----------------------------------------------------------------------------------------------------
| Eval  94 at step    37600 | time: 133.16s | valid loss  4.30 | valid ppl    73.663
----------------------------------------------------------------------------------------------------
| epoch  87 step    37650 |    154 batches | lr 0.000277 | ms/batch 421.50 | loss  4.30 | ppl    73.522
| epoch  87 step    37700 |    204 batches | lr 0.000277 | ms/batch 320.47 | loss  4.28 | ppl    72.263
| epoch  87 step    37750 |    254 batches | lr 0.000277 | ms/batch 321.01 | loss  4.27 | ppl    71.751
| epoch  87 step    37800 |    304 batches | lr 0.000277 | ms/batch 321.91 | loss  4.30 | ppl    73.504
| epoch  87 step    37850 |    354 batches | lr 0.000277 | ms/batch 330.24 | loss  4.19 | ppl    65.827
| epoch  87 step    37900 |    404 batches | lr 0.000276 | ms/batch 335.51 | loss  4.25 | ppl    69.832
| epoch  88 step    37950 |     18 batches | lr 0.000276 | ms/batch 329.07 | loss  4.32 | ppl    74.878
| epoch  88 step    38000 |     68 batches | lr 0.000276 | ms/batch 336.20 | loss  4.21 | ppl    67.520
----------------------------------------------------------------------------------------------------
| Eval  95 at step    38000 | time: 135.98s | valid loss  4.27 | valid ppl    71.170
----------------------------------------------------------------------------------------------------
| epoch  88 step    38050 |    118 batches | lr 0.000276 | ms/batch 473.46 | loss  4.25 | ppl    70.253
| epoch  88 step    38100 |    168 batches | lr 0.000276 | ms/batch 322.61 | loss  4.26 | ppl    70.683
| epoch  88 step    38150 |    218 batches | lr 0.000276 | ms/batch 321.55 | loss  4.29 | ppl    73.149
| epoch  88 step    38200 |    268 batches | lr 0.000275 | ms/batch 321.86 | loss  4.29 | ppl    72.702
| epoch  88 step    38250 |    318 batches | lr 0.000275 | ms/batch 322.78 | loss  4.25 | ppl    69.969
| epoch  88 step    38300 |    368 batches | lr 0.000275 | ms/batch 320.35 | loss  4.20 | ppl    66.723
| epoch  88 step    38350 |    418 batches | lr 0.000275 | ms/batch 319.43 | loss  4.25 | ppl    70.215
| epoch  89 step    38400 |     32 batches | lr 0.000275 | ms/batch 313.99 | loss  4.29 | ppl    72.941
----------------------------------------------------------------------------------------------------
| Eval  96 at step    38400 | time: 133.96s | valid loss  4.29 | valid ppl    73.014
----------------------------------------------------------------------------------------------------
| epoch  89 step    38450 |     82 batches | lr 0.000274 | ms/batch 419.99 | loss  4.20 | ppl    66.408
| epoch  89 step    38500 |    132 batches | lr 0.000274 | ms/batch 319.53 | loss  4.25 | ppl    70.341
| epoch  89 step    38550 |    182 batches | lr 0.000274 | ms/batch 319.93 | loss  4.25 | ppl    70.407
| epoch  89 step    38600 |    232 batches | lr 0.000274 | ms/batch 319.96 | loss  4.27 | ppl    71.743
| epoch  89 step    38650 |    282 batches | lr 0.000274 | ms/batch 319.65 | loss  4.31 | ppl    74.539
| epoch  89 step    38700 |    332 batches | lr 0.000274 | ms/batch 320.05 | loss  4.22 | ppl    67.946
| epoch  89 step    38750 |    382 batches | lr 0.000273 | ms/batch 335.05 | loss  4.25 | ppl    69.865
| epoch  89 step    38800 |    432 batches | lr 0.000273 | ms/batch 336.08 | loss  4.27 | ppl    71.633
----------------------------------------------------------------------------------------------------
| Eval  97 at step    38800 | time: 134.67s | valid loss  4.28 | valid ppl    71.960
----------------------------------------------------------------------------------------------------
| epoch  90 step    38850 |     46 batches | lr 0.000273 | ms/batch 416.74 | loss  4.25 | ppl    70.295
| epoch  90 step    38900 |     96 batches | lr 0.000273 | ms/batch 319.92 | loss  4.21 | ppl    67.325
| epoch  90 step    38950 |    146 batches | lr 0.000273 | ms/batch 320.27 | loss  4.25 | ppl    70.292
| epoch  90 step    39000 |    196 batches | lr 0.000272 | ms/batch 320.19 | loss  4.26 | ppl    71.079
| epoch  90 step    39050 |    246 batches | lr 0.000272 | ms/batch 319.88 | loss  4.27 | ppl    71.449
| epoch  90 step    39100 |    296 batches | lr 0.000272 | ms/batch 320.27 | loss  4.32 | ppl    75.412
| epoch  90 step    39150 |    346 batches | lr 0.000272 | ms/batch 319.64 | loss  4.15 | ppl    63.556
| epoch  90 step    39200 |    396 batches | lr 0.000272 | ms/batch 320.96 | loss  4.25 | ppl    69.892
----------------------------------------------------------------------------------------------------
| Eval  98 at step    39200 | time: 132.72s | valid loss  4.29 | valid ppl    72.782
----------------------------------------------------------------------------------------------------
| epoch  91 step    39250 |     10 batches | lr 0.000272 | ms/batch 413.23 | loss  4.32 | ppl    75.227
| epoch  91 step    39300 |     60 batches | lr 0.000271 | ms/batch 319.28 | loss  4.21 | ppl    67.383
| epoch  91 step    39350 |    110 batches | lr 0.000271 | ms/batch 321.14 | loss  4.22 | ppl    68.129
| epoch  91 step    39400 |    160 batches | lr 0.000271 | ms/batch 320.23 | loss  4.26 | ppl    70.771
| epoch  91 step    39450 |    210 batches | lr 0.000271 | ms/batch 320.27 | loss  4.25 | ppl    70.138
| epoch  91 step    39500 |    260 batches | lr 0.000271 | ms/batch 320.03 | loss  4.25 | ppl    70.062
| epoch  91 step    39550 |    310 batches | lr 0.00027 | ms/batch 319.84 | loss  4.26 | ppl    70.697
| epoch  91 step    39600 |    360 batches | lr 0.00027 | ms/batch 320.19 | loss  4.21 | ppl    67.136
----------------------------------------------------------------------------------------------------
| Eval  99 at step    39600 | time: 132.75s | valid loss  4.28 | valid ppl    71.996
----------------------------------------------------------------------------------------------------
| epoch  91 step    39650 |    410 batches | lr 0.00027 | ms/batch 434.31 | loss  4.24 | ppl    69.078
| epoch  92 step    39700 |     24 batches | lr 0.00027 | ms/batch 328.99 | loss  4.28 | ppl    72.105
| epoch  92 step    39750 |     74 batches | lr 0.00027 | ms/batch 335.77 | loss  4.19 | ppl    66.325
| epoch  92 step    39800 |    124 batches | lr 0.00027 | ms/batch 336.08 | loss  4.24 | ppl    69.392
| epoch  92 step    39850 |    174 batches | lr 0.000269 | ms/batch 321.43 | loss  4.26 | ppl    70.960
| epoch  92 step    39900 |    224 batches | lr 0.000269 | ms/batch 319.92 | loss  4.25 | ppl    69.980
| epoch  92 step    39950 |    274 batches | lr 0.000269 | ms/batch 319.96 | loss  4.27 | ppl    71.625
| epoch  92 step    40000 |    324 batches | lr 0.000269 | ms/batch 320.03 | loss  4.20 | ppl    66.585
----------------------------------------------------------------------------------------------------
| Eval 100 at step    40000 | time: 135.83s | valid loss  4.29 | valid ppl    72.611
----------------------------------------------------------------------------------------------------
| epoch  92 step    40050 |    374 batches | lr 0.000269 | ms/batch 420.81 | loss  4.23 | ppl    68.889
| epoch  92 step    40100 |    424 batches | lr 0.000268 | ms/batch 319.26 | loss  4.25 | ppl    69.974
| epoch  93 step    40150 |     38 batches | lr 0.000268 | ms/batch 313.90 | loss  4.25 | ppl    69.903
| epoch  93 step    40200 |     88 batches | lr 0.000268 | ms/batch 320.51 | loss  4.19 | ppl    65.696
| epoch  93 step    40250 |    138 batches | lr 0.000268 | ms/batch 319.89 | loss  4.24 | ppl    69.647
| epoch  93 step    40300 |    188 batches | lr 0.000268 | ms/batch 320.85 | loss  4.23 | ppl    69.008
| epoch  93 step    40350 |    238 batches | lr 0.000267 | ms/batch 320.97 | loss  4.26 | ppl    70.857
| epoch  93 step    40400 |    288 batches | lr 0.000267 | ms/batch 320.99 | loss  4.30 | ppl    73.550
----------------------------------------------------------------------------------------------------
| Eval 101 at step    40400 | time: 132.88s | valid loss  4.28 | valid ppl    72.531
----------------------------------------------------------------------------------------------------
| epoch  93 step    40450 |    338 batches | lr 0.000267 | ms/batch 422.14 | loss  4.16 | ppl    63.939
| epoch  93 step    40500 |    388 batches | lr 0.000267 | ms/batch 326.92 | loss  4.25 | ppl    70.034
| epoch  94 step    40550 |      2 batches | lr 0.000267 | ms/batch 329.78 | loss  4.28 | ppl    71.934
| epoch  94 step    40600 |     52 batches | lr 0.000267 | ms/batch 336.06 | loss  4.21 | ppl    67.430
| epoch  94 step    40650 |    102 batches | lr 0.000266 | ms/batch 319.75 | loss  4.18 | ppl    65.269
| epoch  94 step    40700 |    152 batches | lr 0.000266 | ms/batch 320.23 | loss  4.23 | ppl    68.428
| epoch  94 step    40750 |    202 batches | lr 0.000266 | ms/batch 320.26 | loss  4.24 | ppl    69.631
| epoch  94 step    40800 |    252 batches | lr 0.000266 | ms/batch 321.81 | loss  4.28 | ppl    72.060
----------------------------------------------------------------------------------------------------
| Eval 102 at step    40800 | time: 134.84s | valid loss  4.29 | valid ppl    72.632
----------------------------------------------------------------------------------------------------
| epoch  94 step    40850 |    302 batches | lr 0.000266 | ms/batch 421.15 | loss  4.28 | ppl    72.430
| epoch  94 step    40900 |    352 batches | lr 0.000265 | ms/batch 319.71 | loss  4.14 | ppl    62.521
| epoch  94 step    40950 |    402 batches | lr 0.000265 | ms/batch 320.22 | loss  4.25 | ppl    70.051
| epoch  95 step    41000 |     16 batches | lr 0.000265 | ms/batch 313.30 | loss  4.27 | ppl    71.360
| epoch  95 step    41050 |     66 batches | lr 0.000265 | ms/batch 319.15 | loss  4.19 | ppl    66.018
| epoch  95 step    41100 |    116 batches | lr 0.000265 | ms/batch 319.89 | loss  4.22 | ppl    68.188
| epoch  95 step    41150 |    166 batches | lr 0.000264 | ms/batch 319.53 | loss  4.23 | ppl    68.578
| epoch  95 step    41200 |    216 batches | lr 0.000264 | ms/batch 320.02 | loss  4.24 | ppl    69.175
----------------------------------------------------------------------------------------------------
| Eval 103 at step    41200 | time: 133.47s | valid loss  4.28 | valid ppl    72.180
----------------------------------------------------------------------------------------------------
| epoch  95 step    41250 |    266 batches | lr 0.000264 | ms/batch 437.30 | loss  4.24 | ppl    69.346
| epoch  95 step    41300 |    316 batches | lr 0.000264 | ms/batch 320.37 | loss  4.25 | ppl    70.374
| epoch  95 step    41350 |    366 batches | lr 0.000264 | ms/batch 320.80 | loss  4.16 | ppl    64.162
| epoch  95 step    41400 |    416 batches | lr 0.000264 | ms/batch 321.62 | loss  4.22 | ppl    68.148
| epoch  96 step    41450 |     30 batches | lr 0.000263 | ms/batch 314.85 | loss  4.25 | ppl    69.832
| epoch  96 step    41500 |     80 batches | lr 0.000263 | ms/batch 319.47 | loss  4.17 | ppl    64.539
| epoch  96 step    41550 |    130 batches | lr 0.000263 | ms/batch 319.25 | loss  4.21 | ppl    67.451
| epoch  96 step    41600 |    180 batches | lr 0.000263 | ms/batch 320.11 | loss  4.23 | ppl    68.465
----------------------------------------------------------------------------------------------------
| Eval 104 at step    41600 | time: 132.85s | valid loss  4.27 | valid ppl    71.555
----------------------------------------------------------------------------------------------------
| epoch  96 step    41650 |    230 batches | lr 0.000263 | ms/batch 420.51 | loss  4.26 | ppl    70.863
| epoch  96 step    41700 |    280 batches | lr 0.000262 | ms/batch 318.12 | loss  4.28 | ppl    72.195
| epoch  96 step    41750 |    330 batches | lr 0.000262 | ms/batch 319.15 | loss  4.20 | ppl    66.442
| epoch  96 step    41800 |    380 batches | lr 0.000262 | ms/batch 320.46 | loss  4.22 | ppl    68.334
| epoch  96 step    41850 |    430 batches | lr 0.000262 | ms/batch 319.39 | loss  4.25 | ppl    70.100
| epoch  97 step    41900 |     44 batches | lr 0.000262 | ms/batch 313.06 | loss  4.22 | ppl    67.744
| epoch  97 step    41950 |     94 batches | lr 0.000261 | ms/batch 318.87 | loss  4.19 | ppl    65.704
| epoch  97 step    42000 |    144 batches | lr 0.000261 | ms/batch 318.38 | loss  4.22 | ppl    68.228
----------------------------------------------------------------------------------------------------
| Eval 105 at step    42000 | time: 132.39s | valid loss  4.28 | valid ppl    72.245
----------------------------------------------------------------------------------------------------
| epoch  97 step    42050 |    194 batches | lr 0.000261 | ms/batch 419.70 | loss  4.24 | ppl    69.750
| epoch  97 step    42100 |    244 batches | lr 0.000261 | ms/batch 318.08 | loss  4.24 | ppl    69.224
| epoch  97 step    42150 |    294 batches | lr 0.000261 | ms/batch 319.26 | loss  4.30 | ppl    73.925
| epoch  97 step    42200 |    344 batches | lr 0.00026 | ms/batch 321.12 | loss  4.11 | ppl    61.192
| epoch  97 step    42250 |    394 batches | lr 0.00026 | ms/batch 318.87 | loss  4.24 | ppl    69.688
| epoch  98 step    42300 |      8 batches | lr 0.00026 | ms/batch 314.25 | loss  4.24 | ppl    69.658
| epoch  98 step    42350 |     58 batches | lr 0.00026 | ms/batch 319.40 | loss  4.19 | ppl    66.276
| epoch  98 step    42400 |    108 batches | lr 0.00026 | ms/batch 320.31 | loss  4.17 | ppl    64.544
----------------------------------------------------------------------------------------------------
| Eval 106 at step    42400 | time: 132.57s | valid loss  4.27 | valid ppl    71.203
----------------------------------------------------------------------------------------------------
| epoch  98 step    42450 |    158 batches | lr 0.000259 | ms/batch 421.47 | loss  4.22 | ppl    67.736
| epoch  98 step    42500 |    208 batches | lr 0.000259 | ms/batch 321.36 | loss  4.23 | ppl    68.647
| epoch  98 step    42550 |    258 batches | lr 0.000259 | ms/batch 321.37 | loss  4.24 | ppl    69.316
| epoch  98 step    42600 |    308 batches | lr 0.000259 | ms/batch 321.32 | loss  4.23 | ppl    69.054
| epoch  98 step    42650 |    358 batches | lr 0.000259 | ms/batch 321.30 | loss  4.17 | ppl    64.398
| epoch  98 step    42700 |    408 batches | lr 0.000259 | ms/batch 321.70 | loss  4.22 | ppl    68.345
| epoch  99 step    42750 |     22 batches | lr 0.000258 | ms/batch 314.51 | loss  4.26 | ppl    70.733
| epoch  99 step    42800 |     72 batches | lr 0.000258 | ms/batch 322.08 | loss  4.17 | ppl    65.030
----------------------------------------------------------------------------------------------------
| Eval 107 at step    42800 | time: 133.26s | valid loss  4.28 | valid ppl    71.920
----------------------------------------------------------------------------------------------------
| epoch  99 step    42850 |    122 batches | lr 0.000258 | ms/batch 422.07 | loss  4.20 | ppl    66.564
| epoch  99 step    42900 |    172 batches | lr 0.000258 | ms/batch 321.19 | loss  4.22 | ppl    68.308
| epoch  99 step    42950 |    222 batches | lr 0.000258 | ms/batch 320.43 | loss  4.25 | ppl    69.914
| epoch  99 step    43000 |    272 batches | lr 0.000257 | ms/batch 326.38 | loss  4.23 | ppl    68.986
| epoch  99 step    43050 |    322 batches | lr 0.000257 | ms/batch 320.26 | loss  4.20 | ppl    66.895
| epoch  99 step    43100 |    372 batches | lr 0.000257 | ms/batch 320.50 | loss  4.18 | ppl    65.637
| epoch  99 step    43150 |    422 batches | lr 0.000257 | ms/batch 319.91 | loss  4.21 | ppl    67.554
| epoch 100 step    43200 |     36 batches | lr 0.000257 | ms/batch 314.46 | loss  4.23 | ppl    69.048
----------------------------------------------------------------------------------------------------
| Eval 108 at step    43200 | time: 133.24s | valid loss  4.27 | valid ppl    71.850
----------------------------------------------------------------------------------------------------
| epoch 100 step    43250 |     86 batches | lr 0.000256 | ms/batch 421.74 | loss  4.14 | ppl    62.901
| epoch 100 step    43300 |    136 batches | lr 0.000256 | ms/batch 321.20 | loss  4.22 | ppl    68.358
| epoch 100 step    43350 |    186 batches | lr 0.000256 | ms/batch 321.29 | loss  4.22 | ppl    68.174
| epoch 100 step    43400 |    236 batches | lr 0.000256 | ms/batch 320.27 | loss  4.23 | ppl    68.749
| epoch 100 step    43450 |    286 batches | lr 0.000256 | ms/batch 321.06 | loss  4.25 | ppl    69.971
| epoch 100 step    43500 |    336 batches | lr 0.000255 | ms/batch 320.11 | loss  4.15 | ppl    63.192
| epoch 100 step    43550 |    386 batches | lr 0.000255 | ms/batch 320.66 | loss  4.20 | ppl    66.926
| epoch 100 step    43600 |    436 batches | lr 0.000255 | ms/batch 317.61 | loss  4.23 | ppl    68.623
----------------------------------------------------------------------------------------------------
| Eval 109 at step    43600 | time: 133.20s | valid loss  4.27 | valid ppl    71.340
----------------------------------------------------------------------------------------------------
| epoch 101 step    43650 |     50 batches | lr 0.000255 | ms/batch 419.59 | loss  4.19 | ppl    65.755
| epoch 101 step    43700 |    100 batches | lr 0.000255 | ms/batch 320.69 | loss  4.17 | ppl    64.789
| epoch 101 step    43750 |    150 batches | lr 0.000254 | ms/batch 319.77 | loss  4.20 | ppl    66.463
| epoch 101 step    43800 |    200 batches | lr 0.000254 | ms/batch 320.65 | loss  4.22 | ppl    67.858
| epoch 101 step    43850 |    250 batches | lr 0.000254 | ms/batch 321.64 | loss  4.22 | ppl    67.736
| epoch 101 step    43900 |    300 batches | lr 0.000254 | ms/batch 329.09 | loss  4.25 | ppl    70.185
| epoch 101 step    43950 |    350 batches | lr 0.000254 | ms/batch 321.18 | loss  4.13 | ppl    62.037
| epoch 101 step    44000 |    400 batches | lr 0.000253 | ms/batch 322.00 | loss  4.21 | ppl    67.430
----------------------------------------------------------------------------------------------------
| Eval 110 at step    44000 | time: 133.76s | valid loss  4.27 | valid ppl    71.277
----------------------------------------------------------------------------------------------------
| epoch 102 step    44050 |     14 batches | lr 0.000253 | ms/batch 415.34 | loss  4.26 | ppl    70.608
| epoch 102 step    44100 |     64 batches | lr 0.000253 | ms/batch 320.27 | loss  4.13 | ppl    62.234
| epoch 102 step    44150 |    114 batches | lr 0.000253 | ms/batch 319.81 | loss  4.19 | ppl    65.807
| epoch 102 step    44200 |    164 batches | lr 0.000253 | ms/batch 320.62 | loss  4.19 | ppl    66.087
| epoch 102 step    44250 |    214 batches | lr 0.000252 | ms/batch 328.94 | loss  4.23 | ppl    68.752
| epoch 102 step    44300 |    264 batches | lr 0.000252 | ms/batch 329.37 | loss  4.21 | ppl    67.081
| epoch 102 step    44350 |    314 batches | lr 0.000252 | ms/batch 321.51 | loss  4.23 | ppl    68.388
| epoch 102 step    44400 |    364 batches | lr 0.000252 | ms/batch 321.02 | loss  4.15 | ppl    63.231
----------------------------------------------------------------------------------------------------
| Eval 111 at step    44400 | time: 134.23s | valid loss  4.25 | valid ppl    70.426
----------------------------------------------------------------------------------------------------
| epoch 102 step    44450 |    414 batches | lr 0.000252 | ms/batch 471.00 | loss  4.18 | ppl    65.606
| epoch 103 step    44500 |     28 batches | lr 0.000251 | ms/batch 314.85 | loss  4.24 | ppl    69.666
| epoch 103 step    44550 |     78 batches | lr 0.000251 | ms/batch 320.85 | loss  4.17 | ppl    64.463
| epoch 103 step    44600 |    128 batches | lr 0.000251 | ms/batch 320.91 | loss  4.19 | ppl    66.025
| epoch 103 step    44650 |    178 batches | lr 0.000251 | ms/batch 321.29 | loss  4.20 | ppl    66.624
| epoch 103 step    44700 |    228 batches | lr 0.000251 | ms/batch 319.66 | loss  4.22 | ppl    68.273
| epoch 103 step    44750 |    278 batches | lr 0.000251 | ms/batch 320.37 | loss  4.23 | ppl    68.919
| epoch 103 step    44800 |    328 batches | lr 0.00025 | ms/batch 321.40 | loss  4.15 | ppl    63.712
----------------------------------------------------------------------------------------------------
| Eval 112 at step    44800 | time: 133.07s | valid loss  4.26 | valid ppl    71.121
----------------------------------------------------------------------------------------------------
| epoch 103 step    44850 |    378 batches | lr 0.00025 | ms/batch 419.59 | loss  4.18 | ppl    65.238
| epoch 103 step    44900 |    428 batches | lr 0.00025 | ms/batch 321.24 | loss  4.22 | ppl    68.348
| epoch 104 step    44950 |     42 batches | lr 0.00025 | ms/batch 314.08 | loss  4.19 | ppl    65.840
| epoch 104 step    45000 |     92 batches | lr 0.00025 | ms/batch 320.29 | loss  4.13 | ppl    62.446
| epoch 104 step    45050 |    142 batches | lr 0.000249 | ms/batch 320.67 | loss  4.22 | ppl    67.956
| epoch 104 step    45100 |    192 batches | lr 0.000249 | ms/batch 321.36 | loss  4.22 | ppl    68.076
| epoch 104 step    45150 |    242 batches | lr 0.000249 | ms/batch 319.41 | loss  4.22 | ppl    67.917
| epoch 104 step    45200 |    292 batches | lr 0.000249 | ms/batch 320.10 | loss  4.27 | ppl    71.187
----------------------------------------------------------------------------------------------------
| Eval 113 at step    45200 | time: 132.83s | valid loss  4.26 | valid ppl    71.077
----------------------------------------------------------------------------------------------------
| epoch 104 step    45250 |    342 batches | lr 0.000249 | ms/batch 419.73 | loss  4.11 | ppl    60.723
| epoch 104 step    45300 |    392 batches | lr 0.000248 | ms/batch 319.33 | loss  4.20 | ppl    66.416
| epoch 105 step    45350 |      6 batches | lr 0.000248 | ms/batch 312.87 | loss  4.24 | ppl    69.554
| epoch 105 step    45400 |     56 batches | lr 0.000248 | ms/batch 319.25 | loss  4.15 | ppl    63.419
| epoch 105 step    45450 |    106 batches | lr 0.000248 | ms/batch 320.35 | loss  4.17 | ppl    64.710
| epoch 105 step    45500 |    156 batches | lr 0.000248 | ms/batch 320.79 | loss  4.19 | ppl    65.971
| epoch 105 step    45550 |    206 batches | lr 0.000247 | ms/batch 320.79 | loss  4.20 | ppl    66.736
| epoch 105 step    45600 |    256 batches | lr 0.000247 | ms/batch 320.62 | loss  4.22 | ppl    67.792
----------------------------------------------------------------------------------------------------
| Eval 114 at step    45600 | time: 132.73s | valid loss  4.27 | valid ppl    71.880
----------------------------------------------------------------------------------------------------
| epoch 105 step    45650 |    306 batches | lr 0.000247 | ms/batch 421.89 | loss  4.22 | ppl    67.789
| epoch 105 step    45700 |    356 batches | lr 0.000247 | ms/batch 320.60 | loss  4.10 | ppl    60.359
| epoch 105 step    45750 |    406 batches | lr 0.000247 | ms/batch 320.56 | loss  4.18 | ppl    65.361
| epoch 106 step    45800 |     20 batches | lr 0.000246 | ms/batch 313.21 | loss  4.23 | ppl    68.522
| epoch 106 step    45850 |     70 batches | lr 0.000246 | ms/batch 319.42 | loss  4.15 | ppl    63.202
| epoch 106 step    45900 |    120 batches | lr 0.000246 | ms/batch 319.03 | loss  4.18 | ppl    65.243
| epoch 106 step    45950 |    170 batches | lr 0.000246 | ms/batch 318.60 | loss  4.17 | ppl    64.809
| epoch 106 step    46000 |    220 batches | lr 0.000246 | ms/batch 319.69 | loss  4.22 | ppl    68.135
----------------------------------------------------------------------------------------------------
| Eval 115 at step    46000 | time: 132.61s | valid loss  4.27 | valid ppl    71.348
----------------------------------------------------------------------------------------------------
| epoch 106 step    46050 |    270 batches | lr 0.000245 | ms/batch 419.53 | loss  4.21 | ppl    67.175
| epoch 106 step    46100 |    320 batches | lr 0.000245 | ms/batch 319.10 | loss  4.19 | ppl    65.979
| epoch 106 step    46150 |    370 batches | lr 0.000245 | ms/batch 319.38 | loss  4.13 | ppl    62.115
| epoch 106 step    46200 |    420 batches | lr 0.000245 | ms/batch 318.45 | loss  4.19 | ppl    66.196
| epoch 107 step    46250 |     34 batches | lr 0.000245 | ms/batch 313.70 | loss  4.22 | ppl    67.821
| epoch 107 step    46300 |     84 batches | lr 0.000244 | ms/batch 319.44 | loss  4.14 | ppl    62.514
| epoch 107 step    46350 |    134 batches | lr 0.000244 | ms/batch 320.07 | loss  4.16 | ppl    63.996
| epoch 107 step    46400 |    184 batches | lr 0.000244 | ms/batch 335.39 | loss  4.18 | ppl    65.379
----------------------------------------------------------------------------------------------------
| Eval 116 at step    46400 | time: 133.30s | valid loss  4.25 | valid ppl    70.341
----------------------------------------------------------------------------------------------------
| epoch 107 step    46450 |    234 batches | lr 0.000244 | ms/batch 456.72 | loss  4.20 | ppl    66.848
| epoch 107 step    46500 |    284 batches | lr 0.000243 | ms/batch 319.33 | loss  4.23 | ppl    68.800
| epoch 107 step    46550 |    334 batches | lr 0.000243 | ms/batch 320.00 | loss  4.13 | ppl    62.355
| epoch 107 step    46600 |    384 batches | lr 0.000243 | ms/batch 320.21 | loss  4.17 | ppl    64.425
| epoch 107 step    46650 |    434 batches | lr 0.000243 | ms/batch 320.19 | loss  4.20 | ppl    66.551
| epoch 108 step    46700 |     48 batches | lr 0.000243 | ms/batch 313.41 | loss  4.16 | ppl    64.310
| epoch 108 step    46750 |     98 batches | lr 0.000242 | ms/batch 318.27 | loss  4.11 | ppl    61.221
| epoch 108 step    46800 |    148 batches | lr 0.000242 | ms/batch 319.72 | loss  4.16 | ppl    64.177
----------------------------------------------------------------------------------------------------
| Eval 117 at step    46800 | time: 132.61s | valid loss  4.26 | valid ppl    70.764
----------------------------------------------------------------------------------------------------
| epoch 108 step    46850 |    198 batches | lr 0.000242 | ms/batch 427.04 | loss  4.22 | ppl    67.715
| epoch 108 step    46900 |    248 batches | lr 0.000242 | ms/batch 320.69 | loss  4.18 | ppl    65.668
| epoch 108 step    46950 |    298 batches | lr 0.000242 | ms/batch 319.98 | loss  4.24 | ppl    69.346
| epoch 108 step    47000 |    348 batches | lr 0.000241 | ms/batch 320.78 | loss  4.10 | ppl    60.513
| epoch 108 step    47050 |    398 batches | lr 0.000241 | ms/batch 321.11 | loss  4.18 | ppl    65.668
| epoch 109 step    47100 |     12 batches | lr 0.000241 | ms/batch 315.06 | loss  4.23 | ppl    69.051
| epoch 109 step    47150 |     62 batches | lr 0.000241 | ms/batch 325.86 | loss  4.12 | ppl    61.728
| epoch 109 step    47200 |    112 batches | lr 0.000241 | ms/batch 325.85 | loss  4.15 | ppl    63.172
----------------------------------------------------------------------------------------------------
| Eval 118 at step    47200 | time: 133.84s | valid loss  4.25 | valid ppl    69.977
----------------------------------------------------------------------------------------------------
| epoch 109 step    47250 |    162 batches | lr 0.00024 | ms/batch 468.35 | loss  4.17 | ppl    64.926
| epoch 109 step    47300 |    212 batches | lr 0.00024 | ms/batch 319.74 | loss  4.17 | ppl    64.880
| epoch 109 step    47350 |    262 batches | lr 0.00024 | ms/batch 319.96 | loss  4.18 | ppl    65.203
| epoch 109 step    47400 |    312 batches | lr 0.00024 | ms/batch 320.27 | loss  4.20 | ppl    66.548
| epoch 109 step    47450 |    362 batches | lr 0.00024 | ms/batch 320.63 | loss  4.11 | ppl    60.849
| epoch 109 step    47500 |    412 batches | lr 0.000239 | ms/batch 319.97 | loss  4.15 | ppl    63.355
| epoch 110 step    47550 |     26 batches | lr 0.000239 | ms/batch 313.13 | loss  4.21 | ppl    67.186
| epoch 110 step    47600 |     76 batches | lr 0.000239 | ms/batch 319.90 | loss  4.13 | ppl    61.981
----------------------------------------------------------------------------------------------------
| Eval 119 at step    47600 | time: 132.72s | valid loss  4.27 | valid ppl    71.192
----------------------------------------------------------------------------------------------------
| epoch 110 step    47650 |    126 batches | lr 0.000239 | ms/batch 420.96 | loss  4.17 | ppl    64.564
| epoch 110 step    47700 |    176 batches | lr 0.000239 | ms/batch 320.32 | loss  4.18 | ppl    65.269
| epoch 110 step    47750 |    226 batches | lr 0.000238 | ms/batch 320.05 | loss  4.19 | ppl    65.750
| epoch 110 step    47800 |    276 batches | lr 0.000238 | ms/batch 320.31 | loss  4.19 | ppl    65.951
| epoch 110 step    47850 |    326 batches | lr 0.000238 | ms/batch 320.37 | loss  4.14 | ppl    62.516
| epoch 110 step    47900 |    376 batches | lr 0.000238 | ms/batch 320.19 | loss  4.15 | ppl    63.563
| epoch 110 step    47950 |    426 batches | lr 0.000238 | ms/batch 319.58 | loss  4.19 | ppl    65.827
| epoch 111 step    48000 |     40 batches | lr 0.000237 | ms/batch 314.19 | loss  4.16 | ppl    64.280
----------------------------------------------------------------------------------------------------
| Eval 120 at step    48000 | time: 132.81s | valid loss  4.26 | valid ppl    70.477
----------------------------------------------------------------------------------------------------
| epoch 111 step    48050 |     90 batches | lr 0.000237 | ms/batch 420.92 | loss  4.09 | ppl    59.957
| epoch 111 step    48100 |    140 batches | lr 0.000237 | ms/batch 320.24 | loss  4.14 | ppl    62.931
| epoch 111 step    48150 |    190 batches | lr 0.000237 | ms/batch 320.71 | loss  4.18 | ppl    65.366
| epoch 111 step    48200 |    240 batches | lr 0.000237 | ms/batch 320.15 | loss  4.18 | ppl    65.139
| epoch 111 step    48250 |    290 batches | lr 0.000236 | ms/batch 321.16 | loss  4.23 | ppl    68.666
| epoch 111 step    48300 |    340 batches | lr 0.000236 | ms/batch 320.37 | loss  4.08 | ppl    58.901
| epoch 111 step    48350 |    390 batches | lr 0.000236 | ms/batch 320.03 | loss  4.16 | ppl    64.092
| epoch 112 step    48400 |      4 batches | lr 0.000236 | ms/batch 314.54 | loss  4.19 | ppl    65.696
----------------------------------------------------------------------------------------------------
| Eval 121 at step    48400 | time: 132.88s | valid loss  4.25 | valid ppl    70.267
----------------------------------------------------------------------------------------------------
| epoch 112 step    48450 |     54 batches | lr 0.000236 | ms/batch 419.81 | loss  4.11 | ppl    60.951
| epoch 112 step    48500 |    104 batches | lr 0.000235 | ms/batch 320.09 | loss  4.13 | ppl    62.163
| epoch 112 step    48550 |    154 batches | lr 0.000235 | ms/batch 319.99 | loss  4.16 | ppl    64.004
| epoch 112 step    48600 |    204 batches | lr 0.000235 | ms/batch 320.73 | loss  4.19 | ppl    65.938
| epoch 112 step    48650 |    254 batches | lr 0.000235 | ms/batch 320.48 | loss  4.22 | ppl    67.710
| epoch 112 step    48700 |    304 batches | lr 0.000234 | ms/batch 320.01 | loss  4.20 | ppl    66.958
| epoch 112 step    48750 |    354 batches | lr 0.000234 | ms/batch 319.06 | loss  4.06 | ppl    58.242
| epoch 112 step    48800 |    404 batches | lr 0.000234 | ms/batch 319.40 | loss  4.16 | ppl    64.147
----------------------------------------------------------------------------------------------------
| Eval 122 at step    48800 | time: 133.00s | valid loss  4.26 | valid ppl    70.561
----------------------------------------------------------------------------------------------------
| epoch 113 step    48850 |     18 batches | lr 0.000234 | ms/batch 415.31 | loss  4.20 | ppl    66.767
| epoch 113 step    48900 |     68 batches | lr 0.000234 | ms/batch 320.34 | loss  4.10 | ppl    60.213
| epoch 113 step    48950 |    118 batches | lr 0.000233 | ms/batch 321.17 | loss  4.15 | ppl    63.295
| epoch 113 step    49000 |    168 batches | lr 0.000233 | ms/batch 321.49 | loss  4.18 | ppl    65.391
| epoch 113 step    49050 |    218 batches | lr 0.000233 | ms/batch 322.17 | loss  4.18 | ppl    65.053
| epoch 113 step    49100 |    268 batches | lr 0.000233 | ms/batch 321.41 | loss  4.16 | ppl    64.049
| epoch 113 step    49150 |    318 batches | lr 0.000233 | ms/batch 319.66 | loss  4.16 | ppl    64.112
| epoch 113 step    49200 |    368 batches | lr 0.000232 | ms/batch 320.26 | loss  4.12 | ppl    61.281
----------------------------------------------------------------------------------------------------
| Eval 123 at step    49200 | time: 133.10s | valid loss  4.24 | valid ppl    69.276
----------------------------------------------------------------------------------------------------
| epoch 113 step    49250 |    418 batches | lr 0.000232 | ms/batch 454.26 | loss  4.16 | ppl    64.094
| epoch 114 step    49300 |     32 batches | lr 0.000232 | ms/batch 313.11 | loss  4.19 | ppl    66.160
| epoch 114 step    49350 |     82 batches | lr 0.000232 | ms/batch 320.78 | loss  4.09 | ppl    59.910
| epoch 114 step    49400 |    132 batches | lr 0.000232 | ms/batch 319.19 | loss  4.17 | ppl    64.408
| epoch 114 step    49450 |    182 batches | lr 0.000231 | ms/batch 318.29 | loss  4.15 | ppl    63.481
| epoch 114 step    49500 |    232 batches | lr 0.000231 | ms/batch 319.50 | loss  4.17 | ppl    64.561
| epoch 114 step    49550 |    282 batches | lr 0.000231 | ms/batch 319.85 | loss  4.20 | ppl    66.489
| epoch 114 step    49600 |    332 batches | lr 0.000231 | ms/batch 319.26 | loss  4.09 | ppl    60.009
----------------------------------------------------------------------------------------------------
| Eval 124 at step    49600 | time: 132.57s | valid loss  4.26 | valid ppl    70.705
----------------------------------------------------------------------------------------------------
| epoch 114 step    49650 |    382 batches | lr 0.000231 | ms/batch 419.26 | loss  4.14 | ppl    62.815
| epoch 114 step    49700 |    432 batches | lr 0.00023 | ms/batch 319.78 | loss  4.20 | ppl    66.450
| epoch 115 step    49750 |     46 batches | lr 0.00023 | ms/batch 313.44 | loss  4.15 | ppl    63.142
| epoch 115 step    49800 |     96 batches | lr 0.00023 | ms/batch 320.41 | loss  4.08 | ppl    59.409
| epoch 115 step    49850 |    146 batches | lr 0.00023 | ms/batch 321.09 | loss  4.17 | ppl    64.503
| epoch 115 step    49900 |    196 batches | lr 0.000229 | ms/batch 318.70 | loss  4.17 | ppl    64.769
| epoch 115 step    49950 |    246 batches | lr 0.000229 | ms/batch 319.11 | loss  4.18 | ppl    65.678
| epoch 115 step    50000 |    296 batches | lr 0.000229 | ms/batch 319.69 | loss  4.23 | ppl    68.857
----------------------------------------------------------------------------------------------------
| Eval 125 at step    50000 | time: 132.59s | valid loss  4.25 | valid ppl    70.353
----------------------------------------------------------------------------------------------------
| epoch 115 step    50050 |    346 batches | lr 0.000229 | ms/batch 419.85 | loss  4.09 | ppl    59.761
| epoch 115 step    50100 |    396 batches | lr 0.000229 | ms/batch 318.22 | loss  4.14 | ppl    62.970
| epoch 116 step    50150 |     10 batches | lr 0.000228 | ms/batch 315.68 | loss  4.19 | ppl    65.758
| epoch 116 step    50200 |     60 batches | lr 0.000228 | ms/batch 319.44 | loss  4.10 | ppl    60.354
| epoch 116 step    50250 |    110 batches | lr 0.000228 | ms/batch 319.01 | loss  4.11 | ppl    61.185
| epoch 116 step    50300 |    160 batches | lr 0.000228 | ms/batch 319.46 | loss  4.15 | ppl    63.578
| epoch 116 step    50350 |    210 batches | lr 0.000228 | ms/batch 319.23 | loss  4.14 | ppl    62.953
| epoch 116 step    50400 |    260 batches | lr 0.000227 | ms/batch 318.49 | loss  4.18 | ppl    65.215
----------------------------------------------------------------------------------------------------
| Eval 126 at step    50400 | time: 132.47s | valid loss  4.26 | valid ppl    70.724
----------------------------------------------------------------------------------------------------
| epoch 116 step    50450 |    310 batches | lr 0.000227 | ms/batch 419.84 | loss  4.17 | ppl    64.670
| epoch 116 step    50500 |    360 batches | lr 0.000227 | ms/batch 318.79 | loss  4.07 | ppl    58.736
| epoch 116 step    50550 |    410 batches | lr 0.000227 | ms/batch 319.22 | loss  4.13 | ppl    62.458
| epoch 117 step    50600 |     24 batches | lr 0.000227 | ms/batch 312.87 | loss  4.19 | ppl    66.049
| epoch 117 step    50650 |     74 batches | lr 0.000226 | ms/batch 318.91 | loss  4.09 | ppl    59.908
| epoch 117 step    50700 |    124 batches | lr 0.000226 | ms/batch 319.68 | loss  4.13 | ppl    62.032
| epoch 117 step    50750 |    174 batches | lr 0.000226 | ms/batch 319.92 | loss  4.17 | ppl    64.516
| epoch 117 step    50800 |    224 batches | lr 0.000226 | ms/batch 320.28 | loss  4.16 | ppl    64.305
----------------------------------------------------------------------------------------------------
| Eval 127 at step    50800 | time: 132.50s | valid loss  4.25 | valid ppl    70.111
----------------------------------------------------------------------------------------------------
| epoch 117 step    50850 |    274 batches | lr 0.000226 | ms/batch 421.61 | loss  4.19 | ppl    66.323
| epoch 117 step    50900 |    324 batches | lr 0.000225 | ms/batch 322.30 | loss  4.09 | ppl    60.021
| epoch 117 step    50950 |    374 batches | lr 0.000225 | ms/batch 326.59 | loss  4.13 | ppl    62.326
| epoch 117 step    51000 |    424 batches | lr 0.000225 | ms/batch 330.22 | loss  4.13 | ppl    62.265
| epoch 118 step    51050 |     38 batches | lr 0.000225 | ms/batch 313.71 | loss  4.14 | ppl    62.646
| epoch 118 step    51100 |     88 batches | lr 0.000224 | ms/batch 319.48 | loss  4.09 | ppl    59.971
| epoch 118 step    51150 |    138 batches | lr 0.000224 | ms/batch 320.10 | loss  4.12 | ppl    61.752
| epoch 118 step    51200 |    188 batches | lr 0.000224 | ms/batch 318.87 | loss  4.15 | ppl    63.394
----------------------------------------------------------------------------------------------------
| Eval 128 at step    51200 | time: 133.61s | valid loss  4.25 | valid ppl    70.141
----------------------------------------------------------------------------------------------------
| epoch 118 step    51250 |    238 batches | lr 0.000224 | ms/batch 420.54 | loss  4.16 | ppl    64.360
| epoch 118 step    51300 |    288 batches | lr 0.000224 | ms/batch 319.78 | loss  4.21 | ppl    67.162
| epoch 118 step    51350 |    338 batches | lr 0.000223 | ms/batch 320.14 | loss  4.05 | ppl    57.454
| epoch 118 step    51400 |    388 batches | lr 0.000223 | ms/batch 318.41 | loss  4.14 | ppl    62.803
| epoch 119 step    51450 |      2 batches | lr 0.000223 | ms/batch 313.64 | loss  4.15 | ppl    63.565
| epoch 119 step    51500 |     52 batches | lr 0.000223 | ms/batch 319.24 | loss  4.11 | ppl    60.747
| epoch 119 step    51550 |    102 batches | lr 0.000223 | ms/batch 319.41 | loss  4.09 | ppl    59.766
| epoch 119 step    51600 |    152 batches | lr 0.000222 | ms/batch 319.65 | loss  4.12 | ppl    61.598
----------------------------------------------------------------------------------------------------
| Eval 129 at step    51600 | time: 132.57s | valid loss  4.24 | valid ppl    69.407
----------------------------------------------------------------------------------------------------
| epoch 119 step    51650 |    202 batches | lr 0.000222 | ms/batch 421.66 | loss  4.16 | ppl    63.782
| epoch 119 step    51700 |    252 batches | lr 0.000222 | ms/batch 321.10 | loss  4.15 | ppl    63.633
| epoch 119 step    51750 |    302 batches | lr 0.000222 | ms/batch 320.83 | loss  4.18 | ppl    65.637
| epoch 119 step    51800 |    352 batches | lr 0.000221 | ms/batch 320.04 | loss  4.05 | ppl    57.492
| epoch 119 step    51850 |    402 batches | lr 0.000221 | ms/batch 320.72 | loss  4.14 | ppl    62.546
| epoch 120 step    51900 |     16 batches | lr 0.000221 | ms/batch 315.29 | loss  4.18 | ppl    65.547
| epoch 120 step    51950 |     66 batches | lr 0.000221 | ms/batch 318.59 | loss  4.07 | ppl    58.720
| epoch 120 step    52000 |    116 batches | lr 0.000221 | ms/batch 318.76 | loss  4.11 | ppl    60.833
----------------------------------------------------------------------------------------------------
| Eval 130 at step    52000 | time: 132.82s | valid loss  4.24 | valid ppl    69.642
----------------------------------------------------------------------------------------------------
| epoch 120 step    52050 |    166 batches | lr 0.00022 | ms/batch 419.66 | loss  4.12 | ppl    61.317
| epoch 120 step    52100 |    216 batches | lr 0.00022 | ms/batch 319.14 | loss  4.14 | ppl    62.916
| epoch 120 step    52150 |    266 batches | lr 0.00022 | ms/batch 320.74 | loss  4.15 | ppl    63.672
| epoch 120 step    52200 |    316 batches | lr 0.00022 | ms/batch 321.59 | loss  4.13 | ppl    61.989
| epoch 120 step    52250 |    366 batches | lr 0.00022 | ms/batch 320.75 | loss  4.10 | ppl    60.154
| epoch 120 step    52300 |    416 batches | lr 0.000219 | ms/batch 321.21 | loss  4.10 | ppl    60.204
| epoch 121 step    52350 |     30 batches | lr 0.000219 | ms/batch 315.81 | loss  4.14 | ppl    62.548
| epoch 121 step    52400 |     80 batches | lr 0.000219 | ms/batch 321.09 | loss  4.10 | ppl    60.432
----------------------------------------------------------------------------------------------------
| Eval 131 at step    52400 | time: 133.02s | valid loss  4.25 | valid ppl    70.307
----------------------------------------------------------------------------------------------------
| epoch 121 step    52450 |    130 batches | lr 0.000219 | ms/batch 421.28 | loss  4.12 | ppl    61.696
| epoch 121 step    52500 |    180 batches | lr 0.000219 | ms/batch 321.88 | loss  4.14 | ppl    62.840
| epoch 121 step    52550 |    230 batches | lr 0.000218 | ms/batch 319.90 | loss  4.16 | ppl    63.844
| epoch 121 step    52600 |    280 batches | lr 0.000218 | ms/batch 320.72 | loss  4.18 | ppl    65.376
| epoch 121 step    52650 |    330 batches | lr 0.000218 | ms/batch 320.56 | loss  4.08 | ppl    59.326
| epoch 121 step    52700 |    380 batches | lr 0.000218 | ms/batch 320.50 | loss  4.11 | ppl    60.650
| epoch 121 step    52750 |    430 batches | lr 0.000217 | ms/batch 321.01 | loss  4.14 | ppl    63.081
| epoch 122 step    52800 |     44 batches | lr 0.000217 | ms/batch 314.22 | loss  4.11 | ppl    61.190
----------------------------------------------------------------------------------------------------
| Eval 132 at step    52800 | time: 133.00s | valid loss  4.24 | valid ppl    69.741
----------------------------------------------------------------------------------------------------
| epoch 122 step    52850 |     94 batches | lr 0.000217 | ms/batch 422.02 | loss  4.06 | ppl    58.244
| epoch 122 step    52900 |    144 batches | lr 0.000217 | ms/batch 320.42 | loss  4.10 | ppl    60.626
| epoch 122 step    52950 |    194 batches | lr 0.000217 | ms/batch 320.14 | loss  4.14 | ppl    62.911
| epoch 122 step    53000 |    244 batches | lr 0.000216 | ms/batch 321.16 | loss  4.13 | ppl    62.463
| epoch 122 step    53050 |    294 batches | lr 0.000216 | ms/batch 321.23 | loss  4.18 | ppl    65.307
| epoch 122 step    53100 |    344 batches | lr 0.000216 | ms/batch 321.07 | loss  4.04 | ppl    56.929
| epoch 122 step    53150 |    394 batches | lr 0.000216 | ms/batch 320.07 | loss  4.13 | ppl    62.317
| epoch 123 step    53200 |      8 batches | lr 0.000216 | ms/batch 314.60 | loss  4.13 | ppl    62.192
----------------------------------------------------------------------------------------------------
| Eval 133 at step    53200 | time: 133.02s | valid loss  4.23 | valid ppl    69.004
----------------------------------------------------------------------------------------------------
| epoch 123 step    53250 |     58 batches | lr 0.000215 | ms/batch 457.05 | loss  4.07 | ppl    58.571
| epoch 123 step    53300 |    108 batches | lr 0.000215 | ms/batch 318.89 | loss  4.09 | ppl    59.974
| epoch 123 step    53350 |    158 batches | lr 0.000215 | ms/batch 319.11 | loss  4.13 | ppl    61.899
| epoch 123 step    53400 |    208 batches | lr 0.000215 | ms/batch 319.27 | loss  4.14 | ppl    63.078
| epoch 123 step    53450 |    258 batches | lr 0.000214 | ms/batch 319.91 | loss  4.15 | ppl    63.147
| epoch 123 step    53500 |    308 batches | lr 0.000214 | ms/batch 319.00 | loss  4.11 | ppl    61.035
| epoch 123 step    53550 |    358 batches | lr 0.000214 | ms/batch 319.31 | loss  4.07 | ppl    58.374
| epoch 123 step    53600 |    408 batches | lr 0.000214 | ms/batch 319.84 | loss  4.11 | ppl    61.152
----------------------------------------------------------------------------------------------------
| Eval 134 at step    53600 | time: 132.77s | valid loss  4.24 | valid ppl    69.642
----------------------------------------------------------------------------------------------------
| epoch 124 step    53650 |     22 batches | lr 0.000214 | ms/batch 412.83 | loss  4.15 | ppl    63.682
| epoch 124 step    53700 |     72 batches | lr 0.000213 | ms/batch 319.49 | loss  4.06 | ppl    58.074
| epoch 124 step    53750 |    122 batches | lr 0.000213 | ms/batch 318.78 | loss  4.09 | ppl    60.021
| epoch 124 step    53800 |    172 batches | lr 0.000213 | ms/batch 320.01 | loss  4.11 | ppl    60.697
| epoch 124 step    53850 |    222 batches | lr 0.000213 | ms/batch 320.92 | loss  4.17 | ppl    64.622
| epoch 124 step    53900 |    272 batches | lr 0.000213 | ms/batch 320.73 | loss  4.14 | ppl    63.078
| epoch 124 step    53950 |    322 batches | lr 0.000212 | ms/batch 319.90 | loss  4.09 | ppl    59.721
| epoch 124 step    54000 |    372 batches | lr 0.000212 | ms/batch 320.00 | loss  4.08 | ppl    58.871
----------------------------------------------------------------------------------------------------
| Eval 135 at step    54000 | time: 132.64s | valid loss  4.24 | valid ppl    69.409
----------------------------------------------------------------------------------------------------
| epoch 124 step    54050 |    422 batches | lr 0.000212 | ms/batch 420.60 | loss  4.11 | ppl    61.090
| epoch 125 step    54100 |     36 batches | lr 0.000212 | ms/batch 313.84 | loss  4.14 | ppl    62.543
| epoch 125 step    54150 |     86 batches | lr 0.000211 | ms/batch 320.00 | loss  4.08 | ppl    59.076
| epoch 125 step    54200 |    136 batches | lr 0.000211 | ms/batch 319.87 | loss  4.12 | ppl    61.377
| epoch 125 step    54250 |    186 batches | lr 0.000211 | ms/batch 320.20 | loss  4.11 | ppl    60.802
| epoch 125 step    54300 |    236 batches | lr 0.000211 | ms/batch 320.82 | loss  4.13 | ppl    61.991
| epoch 125 step    54350 |    286 batches | lr 0.000211 | ms/batch 321.20 | loss  4.15 | ppl    63.628
| epoch 125 step    54400 |    336 batches | lr 0.00021 | ms/batch 320.78 | loss  4.04 | ppl    56.656
----------------------------------------------------------------------------------------------------
| Eval 136 at step    54400 | time: 132.90s | valid loss  4.25 | valid ppl    69.892
----------------------------------------------------------------------------------------------------
| epoch 125 step    54450 |    386 batches | lr 0.00021 | ms/batch 420.62 | loss  4.11 | ppl    60.890
| epoch 125 step    54500 |    436 batches | lr 0.00021 | ms/batch 316.85 | loss  4.13 | ppl    62.166
| epoch 126 step    54550 |     50 batches | lr 0.00021 | ms/batch 317.21 | loss  4.10 | ppl    60.411
| epoch 126 step    54600 |    100 batches | lr 0.00021 | ms/batch 320.40 | loss  4.08 | ppl    58.954
| epoch 126 step    54650 |    150 batches | lr 0.000209 | ms/batch 320.70 | loss  4.09 | ppl    59.726
| epoch 126 step    54700 |    200 batches | lr 0.000209 | ms/batch 320.69 | loss  4.10 | ppl    60.506
| epoch 126 step    54750 |    250 batches | lr 0.000209 | ms/batch 321.72 | loss  4.15 | ppl    63.370
| epoch 126 step    54800 |    300 batches | lr 0.000209 | ms/batch 319.34 | loss  4.16 | ppl    64.021
----------------------------------------------------------------------------------------------------
| Eval 137 at step    54800 | time: 132.86s | valid loss  4.25 | valid ppl    69.807
----------------------------------------------------------------------------------------------------
| epoch 126 step    54850 |    350 batches | lr 0.000208 | ms/batch 420.15 | loss  4.02 | ppl    55.703
| epoch 126 step    54900 |    400 batches | lr 0.000208 | ms/batch 320.52 | loss  4.11 | ppl    60.752
| epoch 127 step    54950 |     14 batches | lr 0.000208 | ms/batch 313.19 | loss  4.17 | ppl    64.599
| epoch 127 step    55000 |     64 batches | lr 0.000208 | ms/batch 318.89 | loss  4.05 | ppl    57.613
| epoch 127 step    55050 |    114 batches | lr 0.000208 | ms/batch 319.82 | loss  4.07 | ppl    58.754
| epoch 127 step    55100 |    164 batches | lr 0.000207 | ms/batch 319.96 | loss  4.09 | ppl    59.586
| epoch 127 step    55150 |    214 batches | lr 0.000207 | ms/batch 318.67 | loss  4.10 | ppl    60.517
| epoch 127 step    55200 |    264 batches | lr 0.000207 | ms/batch 319.20 | loss  4.12 | ppl    61.504
----------------------------------------------------------------------------------------------------
| Eval 138 at step    55200 | time: 132.52s | valid loss  4.25 | valid ppl    70.268
----------------------------------------------------------------------------------------------------
| epoch 127 step    55250 |    314 batches | lr 0.000207 | ms/batch 419.11 | loss  4.11 | ppl    61.135
| epoch 127 step    55300 |    364 batches | lr 0.000206 | ms/batch 320.05 | loss  4.04 | ppl    56.873
| epoch 127 step    55350 |    414 batches | lr 0.000206 | ms/batch 319.10 | loss  4.11 | ppl    60.966
| epoch 128 step    55400 |     28 batches | lr 0.000206 | ms/batch 312.40 | loss  4.14 | ppl    62.935
| epoch 128 step    55450 |     78 batches | lr 0.000206 | ms/batch 320.48 | loss  4.07 | ppl    58.358
| epoch 128 step    55500 |    128 batches | lr 0.000206 | ms/batch 320.09 | loss  4.09 | ppl    59.635
| epoch 128 step    55550 |    178 batches | lr 0.000205 | ms/batch 320.75 | loss  4.09 | ppl    60.023
| epoch 128 step    55600 |    228 batches | lr 0.000205 | ms/batch 320.60 | loss  4.12 | ppl    61.540
----------------------------------------------------------------------------------------------------
| Eval 139 at step    55600 | time: 132.65s | valid loss  4.25 | valid ppl    70.082
----------------------------------------------------------------------------------------------------
| epoch 128 step    55650 |    278 batches | lr 0.000205 | ms/batch 420.42 | loss  4.14 | ppl    62.857
| epoch 128 step    55700 |    328 batches | lr 0.000205 | ms/batch 321.02 | loss  4.05 | ppl    57.638
| epoch 128 step    55750 |    378 batches | lr 0.000205 | ms/batch 318.61 | loss  4.08 | ppl    59.127
| epoch 128 step    55800 |    428 batches | lr 0.000204 | ms/batch 318.28 | loss  4.10 | ppl    60.406
| epoch 129 step    55850 |     42 batches | lr 0.000204 | ms/batch 315.27 | loss  4.09 | ppl    59.983
| epoch 129 step    55900 |     92 batches | lr 0.000204 | ms/batch 320.43 | loss  4.03 | ppl    56.479
| epoch 129 step    55950 |    142 batches | lr 0.000204 | ms/batch 320.08 | loss  4.08 | ppl    59.192
| epoch 129 step    56000 |    192 batches | lr 0.000203 | ms/batch 320.42 | loss  4.10 | ppl    60.044
----------------------------------------------------------------------------------------------------
| Eval 140 at step    56000 | time: 132.71s | valid loss  4.25 | valid ppl    69.946
----------------------------------------------------------------------------------------------------
| epoch 129 step    56050 |    242 batches | lr 0.000203 | ms/batch 420.67 | loss  4.12 | ppl    61.745
| epoch 129 step    56100 |    292 batches | lr 0.000203 | ms/batch 321.10 | loss  4.14 | ppl    62.688
| epoch 129 step    56150 |    342 batches | lr 0.000203 | ms/batch 320.47 | loss  4.00 | ppl    54.326
| epoch 129 step    56200 |    392 batches | lr 0.000203 | ms/batch 319.58 | loss  4.10 | ppl    60.131
| epoch 130 step    56250 |      6 batches | lr 0.000202 | ms/batch 313.64 | loss  4.15 | ppl    63.568
| epoch 130 step    56300 |     56 batches | lr 0.000202 | ms/batch 320.53 | loss  4.05 | ppl    57.427
| epoch 130 step    56350 |    106 batches | lr 0.000202 | ms/batch 320.00 | loss  4.04 | ppl    56.937
| epoch 130 step    56400 |    156 batches | lr 0.000202 | ms/batch 319.45 | loss  4.09 | ppl    59.901
----------------------------------------------------------------------------------------------------
| Eval 141 at step    56400 | time: 132.76s | valid loss  4.23 | valid ppl    68.564
----------------------------------------------------------------------------------------------------
| epoch 130 step    56450 |    206 batches | lr 0.000202 | ms/batch 454.09 | loss  4.10 | ppl    60.350
| epoch 130 step    56500 |    256 batches | lr 0.000201 | ms/batch 319.44 | loss  4.13 | ppl    62.025
| epoch 130 step    56550 |    306 batches | lr 0.000201 | ms/batch 320.40 | loss  4.13 | ppl    61.938
| epoch 130 step    56600 |    356 batches | lr 0.000201 | ms/batch 319.29 | loss  4.04 | ppl    56.598
| epoch 130 step    56650 |    406 batches | lr 0.000201 | ms/batch 320.10 | loss  4.05 | ppl    57.476
| epoch 131 step    56700 |     20 batches | lr 0.0002 | ms/batch 312.50 | loss  4.13 | ppl    61.916
| epoch 131 step    56750 |     70 batches | lr 0.0002 | ms/batch 317.99 | loss  4.01 | ppl    55.367
| epoch 131 step    56800 |    120 batches | lr 0.0002 | ms/batch 319.50 | loss  4.08 | ppl    59.088
----------------------------------------------------------------------------------------------------
| Eval 142 at step    56800 | time: 132.44s | valid loss  4.24 | valid ppl    69.170
----------------------------------------------------------------------------------------------------
| epoch 131 step    56850 |    170 batches | lr 0.0002 | ms/batch 419.37 | loss  4.10 | ppl    60.145
| epoch 131 step    56900 |    220 batches | lr 0.0002 | ms/batch 319.77 | loss  4.11 | ppl    60.947
| epoch 131 step    56950 |    270 batches | lr 0.000199 | ms/batch 319.69 | loss  4.09 | ppl    59.845
| epoch 131 step    57000 |    320 batches | lr 0.000199 | ms/batch 320.25 | loss  4.08 | ppl    59.206
| epoch 131 step    57050 |    370 batches | lr 0.000199 | ms/batch 320.49 | loss  4.05 | ppl    57.651
| epoch 131 step    57100 |    420 batches | lr 0.000199 | ms/batch 320.53 | loss  4.09 | ppl    59.950
| epoch 132 step    57150 |     34 batches | lr 0.000198 | ms/batch 312.81 | loss  4.10 | ppl    60.270
| epoch 132 step    57200 |     84 batches | lr 0.000198 | ms/batch 319.27 | loss  4.02 | ppl    55.651
----------------------------------------------------------------------------------------------------
| Eval 143 at step    57200 | time: 132.63s | valid loss  4.25 | valid ppl    69.896
----------------------------------------------------------------------------------------------------
| epoch 132 step    57250 |    134 batches | lr 0.000198 | ms/batch 419.41 | loss  4.09 | ppl    59.990
| epoch 132 step    57300 |    184 batches | lr 0.000198 | ms/batch 320.47 | loss  4.09 | ppl    59.465
| epoch 132 step    57350 |    234 batches | lr 0.000198 | ms/batch 320.64 | loss  4.11 | ppl    61.016
| epoch 132 step    57400 |    284 batches | lr 0.000197 | ms/batch 320.22 | loss  4.12 | ppl    61.355
| epoch 132 step    57450 |    334 batches | lr 0.000197 | ms/batch 320.38 | loss  4.03 | ppl    56.349
| epoch 132 step    57500 |    384 batches | lr 0.000197 | ms/batch 321.94 | loss  4.08 | ppl    59.090
| epoch 132 step    57550 |    434 batches | lr 0.000197 | ms/batch 321.19 | loss  4.12 | ppl    61.803
| epoch 133 step    57600 |     48 batches | lr 0.000196 | ms/batch 315.48 | loss  4.06 | ppl    58.110
----------------------------------------------------------------------------------------------------
| Eval 144 at step    57600 | time: 133.03s | valid loss  4.25 | valid ppl    70.045
----------------------------------------------------------------------------------------------------
| epoch 133 step    57650 |     98 batches | lr 0.000196 | ms/batch 422.19 | loss  4.04 | ppl    57.078
| epoch 133 step    57700 |    148 batches | lr 0.000196 | ms/batch 320.84 | loss  4.07 | ppl    58.440
| epoch 133 step    57750 |    198 batches | lr 0.000196 | ms/batch 321.59 | loss  4.10 | ppl    60.352
| epoch 133 step    57800 |    248 batches | lr 0.000196 | ms/batch 321.00 | loss  4.10 | ppl    60.567
| epoch 133 step    57850 |    298 batches | lr 0.000195 | ms/batch 319.92 | loss  4.14 | ppl    62.835
| epoch 133 step    57900 |    348 batches | lr 0.000195 | ms/batch 320.35 | loss  3.99 | ppl    54.203
| epoch 133 step    57950 |    398 batches | lr 0.000195 | ms/batch 322.20 | loss  4.09 | ppl    59.691
| epoch 134 step    58000 |     12 batches | lr 0.000195 | ms/batch 315.09 | loss  4.12 | ppl    61.312
----------------------------------------------------------------------------------------------------
| Eval 145 at step    58000 | time: 133.14s | valid loss  4.23 | valid ppl    68.760
----------------------------------------------------------------------------------------------------
| epoch 134 step    58050 |     62 batches | lr 0.000195 | ms/batch 420.30 | loss  4.03 | ppl    56.314
| epoch 134 step    58100 |    112 batches | lr 0.000194 | ms/batch 320.18 | loss  4.07 | ppl    58.685
| epoch 134 step    58150 |    162 batches | lr 0.000194 | ms/batch 319.50 | loss  4.08 | ppl    59.028
| epoch 134 step    58200 |    212 batches | lr 0.000194 | ms/batch 319.10 | loss  4.06 | ppl    57.909
| epoch 134 step    58250 |    262 batches | lr 0.000194 | ms/batch 318.80 | loss  4.09 | ppl    59.971
| epoch 134 step    58300 |    312 batches | lr 0.000193 | ms/batch 321.28 | loss  4.08 | ppl    59.370
| epoch 134 step    58350 |    362 batches | lr 0.000193 | ms/batch 319.75 | loss  4.03 | ppl    56.419
| epoch 134 step    58400 |    412 batches | lr 0.000193 | ms/batch 320.18 | loss  4.06 | ppl    57.721
----------------------------------------------------------------------------------------------------
| Eval 146 at step    58400 | time: 132.96s | valid loss  4.24 | valid ppl    69.351
----------------------------------------------------------------------------------------------------
| epoch 135 step    58450 |     26 batches | lr 0.000193 | ms/batch 414.03 | loss  4.12 | ppl    61.591
| epoch 135 step    58500 |     76 batches | lr 0.000193 | ms/batch 321.04 | loss  4.01 | ppl    54.990
| epoch 135 step    58550 |    126 batches | lr 0.000192 | ms/batch 320.82 | loss  4.06 | ppl    58.113
| epoch 135 step    58600 |    176 batches | lr 0.000192 | ms/batch 320.66 | loss  4.09 | ppl    59.644
| epoch 135 step    58650 |    226 batches | lr 0.000192 | ms/batch 322.18 | loss  4.09 | ppl    59.600
| epoch 135 step    58700 |    276 batches | lr 0.000192 | ms/batch 321.90 | loss  4.09 | ppl    59.899
| epoch 135 step    58750 |    326 batches | lr 0.000191 | ms/batch 321.88 | loss  4.03 | ppl    56.042
| epoch 135 step    58800 |    376 batches | lr 0.000191 | ms/batch 321.46 | loss  4.05 | ppl    57.221
----------------------------------------------------------------------------------------------------
| Eval 147 at step    58800 | time: 133.23s | valid loss  4.23 | valid ppl    68.867
----------------------------------------------------------------------------------------------------
| epoch 135 step    58850 |    426 batches | lr 0.000191 | ms/batch 422.12 | loss  4.09 | ppl    59.658
| epoch 136 step    58900 |     40 batches | lr 0.000191 | ms/batch 315.15 | loss  4.06 | ppl    58.047
| epoch 136 step    58950 |     90 batches | lr 0.000191 | ms/batch 322.22 | loss  4.02 | ppl    55.976
| epoch 136 step    59000 |    140 batches | lr 0.00019 | ms/batch 321.27 | loss  4.05 | ppl    57.627
| epoch 136 step    59050 |    190 batches | lr 0.00019 | ms/batch 320.54 | loss  4.06 | ppl    57.784
| epoch 136 step    59100 |    240 batches | lr 0.00019 | ms/batch 321.04 | loss  4.09 | ppl    59.882
| epoch 136 step    59150 |    290 batches | lr 0.00019 | ms/batch 335.72 | loss  4.15 | ppl    63.145
| epoch 136 step    59200 |    340 batches | lr 0.000189 | ms/batch 335.78 | loss  3.98 | ppl    53.354
----------------------------------------------------------------------------------------------------
| Eval 148 at step    59200 | time: 134.90s | valid loss  4.25 | valid ppl    70.232
----------------------------------------------------------------------------------------------------
| epoch 136 step    59250 |    390 batches | lr 0.000189 | ms/batch 424.87 | loss  4.08 | ppl    59.282
| epoch 137 step    59300 |      4 batches | lr 0.000189 | ms/batch 314.29 | loss  4.10 | ppl    60.110
| epoch 137 step    59350 |     54 batches | lr 0.000189 | ms/batch 319.61 | loss  4.02 | ppl    55.917
| epoch 137 step    59400 |    104 batches | lr 0.000189 | ms/batch 321.39 | loss  4.03 | ppl    56.166
| epoch 137 step    59450 |    154 batches | lr 0.000188 | ms/batch 319.79 | loss  4.09 | ppl    59.463
| epoch 137 step    59500 |    204 batches | lr 0.000188 | ms/batch 321.72 | loss  4.06 | ppl    58.233
| epoch 137 step    59550 |    254 batches | lr 0.000188 | ms/batch 321.72 | loss  4.09 | ppl    59.932
| epoch 137 step    59600 |    304 batches | lr 0.000188 | ms/batch 320.08 | loss  4.12 | ppl    61.822
----------------------------------------------------------------------------------------------------
| Eval 149 at step    59600 | time: 132.96s | valid loss  4.23 | valid ppl    68.718
----------------------------------------------------------------------------------------------------
| epoch 137 step    59650 |    354 batches | lr 0.000188 | ms/batch 421.89 | loss  3.97 | ppl    52.786
| epoch 137 step    59700 |    404 batches | lr 0.000187 | ms/batch 320.20 | loss  4.06 | ppl    58.151
| epoch 138 step    59750 |     18 batches | lr 0.000187 | ms/batch 314.93 | loss  4.11 | ppl    61.042
| epoch 138 step    59800 |     68 batches | lr 0.000187 | ms/batch 320.24 | loss  4.01 | ppl    55.227
| epoch 138 step    59850 |    118 batches | lr 0.000187 | ms/batch 320.10 | loss  4.05 | ppl    57.310
| epoch 138 step    59900 |    168 batches | lr 0.000186 | ms/batch 333.43 | loss  4.06 | ppl    58.033
| epoch 138 step    59950 |    218 batches | lr 0.000186 | ms/batch 336.06 | loss  4.09 | ppl    59.742
| epoch 138 step    60000 |    268 batches | lr 0.000186 | ms/batch 334.84 | loss  4.08 | ppl    58.892
----------------------------------------------------------------------------------------------------
| Eval 150 at step    60000 | time: 135.08s | valid loss  4.24 | valid ppl    69.188
----------------------------------------------------------------------------------------------------
| epoch 138 step    60050 |    318 batches | lr 0.000186 | ms/batch 420.71 | loss  4.05 | ppl    57.629
| epoch 138 step    60100 |    368 batches | lr 0.000186 | ms/batch 320.04 | loss  4.00 | ppl    54.649
| epoch 138 step    60150 |    418 batches | lr 0.000185 | ms/batch 320.46 | loss  4.03 | ppl    56.193
| epoch 139 step    60200 |     32 batches | lr 0.000185 | ms/batch 314.45 | loss  4.10 | ppl    60.272
| epoch 139 step    60250 |     82 batches | lr 0.000185 | ms/batch 319.77 | loss  4.00 | ppl    54.799
| epoch 139 step    60300 |    132 batches | lr 0.000185 | ms/batch 320.94 | loss  4.06 | ppl    57.836
| epoch 139 step    60350 |    182 batches | lr 0.000184 | ms/batch 320.47 | loss  4.04 | ppl    56.620
| epoch 139 step    60400 |    232 batches | lr 0.000184 | ms/batch 319.59 | loss  4.08 | ppl    59.023
----------------------------------------------------------------------------------------------------
| Eval 151 at step    60400 | time: 132.83s | valid loss  4.24 | valid ppl    69.162
----------------------------------------------------------------------------------------------------
| epoch 139 step    60450 |    282 batches | lr 0.000184 | ms/batch 421.21 | loss  4.09 | ppl    60.021
| epoch 139 step    60500 |    332 batches | lr 0.000184 | ms/batch 320.79 | loss  4.02 | ppl    55.769
| epoch 139 step    60550 |    382 batches | lr 0.000184 | ms/batch 321.97 | loss  4.04 | ppl    57.011
| epoch 139 step    60600 |    432 batches | lr 0.000183 | ms/batch 321.53 | loss  4.07 | ppl    58.740
| epoch 140 step    60650 |     46 batches | lr 0.000183 | ms/batch 314.52 | loss  4.03 | ppl    56.523
| epoch 140 step    60700 |     96 batches | lr 0.000183 | ms/batch 321.03 | loss  3.99 | ppl    53.836
| epoch 140 step    60750 |    146 batches | lr 0.000183 | ms/batch 320.40 | loss  4.06 | ppl    57.934
| epoch 140 step    60800 |    196 batches | lr 0.000182 | ms/batch 318.59 | loss  4.07 | ppl    58.827
----------------------------------------------------------------------------------------------------
| Eval 152 at step    60800 | time: 132.99s | valid loss  4.23 | valid ppl    68.803
----------------------------------------------------------------------------------------------------
| epoch 140 step    60850 |    246 batches | lr 0.000182 | ms/batch 421.62 | loss  4.08 | ppl    58.954
| epoch 140 step    60900 |    296 batches | lr 0.000182 | ms/batch 321.19 | loss  4.10 | ppl    60.136
| epoch 140 step    60950 |    346 batches | lr 0.000182 | ms/batch 321.50 | loss  3.96 | ppl    52.566
| epoch 140 step    61000 |    396 batches | lr 0.000182 | ms/batch 320.77 | loss  4.05 | ppl    57.507
| epoch 141 step    61050 |     10 batches | lr 0.000181 | ms/batch 312.89 | loss  4.08 | ppl    59.319
| epoch 141 step    61100 |     60 batches | lr 0.000181 | ms/batch 319.48 | loss  4.01 | ppl    55.205
| epoch 141 step    61150 |    110 batches | lr 0.000181 | ms/batch 319.29 | loss  4.04 | ppl    56.875
| epoch 141 step    61200 |    160 batches | lr 0.000181 | ms/batch 319.02 | loss  4.05 | ppl    57.492
----------------------------------------------------------------------------------------------------
| Eval 153 at step    61200 | time: 132.80s | valid loss  4.22 | valid ppl    68.057
----------------------------------------------------------------------------------------------------
| epoch 141 step    61250 |    210 batches | lr 0.00018 | ms/batch 452.60 | loss  4.06 | ppl    58.094
| epoch 141 step    61300 |    260 batches | lr 0.00018 | ms/batch 320.95 | loss  4.08 | ppl    58.931
| epoch 141 step    61350 |    310 batches | lr 0.00018 | ms/batch 319.59 | loss  4.08 | ppl    59.266
| epoch 141 step    61400 |    360 batches | lr 0.00018 | ms/batch 319.37 | loss  4.01 | ppl    55.209
| epoch 141 step    61450 |    410 batches | lr 0.00018 | ms/batch 318.90 | loss  4.05 | ppl    57.147
| epoch 142 step    61500 |     24 batches | lr 0.000179 | ms/batch 312.78 | loss  4.07 | ppl    58.301
| epoch 142 step    61550 |     74 batches | lr 0.000179 | ms/batch 318.93 | loss  4.01 | ppl    55.358
| epoch 142 step    61600 |    124 batches | lr 0.000179 | ms/batch 319.06 | loss  4.03 | ppl    56.169
----------------------------------------------------------------------------------------------------
| Eval 154 at step    61600 | time: 132.41s | valid loss  4.23 | valid ppl    68.502
----------------------------------------------------------------------------------------------------
| epoch 142 step    61650 |    174 batches | lr 0.000179 | ms/batch 419.87 | loss  4.06 | ppl    57.900
| epoch 142 step    61700 |    224 batches | lr 0.000179 | ms/batch 319.77 | loss  4.08 | ppl    59.120
| epoch 142 step    61750 |    274 batches | lr 0.000178 | ms/batch 319.51 | loss  4.10 | ppl    60.534
| epoch 142 step    61800 |    324 batches | lr 0.000178 | ms/batch 319.19 | loss  4.00 | ppl    54.424
| epoch 142 step    61850 |    374 batches | lr 0.000178 | ms/batch 319.57 | loss  4.03 | ppl    56.109
| epoch 142 step    61900 |    424 batches | lr 0.000178 | ms/batch 318.78 | loss  4.06 | ppl    57.845
| epoch 143 step    61950 |     38 batches | lr 0.000177 | ms/batch 313.96 | loss  4.06 | ppl    57.721
| epoch 143 step    62000 |     88 batches | lr 0.000177 | ms/batch 319.62 | loss  4.00 | ppl    54.339
----------------------------------------------------------------------------------------------------
| Eval 155 at step    62000 | time: 132.52s | valid loss  4.24 | valid ppl    69.648
----------------------------------------------------------------------------------------------------
| epoch 143 step    62050 |    138 batches | lr 0.000177 | ms/batch 419.84 | loss  4.04 | ppl    56.966
| epoch 143 step    62100 |    188 batches | lr 0.000177 | ms/batch 320.16 | loss  4.04 | ppl    56.849
| epoch 143 step    62150 |    238 batches | lr 0.000177 | ms/batch 318.67 | loss  4.05 | ppl    57.604
| epoch 143 step    62200 |    288 batches | lr 0.000176 | ms/batch 319.32 | loss  4.08 | ppl    59.442
| epoch 143 step    62250 |    338 batches | lr 0.000176 | ms/batch 319.22 | loss  3.95 | ppl    52.098
| epoch 143 step    62300 |    388 batches | lr 0.000176 | ms/batch 318.34 | loss  4.03 | ppl    56.077
| epoch 144 step    62350 |      2 batches | lr 0.000176 | ms/batch 313.32 | loss  4.09 | ppl    59.719
| epoch 144 step    62400 |     52 batches | lr 0.000175 | ms/batch 318.89 | loss  3.99 | ppl    54.199
----------------------------------------------------------------------------------------------------
| Eval 156 at step    62400 | time: 132.37s | valid loss  4.24 | valid ppl    69.139
----------------------------------------------------------------------------------------------------
| epoch 144 step    62450 |    102 batches | lr 0.000175 | ms/batch 418.35 | loss  4.01 | ppl    55.026
| epoch 144 step    62500 |    152 batches | lr 0.000175 | ms/batch 318.79 | loss  4.05 | ppl    57.591
| epoch 144 step    62550 |    202 batches | lr 0.000175 | ms/batch 320.11 | loss  4.05 | ppl    57.555
| epoch 144 step    62600 |    252 batches | lr 0.000175 | ms/batch 322.52 | loss  4.08 | ppl    58.864
| epoch 144 step    62650 |    302 batches | lr 0.000174 | ms/batch 320.76 | loss  4.11 | ppl    60.818
| epoch 144 step    62700 |    352 batches | lr 0.000174 | ms/batch 319.02 | loss  3.96 | ppl    52.472
| epoch 144 step    62750 |    402 batches | lr 0.000174 | ms/batch 320.47 | loss  4.05 | ppl    57.409
| epoch 145 step    62800 |     16 batches | lr 0.000174 | ms/batch 314.33 | loss  4.06 | ppl    58.165
----------------------------------------------------------------------------------------------------
| Eval 157 at step    62800 | time: 132.72s | valid loss  4.23 | valid ppl    68.651
----------------------------------------------------------------------------------------------------
| epoch 145 step    62850 |     66 batches | lr 0.000173 | ms/batch 419.27 | loss  3.98 | ppl    53.408
| epoch 145 step    62900 |    116 batches | lr 0.000173 | ms/batch 319.54 | loss  4.03 | ppl    56.035
| epoch 145 step    62950 |    166 batches | lr 0.000173 | ms/batch 320.46 | loss  4.03 | ppl    56.492
| epoch 145 step    63000 |    216 batches | lr 0.000173 | ms/batch 318.82 | loss  4.02 | ppl    55.799
| epoch 145 step    63050 |    266 batches | lr 0.000173 | ms/batch 320.37 | loss  4.07 | ppl    58.498
| epoch 145 step    63100 |    316 batches | lr 0.000172 | ms/batch 319.51 | loss  4.03 | ppl    56.239
| epoch 145 step    63150 |    366 batches | lr 0.000172 | ms/batch 319.93 | loss  3.99 | ppl    53.871
| epoch 145 step    63200 |    416 batches | lr 0.000172 | ms/batch 319.45 | loss  4.03 | ppl    56.068
----------------------------------------------------------------------------------------------------
| Eval 158 at step    63200 | time: 132.87s | valid loss  4.23 | valid ppl    68.563
----------------------------------------------------------------------------------------------------
| epoch 146 step    63250 |     30 batches | lr 0.000172 | ms/batch 412.21 | loss  4.07 | ppl    58.726
| epoch 146 step    63300 |     80 batches | lr 0.000171 | ms/batch 319.06 | loss  3.99 | ppl    53.899
| epoch 146 step    63350 |    130 batches | lr 0.000171 | ms/batch 318.88 | loss  4.05 | ppl    57.537
| epoch 146 step    63400 |    180 batches | lr 0.000171 | ms/batch 319.22 | loss  4.02 | ppl    55.514
| epoch 146 step    63450 |    230 batches | lr 0.000171 | ms/batch 319.28 | loss  4.05 | ppl    57.270
| epoch 146 step    63500 |    280 batches | lr 0.000171 | ms/batch 319.33 | loss  4.06 | ppl    57.764
| epoch 146 step    63550 |    330 batches | lr 0.00017 | ms/batch 319.84 | loss  3.99 | ppl    54.053
| epoch 146 step    63600 |    380 batches | lr 0.00017 | ms/batch 320.41 | loss  4.04 | ppl    56.687
----------------------------------------------------------------------------------------------------
| Eval 159 at step    63600 | time: 132.45s | valid loss  4.23 | valid ppl    68.635
----------------------------------------------------------------------------------------------------
| epoch 146 step    63650 |    430 batches | lr 0.00017 | ms/batch 420.82 | loss  4.05 | ppl    57.355
| epoch 147 step    63700 |     44 batches | lr 0.00017 | ms/batch 314.30 | loss  4.02 | ppl    55.673
| epoch 147 step    63750 |     94 batches | lr 0.00017 | ms/batch 320.34 | loss  3.95 | ppl    51.903
| epoch 147 step    63800 |    144 batches | lr 0.000169 | ms/batch 322.46 | loss  4.03 | ppl    56.479
| epoch 147 step    63850 |    194 batches | lr 0.000169 | ms/batch 321.40 | loss  4.04 | ppl    56.851
| epoch 147 step    63900 |    244 batches | lr 0.000169 | ms/batch 319.94 | loss  4.06 | ppl    57.687
| epoch 147 step    63950 |    294 batches | lr 0.000169 | ms/batch 321.23 | loss  4.08 | ppl    59.217
| epoch 147 step    64000 |    344 batches | lr 0.000168 | ms/batch 320.48 | loss  3.94 | ppl    51.224
----------------------------------------------------------------------------------------------------
| Eval 160 at step    64000 | time: 133.04s | valid loss  4.24 | valid ppl    69.129
----------------------------------------------------------------------------------------------------
| epoch 147 step    64050 |    394 batches | lr 0.000168 | ms/batch 420.98 | loss  4.03 | ppl    56.048
| epoch 148 step    64100 |      8 batches | lr 0.000168 | ms/batch 314.76 | loss  4.07 | ppl    58.447
| epoch 148 step    64150 |     58 batches | lr 0.000168 | ms/batch 321.10 | loss  3.98 | ppl    53.714
| epoch 148 step    64200 |    108 batches | lr 0.000168 | ms/batch 320.89 | loss  3.99 | ppl    53.985
| epoch 148 step    64250 |    158 batches | lr 0.000167 | ms/batch 320.21 | loss  4.03 | ppl    56.055
| epoch 148 step    64300 |    208 batches | lr 0.000167 | ms/batch 319.39 | loss  4.03 | ppl    56.461
| epoch 148 step    64350 |    258 batches | lr 0.000167 | ms/batch 319.35 | loss  4.08 | ppl    59.012
| epoch 148 step    64400 |    308 batches | lr 0.000167 | ms/batch 319.58 | loss  4.04 | ppl    57.062
----------------------------------------------------------------------------------------------------
| Eval 161 at step    64400 | time: 132.81s | valid loss  4.23 | valid ppl    68.517
----------------------------------------------------------------------------------------------------
| epoch 148 step    64450 |    358 batches | lr 0.000166 | ms/batch 419.02 | loss  3.96 | ppl    52.400
| epoch 148 step    64500 |    408 batches | lr 0.000166 | ms/batch 319.07 | loss  4.02 | ppl    55.516
| epoch 149 step    64550 |     22 batches | lr 0.000166 | ms/batch 313.89 | loss  4.06 | ppl    58.115
| epoch 149 step    64600 |     72 batches | lr 0.000166 | ms/batch 320.20 | loss  3.96 | ppl    52.642
| epoch 149 step    64650 |    122 batches | lr 0.000166 | ms/batch 320.07 | loss  4.01 | ppl    55.005
| epoch 149 step    64700 |    172 batches | lr 0.000165 | ms/batch 319.95 | loss  4.02 | ppl    55.954
| epoch 149 step    64750 |    222 batches | lr 0.000165 | ms/batch 319.65 | loss  4.04 | ppl    56.769
| epoch 149 step    64800 |    272 batches | lr 0.000165 | ms/batch 319.87 | loss  4.04 | ppl    56.815
----------------------------------------------------------------------------------------------------
| Eval 162 at step    64800 | time: 132.59s | valid loss  4.23 | valid ppl    68.384
----------------------------------------------------------------------------------------------------
| epoch 149 step    64850 |    322 batches | lr 0.000165 | ms/batch 419.54 | loss  3.99 | ppl    54.131
| epoch 149 step    64900 |    372 batches | lr 0.000164 | ms/batch 319.68 | loss  4.00 | ppl    54.814
| epoch 149 step    64950 |    422 batches | lr 0.000164 | ms/batch 320.42 | loss  4.02 | ppl    55.897
| epoch 150 step    65000 |     36 batches | lr 0.000164 | ms/batch 313.01 | loss  4.02 | ppl    55.545
| epoch 150 step    65050 |     86 batches | lr 0.000164 | ms/batch 319.09 | loss  3.98 | ppl    53.682
| epoch 150 step    65100 |    136 batches | lr 0.000164 | ms/batch 319.52 | loss  4.02 | ppl    55.582
| epoch 150 step    65150 |    186 batches | lr 0.000163 | ms/batch 320.39 | loss  4.04 | ppl    56.696
| epoch 150 step    65200 |    236 batches | lr 0.000163 | ms/batch 320.28 | loss  4.04 | ppl    56.824
----------------------------------------------------------------------------------------------------
| Eval 163 at step    65200 | time: 132.59s | valid loss  4.23 | valid ppl    68.506
----------------------------------------------------------------------------------------------------
| epoch 150 step    65250 |    286 batches | lr 0.000163 | ms/batch 419.59 | loss  4.06 | ppl    57.791
| epoch 150 step    65300 |    336 batches | lr 0.000163 | ms/batch 318.96 | loss  3.95 | ppl    51.840
| epoch 150 step    65350 |    386 batches | lr 0.000162 | ms/batch 319.85 | loss  4.03 | ppl    56.147
| epoch 150 step    65400 |    436 batches | lr 0.000162 | ms/batch 315.17 | loss  4.03 | ppl    56.232
| epoch 151 step    65450 |     50 batches | lr 0.000162 | ms/batch 318.29 | loss  4.00 | ppl    54.343
| epoch 151 step    65500 |    100 batches | lr 0.000162 | ms/batch 320.57 | loss  3.98 | ppl    53.373
| epoch 151 step    65550 |    150 batches | lr 0.000162 | ms/batch 319.65 | loss  4.02 | ppl    55.642
| epoch 151 step    65600 |    200 batches | lr 0.000161 | ms/batch 320.55 | loss  4.03 | ppl    56.057
----------------------------------------------------------------------------------------------------
| Eval 164 at step    65600 | time: 132.67s | valid loss  4.22 | valid ppl    68.233
----------------------------------------------------------------------------------------------------
| epoch 151 step    65650 |    250 batches | lr 0.000161 | ms/batch 420.41 | loss  4.03 | ppl    56.514
| epoch 151 step    65700 |    300 batches | lr 0.000161 | ms/batch 320.38 | loss  4.05 | ppl    57.597
| epoch 151 step    65750 |    350 batches | lr 0.000161 | ms/batch 321.19 | loss  3.92 | ppl    50.397
| epoch 151 step    65800 |    400 batches | lr 0.000161 | ms/batch 321.18 | loss  4.01 | ppl    55.171
| epoch 152 step    65850 |     14 batches | lr 0.00016 | ms/batch 314.31 | loss  4.08 | ppl    59.012
| epoch 152 step    65900 |     64 batches | lr 0.00016 | ms/batch 321.78 | loss  3.94 | ppl    51.409
| epoch 152 step    65950 |    114 batches | lr 0.00016 | ms/batch 319.90 | loss  3.99 | ppl    54.258
| epoch 152 step    66000 |    164 batches | lr 0.00016 | ms/batch 321.27 | loss  4.03 | ppl    56.395
----------------------------------------------------------------------------------------------------
| Eval 165 at step    66000 | time: 133.02s | valid loss  4.21 | valid ppl    67.452
----------------------------------------------------------------------------------------------------
| epoch 152 step    66050 |    214 batches | lr 0.000159 | ms/batch 453.16 | loss  4.02 | ppl    55.703
| epoch 152 step    66100 |    264 batches | lr 0.000159 | ms/batch 319.24 | loss  4.04 | ppl    56.585
| epoch 152 step    66150 |    314 batches | lr 0.000159 | ms/batch 319.10 | loss  4.01 | ppl    54.902
| epoch 152 step    66200 |    364 batches | lr 0.000159 | ms/batch 317.87 | loss  3.96 | ppl    52.628
| epoch 152 step    66250 |    414 batches | lr 0.000159 | ms/batch 319.69 | loss  4.00 | ppl    54.438
| epoch 153 step    66300 |     28 batches | lr 0.000158 | ms/batch 314.73 | loss  4.04 | ppl    56.877
| epoch 153 step    66350 |     78 batches | lr 0.000158 | ms/batch 320.24 | loss  3.97 | ppl    52.840
| epoch 153 step    66400 |    128 batches | lr 0.000158 | ms/batch 322.31 | loss  4.00 | ppl    54.771
----------------------------------------------------------------------------------------------------
| Eval 166 at step    66400 | time: 132.63s | valid loss  4.22 | valid ppl    68.009
----------------------------------------------------------------------------------------------------
| epoch 153 step    66450 |    178 batches | lr 0.000158 | ms/batch 421.25 | loss  3.99 | ppl    54.032
| epoch 153 step    66500 |    228 batches | lr 0.000157 | ms/batch 320.70 | loss  4.03 | ppl    56.202
| epoch 153 step    66550 |    278 batches | lr 0.000157 | ms/batch 320.08 | loss  4.07 | ppl    58.706
| epoch 153 step    66600 |    328 batches | lr 0.000157 | ms/batch 320.41 | loss  3.96 | ppl    52.685
| epoch 153 step    66650 |    378 batches | lr 0.000157 | ms/batch 320.17 | loss  3.98 | ppl    53.390
| epoch 153 step    66700 |    428 batches | lr 0.000157 | ms/batch 320.04 | loss  4.00 | ppl    54.543
| epoch 154 step    66750 |     42 batches | lr 0.000156 | ms/batch 314.29 | loss  4.00 | ppl    54.532
| epoch 154 step    66800 |     92 batches | lr 0.000156 | ms/batch 320.47 | loss  3.94 | ppl    51.378
----------------------------------------------------------------------------------------------------
| Eval 167 at step    66800 | time: 132.84s | valid loss  4.24 | valid ppl    69.208
----------------------------------------------------------------------------------------------------
| epoch 154 step    66850 |    142 batches | lr 0.000156 | ms/batch 419.23 | loss  4.01 | ppl    55.029
| epoch 154 step    66900 |    192 batches | lr 0.000156 | ms/batch 319.88 | loss  4.00 | ppl    54.835
| epoch 154 step    66950 |    242 batches | lr 0.000155 | ms/batch 319.26 | loss  4.04 | ppl    56.837
| epoch 154 step    67000 |    292 batches | lr 0.000155 | ms/batch 320.36 | loss  4.05 | ppl    57.409
| epoch 154 step    67050 |    342 batches | lr 0.000155 | ms/batch 319.70 | loss  3.91 | ppl    49.747
| epoch 154 step    67100 |    392 batches | lr 0.000155 | ms/batch 319.81 | loss  4.01 | ppl    55.110
| epoch 155 step    67150 |      6 batches | lr 0.000155 | ms/batch 313.65 | loss  4.06 | ppl    57.927
| epoch 155 step    67200 |     56 batches | lr 0.000154 | ms/batch 319.29 | loss  3.95 | ppl    51.699
----------------------------------------------------------------------------------------------------
| Eval 168 at step    67200 | time: 132.55s | valid loss  4.22 | valid ppl    68.318
----------------------------------------------------------------------------------------------------
| epoch 155 step    67250 |    106 batches | lr 0.000154 | ms/batch 422.04 | loss  3.97 | ppl    52.997
| epoch 155 step    67300 |    156 batches | lr 0.000154 | ms/batch 321.53 | loss  4.00 | ppl    54.718
| epoch 155 step    67350 |    206 batches | lr 0.000154 | ms/batch 321.40 | loss  4.03 | ppl    56.300
| epoch 155 step    67400 |    256 batches | lr 0.000154 | ms/batch 319.92 | loss  4.03 | ppl    56.193
| epoch 155 step    67450 |    306 batches | lr 0.000153 | ms/batch 319.01 | loss  4.01 | ppl    55.192
| epoch 155 step    67500 |    356 batches | lr 0.000153 | ms/batch 320.46 | loss  3.94 | ppl    51.254
| epoch 155 step    67550 |    406 batches | lr 0.000153 | ms/batch 320.16 | loss  4.00 | ppl    54.392
| epoch 156 step    67600 |     20 batches | lr 0.000153 | ms/batch 314.73 | loss  4.04 | ppl    57.089
----------------------------------------------------------------------------------------------------
| Eval 169 at step    67600 | time: 132.97s | valid loss  4.23 | valid ppl    68.470
----------------------------------------------------------------------------------------------------
| epoch 156 step    67650 |     70 batches | lr 0.000152 | ms/batch 419.71 | loss  3.94 | ppl    51.334
| epoch 156 step    67700 |    120 batches | lr 0.000152 | ms/batch 319.32 | loss  4.00 | ppl    54.341
| epoch 156 step    67750 |    170 batches | lr 0.000152 | ms/batch 319.77 | loss  4.00 | ppl    54.829
| epoch 156 step    67800 |    220 batches | lr 0.000152 | ms/batch 321.08 | loss  4.01 | ppl    55.054
| epoch 156 step    67850 |    270 batches | lr 0.000152 | ms/batch 323.88 | loss  4.03 | ppl    56.272
| epoch 156 step    67900 |    320 batches | lr 0.000151 | ms/batch 320.82 | loss  3.98 | ppl    53.411
| epoch 156 step    67950 |    370 batches | lr 0.000151 | ms/batch 320.25 | loss  3.98 | ppl    53.724
| epoch 156 step    68000 |    420 batches | lr 0.000151 | ms/batch 320.52 | loss  3.98 | ppl    53.465
----------------------------------------------------------------------------------------------------
| Eval 170 at step    68000 | time: 133.29s | valid loss  4.22 | valid ppl    68.163
----------------------------------------------------------------------------------------------------
| epoch 157 step    68050 |     34 batches | lr 0.000151 | ms/batch 414.30 | loss  4.02 | ppl    55.690
| epoch 157 step    68100 |     84 batches | lr 0.00015 | ms/batch 320.18 | loss  3.93 | ppl    50.885
| epoch 157 step    68150 |    134 batches | lr 0.00015 | ms/batch 318.97 | loss  4.01 | ppl    55.046
| epoch 157 step    68200 |    184 batches | lr 0.00015 | ms/batch 320.50 | loss  3.99 | ppl    54.034
| epoch 157 step    68250 |    234 batches | lr 0.00015 | ms/batch 321.40 | loss  4.02 | ppl    55.978
| epoch 157 step    68300 |    284 batches | lr 0.00015 | ms/batch 320.85 | loss  4.03 | ppl    56.468
| epoch 157 step    68350 |    334 batches | lr 0.000149 | ms/batch 319.74 | loss  3.96 | ppl    52.375
| epoch 157 step    68400 |    384 batches | lr 0.000149 | ms/batch 320.96 | loss  3.99 | ppl    53.909
----------------------------------------------------------------------------------------------------
| Eval 171 at step    68400 | time: 132.88s | valid loss  4.23 | valid ppl    68.455
----------------------------------------------------------------------------------------------------
| epoch 157 step    68450 |    434 batches | lr 0.000149 | ms/batch 421.63 | loss  4.05 | ppl    57.353
| epoch 158 step    68500 |     48 batches | lr 0.000149 | ms/batch 314.02 | loss  3.99 | ppl    54.040
| epoch 158 step    68550 |     98 batches | lr 0.000148 | ms/batch 320.93 | loss  3.95 | ppl    52.061
| epoch 158 step    68600 |    148 batches | lr 0.000148 | ms/batch 321.23 | loss  3.99 | ppl    53.899
| epoch 158 step    68650 |    198 batches | lr 0.000148 | ms/batch 319.70 | loss  4.01 | ppl    55.121
| epoch 158 step    68700 |    248 batches | lr 0.000148 | ms/batch 320.74 | loss  4.03 | ppl    56.461
| epoch 158 step    68750 |    298 batches | lr 0.000148 | ms/batch 319.94 | loss  4.06 | ppl    57.945
| epoch 158 step    68800 |    348 batches | lr 0.000147 | ms/batch 319.95 | loss  3.89 | ppl    48.835
----------------------------------------------------------------------------------------------------
| Eval 172 at step    68800 | time: 132.90s | valid loss  4.22 | valid ppl    68.358
----------------------------------------------------------------------------------------------------
| epoch 158 step    68850 |    398 batches | lr 0.000147 | ms/batch 420.82 | loss  3.98 | ppl    53.594
| epoch 159 step    68900 |     12 batches | lr 0.000147 | ms/batch 312.54 | loss  4.04 | ppl    57.051
| epoch 159 step    68950 |     62 batches | lr 0.000147 | ms/batch 320.50 | loss  3.93 | ppl    51.156
| epoch 159 step    69000 |    112 batches | lr 0.000147 | ms/batch 320.65 | loss  3.98 | ppl    53.269
| epoch 159 step    69050 |    162 batches | lr 0.000146 | ms/batch 320.33 | loss  3.99 | ppl    53.939
| epoch 159 step    69100 |    212 batches | lr 0.000146 | ms/batch 320.07 | loss  3.99 | ppl    54.019
| epoch 159 step    69150 |    262 batches | lr 0.000146 | ms/batch 320.38 | loss  4.02 | ppl    55.873
| epoch 159 step    69200 |    312 batches | lr 0.000146 | ms/batch 319.63 | loss  4.00 | ppl    54.624
----------------------------------------------------------------------------------------------------
| Eval 173 at step    69200 | time: 132.74s | valid loss  4.22 | valid ppl    67.830
----------------------------------------------------------------------------------------------------
| epoch 159 step    69250 |    362 batches | lr 0.000145 | ms/batch 420.15 | loss  3.93 | ppl    50.975
| epoch 159 step    69300 |    412 batches | lr 0.000145 | ms/batch 319.00 | loss  3.97 | ppl    52.896
| epoch 160 step    69350 |     26 batches | lr 0.000145 | ms/batch 313.88 | loss  4.01 | ppl    55.391
| epoch 160 step    69400 |     76 batches | lr 0.000145 | ms/batch 320.96 | loss  3.94 | ppl    51.409
| epoch 160 step    69450 |    126 batches | lr 0.000145 | ms/batch 319.72 | loss  3.98 | ppl    53.695
| epoch 160 step    69500 |    176 batches | lr 0.000144 | ms/batch 320.13 | loss  3.98 | ppl    53.555
| epoch 160 step    69550 |    226 batches | lr 0.000144 | ms/batch 319.19 | loss  4.01 | ppl    55.007
| epoch 160 step    69600 |    276 batches | lr 0.000144 | ms/batch 318.54 | loss  4.03 | ppl    56.455
----------------------------------------------------------------------------------------------------
| Eval 174 at step    69600 | time: 132.56s | valid loss  4.21 | valid ppl    67.414
----------------------------------------------------------------------------------------------------
| epoch 160 step    69650 |    326 batches | lr 0.000144 | ms/batch 452.55 | loss  3.94 | ppl    51.202
| epoch 160 step    69700 |    376 batches | lr 0.000144 | ms/batch 318.85 | loss  3.96 | ppl    52.478
| epoch 160 step    69750 |    426 batches | lr 0.000143 | ms/batch 319.35 | loss  3.98 | ppl    53.530
| epoch 161 step    69800 |     40 batches | lr 0.000143 | ms/batch 312.19 | loss  3.95 | ppl    51.818
| epoch 161 step    69850 |     90 batches | lr 0.000143 | ms/batch 317.80 | loss  3.92 | ppl    50.529
| epoch 161 step    69900 |    140 batches | lr 0.000143 | ms/batch 319.22 | loss  3.98 | ppl    53.758
| epoch 161 step    69950 |    190 batches | lr 0.000142 | ms/batch 320.12 | loss  3.99 | ppl    53.838
| epoch 161 step    70000 |    240 batches | lr 0.000142 | ms/batch 321.17 | loss  3.99 | ppl    54.211
----------------------------------------------------------------------------------------------------
| Eval 175 at step    70000 | time: 132.48s | valid loss  4.22 | valid ppl    68.296
----------------------------------------------------------------------------------------------------
| epoch 161 step    70050 |    290 batches | lr 0.000142 | ms/batch 422.05 | loss  4.03 | ppl    56.514
| epoch 161 step    70100 |    340 batches | lr 0.000142 | ms/batch 320.87 | loss  3.91 | ppl    49.913
| epoch 161 step    70150 |    390 batches | lr 0.000142 | ms/batch 320.24 | loss  3.98 | ppl    53.720
| epoch 162 step    70200 |      4 batches | lr 0.000141 | ms/batch 314.61 | loss  4.00 | ppl    54.534
| epoch 162 step    70250 |     54 batches | lr 0.000141 | ms/batch 321.11 | loss  3.95 | ppl    51.927
| epoch 162 step    70300 |    104 batches | lr 0.000141 | ms/batch 320.73 | loss  3.94 | ppl    51.443
| epoch 162 step    70350 |    154 batches | lr 0.000141 | ms/batch 321.08 | loss  3.97 | ppl    53.076
| epoch 162 step    70400 |    204 batches | lr 0.00014 | ms/batch 319.27 | loss  3.97 | ppl    53.011
----------------------------------------------------------------------------------------------------
| Eval 176 at step    70400 | time: 132.95s | valid loss  4.21 | valid ppl    67.550
----------------------------------------------------------------------------------------------------
| epoch 162 step    70450 |    254 batches | lr 0.00014 | ms/batch 419.01 | loss  3.99 | ppl    54.188
| epoch 162 step    70500 |    304 batches | lr 0.00014 | ms/batch 318.75 | loss  4.02 | ppl    55.843
| epoch 162 step    70550 |    354 batches | lr 0.00014 | ms/batch 318.74 | loss  3.90 | ppl    49.335
| epoch 162 step    70600 |    404 batches | lr 0.00014 | ms/batch 318.43 | loss  3.97 | ppl    52.807
| epoch 163 step    70650 |     18 batches | lr 0.000139 | ms/batch 314.72 | loss  4.00 | ppl    54.735
| epoch 163 step    70700 |     68 batches | lr 0.000139 | ms/batch 320.81 | loss  3.91 | ppl    50.128
| epoch 163 step    70750 |    118 batches | lr 0.000139 | ms/batch 320.49 | loss  3.96 | ppl    52.369
| epoch 163 step    70800 |    168 batches | lr 0.000139 | ms/batch 320.08 | loss  3.97 | ppl    52.937
----------------------------------------------------------------------------------------------------
| Eval 177 at step    70800 | time: 132.57s | valid loss  4.21 | valid ppl    67.352
----------------------------------------------------------------------------------------------------
| epoch 163 step    70850 |    218 batches | lr 0.000139 | ms/batch 454.90 | loss  3.98 | ppl    53.689
| epoch 163 step    70900 |    268 batches | lr 0.000138 | ms/batch 321.54 | loss  4.00 | ppl    54.332
| epoch 163 step    70950 |    318 batches | lr 0.000138 | ms/batch 319.45 | loss  3.97 | ppl    52.956
| epoch 163 step    71000 |    368 batches | lr 0.000138 | ms/batch 319.97 | loss  3.93 | ppl    50.959
| epoch 163 step    71050 |    418 batches | lr 0.000138 | ms/batch 319.50 | loss  3.97 | ppl    53.034
| epoch 164 step    71100 |     32 batches | lr 0.000137 | ms/batch 313.98 | loss  3.99 | ppl    53.958
| epoch 164 step    71150 |     82 batches | lr 0.000137 | ms/batch 319.43 | loss  3.92 | ppl    50.377
| epoch 164 step    71200 |    132 batches | lr 0.000137 | ms/batch 319.94 | loss  3.98 | ppl    53.329
----------------------------------------------------------------------------------------------------
| Eval 178 at step    71200 | time: 132.74s | valid loss  4.21 | valid ppl    67.287
----------------------------------------------------------------------------------------------------
| epoch 164 step    71250 |    182 batches | lr 0.000137 | ms/batch 466.85 | loss  3.97 | ppl    53.057
| epoch 164 step    71300 |    232 batches | lr 0.000137 | ms/batch 319.68 | loss  4.00 | ppl    54.564
| epoch 164 step    71350 |    282 batches | lr 0.000136 | ms/batch 319.12 | loss  4.01 | ppl    55.237
| epoch 164 step    71400 |    332 batches | lr 0.000136 | ms/batch 320.76 | loss  3.92 | ppl    50.546
| epoch 164 step    71450 |    382 batches | lr 0.000136 | ms/batch 320.47 | loss  3.95 | ppl    52.000
| epoch 164 step    71500 |    432 batches | lr 0.000136 | ms/batch 320.16 | loss  4.00 | ppl    54.400
| epoch 165 step    71550 |     46 batches | lr 0.000136 | ms/batch 314.96 | loss  3.96 | ppl    52.380
| epoch 165 step    71600 |     96 batches | lr 0.000135 | ms/batch 321.72 | loss  3.92 | ppl    50.239
----------------------------------------------------------------------------------------------------
| Eval 179 at step    71600 | time: 132.86s | valid loss  4.22 | valid ppl    67.973
----------------------------------------------------------------------------------------------------
| epoch 165 step    71650 |    146 batches | lr 0.000135 | ms/batch 422.06 | loss  3.96 | ppl    52.601
| epoch 165 step    71700 |    196 batches | lr 0.000135 | ms/batch 321.48 | loss  3.97 | ppl    53.132
| epoch 165 step    71750 |    246 batches | lr 0.000135 | ms/batch 323.13 | loss  4.00 | ppl    54.622
| epoch 165 step    71800 |    296 batches | lr 0.000134 | ms/batch 319.38 | loss  4.03 | ppl    56.393
| epoch 165 step    71850 |    346 batches | lr 0.000134 | ms/batch 319.74 | loss  3.88 | ppl    48.599
| epoch 165 step    71900 |    396 batches | lr 0.000134 | ms/batch 319.15 | loss  3.96 | ppl    52.585
| epoch 166 step    71950 |     10 batches | lr 0.000134 | ms/batch 314.11 | loss  4.01 | ppl    54.932
| epoch 166 step    72000 |     60 batches | lr 0.000134 | ms/batch 321.37 | loss  3.93 | ppl    50.740
----------------------------------------------------------------------------------------------------
| Eval 180 at step    72000 | time: 133.57s | valid loss  4.21 | valid ppl    67.614
----------------------------------------------------------------------------------------------------
| epoch 166 step    72050 |    110 batches | lr 0.000133 | ms/batch 420.51 | loss  3.94 | ppl    51.403
| epoch 166 step    72100 |    160 batches | lr 0.000133 | ms/batch 319.95 | loss  3.95 | ppl    52.006
| epoch 166 step    72150 |    210 batches | lr 0.000133 | ms/batch 319.85 | loss  3.98 | ppl    53.718
| epoch 166 step    72200 |    260 batches | lr 0.000133 | ms/batch 320.14 | loss  3.99 | ppl    54.228
| epoch 166 step    72250 |    310 batches | lr 0.000133 | ms/batch 321.00 | loss  3.99 | ppl    53.888
| epoch 166 step    72300 |    360 batches | lr 0.000132 | ms/batch 319.81 | loss  3.92 | ppl    50.406
| epoch 166 step    72350 |    410 batches | lr 0.000132 | ms/batch 318.77 | loss  3.97 | ppl    53.026
| epoch 167 step    72400 |     24 batches | lr 0.000132 | ms/batch 313.72 | loss  3.99 | ppl    54.271
----------------------------------------------------------------------------------------------------
| Eval 181 at step    72400 | time: 132.69s | valid loss  4.22 | valid ppl    67.710
----------------------------------------------------------------------------------------------------
| epoch 167 step    72450 |     74 batches | lr 0.000132 | ms/batch 422.10 | loss  3.92 | ppl    50.529
| epoch 167 step    72500 |    124 batches | lr 0.000131 | ms/batch 321.71 | loss  3.95 | ppl    52.033
| epoch 167 step    72550 |    174 batches | lr 0.000131 | ms/batch 320.93 | loss  3.96 | ppl    52.663
| epoch 167 step    72600 |    224 batches | lr 0.000131 | ms/batch 321.89 | loss  4.01 | ppl    55.084
| epoch 167 step    72650 |    274 batches | lr 0.000131 | ms/batch 322.05 | loss  3.98 | ppl    53.647
| epoch 167 step    72700 |    324 batches | lr 0.000131 | ms/batch 318.89 | loss  3.93 | ppl    51.078
| epoch 167 step    72750 |    374 batches | lr 0.00013 | ms/batch 321.96 | loss  3.93 | ppl    50.746
| epoch 167 step    72800 |    424 batches | lr 0.00013 | ms/batch 321.01 | loss  3.99 | ppl    54.188
----------------------------------------------------------------------------------------------------
| Eval 182 at step    72800 | time: 133.53s | valid loss  4.21 | valid ppl    67.532
----------------------------------------------------------------------------------------------------
| epoch 168 step    72850 |     38 batches | lr 0.00013 | ms/batch 415.09 | loss  3.97 | ppl    52.958
| epoch 168 step    72900 |     88 batches | lr 0.00013 | ms/batch 320.99 | loss  3.89 | ppl    48.749
| epoch 168 step    72950 |    138 batches | lr 0.00013 | ms/batch 319.28 | loss  3.96 | ppl    52.554
| epoch 168 step    73000 |    188 batches | lr 0.000129 | ms/batch 319.86 | loss  3.96 | ppl    52.587
| epoch 168 step    73050 |    238 batches | lr 0.000129 | ms/batch 320.62 | loss  3.98 | ppl    53.469
| epoch 168 step    73100 |    288 batches | lr 0.000129 | ms/batch 320.04 | loss  4.02 | ppl    55.906
| epoch 168 step    73150 |    338 batches | lr 0.000129 | ms/batch 320.00 | loss  3.89 | ppl    49.051
| epoch 168 step    73200 |    388 batches | lr 0.000129 | ms/batch 320.24 | loss  3.96 | ppl    52.696
----------------------------------------------------------------------------------------------------
| Eval 183 at step    73200 | time: 132.81s | valid loss  4.22 | valid ppl    67.876
----------------------------------------------------------------------------------------------------
| epoch 169 step    73250 |      2 batches | lr 0.000128 | ms/batch 415.60 | loss  4.00 | ppl    54.485
| epoch 169 step    73300 |     52 batches | lr 0.000128 | ms/batch 319.63 | loss  3.94 | ppl    51.300
| epoch 169 step    73350 |    102 batches | lr 0.000128 | ms/batch 320.61 | loss  3.93 | ppl    50.969
| epoch 169 step    73400 |    152 batches | lr 0.000128 | ms/batch 319.65 | loss  3.96 | ppl    52.271
| epoch 169 step    73450 |    202 batches | lr 0.000127 | ms/batch 320.70 | loss  3.96 | ppl    52.564
| epoch 169 step    73500 |    252 batches | lr 0.000127 | ms/batch 319.65 | loss  3.99 | ppl    54.011
| epoch 169 step    73550 |    302 batches | lr 0.000127 | ms/batch 319.44 | loss  4.00 | ppl    54.812
| epoch 169 step    73600 |    352 batches | lr 0.000127 | ms/batch 320.39 | loss  3.89 | ppl    49.005
----------------------------------------------------------------------------------------------------
| Eval 184 at step    73600 | time: 132.78s | valid loss  4.22 | valid ppl    68.004
----------------------------------------------------------------------------------------------------
| epoch 169 step    73650 |    402 batches | lr 0.000127 | ms/batch 420.20 | loss  3.95 | ppl    51.792
| epoch 170 step    73700 |     16 batches | lr 0.000126 | ms/batch 313.41 | loss  3.98 | ppl    53.400
| epoch 170 step    73750 |     66 batches | lr 0.000126 | ms/batch 321.05 | loss  3.90 | ppl    49.435
| epoch 170 step    73800 |    116 batches | lr 0.000126 | ms/batch 319.99 | loss  3.93 | ppl    51.050
| epoch 170 step    73850 |    166 batches | lr 0.000126 | ms/batch 320.12 | loss  3.96 | ppl    52.345
| epoch 170 step    73900 |    216 batches | lr 0.000126 | ms/batch 319.53 | loss  3.96 | ppl    52.521
| epoch 170 step    73950 |    266 batches | lr 0.000125 | ms/batch 319.75 | loss  3.99 | ppl    53.804
| epoch 170 step    74000 |    316 batches | lr 0.000125 | ms/batch 324.14 | loss  3.96 | ppl    52.578
----------------------------------------------------------------------------------------------------
| Eval 185 at step    74000 | time: 132.89s | valid loss  4.22 | valid ppl    67.960
----------------------------------------------------------------------------------------------------
| epoch 170 step    74050 |    366 batches | lr 0.000125 | ms/batch 419.39 | loss  3.90 | ppl    49.391
| epoch 170 step    74100 |    416 batches | lr 0.000125 | ms/batch 327.52 | loss  3.95 | ppl    51.713
| epoch 171 step    74150 |     30 batches | lr 0.000124 | ms/batch 329.75 | loss  4.01 | ppl    55.039
| epoch 171 step    74200 |     80 batches | lr 0.000124 | ms/batch 336.55 | loss  3.89 | ppl    48.812
| epoch 171 step    74250 |    130 batches | lr 0.000124 | ms/batch 336.05 | loss  3.96 | ppl    52.457
| epoch 171 step    74300 |    180 batches | lr 0.000124 | ms/batch 335.30 | loss  3.97 | ppl    52.805
| epoch 171 step    74350 |    230 batches | lr 0.000124 | ms/batch 336.13 | loss  3.99 | ppl    53.829
| epoch 171 step    74400 |    280 batches | lr 0.000123 | ms/batch 335.82 | loss  3.97 | ppl    52.945
----------------------------------------------------------------------------------------------------
| Eval 186 at step    74400 | time: 137.86s | valid loss  4.21 | valid ppl    67.296
----------------------------------------------------------------------------------------------------
| epoch 171 step    74450 |    330 batches | lr 0.000123 | ms/batch 420.69 | loss  3.91 | ppl    50.084
| epoch 171 step    74500 |    380 batches | lr 0.000123 | ms/batch 319.50 | loss  3.92 | ppl    50.568
| epoch 171 step    74550 |    430 batches | lr 0.000123 | ms/batch 319.31 | loss  3.97 | ppl    53.059
| epoch 172 step    74600 |     44 batches | lr 0.000123 | ms/batch 313.83 | loss  3.95 | ppl    51.848
| epoch 172 step    74650 |     94 batches | lr 0.000122 | ms/batch 320.04 | loss  3.89 | ppl    48.804
| epoch 172 step    74700 |    144 batches | lr 0.000122 | ms/batch 318.33 | loss  3.96 | ppl    52.648
| epoch 172 step    74750 |    194 batches | lr 0.000122 | ms/batch 319.66 | loss  3.98 | ppl    53.609
| epoch 172 step    74800 |    244 batches | lr 0.000122 | ms/batch 318.24 | loss  3.99 | ppl    54.300
----------------------------------------------------------------------------------------------------
| Eval 187 at step    74800 | time: 132.46s | valid loss  4.21 | valid ppl    67.614
----------------------------------------------------------------------------------------------------
| epoch 172 step    74850 |    294 batches | lr 0.000122 | ms/batch 419.23 | loss  4.02 | ppl    55.497
| epoch 172 step    74900 |    344 batches | lr 0.000121 | ms/batch 319.38 | loss  3.85 | ppl    47.032
| epoch 172 step    74950 |    394 batches | lr 0.000121 | ms/batch 319.36 | loss  3.95 | ppl    52.027
| epoch 173 step    75000 |      8 batches | lr 0.000121 | ms/batch 312.43 | loss  3.99 | ppl    54.264
| epoch 173 step    75050 |     58 batches | lr 0.000121 | ms/batch 320.42 | loss  3.92 | ppl    50.462
| epoch 173 step    75100 |    108 batches | lr 0.000121 | ms/batch 320.00 | loss  3.93 | ppl    51.134
| epoch 173 step    75150 |    158 batches | lr 0.00012 | ms/batch 320.98 | loss  3.95 | ppl    52.076
| epoch 173 step    75200 |    208 batches | lr 0.00012 | ms/batch 321.76 | loss  3.93 | ppl    51.028
----------------------------------------------------------------------------------------------------
| Eval 188 at step    75200 | time: 132.69s | valid loss  4.21 | valid ppl    67.368
----------------------------------------------------------------------------------------------------
| epoch 173 step    75250 |    258 batches | lr 0.00012 | ms/batch 420.40 | loss  3.99 | ppl    54.313
| epoch 173 step    75300 |    308 batches | lr 0.00012 | ms/batch 320.66 | loss  3.96 | ppl    52.335
| epoch 173 step    75350 |    358 batches | lr 0.000119 | ms/batch 320.39 | loss  3.88 | ppl    48.398
| epoch 173 step    75400 |    408 batches | lr 0.000119 | ms/batch 319.85 | loss  3.91 | ppl    50.006
| epoch 174 step    75450 |     22 batches | lr 0.000119 | ms/batch 314.30 | loss  3.97 | ppl    53.233
| epoch 174 step    75500 |     72 batches | lr 0.000119 | ms/batch 319.44 | loss  3.91 | ppl    49.977
| epoch 174 step    75550 |    122 batches | lr 0.000119 | ms/batch 319.81 | loss  3.93 | ppl    50.655
| epoch 174 step    75600 |    172 batches | lr 0.000118 | ms/batch 319.84 | loss  3.96 | ppl    52.335
----------------------------------------------------------------------------------------------------
| Eval 189 at step    75600 | time: 132.76s | valid loss  4.21 | valid ppl    67.042
----------------------------------------------------------------------------------------------------
| epoch 174 step    75650 |    222 batches | lr 0.000118 | ms/batch 470.75 | loss  3.96 | ppl    52.453
| epoch 174 step    75700 |    272 batches | lr 0.000118 | ms/batch 336.25 | loss  3.96 | ppl    52.449
| epoch 174 step    75750 |    322 batches | lr 0.000118 | ms/batch 336.00 | loss  3.93 | ppl    50.889
| epoch 174 step    75800 |    372 batches | lr 0.000118 | ms/batch 336.09 | loss  3.91 | ppl    49.909
| epoch 174 step    75850 |    422 batches | lr 0.000117 | ms/batch 335.76 | loss  3.93 | ppl    50.810
| epoch 175 step    75900 |     36 batches | lr 0.000117 | ms/batch 315.54 | loss  3.94 | ppl    51.608
| epoch 175 step    75950 |     86 batches | lr 0.000117 | ms/batch 320.17 | loss  3.90 | ppl    49.571
| epoch 175 step    76000 |    136 batches | lr 0.000117 | ms/batch 319.62 | loss  3.95 | ppl    52.057
----------------------------------------------------------------------------------------------------
| Eval 190 at step    76000 | time: 136.77s | valid loss  4.21 | valid ppl    67.261
----------------------------------------------------------------------------------------------------
| epoch 175 step    76050 |    186 batches | lr 0.000117 | ms/batch 419.92 | loss  3.95 | ppl    51.735
| epoch 175 step    76100 |    236 batches | lr 0.000116 | ms/batch 318.97 | loss  3.96 | ppl    52.427
| epoch 175 step    76150 |    286 batches | lr 0.000116 | ms/batch 319.64 | loss  3.98 | ppl    53.523
| epoch 175 step    76200 |    336 batches | lr 0.000116 | ms/batch 319.75 | loss  3.84 | ppl    46.444
| epoch 175 step    76250 |    386 batches | lr 0.000116 | ms/batch 319.73 | loss  3.93 | ppl    50.943
| epoch 175 step    76300 |    436 batches | lr 0.000116 | ms/batch 315.69 | loss  3.98 | ppl    53.352
| epoch 176 step    76350 |     50 batches | lr 0.000115 | ms/batch 316.76 | loss  3.94 | ppl    51.654
| epoch 176 step    76400 |    100 batches | lr 0.000115 | ms/batch 318.87 | loss  3.90 | ppl    49.271
----------------------------------------------------------------------------------------------------
| Eval 191 at step    76400 | time: 132.48s | valid loss  4.22 | valid ppl    67.819
----------------------------------------------------------------------------------------------------
| epoch 176 step    76450 |    150 batches | lr 0.000115 | ms/batch 420.17 | loss  3.94 | ppl    51.370
| epoch 176 step    76500 |    200 batches | lr 0.000115 | ms/batch 319.99 | loss  3.94 | ppl    51.376
| epoch 176 step    76550 |    250 batches | lr 0.000114 | ms/batch 319.79 | loss  3.93 | ppl    51.046
| epoch 176 step    76600 |    300 batches | lr 0.000114 | ms/batch 326.32 | loss  3.98 | ppl    53.716
| epoch 176 step    76650 |    350 batches | lr 0.000114 | ms/batch 335.40 | loss  3.86 | ppl    47.267
| epoch 176 step    76700 |    400 batches | lr 0.000114 | ms/batch 319.61 | loss  3.93 | ppl    50.855
| epoch 177 step    76750 |     14 batches | lr 0.000114 | ms/batch 312.25 | loss  3.97 | ppl    53.132
| epoch 177 step    76800 |     64 batches | lr 0.000113 | ms/batch 319.30 | loss  3.87 | ppl    47.886
----------------------------------------------------------------------------------------------------
| Eval 192 at step    76800 | time: 133.63s | valid loss  4.21 | valid ppl    67.430
----------------------------------------------------------------------------------------------------
| epoch 177 step    76850 |    114 batches | lr 0.000113 | ms/batch 419.54 | loss  3.92 | ppl    50.226
| epoch 177 step    76900 |    164 batches | lr 0.000113 | ms/batch 320.29 | loss  3.94 | ppl    51.644
| epoch 177 step    76950 |    214 batches | lr 0.000113 | ms/batch 321.20 | loss  3.94 | ppl    51.495
| epoch 177 step    77000 |    264 batches | lr 0.000113 | ms/batch 320.51 | loss  3.95 | ppl    52.102
| epoch 177 step    77050 |    314 batches | lr 0.000112 | ms/batch 333.49 | loss  3.93 | ppl    50.905
| epoch 177 step    77100 |    364 batches | lr 0.000112 | ms/batch 327.80 | loss  3.87 | ppl    48.109
| epoch 177 step    77150 |    414 batches | lr 0.000112 | ms/batch 320.36 | loss  3.92 | ppl    50.412
| epoch 178 step    77200 |     28 batches | lr 0.000112 | ms/batch 314.72 | loss  3.96 | ppl    52.468
----------------------------------------------------------------------------------------------------
| Eval 193 at step    77200 | time: 133.92s | valid loss  4.20 | valid ppl    66.977
----------------------------------------------------------------------------------------------------
| epoch 178 step    77250 |     78 batches | lr 0.000112 | ms/batch 454.44 | loss  3.87 | ppl    47.744
| epoch 178 step    77300 |    128 batches | lr 0.000111 | ms/batch 321.60 | loss  3.92 | ppl    50.489
| epoch 178 step    77350 |    178 batches | lr 0.000111 | ms/batch 320.39 | loss  3.92 | ppl    50.556
| epoch 178 step    77400 |    228 batches | lr 0.000111 | ms/batch 319.78 | loss  3.92 | ppl    50.387
| epoch 178 step    77450 |    278 batches | lr 0.000111 | ms/batch 320.14 | loss  3.96 | ppl    52.241
| epoch 178 step    77500 |    328 batches | lr 0.000111 | ms/batch 319.14 | loss  3.90 | ppl    49.166
| epoch 178 step    77550 |    378 batches | lr 0.00011 | ms/batch 319.93 | loss  3.91 | ppl    50.145
| epoch 178 step    77600 |    428 batches | lr 0.00011 | ms/batch 320.35 | loss  3.96 | ppl    52.504
----------------------------------------------------------------------------------------------------
| Eval 194 at step    77600 | time: 133.14s | valid loss  4.21 | valid ppl    67.047
----------------------------------------------------------------------------------------------------
| epoch 179 step    77650 |     42 batches | lr 0.00011 | ms/batch 416.34 | loss  3.93 | ppl    50.895
| epoch 179 step    77700 |     92 batches | lr 0.00011 | ms/batch 320.86 | loss  3.87 | ppl    47.776
| epoch 179 step    77750 |    142 batches | lr 0.00011 | ms/batch 319.79 | loss  3.92 | ppl    50.306
| epoch 179 step    77800 |    192 batches | lr 0.000109 | ms/batch 319.49 | loss  3.93 | ppl    50.859
| epoch 179 step    77850 |    242 batches | lr 0.000109 | ms/batch 320.51 | loss  3.93 | ppl    50.685
| epoch 179 step    77900 |    292 batches | lr 0.000109 | ms/batch 319.46 | loss  3.99 | ppl    54.324
| epoch 179 step    77950 |    342 batches | lr 0.000109 | ms/batch 318.87 | loss  3.83 | ppl    46.225
| epoch 179 step    78000 |    392 batches | lr 0.000109 | ms/batch 319.62 | loss  3.92 | ppl    50.475
----------------------------------------------------------------------------------------------------
| Eval 195 at step    78000 | time: 132.88s | valid loss  4.21 | valid ppl    67.683
----------------------------------------------------------------------------------------------------
| epoch 180 step    78050 |      6 batches | lr 0.000108 | ms/batch 433.48 | loss  3.97 | ppl    53.171
| epoch 180 step    78100 |     56 batches | lr 0.000108 | ms/batch 335.74 | loss  3.87 | ppl    48.059
| epoch 180 step    78150 |    106 batches | lr 0.000108 | ms/batch 319.03 | loss  3.90 | ppl    49.210
| epoch 180 step    78200 |    156 batches | lr 0.000108 | ms/batch 318.57 | loss  3.92 | ppl    50.635
| epoch 180 step    78250 |    206 batches | lr 0.000108 | ms/batch 319.09 | loss  3.93 | ppl    50.909
| epoch 180 step    78300 |    256 batches | lr 0.000107 | ms/batch 326.40 | loss  3.95 | ppl    51.931
| epoch 180 step    78350 |    306 batches | lr 0.000107 | ms/batch 319.84 | loss  3.97 | ppl    52.889
| epoch 180 step    78400 |    356 batches | lr 0.000107 | ms/batch 320.33 | loss  3.87 | ppl    47.931
----------------------------------------------------------------------------------------------------
| Eval 196 at step    78400 | time: 134.47s | valid loss  4.21 | valid ppl    67.413
----------------------------------------------------------------------------------------------------
| epoch 180 step    78450 |    406 batches | lr 0.000107 | ms/batch 421.33 | loss  3.93 | ppl    50.885
| epoch 181 step    78500 |     20 batches | lr 0.000107 | ms/batch 314.71 | loss  3.97 | ppl    53.113
| epoch 181 step    78550 |     70 batches | lr 0.000106 | ms/batch 320.91 | loss  3.86 | ppl    47.349
| epoch 181 step    78600 |    120 batches | lr 0.000106 | ms/batch 320.24 | loss  3.92 | ppl    50.523
| epoch 181 step    78650 |    170 batches | lr 0.000106 | ms/batch 320.95 | loss  3.89 | ppl    49.026
| epoch 181 step    78700 |    220 batches | lr 0.000106 | ms/batch 319.93 | loss  3.93 | ppl    50.709
| epoch 181 step    78750 |    270 batches | lr 0.000105 | ms/batch 319.76 | loss  3.95 | ppl    52.159
| epoch 181 step    78800 |    320 batches | lr 0.000105 | ms/batch 320.36 | loss  3.90 | ppl    49.426
----------------------------------------------------------------------------------------------------
| Eval 197 at step    78800 | time: 132.91s | valid loss  4.20 | valid ppl    66.931
----------------------------------------------------------------------------------------------------
| epoch 181 step    78850 |    370 batches | lr 0.000105 | ms/batch 459.63 | loss  3.85 | ppl    47.203
| epoch 181 step    78900 |    420 batches | lr 0.000105 | ms/batch 335.18 | loss  3.92 | ppl    50.310
| epoch 182 step    78950 |     34 batches | lr 0.000105 | ms/batch 327.17 | loss  3.92 | ppl    50.336
| epoch 182 step    79000 |     84 batches | lr 0.000104 | ms/batch 320.27 | loss  3.88 | ppl    48.581
| epoch 182 step    79050 |    134 batches | lr 0.000104 | ms/batch 320.41 | loss  3.90 | ppl    49.414
| epoch 182 step    79100 |    184 batches | lr 0.000104 | ms/batch 321.29 | loss  3.91 | ppl    49.823
| epoch 182 step    79150 |    234 batches | lr 0.000104 | ms/batch 318.54 | loss  3.93 | ppl    51.158
| epoch 182 step    79200 |    284 batches | lr 0.000104 | ms/batch 319.67 | loss  3.94 | ppl    51.668
----------------------------------------------------------------------------------------------------
| Eval 198 at step    79200 | time: 134.45s | valid loss  4.20 | valid ppl    66.640
----------------------------------------------------------------------------------------------------
| epoch 182 step    79250 |    334 batches | lr 0.000103 | ms/batch 454.12 | loss  3.86 | ppl    47.375
| epoch 182 step    79300 |    384 batches | lr 0.000103 | ms/batch 319.49 | loss  3.91 | ppl    49.858
| epoch 182 step    79350 |    434 batches | lr 0.000103 | ms/batch 320.29 | loss  3.95 | ppl    51.881
| epoch 183 step    79400 |     48 batches | lr 0.000103 | ms/batch 313.51 | loss  3.87 | ppl    48.117
| epoch 183 step    79450 |     98 batches | lr 0.000103 | ms/batch 320.66 | loss  3.86 | ppl    47.649
| epoch 183 step    79500 |    148 batches | lr 0.000102 | ms/batch 318.96 | loss  3.92 | ppl    50.369
| epoch 183 step    79550 |    198 batches | lr 0.000102 | ms/batch 319.19 | loss  3.92 | ppl    50.294
| epoch 183 step    79600 |    248 batches | lr 0.000102 | ms/batch 319.14 | loss  3.94 | ppl    51.388
----------------------------------------------------------------------------------------------------
| Eval 199 at step    79600 | time: 132.53s | valid loss  4.21 | valid ppl    67.311
----------------------------------------------------------------------------------------------------
| epoch 183 step    79650 |    298 batches | lr 0.000102 | ms/batch 420.10 | loss  3.95 | ppl    51.796
| epoch 183 step    79700 |    348 batches | lr 0.000102 | ms/batch 318.91 | loss  3.81 | ppl    45.272
| epoch 183 step    79750 |    398 batches | lr 0.000101 | ms/batch 319.94 | loss  3.91 | ppl    50.086
| epoch 184 step    79800 |     12 batches | lr 0.000101 | ms/batch 313.42 | loss  3.96 | ppl    52.308
| epoch 184 step    79850 |     62 batches | lr 0.000101 | ms/batch 320.73 | loss  3.86 | ppl    47.519
| epoch 184 step    79900 |    112 batches | lr 0.000101 | ms/batch 319.70 | loss  3.89 | ppl    49.143
| epoch 184 step    79950 |    162 batches | lr 0.000101 | ms/batch 319.21 | loss  3.90 | ppl    49.397
| epoch 184 step    80000 |    212 batches | lr 0.0001 | ms/batch 319.95 | loss  3.92 | ppl    50.169
----------------------------------------------------------------------------------------------------
| Eval 200 at step    80000 | time: 132.60s | valid loss  4.20 | valid ppl    66.916
----------------------------------------------------------------------------------------------------
| epoch 184 step    80050 |    262 batches | lr 0.0001 | ms/batch 420.39 | loss  3.95 | ppl    52.124
| epoch 184 step    80100 |    312 batches | lr 0.0001 | ms/batch 320.02 | loss  3.93 | ppl    50.730
| epoch 184 step    80150 |    362 batches | lr 9.99e-05 | ms/batch 319.70 | loss  3.86 | ppl    47.386
| epoch 184 step    80200 |    412 batches | lr 9.97e-05 | ms/batch 320.22 | loss  3.92 | ppl    50.379
| epoch 185 step    80250 |     26 batches | lr 9.95e-05 | ms/batch 314.22 | loss  3.95 | ppl    52.096
| epoch 185 step    80300 |     76 batches | lr 9.93e-05 | ms/batch 321.18 | loss  3.85 | ppl    46.944
| epoch 185 step    80350 |    126 batches | lr 9.91e-05 | ms/batch 321.21 | loss  3.89 | ppl    48.827
| epoch 185 step    80400 |    176 batches | lr 9.89e-05 | ms/batch 320.30 | loss  3.91 | ppl    49.915
----------------------------------------------------------------------------------------------------
| Eval 201 at step    80400 | time: 132.89s | valid loss  4.21 | valid ppl    67.098
----------------------------------------------------------------------------------------------------
| epoch 185 step    80450 |    226 batches | lr 9.87e-05 | ms/batch 420.37 | loss  3.91 | ppl    49.928
| epoch 185 step    80500 |    276 batches | lr 9.85e-05 | ms/batch 319.17 | loss  3.96 | ppl    52.394
| epoch 185 step    80550 |    326 batches | lr 9.83e-05 | ms/batch 320.45 | loss  3.87 | ppl    48.170
| epoch 185 step    80600 |    376 batches | lr 9.81e-05 | ms/batch 320.76 | loss  3.90 | ppl    49.293
| epoch 185 step    80650 |    426 batches | lr 9.79e-05 | ms/batch 319.09 | loss  3.92 | ppl    50.460
| epoch 186 step    80700 |     40 batches | lr 9.77e-05 | ms/batch 312.70 | loss  3.92 | ppl    50.326
| epoch 186 step    80750 |     90 batches | lr 9.75e-05 | ms/batch 320.93 | loss  3.85 | ppl    47.190
| epoch 186 step    80800 |    140 batches | lr 9.73e-05 | ms/batch 321.02 | loss  3.91 | ppl    49.656
----------------------------------------------------------------------------------------------------
| Eval 202 at step    80800 | time: 132.74s | valid loss  4.20 | valid ppl    66.786
----------------------------------------------------------------------------------------------------
| epoch 186 step    80850 |    190 batches | lr 9.71e-05 | ms/batch 422.37 | loss  3.91 | ppl    49.850
| epoch 186 step    80900 |    240 batches | lr 9.69e-05 | ms/batch 320.48 | loss  3.93 | ppl    51.118
| epoch 186 step    80950 |    290 batches | lr 9.67e-05 | ms/batch 320.91 | loss  3.98 | ppl    53.461
| epoch 186 step    81000 |    340 batches | lr 9.65e-05 | ms/batch 320.97 | loss  3.83 | ppl    46.190
| epoch 186 step    81050 |    390 batches | lr 9.63e-05 | ms/batch 321.37 | loss  3.90 | ppl    49.565
| epoch 187 step    81100 |      4 batches | lr 9.61e-05 | ms/batch 314.69 | loss  3.94 | ppl    51.407
| epoch 187 step    81150 |     54 batches | lr 9.59e-05 | ms/batch 320.90 | loss  3.89 | ppl    48.947
| epoch 187 step    81200 |    104 batches | lr 9.57e-05 | ms/batch 320.97 | loss  3.87 | ppl    47.959
----------------------------------------------------------------------------------------------------
| Eval 203 at step    81200 | time: 133.13s | valid loss  4.22 | valid ppl    67.770
----------------------------------------------------------------------------------------------------
| epoch 187 step    81250 |    154 batches | lr 9.56e-05 | ms/batch 421.40 | loss  3.92 | ppl    50.491
| epoch 187 step    81300 |    204 batches | lr 9.54e-05 | ms/batch 320.02 | loss  3.92 | ppl    50.355
| epoch 187 step    81350 |    254 batches | lr 9.52e-05 | ms/batch 319.27 | loss  3.93 | ppl    50.758
| epoch 187 step    81400 |    304 batches | lr 9.5e-05 | ms/batch 319.35 | loss  3.94 | ppl    51.559
| epoch 187 step    81450 |    354 batches | lr 9.48e-05 | ms/batch 318.97 | loss  3.82 | ppl    45.512
| epoch 187 step    81500 |    404 batches | lr 9.46e-05 | ms/batch 319.71 | loss  3.90 | ppl    49.491
| epoch 188 step    81550 |     18 batches | lr 9.44e-05 | ms/batch 313.70 | loss  3.92 | ppl    50.509
| epoch 188 step    81600 |     68 batches | lr 9.42e-05 | ms/batch 320.59 | loss  3.85 | ppl    47.190
----------------------------------------------------------------------------------------------------
| Eval 204 at step    81600 | time: 132.66s | valid loss  4.20 | valid ppl    66.901
----------------------------------------------------------------------------------------------------
| epoch 188 step    81650 |    118 batches | lr 9.4e-05 | ms/batch 422.12 | loss  3.89 | ppl    48.934
| epoch 188 step    81700 |    168 batches | lr 9.38e-05 | ms/batch 319.95 | loss  3.91 | ppl    49.769
| epoch 188 step    81750 |    218 batches | lr 9.36e-05 | ms/batch 321.10 | loss  3.93 | ppl    50.730
| epoch 188 step    81800 |    268 batches | lr 9.34e-05 | ms/batch 321.27 | loss  3.93 | ppl    51.062
| epoch 188 step    81850 |    318 batches | lr 9.32e-05 | ms/batch 319.88 | loss  3.90 | ppl    49.289
| epoch 188 step    81900 |    368 batches | lr 9.3e-05 | ms/batch 320.67 | loss  3.86 | ppl    47.330
| epoch 188 step    81950 |    418 batches | lr 9.28e-05 | ms/batch 320.76 | loss  3.90 | ppl    49.260
| epoch 189 step    82000 |     32 batches | lr 9.26e-05 | ms/batch 313.27 | loss  3.92 | ppl    50.412
----------------------------------------------------------------------------------------------------
| Eval 205 at step    82000 | time: 132.94s | valid loss  4.21 | valid ppl    67.371
----------------------------------------------------------------------------------------------------
| epoch 189 step    82050 |     82 batches | lr 9.24e-05 | ms/batch 420.83 | loss  3.84 | ppl    46.678
| epoch 189 step    82100 |    132 batches | lr 9.22e-05 | ms/batch 320.50 | loss  3.91 | ppl    49.969
| epoch 189 step    82150 |    182 batches | lr 9.2e-05 | ms/batch 320.56 | loss  3.88 | ppl    48.646
| epoch 189 step    82200 |    232 batches | lr 9.19e-05 | ms/batch 320.11 | loss  3.94 | ppl    51.210
| epoch 189 step    82250 |    282 batches | lr 9.17e-05 | ms/batch 320.40 | loss  3.95 | ppl    51.755
| epoch 189 step    82300 |    332 batches | lr 9.15e-05 | ms/batch 321.74 | loss  3.85 | ppl    46.814
| epoch 189 step    82350 |    382 batches | lr 9.13e-05 | ms/batch 321.18 | loss  3.89 | ppl    49.028
| epoch 189 step    82400 |    432 batches | lr 9.11e-05 | ms/batch 320.23 | loss  3.92 | ppl    50.653
----------------------------------------------------------------------------------------------------
| Eval 206 at step    82400 | time: 133.28s | valid loss  4.20 | valid ppl    66.519
----------------------------------------------------------------------------------------------------
| epoch 190 step    82450 |     46 batches | lr 9.09e-05 | ms/batch 461.90 | loss  3.89 | ppl    48.678
| epoch 190 step    82500 |     96 batches | lr 9.07e-05 | ms/batch 319.98 | loss  3.85 | ppl    46.857
| epoch 190 step    82550 |    146 batches | lr 9.05e-05 | ms/batch 319.67 | loss  3.90 | ppl    49.189
| epoch 190 step    82600 |    196 batches | lr 9.03e-05 | ms/batch 319.97 | loss  3.89 | ppl    48.772
| epoch 190 step    82650 |    246 batches | lr 9.01e-05 | ms/batch 319.81 | loss  3.92 | ppl    50.229
| epoch 190 step    82700 |    296 batches | lr 8.99e-05 | ms/batch 320.17 | loss  3.95 | ppl    52.092
| epoch 190 step    82750 |    346 batches | lr 8.97e-05 | ms/batch 318.57 | loss  3.82 | ppl    45.624
| epoch 190 step    82800 |    396 batches | lr 8.95e-05 | ms/batch 319.14 | loss  3.90 | ppl    49.561
----------------------------------------------------------------------------------------------------
| Eval 207 at step    82800 | time: 133.20s | valid loss  4.21 | valid ppl    67.032
----------------------------------------------------------------------------------------------------
| epoch 191 step    82850 |     10 batches | lr 8.93e-05 | ms/batch 412.94 | loss  3.96 | ppl    52.236
| epoch 191 step    82900 |     60 batches | lr 8.92e-05 | ms/batch 319.21 | loss  3.84 | ppl    46.629
| epoch 191 step    82950 |    110 batches | lr 8.9e-05 | ms/batch 319.34 | loss  3.85 | ppl    47.184
| epoch 191 step    83000 |    160 batches | lr 8.88e-05 | ms/batch 318.94 | loss  3.89 | ppl    48.673
| epoch 191 step    83050 |    210 batches | lr 8.86e-05 | ms/batch 319.13 | loss  3.90 | ppl    49.588
| epoch 191 step    83100 |    260 batches | lr 8.84e-05 | ms/batch 319.37 | loss  3.94 | ppl    51.356
| epoch 191 step    83150 |    310 batches | lr 8.82e-05 | ms/batch 319.24 | loss  3.91 | ppl    49.997
| epoch 191 step    83200 |    360 batches | lr 8.8e-05 | ms/batch 318.60 | loss  3.82 | ppl    45.606
----------------------------------------------------------------------------------------------------
| Eval 208 at step    83200 | time: 132.37s | valid loss  4.20 | valid ppl    67.007
----------------------------------------------------------------------------------------------------
| epoch 191 step    83250 |    410 batches | lr 8.78e-05 | ms/batch 420.92 | loss  3.87 | ppl    47.969
| epoch 192 step    83300 |     24 batches | lr 8.76e-05 | ms/batch 314.05 | loss  3.93 | ppl    51.082
| epoch 192 step    83350 |     74 batches | lr 8.74e-05 | ms/batch 319.37 | loss  3.85 | ppl    47.070
| epoch 192 step    83400 |    124 batches | lr 8.72e-05 | ms/batch 320.19 | loss  3.91 | ppl    50.108
| epoch 192 step    83450 |    174 batches | lr 8.71e-05 | ms/batch 320.66 | loss  3.90 | ppl    49.196
| epoch 192 step    83500 |    224 batches | lr 8.69e-05 | ms/batch 320.32 | loss  3.92 | ppl    50.475
| epoch 192 step    83550 |    274 batches | lr 8.67e-05 | ms/batch 319.51 | loss  3.89 | ppl    48.844
| epoch 192 step    83600 |    324 batches | lr 8.65e-05 | ms/batch 320.37 | loss  3.88 | ppl    48.192
----------------------------------------------------------------------------------------------------
| Eval 209 at step    83600 | time: 132.75s | valid loss  4.21 | valid ppl    67.216
----------------------------------------------------------------------------------------------------
| epoch 192 step    83650 |    374 batches | lr 8.63e-05 | ms/batch 422.54 | loss  3.88 | ppl    48.368
| epoch 192 step    83700 |    424 batches | lr 8.61e-05 | ms/batch 320.52 | loss  3.89 | ppl    48.911
| epoch 193 step    83750 |     38 batches | lr 8.59e-05 | ms/batch 313.28 | loss  3.89 | ppl    49.100
| epoch 193 step    83800 |     88 batches | lr 8.57e-05 | ms/batch 325.59 | loss  3.83 | ppl    46.135
| epoch 193 step    83850 |    138 batches | lr 8.55e-05 | ms/batch 322.69 | loss  3.88 | ppl    48.637
| epoch 193 step    83900 |    188 batches | lr 8.54e-05 | ms/batch 336.39 | loss  3.89 | ppl    48.760
| epoch 193 step    83950 |    238 batches | lr 8.52e-05 | ms/batch 336.00 | loss  3.92 | ppl    50.501
| epoch 193 step    84000 |    288 batches | lr 8.5e-05 | ms/batch 335.10 | loss  3.94 | ppl    51.264
----------------------------------------------------------------------------------------------------
| Eval 210 at step    84000 | time: 135.60s | valid loss  4.20 | valid ppl    66.608
----------------------------------------------------------------------------------------------------
| epoch 193 step    84050 |    338 batches | lr 8.48e-05 | ms/batch 420.59 | loss  3.81 | ppl    45.039
| epoch 193 step    84100 |    388 batches | lr 8.46e-05 | ms/batch 318.99 | loss  3.90 | ppl    49.358
| epoch 194 step    84150 |      2 batches | lr 8.44e-05 | ms/batch 313.95 | loss  3.93 | ppl    50.816
| epoch 194 step    84200 |     52 batches | lr 8.42e-05 | ms/batch 318.91 | loss  3.84 | ppl    46.340
| epoch 194 step    84250 |    102 batches | lr 8.4e-05 | ms/batch 319.77 | loss  3.87 | ppl    47.864
| epoch 194 step    84300 |    152 batches | lr 8.38e-05 | ms/batch 319.32 | loss  3.90 | ppl    49.314
| epoch 194 step    84350 |    202 batches | lr 8.37e-05 | ms/batch 319.17 | loss  3.87 | ppl    47.813
| epoch 194 step    84400 |    252 batches | lr 8.35e-05 | ms/batch 319.25 | loss  3.91 | ppl    49.965
----------------------------------------------------------------------------------------------------
| Eval 211 at step    84400 | time: 132.50s | valid loss  4.20 | valid ppl    67.000
----------------------------------------------------------------------------------------------------
| epoch 194 step    84450 |    302 batches | lr 8.33e-05 | ms/batch 422.11 | loss  3.95 | ppl    51.822
| epoch 194 step    84500 |    352 batches | lr 8.31e-05 | ms/batch 320.19 | loss  3.81 | ppl    45.218
| epoch 194 step    84550 |    402 batches | lr 8.29e-05 | ms/batch 318.78 | loss  3.89 | ppl    48.941
| epoch 195 step    84600 |     16 batches | lr 8.27e-05 | ms/batch 313.00 | loss  3.90 | ppl    49.306
| epoch 195 step    84650 |     66 batches | lr 8.25e-05 | ms/batch 319.63 | loss  3.83 | ppl    46.043
| epoch 195 step    84700 |    116 batches | lr 8.23e-05 | ms/batch 319.02 | loss  3.87 | ppl    47.952
| epoch 195 step    84750 |    166 batches | lr 8.22e-05 | ms/batch 318.93 | loss  3.88 | ppl    48.439
| epoch 195 step    84800 |    216 batches | lr 8.2e-05 | ms/batch 319.28 | loss  3.89 | ppl    48.945
----------------------------------------------------------------------------------------------------
| Eval 212 at step    84800 | time: 132.54s | valid loss  4.20 | valid ppl    66.910
----------------------------------------------------------------------------------------------------
| epoch 195 step    84850 |    266 batches | lr 8.18e-05 | ms/batch 420.60 | loss  3.93 | ppl    50.827
| epoch 195 step    84900 |    316 batches | lr 8.16e-05 | ms/batch 320.41 | loss  3.88 | ppl    48.426
| epoch 195 step    84950 |    366 batches | lr 8.14e-05 | ms/batch 319.53 | loss  3.83 | ppl    45.919
| epoch 195 step    85000 |    416 batches | lr 8.12e-05 | ms/batch 319.84 | loss  3.88 | ppl    48.360
| epoch 196 step    85050 |     30 batches | lr 8.1e-05 | ms/batch 313.42 | loss  3.91 | ppl    50.118
| epoch 196 step    85100 |     80 batches | lr 8.09e-05 | ms/batch 319.71 | loss  3.83 | ppl    45.879
| epoch 196 step    85150 |    130 batches | lr 8.07e-05 | ms/batch 321.11 | loss  3.86 | ppl    47.247
| epoch 196 step    85200 |    180 batches | lr 8.05e-05 | ms/batch 320.54 | loss  3.89 | ppl    48.871
----------------------------------------------------------------------------------------------------
| Eval 213 at step    85200 | time: 132.80s | valid loss  4.20 | valid ppl    66.598
----------------------------------------------------------------------------------------------------
| epoch 196 step    85250 |    230 batches | lr 8.03e-05 | ms/batch 422.15 | loss  3.92 | ppl    50.566
| epoch 196 step    85300 |    280 batches | lr 8.01e-05 | ms/batch 320.79 | loss  3.91 | ppl    50.088
| epoch 196 step    85350 |    330 batches | lr 7.99e-05 | ms/batch 319.83 | loss  3.84 | ppl    46.487
| epoch 196 step    85400 |    380 batches | lr 7.97e-05 | ms/batch 320.51 | loss  3.88 | ppl    48.330
| epoch 196 step    85450 |    430 batches | lr 7.96e-05 | ms/batch 321.06 | loss  3.89 | ppl    48.987
| epoch 197 step    85500 |     44 batches | lr 7.94e-05 | ms/batch 314.67 | loss  3.86 | ppl    47.310
| epoch 197 step    85550 |     94 batches | lr 7.92e-05 | ms/batch 320.93 | loss  3.84 | ppl    46.713
| epoch 197 step    85600 |    144 batches | lr 7.9e-05 | ms/batch 321.41 | loss  3.91 | ppl    49.706
----------------------------------------------------------------------------------------------------
| Eval 214 at step    85600 | time: 133.06s | valid loss  4.20 | valid ppl    66.582
----------------------------------------------------------------------------------------------------
| epoch 197 step    85650 |    194 batches | lr 7.88e-05 | ms/batch 421.62 | loss  3.90 | ppl    49.273
| epoch 197 step    85700 |    244 batches | lr 7.86e-05 | ms/batch 320.97 | loss  3.90 | ppl    49.445
| epoch 197 step    85750 |    294 batches | lr 7.85e-05 | ms/batch 319.73 | loss  3.93 | ppl    51.130
| epoch 197 step    85800 |    344 batches | lr 7.83e-05 | ms/batch 321.39 | loss  3.79 | ppl    44.082
| epoch 197 step    85850 |    394 batches | lr 7.81e-05 | ms/batch 321.32 | loss  3.88 | ppl    48.326
| epoch 198 step    85900 |      8 batches | lr 7.79e-05 | ms/batch 314.37 | loss  3.90 | ppl    49.511
| epoch 198 step    85950 |     58 batches | lr 7.77e-05 | ms/batch 321.30 | loss  3.84 | ppl    46.525
| epoch 198 step    86000 |    108 batches | lr 7.75e-05 | ms/batch 320.75 | loss  3.86 | ppl    47.430
----------------------------------------------------------------------------------------------------
| Eval 215 at step    86000 | time: 133.07s | valid loss  4.21 | valid ppl    67.038
----------------------------------------------------------------------------------------------------
| epoch 198 step    86050 |    158 batches | lr 7.74e-05 | ms/batch 421.40 | loss  3.87 | ppl    47.871
| epoch 198 step    86100 |    208 batches | lr 7.72e-05 | ms/batch 320.97 | loss  3.88 | ppl    48.525
| epoch 198 step    86150 |    258 batches | lr 7.7e-05 | ms/batch 321.04 | loss  3.91 | ppl    49.691
| epoch 198 step    86200 |    308 batches | lr 7.68e-05 | ms/batch 321.25 | loss  3.89 | ppl    49.020
| epoch 198 step    86250 |    358 batches | lr 7.66e-05 | ms/batch 321.15 | loss  3.84 | ppl    46.350
| epoch 198 step    86300 |    408 batches | lr 7.65e-05 | ms/batch 320.45 | loss  3.86 | ppl    47.514
| epoch 199 step    86350 |     22 batches | lr 7.63e-05 | ms/batch 315.34 | loss  3.91 | ppl    50.055
| epoch 199 step    86400 |     72 batches | lr 7.61e-05 | ms/batch 321.46 | loss  3.83 | ppl    46.030
----------------------------------------------------------------------------------------------------
| Eval 216 at step    86400 | time: 133.15s | valid loss  4.20 | valid ppl    66.674
----------------------------------------------------------------------------------------------------
| epoch 199 step    86450 |    122 batches | lr 7.59e-05 | ms/batch 420.17 | loss  3.86 | ppl    47.590
| epoch 199 step    86500 |    172 batches | lr 7.57e-05 | ms/batch 320.30 | loss  3.84 | ppl    46.433
| epoch 199 step    86550 |    222 batches | lr 7.55e-05 | ms/batch 319.68 | loss  3.90 | ppl    49.306
| epoch 199 step    86600 |    272 batches | lr 7.54e-05 | ms/batch 319.79 | loss  3.92 | ppl    50.229
| epoch 199 step    86650 |    322 batches | lr 7.52e-05 | ms/batch 319.59 | loss  3.89 | ppl    48.766
| epoch 199 step    86700 |    372 batches | lr 7.5e-05 | ms/batch 318.77 | loss  3.85 | ppl    47.004
| epoch 199 step    86750 |    422 batches | lr 7.48e-05 | ms/batch 320.04 | loss  3.85 | ppl    46.762
| epoch 200 step    86800 |     36 batches | lr 7.46e-05 | ms/batch 312.96 | loss  3.88 | ppl    48.489
----------------------------------------------------------------------------------------------------
| Eval 217 at step    86800 | time: 132.55s | valid loss  4.21 | valid ppl    67.268
----------------------------------------------------------------------------------------------------
| epoch 200 step    86850 |     86 batches | lr 7.45e-05 | ms/batch 425.55 | loss  3.81 | ppl    45.237
| epoch 200 step    86900 |    136 batches | lr 7.43e-05 | ms/batch 321.22 | loss  3.89 | ppl    48.857
| epoch 200 step    86950 |    186 batches | lr 7.41e-05 | ms/batch 321.43 | loss  3.88 | ppl    48.202
| epoch 200 step    87000 |    236 batches | lr 7.39e-05 | ms/batch 320.52 | loss  3.90 | ppl    49.181
| epoch 200 step    87050 |    286 batches | lr 7.37e-05 | ms/batch 320.66 | loss  3.91 | ppl    50.061
| epoch 200 step    87100 |    336 batches | lr 7.36e-05 | ms/batch 320.83 | loss  3.81 | ppl    45.087
| epoch 200 step    87150 |    386 batches | lr 7.34e-05 | ms/batch 320.76 | loss  3.87 | ppl    47.954
| epoch 200 step    87200 |    436 batches | lr 7.32e-05 | ms/batch 315.87 | loss  3.90 | ppl    49.487
----------------------------------------------------------------------------------------------------
| Eval 218 at step    87200 | time: 133.37s | valid loss  4.20 | valid ppl    66.859
----------------------------------------------------------------------------------------------------
| epoch 201 step    87250 |     50 batches | lr 7.3e-05 | ms/batch 418.87 | loss  3.83 | ppl    46.153
| epoch 201 step    87300 |    100 batches | lr 7.29e-05 | ms/batch 321.45 | loss  3.82 | ppl    45.695
| epoch 201 step    87350 |    150 batches | lr 7.27e-05 | ms/batch 320.87 | loss  3.88 | ppl    48.350
| epoch 201 step    87400 |    200 batches | lr 7.25e-05 | ms/batch 320.12 | loss  3.88 | ppl    48.317
| epoch 201 step    87450 |    250 batches | lr 7.23e-05 | ms/batch 321.09 | loss  3.91 | ppl    49.732
| epoch 201 step    87500 |    300 batches | lr 7.21e-05 | ms/batch 323.69 | loss  3.92 | ppl    50.214
| epoch 201 step    87550 |    350 batches | lr 7.2e-05 | ms/batch 322.81 | loss  3.79 | ppl    44.367
| epoch 201 step    87600 |    400 batches | lr 7.18e-05 | ms/batch 320.59 | loss  3.88 | ppl    48.468
----------------------------------------------------------------------------------------------------
| Eval 219 at step    87600 | time: 133.49s | valid loss  4.20 | valid ppl    66.687
----------------------------------------------------------------------------------------------------
| epoch 202 step    87650 |     14 batches | lr 7.16e-05 | ms/batch 414.92 | loss  3.92 | ppl    50.157
| epoch 202 step    87700 |     64 batches | lr 7.14e-05 | ms/batch 320.88 | loss  3.82 | ppl    45.515
| epoch 202 step    87750 |    114 batches | lr 7.13e-05 | ms/batch 320.51 | loss  3.85 | ppl    47.118
| epoch 202 step    87800 |    164 batches | lr 7.11e-05 | ms/batch 319.23 | loss  3.88 | ppl    48.243
| epoch 202 step    87850 |    214 batches | lr 7.09e-05 | ms/batch 320.33 | loss  3.88 | ppl    48.506
| epoch 202 step    87900 |    264 batches | lr 7.07e-05 | ms/batch 320.44 | loss  3.87 | ppl    48.015
| epoch 202 step    87950 |    314 batches | lr 7.05e-05 | ms/batch 320.19 | loss  3.87 | ppl    48.143
| epoch 202 step    88000 |    364 batches | lr 7.04e-05 | ms/batch 320.44 | loss  3.80 | ppl    44.834
----------------------------------------------------------------------------------------------------
| Eval 220 at step    88000 | time: 132.90s | valid loss  4.20 | valid ppl    66.547
----------------------------------------------------------------------------------------------------
| epoch 202 step    88050 |    414 batches | lr 7.02e-05 | ms/batch 422.08 | loss  3.86 | ppl    47.251
| epoch 203 step    88100 |     28 batches | lr 7e-05 | ms/batch 314.82 | loss  3.90 | ppl    49.264
| epoch 203 step    88150 |     78 batches | lr 6.98e-05 | ms/batch 320.33 | loss  3.79 | ppl    44.450
| epoch 203 step    88200 |    128 batches | lr 6.97e-05 | ms/batch 320.96 | loss  3.87 | ppl    47.703
| epoch 203 step    88250 |    178 batches | lr 6.95e-05 | ms/batch 321.57 | loss  3.88 | ppl    48.456
| epoch 203 step    88300 |    228 batches | lr 6.93e-05 | ms/batch 321.37 | loss  3.87 | ppl    47.765
| epoch 203 step    88350 |    278 batches | lr 6.91e-05 | ms/batch 320.85 | loss  3.90 | ppl    49.414
| epoch 203 step    88400 |    328 batches | lr 6.9e-05 | ms/batch 320.89 | loss  3.81 | ppl    45.039
----------------------------------------------------------------------------------------------------
| Eval 221 at step    88400 | time: 133.09s | valid loss  4.20 | valid ppl    66.493
----------------------------------------------------------------------------------------------------
| epoch 203 step    88450 |    378 batches | lr 6.88e-05 | ms/batch 453.93 | loss  3.86 | ppl    47.386
| epoch 203 step    88500 |    428 batches | lr 6.86e-05 | ms/batch 319.21 | loss  3.87 | ppl    47.881
| epoch 204 step    88550 |     42 batches | lr 6.84e-05 | ms/batch 313.12 | loss  3.86 | ppl    47.389
| epoch 204 step    88600 |     92 batches | lr 6.83e-05 | ms/batch 318.39 | loss  3.80 | ppl    44.565
| epoch 204 step    88650 |    142 batches | lr 6.81e-05 | ms/batch 319.54 | loss  3.87 | ppl    47.832
| epoch 204 step    88700 |    192 batches | lr 6.79e-05 | ms/batch 319.44 | loss  3.88 | ppl    48.542
| epoch 204 step    88750 |    242 batches | lr 6.77e-05 | ms/batch 318.66 | loss  3.90 | ppl    49.268
| epoch 204 step    88800 |    292 batches | lr 6.76e-05 | ms/batch 320.52 | loss  3.91 | ppl    50.131
----------------------------------------------------------------------------------------------------
| Eval 222 at step    88800 | time: 132.38s | valid loss  4.20 | valid ppl    66.360
----------------------------------------------------------------------------------------------------
| epoch 204 step    88850 |    342 batches | lr 6.74e-05 | ms/batch 453.50 | loss  3.78 | ppl    43.962
| epoch 204 step    88900 |    392 batches | lr 6.72e-05 | ms/batch 321.75 | loss  3.87 | ppl    48.077
| epoch 205 step    88950 |      6 batches | lr 6.7e-05 | ms/batch 315.58 | loss  3.89 | ppl    48.896
| epoch 205 step    89000 |     56 batches | lr 6.69e-05 | ms/batch 318.57 | loss  3.82 | ppl    45.421
| epoch 205 step    89050 |    106 batches | lr 6.67e-05 | ms/batch 319.41 | loss  3.85 | ppl    46.889
| epoch 205 step    89100 |    156 batches | lr 6.65e-05 | ms/batch 320.64 | loss  3.86 | ppl    47.351
| epoch 205 step    89150 |    206 batches | lr 6.64e-05 | ms/batch 320.40 | loss  3.87 | ppl    47.896
| epoch 205 step    89200 |    256 batches | lr 6.62e-05 | ms/batch 320.61 | loss  3.90 | ppl    49.341
----------------------------------------------------------------------------------------------------
| Eval 223 at step    89200 | time: 132.90s | valid loss  4.20 | valid ppl    66.548
----------------------------------------------------------------------------------------------------
| epoch 205 step    89250 |    306 batches | lr 6.6e-05 | ms/batch 422.24 | loss  3.91 | ppl    49.757
| epoch 205 step    89300 |    356 batches | lr 6.58e-05 | ms/batch 320.93 | loss  3.80 | ppl    44.525
| epoch 205 step    89350 |    406 batches | lr 6.57e-05 | ms/batch 320.33 | loss  3.88 | ppl    48.218
| epoch 206 step    89400 |     20 batches | lr 6.55e-05 | ms/batch 314.02 | loss  3.89 | ppl    48.856
| epoch 206 step    89450 |     70 batches | lr 6.53e-05 | ms/batch 321.54 | loss  3.82 | ppl    45.389
| epoch 206 step    89500 |    120 batches | lr 6.52e-05 | ms/batch 324.88 | loss  3.87 | ppl    48.072
| epoch 206 step    89550 |    170 batches | lr 6.5e-05 | ms/batch 320.49 | loss  3.88 | ppl    48.487
| epoch 206 step    89600 |    220 batches | lr 6.48e-05 | ms/batch 320.26 | loss  3.88 | ppl    48.589
----------------------------------------------------------------------------------------------------
| Eval 224 at step    89600 | time: 133.21s | valid loss  4.20 | valid ppl    66.684
----------------------------------------------------------------------------------------------------
| epoch 206 step    89650 |    270 batches | lr 6.46e-05 | ms/batch 421.57 | loss  3.88 | ppl    48.432
| epoch 206 step    89700 |    320 batches | lr 6.45e-05 | ms/batch 319.66 | loss  3.84 | ppl    46.575
| epoch 206 step    89750 |    370 batches | lr 6.43e-05 | ms/batch 319.95 | loss  3.82 | ppl    45.497
| epoch 206 step    89800 |    420 batches | lr 6.41e-05 | ms/batch 320.28 | loss  3.85 | ppl    47.028
| epoch 207 step    89850 |     34 batches | lr 6.4e-05 | ms/batch 313.43 | loss  3.89 | ppl    49.146
| epoch 207 step    89900 |     84 batches | lr 6.38e-05 | ms/batch 319.96 | loss  3.79 | ppl    44.249
| epoch 207 step    89950 |    134 batches | lr 6.36e-05 | ms/batch 317.90 | loss  3.84 | ppl    46.709
| epoch 207 step    90000 |    184 batches | lr 6.35e-05 | ms/batch 319.36 | loss  3.85 | ppl    47.057
----------------------------------------------------------------------------------------------------
| Eval 225 at step    90000 | time: 132.59s | valid loss  4.20 | valid ppl    66.422
----------------------------------------------------------------------------------------------------
| epoch 207 step    90050 |    234 batches | lr 6.33e-05 | ms/batch 420.00 | loss  3.86 | ppl    47.601
| epoch 207 step    90100 |    284 batches | lr 6.31e-05 | ms/batch 319.26 | loss  3.91 | ppl    50.075
| epoch 207 step    90150 |    334 batches | lr 6.29e-05 | ms/batch 318.72 | loss  3.79 | ppl    44.160
| epoch 207 step    90200 |    384 batches | lr 6.28e-05 | ms/batch 319.77 | loss  3.85 | ppl    46.966
| epoch 207 step    90250 |    434 batches | lr 6.26e-05 | ms/batch 319.79 | loss  3.88 | ppl    48.581
| epoch 208 step    90300 |     48 batches | lr 6.24e-05 | ms/batch 313.82 | loss  3.81 | ppl    45.249
| epoch 208 step    90350 |     98 batches | lr 6.23e-05 | ms/batch 320.30 | loss  3.80 | ppl    44.859
| epoch 208 step    90400 |    148 batches | lr 6.21e-05 | ms/batch 320.74 | loss  3.86 | ppl    47.573
----------------------------------------------------------------------------------------------------
| Eval 226 at step    90400 | time: 132.63s | valid loss  4.20 | valid ppl    66.631
----------------------------------------------------------------------------------------------------
| epoch 208 step    90450 |    198 batches | lr 6.19e-05 | ms/batch 418.71 | loss  3.84 | ppl    46.442
| epoch 208 step    90500 |    248 batches | lr 6.18e-05 | ms/batch 319.96 | loss  3.89 | ppl    48.669
| epoch 208 step    90550 |    298 batches | lr 6.16e-05 | ms/batch 321.98 | loss  3.90 | ppl    49.345
| epoch 208 step    90600 |    348 batches | lr 6.14e-05 | ms/batch 320.19 | loss  3.76 | ppl    42.917
| epoch 208 step    90650 |    398 batches | lr 6.13e-05 | ms/batch 319.18 | loss  3.87 | ppl    47.772
| epoch 209 step    90700 |     12 batches | lr 6.11e-05 | ms/batch 313.84 | loss  3.88 | ppl    48.629
| epoch 209 step    90750 |     62 batches | lr 6.09e-05 | ms/batch 321.07 | loss  3.80 | ppl    44.843
| epoch 209 step    90800 |    112 batches | lr 6.08e-05 | ms/batch 319.47 | loss  3.83 | ppl    46.178
----------------------------------------------------------------------------------------------------
| Eval 227 at step    90800 | time: 132.71s | valid loss  4.20 | valid ppl    66.476
----------------------------------------------------------------------------------------------------
| epoch 209 step    90850 |    162 batches | lr 6.06e-05 | ms/batch 419.13 | loss  3.87 | ppl    47.714
| epoch 209 step    90900 |    212 batches | lr 6.04e-05 | ms/batch 318.37 | loss  3.84 | ppl    46.640
| epoch 209 step    90950 |    262 batches | lr 6.03e-05 | ms/batch 319.35 | loss  3.87 | ppl    47.888
| epoch 209 step    91000 |    312 batches | lr 6.01e-05 | ms/batch 319.11 | loss  3.85 | ppl    47.083
| epoch 209 step    91050 |    362 batches | lr 5.99e-05 | ms/batch 318.52 | loss  3.83 | ppl    46.072
| epoch 209 step    91100 |    412 batches | lr 5.98e-05 | ms/batch 318.70 | loss  3.84 | ppl    46.740
| epoch 210 step    91150 |     26 batches | lr 5.96e-05 | ms/batch 314.55 | loss  3.87 | ppl    48.038
| epoch 210 step    91200 |     76 batches | lr 5.94e-05 | ms/batch 319.93 | loss  3.82 | ppl    45.412
----------------------------------------------------------------------------------------------------
| Eval 228 at step    91200 | time: 132.42s | valid loss  4.20 | valid ppl    66.461
----------------------------------------------------------------------------------------------------
| epoch 210 step    91250 |    126 batches | lr 5.93e-05 | ms/batch 421.61 | loss  3.87 | ppl    47.819
| epoch 210 step    91300 |    176 batches | lr 5.91e-05 | ms/batch 318.29 | loss  3.87 | ppl    47.970
| epoch 210 step    91350 |    226 batches | lr 5.89e-05 | ms/batch 319.52 | loss  3.88 | ppl    48.224
| epoch 210 step    91400 |    276 batches | lr 5.88e-05 | ms/batch 318.15 | loss  3.89 | ppl    49.137
| epoch 210 step    91450 |    326 batches | lr 5.86e-05 | ms/batch 317.86 | loss  3.81 | ppl    45.165
| epoch 210 step    91500 |    376 batches | lr 5.84e-05 | ms/batch 320.78 | loss  3.83 | ppl    46.270
| epoch 210 step    91550 |    426 batches | lr 5.83e-05 | ms/batch 321.59 | loss  3.84 | ppl    46.664
| epoch 211 step    91600 |     40 batches | lr 5.81e-05 | ms/batch 314.06 | loss  3.87 | ppl    47.707
----------------------------------------------------------------------------------------------------
| Eval 229 at step    91600 | time: 132.61s | valid loss  4.20 | valid ppl    66.612
----------------------------------------------------------------------------------------------------
| epoch 211 step    91650 |     90 batches | lr 5.8e-05 | ms/batch 420.21 | loss  3.78 | ppl    43.974
| epoch 211 step    91700 |    140 batches | lr 5.78e-05 | ms/batch 320.05 | loss  3.87 | ppl    47.879
| epoch 211 step    91750 |    190 batches | lr 5.76e-05 | ms/batch 318.97 | loss  3.86 | ppl    47.438
| epoch 211 step    91800 |    240 batches | lr 5.75e-05 | ms/batch 319.83 | loss  3.86 | ppl    47.638
| epoch 211 step    91850 |    290 batches | lr 5.73e-05 | ms/batch 318.49 | loss  3.89 | ppl    49.008
| epoch 211 step    91900 |    340 batches | lr 5.71e-05 | ms/batch 319.44 | loss  3.76 | ppl    43.073
| epoch 211 step    91950 |    390 batches | lr 5.7e-05 | ms/batch 320.08 | loss  3.85 | ppl    46.861
| epoch 212 step    92000 |      4 batches | lr 5.68e-05 | ms/batch 311.61 | loss  3.88 | ppl    48.298
----------------------------------------------------------------------------------------------------
| Eval 230 at step    92000 | time: 132.43s | valid loss  4.20 | valid ppl    66.512
----------------------------------------------------------------------------------------------------
| epoch 212 step    92050 |     54 batches | lr 5.67e-05 | ms/batch 420.11 | loss  3.80 | ppl    44.616
| epoch 212 step    92100 |    104 batches | lr 5.65e-05 | ms/batch 319.94 | loss  3.83 | ppl    46.108
| epoch 212 step    92150 |    154 batches | lr 5.63e-05 | ms/batch 320.03 | loss  3.85 | ppl    46.982
| epoch 212 step    92200 |    204 batches | lr 5.62e-05 | ms/batch 318.94 | loss  3.83 | ppl    46.277
| epoch 212 step    92250 |    254 batches | lr 5.6e-05 | ms/batch 318.90 | loss  3.86 | ppl    47.277
| epoch 212 step    92300 |    304 batches | lr 5.58e-05 | ms/batch 321.18 | loss  3.87 | ppl    48.076
| epoch 212 step    92350 |    354 batches | lr 5.57e-05 | ms/batch 321.50 | loss  3.80 | ppl    44.726
| epoch 212 step    92400 |    404 batches | lr 5.55e-05 | ms/batch 319.75 | loss  3.86 | ppl    47.443
----------------------------------------------------------------------------------------------------
| Eval 231 at step    92400 | time: 133.03s | valid loss  4.20 | valid ppl    66.704
----------------------------------------------------------------------------------------------------
| epoch 213 step    92450 |     18 batches | lr 5.54e-05 | ms/batch 416.68 | loss  3.90 | ppl    49.406
| epoch 213 step    92500 |     68 batches | lr 5.52e-05 | ms/batch 320.83 | loss  3.78 | ppl    43.763
| epoch 213 step    92550 |    118 batches | lr 5.5e-05 | ms/batch 320.59 | loss  3.82 | ppl    45.793
| epoch 213 step    92600 |    168 batches | lr 5.49e-05 | ms/batch 320.52 | loss  3.83 | ppl    46.162
| epoch 213 step    92650 |    218 batches | lr 5.47e-05 | ms/batch 320.39 | loss  3.87 | ppl    47.914
| epoch 213 step    92700 |    268 batches | lr 5.46e-05 | ms/batch 321.09 | loss  3.86 | ppl    47.384
| epoch 213 step    92750 |    318 batches | lr 5.44e-05 | ms/batch 320.62 | loss  3.85 | ppl    46.964
| epoch 213 step    92800 |    368 batches | lr 5.42e-05 | ms/batch 318.92 | loss  3.80 | ppl    44.659
----------------------------------------------------------------------------------------------------
| Eval 232 at step    92800 | time: 132.94s | valid loss  4.20 | valid ppl    66.361
----------------------------------------------------------------------------------------------------
| epoch 213 step    92850 |    418 batches | lr 5.41e-05 | ms/batch 419.73 | loss  3.83 | ppl    45.976
| epoch 214 step    92900 |     32 batches | lr 5.39e-05 | ms/batch 312.80 | loss  3.86 | ppl    47.580
| epoch 214 step    92950 |     82 batches | lr 5.38e-05 | ms/batch 319.86 | loss  3.79 | ppl    44.144
| epoch 214 step    93000 |    132 batches | lr 5.36e-05 | ms/batch 319.02 | loss  3.86 | ppl    47.491
| epoch 214 step    93050 |    182 batches | lr 5.35e-05 | ms/batch 319.37 | loss  3.86 | ppl    47.360
| epoch 214 step    93100 |    232 batches | lr 5.33e-05 | ms/batch 319.32 | loss  3.84 | ppl    46.580
| epoch 214 step    93150 |    282 batches | lr 5.31e-05 | ms/batch 319.28 | loss  3.85 | ppl    47.197
| epoch 214 step    93200 |    332 batches | lr 5.3e-05 | ms/batch 319.14 | loss  3.77 | ppl    43.473
----------------------------------------------------------------------------------------------------
| Eval 233 at step    93200 | time: 132.42s | valid loss  4.20 | valid ppl    66.801
----------------------------------------------------------------------------------------------------
| epoch 214 step    93250 |    382 batches | lr 5.28e-05 | ms/batch 419.31 | loss  3.83 | ppl    45.847
| epoch 214 step    93300 |    432 batches | lr 5.27e-05 | ms/batch 321.96 | loss  3.87 | ppl    48.106
| epoch 215 step    93350 |     46 batches | lr 5.25e-05 | ms/batch 313.38 | loss  3.83 | ppl    45.913
| epoch 215 step    93400 |     96 batches | lr 5.23e-05 | ms/batch 319.42 | loss  3.77 | ppl    43.370
| epoch 215 step    93450 |    146 batches | lr 5.22e-05 | ms/batch 320.44 | loss  3.83 | ppl    46.261
| epoch 215 step    93500 |    196 batches | lr 5.2e-05 | ms/batch 320.25 | loss  3.85 | ppl    46.815
| epoch 215 step    93550 |    246 batches | lr 5.19e-05 | ms/batch 319.75 | loss  3.87 | ppl    47.882
| epoch 215 step    93600 |    296 batches | lr 5.17e-05 | ms/batch 319.55 | loss  3.90 | ppl    49.559
----------------------------------------------------------------------------------------------------
| Eval 234 at step    93600 | time: 132.72s | valid loss  4.20 | valid ppl    66.379
----------------------------------------------------------------------------------------------------
| epoch 215 step    93650 |    346 batches | lr 5.16e-05 | ms/batch 419.72 | loss  3.74 | ppl    41.926
| epoch 215 step    93700 |    396 batches | lr 5.14e-05 | ms/batch 319.41 | loss  3.84 | ppl    46.301
| epoch 216 step    93750 |     10 batches | lr 5.13e-05 | ms/batch 313.41 | loss  3.86 | ppl    47.426
| epoch 216 step    93800 |     60 batches | lr 5.11e-05 | ms/batch 319.56 | loss  3.78 | ppl    43.794
| epoch 216 step    93850 |    110 batches | lr 5.09e-05 | ms/batch 320.53 | loss  3.80 | ppl    44.654
| epoch 216 step    93900 |    160 batches | lr 5.08e-05 | ms/batch 318.39 | loss  3.82 | ppl    45.726
| epoch 216 step    93950 |    210 batches | lr 5.06e-05 | ms/batch 319.55 | loss  3.83 | ppl    46.028
| epoch 216 step    94000 |    260 batches | lr 5.05e-05 | ms/batch 318.35 | loss  3.87 | ppl    47.798
----------------------------------------------------------------------------------------------------
| Eval 235 at step    94000 | time: 132.43s | valid loss  4.20 | valid ppl    66.458
----------------------------------------------------------------------------------------------------
| epoch 216 step    94050 |    310 batches | lr 5.03e-05 | ms/batch 418.90 | loss  3.85 | ppl    47.039
| epoch 216 step    94100 |    360 batches | lr 5.02e-05 | ms/batch 319.80 | loss  3.78 | ppl    43.785
| epoch 216 step    94150 |    410 batches | lr 5e-05 | ms/batch 319.79 | loss  3.84 | ppl    46.404
| epoch 217 step    94200 |     24 batches | lr 4.99e-05 | ms/batch 314.44 | loss  3.86 | ppl    47.301
| epoch 217 step    94250 |     74 batches | lr 4.97e-05 | ms/batch 320.22 | loss  3.77 | ppl    43.255
| epoch 217 step    94300 |    124 batches | lr 4.96e-05 | ms/batch 318.84 | loss  3.83 | ppl    46.050
| epoch 217 step    94350 |    174 batches | lr 4.94e-05 | ms/batch 320.75 | loss  3.81 | ppl    45.281
| epoch 217 step    94400 |    224 batches | lr 4.93e-05 | ms/batch 320.44 | loss  3.88 | ppl    48.366
----------------------------------------------------------------------------------------------------
| Eval 236 at step    94400 | time: 132.68s | valid loss  4.20 | valid ppl    66.484
----------------------------------------------------------------------------------------------------
| epoch 217 step    94450 |    274 batches | lr 4.91e-05 | ms/batch 421.03 | loss  3.86 | ppl    47.345
| epoch 217 step    94500 |    324 batches | lr 4.89e-05 | ms/batch 318.87 | loss  3.82 | ppl    45.524
| epoch 217 step    94550 |    374 batches | lr 4.88e-05 | ms/batch 318.23 | loss  3.82 | ppl    45.654
| epoch 217 step    94600 |    424 batches | lr 4.86e-05 | ms/batch 319.16 | loss  3.85 | ppl    46.819
| epoch 218 step    94650 |     38 batches | lr 4.85e-05 | ms/batch 313.34 | loss  3.84 | ppl    46.666
| epoch 218 step    94700 |     88 batches | lr 4.83e-05 | ms/batch 318.28 | loss  3.80 | ppl    44.516
| epoch 218 step    94750 |    138 batches | lr 4.82e-05 | ms/batch 318.97 | loss  3.83 | ppl    46.145
| epoch 218 step    94800 |    188 batches | lr 4.8e-05 | ms/batch 318.67 | loss  3.84 | ppl    46.382
----------------------------------------------------------------------------------------------------
| Eval 237 at step    94800 | time: 132.34s | valid loss  4.20 | valid ppl    66.365
----------------------------------------------------------------------------------------------------
| epoch 218 step    94850 |    238 batches | lr 4.79e-05 | ms/batch 420.15 | loss  3.83 | ppl    46.272
| epoch 218 step    94900 |    288 batches | lr 4.77e-05 | ms/batch 319.76 | loss  3.89 | ppl    49.072
| epoch 218 step    94950 |    338 batches | lr 4.76e-05 | ms/batch 319.79 | loss  3.75 | ppl    42.398
| epoch 218 step    95000 |    388 batches | lr 4.74e-05 | ms/batch 319.10 | loss  3.83 | ppl    46.057
| epoch 219 step    95050 |      2 batches | lr 4.73e-05 | ms/batch 314.93 | loss  3.88 | ppl    48.451
| epoch 219 step    95100 |     52 batches | lr 4.71e-05 | ms/batch 320.00 | loss  3.82 | ppl    45.464
| epoch 219 step    95150 |    102 batches | lr 4.7e-05 | ms/batch 320.65 | loss  3.78 | ppl    43.777
| epoch 219 step    95200 |    152 batches | lr 4.68e-05 | ms/batch 321.22 | loss  3.83 | ppl    46.048
----------------------------------------------------------------------------------------------------
| Eval 238 at step    95200 | time: 132.76s | valid loss  4.19 | valid ppl    66.329
----------------------------------------------------------------------------------------------------
| epoch 219 step    95250 |    202 batches | lr 4.67e-05 | ms/batch 450.46 | loss  3.85 | ppl    47.160
| epoch 219 step    95300 |    252 batches | lr 4.65e-05 | ms/batch 320.26 | loss  3.87 | ppl    47.916
| epoch 219 step    95350 |    302 batches | lr 4.64e-05 | ms/batch 320.87 | loss  3.89 | ppl    48.787
| epoch 219 step    95400 |    352 batches | lr 4.62e-05 | ms/batch 320.05 | loss  3.75 | ppl    42.631
| epoch 219 step    95450 |    402 batches | lr 4.61e-05 | ms/batch 320.68 | loss  3.87 | ppl    47.985
| epoch 220 step    95500 |     16 batches | lr 4.59e-05 | ms/batch 312.35 | loss  3.85 | ppl    47.010
| epoch 220 step    95550 |     66 batches | lr 4.58e-05 | ms/batch 320.54 | loss  3.75 | ppl    42.470
| epoch 220 step    95600 |    116 batches | lr 4.56e-05 | ms/batch 320.48 | loss  3.79 | ppl    44.409
----------------------------------------------------------------------------------------------------
| Eval 239 at step    95600 | time: 132.74s | valid loss  4.20 | valid ppl    66.751
----------------------------------------------------------------------------------------------------
| epoch 220 step    95650 |    166 batches | lr 4.55e-05 | ms/batch 420.72 | loss  3.82 | ppl    45.542
| epoch 220 step    95700 |    216 batches | lr 4.53e-05 | ms/batch 320.58 | loss  3.84 | ppl    46.482
| epoch 220 step    95750 |    266 batches | lr 4.52e-05 | ms/batch 321.84 | loss  3.85 | ppl    47.160
| epoch 220 step    95800 |    316 batches | lr 4.5e-05 | ms/batch 320.74 | loss  3.82 | ppl    45.692
| epoch 220 step    95850 |    366 batches | lr 4.49e-05 | ms/batch 319.77 | loss  3.77 | ppl    43.492
| epoch 220 step    95900 |    416 batches | lr 4.48e-05 | ms/batch 319.08 | loss  3.83 | ppl    46.018
| epoch 221 step    95950 |     30 batches | lr 4.46e-05 | ms/batch 312.48 | loss  3.86 | ppl    47.551
| epoch 221 step    96000 |     80 batches | lr 4.45e-05 | ms/batch 320.59 | loss  3.76 | ppl    43.162
----------------------------------------------------------------------------------------------------
| Eval 240 at step    96000 | time: 132.76s | valid loss  4.20 | valid ppl    66.595
----------------------------------------------------------------------------------------------------
| epoch 221 step    96050 |    130 batches | lr 4.43e-05 | ms/batch 418.01 | loss  3.82 | ppl    45.558
| epoch 221 step    96100 |    180 batches | lr 4.42e-05 | ms/batch 319.85 | loss  3.84 | ppl    46.717
| epoch 221 step    96150 |    230 batches | lr 4.4e-05 | ms/batch 320.83 | loss  3.83 | ppl    45.899
| epoch 221 step    96200 |    280 batches | lr 4.39e-05 | ms/batch 320.06 | loss  3.87 | ppl    47.845
| epoch 221 step    96250 |    330 batches | lr 4.37e-05 | ms/batch 320.36 | loss  3.79 | ppl    44.080
| epoch 221 step    96300 |    380 batches | lr 4.36e-05 | ms/batch 320.70 | loss  3.81 | ppl    45.347
| epoch 221 step    96350 |    430 batches | lr 4.34e-05 | ms/batch 320.51 | loss  3.84 | ppl    46.306
| epoch 222 step    96400 |     44 batches | lr 4.33e-05 | ms/batch 313.58 | loss  3.81 | ppl    45.115
----------------------------------------------------------------------------------------------------
| Eval 241 at step    96400 | time: 132.71s | valid loss  4.20 | valid ppl    66.673
----------------------------------------------------------------------------------------------------
| epoch 222 step    96450 |     94 batches | lr 4.32e-05 | ms/batch 420.07 | loss  3.77 | ppl    43.328
| epoch 222 step    96500 |    144 batches | lr 4.3e-05 | ms/batch 320.18 | loss  3.83 | ppl    46.034
| epoch 222 step    96550 |    194 batches | lr 4.29e-05 | ms/batch 321.20 | loss  3.82 | ppl    45.740
| epoch 222 step    96600 |    244 batches | lr 4.27e-05 | ms/batch 318.95 | loss  3.84 | ppl    46.631
| epoch 222 step    96650 |    294 batches | lr 4.26e-05 | ms/batch 320.96 | loss  3.89 | ppl    49.098
| epoch 222 step    96700 |    344 batches | lr 4.24e-05 | ms/batch 318.87 | loss  3.73 | ppl    41.828
| epoch 222 step    96750 |    394 batches | lr 4.23e-05 | ms/batch 318.86 | loss  3.83 | ppl    45.852
| epoch 223 step    96800 |      8 batches | lr 4.21e-05 | ms/batch 313.10 | loss  3.87 | ppl    47.970
----------------------------------------------------------------------------------------------------
| Eval 242 at step    96800 | time: 132.58s | valid loss  4.19 | valid ppl    66.123
----------------------------------------------------------------------------------------------------
| epoch 223 step    96850 |     58 batches | lr 4.2e-05 | ms/batch 451.90 | loss  3.78 | ppl    43.857
| epoch 223 step    96900 |    108 batches | lr 4.19e-05 | ms/batch 319.87 | loss  3.81 | ppl    44.997
| epoch 223 step    96950 |    158 batches | lr 4.17e-05 | ms/batch 318.35 | loss  3.83 | ppl    46.172
| epoch 223 step    97000 |    208 batches | lr 4.16e-05 | ms/batch 318.70 | loss  3.82 | ppl    45.611
| epoch 223 step    97050 |    258 batches | lr 4.14e-05 | ms/batch 319.26 | loss  3.85 | ppl    47.214
| epoch 223 step    97100 |    308 batches | lr 4.13e-05 | ms/batch 318.76 | loss  3.86 | ppl    47.325
| epoch 223 step    97150 |    358 batches | lr 4.11e-05 | ms/batch 318.60 | loss  3.75 | ppl    42.471
| epoch 223 step    97200 |    408 batches | lr 4.1e-05 | ms/batch 320.02 | loss  3.80 | ppl    44.897
----------------------------------------------------------------------------------------------------
| Eval 243 at step    97200 | time: 132.59s | valid loss  4.19 | valid ppl    66.121
----------------------------------------------------------------------------------------------------
| epoch 224 step    97250 |     22 batches | lr 4.09e-05 | ms/batch 446.65 | loss  3.86 | ppl    47.634
| epoch 224 step    97300 |     72 batches | lr 4.07e-05 | ms/batch 319.30 | loss  3.78 | ppl    43.921
| epoch 224 step    97350 |    122 batches | lr 4.06e-05 | ms/batch 319.20 | loss  3.81 | ppl    45.075
| epoch 224 step    97400 |    172 batches | lr 4.04e-05 | ms/batch 318.27 | loss  3.83 | ppl    46.063
| epoch 224 step    97450 |    222 batches | lr 4.03e-05 | ms/batch 318.97 | loss  3.84 | ppl    46.384
| epoch 224 step    97500 |    272 batches | lr 4.02e-05 | ms/batch 318.45 | loss  3.84 | ppl    46.301
| epoch 224 step    97550 |    322 batches | lr 4e-05 | ms/batch 318.19 | loss  3.79 | ppl    44.068
| epoch 224 step    97600 |    372 batches | lr 3.99e-05 | ms/batch 319.77 | loss  3.78 | ppl    43.671
----------------------------------------------------------------------------------------------------
| Eval 244 at step    97600 | time: 132.30s | valid loss  4.19 | valid ppl    66.157
----------------------------------------------------------------------------------------------------
| epoch 224 step    97650 |    422 batches | lr 3.97e-05 | ms/batch 418.40 | loss  3.83 | ppl    46.214
| epoch 225 step    97700 |     36 batches | lr 3.96e-05 | ms/batch 312.12 | loss  3.84 | ppl    46.658
| epoch 225 step    97750 |     86 batches | lr 3.95e-05 | ms/batch 319.99 | loss  3.75 | ppl    42.599
| epoch 225 step    97800 |    136 batches | lr 3.93e-05 | ms/batch 320.93 | loss  3.86 | ppl    47.310
| epoch 225 step    97850 |    186 batches | lr 3.92e-05 | ms/batch 320.41 | loss  3.84 | ppl    46.297
| epoch 225 step    97900 |    236 batches | lr 3.9e-05 | ms/batch 320.85 | loss  3.84 | ppl    46.409
| epoch 225 step    97950 |    286 batches | lr 3.89e-05 | ms/batch 320.34 | loss  3.88 | ppl    48.644
| epoch 225 step    98000 |    336 batches | lr 3.88e-05 | ms/batch 321.50 | loss  3.74 | ppl    42.040
----------------------------------------------------------------------------------------------------
| Eval 245 at step    98000 | time: 132.75s | valid loss  4.20 | valid ppl    66.371
----------------------------------------------------------------------------------------------------
| epoch 225 step    98050 |    386 batches | lr 3.86e-05 | ms/batch 420.62 | loss  3.82 | ppl    45.633
| epoch 225 step    98100 |    436 batches | lr 3.85e-05 | ms/batch 315.88 | loss  3.83 | ppl    45.860
| epoch 226 step    98150 |     50 batches | lr 3.84e-05 | ms/batch 319.84 | loss  3.78 | ppl    43.984
| epoch 226 step    98200 |    100 batches | lr 3.82e-05 | ms/batch 321.43 | loss  3.80 | ppl    44.503
| epoch 226 step    98250 |    150 batches | lr 3.81e-05 | ms/batch 319.50 | loss  3.81 | ppl    45.200
| epoch 226 step    98300 |    200 batches | lr 3.79e-05 | ms/batch 318.69 | loss  3.83 | ppl    45.980
| epoch 226 step    98350 |    250 batches | lr 3.78e-05 | ms/batch 319.06 | loss  3.84 | ppl    46.595
| epoch 226 step    98400 |    300 batches | lr 3.77e-05 | ms/batch 319.52 | loss  3.88 | ppl    48.294
----------------------------------------------------------------------------------------------------
| Eval 246 at step    98400 | time: 132.72s | valid loss  4.19 | valid ppl    66.155
----------------------------------------------------------------------------------------------------
| epoch 226 step    98450 |    350 batches | lr 3.75e-05 | ms/batch 419.52 | loss  3.73 | ppl    41.765
| epoch 226 step    98500 |    400 batches | lr 3.74e-05 | ms/batch 318.77 | loss  3.82 | ppl    45.726
| epoch 227 step    98550 |     14 batches | lr 3.73e-05 | ms/batch 315.13 | loss  3.85 | ppl    46.966
| epoch 227 step    98600 |     64 batches | lr 3.71e-05 | ms/batch 322.51 | loss  3.78 | ppl    43.754
| epoch 227 step    98650 |    114 batches | lr 3.7e-05 | ms/batch 320.32 | loss  3.81 | ppl    45.089
| epoch 227 step    98700 |    164 batches | lr 3.69e-05 | ms/batch 320.41 | loss  3.79 | ppl    44.329
| epoch 227 step    98750 |    214 batches | lr 3.67e-05 | ms/batch 320.17 | loss  3.84 | ppl    46.397
| epoch 227 step    98800 |    264 batches | lr 3.66e-05 | ms/batch 321.05 | loss  3.85 | ppl    47.157
----------------------------------------------------------------------------------------------------
| Eval 247 at step    98800 | time: 132.88s | valid loss  4.20 | valid ppl    66.614
----------------------------------------------------------------------------------------------------
| epoch 227 step    98850 |    314 batches | lr 3.65e-05 | ms/batch 419.40 | loss  3.81 | ppl    45.327
| epoch 227 step    98900 |    364 batches | lr 3.63e-05 | ms/batch 318.02 | loss  3.77 | ppl    43.416
| epoch 227 step    98950 |    414 batches | lr 3.62e-05 | ms/batch 319.45 | loss  3.81 | ppl    45.242
| epoch 228 step    99000 |     28 batches | lr 3.61e-05 | ms/batch 314.11 | loss  3.85 | ppl    47.103
| epoch 228 step    99050 |     78 batches | lr 3.59e-05 | ms/batch 320.11 | loss  3.76 | ppl    43.053
| epoch 228 step    99100 |    128 batches | lr 3.58e-05 | ms/batch 319.86 | loss  3.81 | ppl    45.223
| epoch 228 step    99150 |    178 batches | lr 3.57e-05 | ms/batch 319.82 | loss  3.83 | ppl    46.081
| epoch 228 step    99200 |    228 batches | lr 3.55e-05 | ms/batch 320.55 | loss  3.83 | ppl    46.250
----------------------------------------------------------------------------------------------------
| Eval 248 at step    99200 | time: 132.59s | valid loss  4.19 | valid ppl    66.224
----------------------------------------------------------------------------------------------------
| epoch 228 step    99250 |    278 batches | lr 3.54e-05 | ms/batch 420.17 | loss  3.85 | ppl    46.867
| epoch 228 step    99300 |    328 batches | lr 3.53e-05 | ms/batch 319.20 | loss  3.80 | ppl    44.733
| epoch 228 step    99350 |    378 batches | lr 3.51e-05 | ms/batch 320.01 | loss  3.81 | ppl    45.228
| epoch 228 step    99400 |    428 batches | lr 3.5e-05 | ms/batch 319.91 | loss  3.83 | ppl    45.982
| epoch 229 step    99450 |     42 batches | lr 3.49e-05 | ms/batch 311.88 | loss  3.80 | ppl    44.605
| epoch 229 step    99500 |     92 batches | lr 3.47e-05 | ms/batch 320.01 | loss  3.77 | ppl    43.514
| epoch 229 step    99550 |    142 batches | lr 3.46e-05 | ms/batch 319.78 | loss  3.79 | ppl    44.444
| epoch 229 step    99600 |    192 batches | lr 3.45e-05 | ms/batch 320.55 | loss  3.82 | ppl    45.724
----------------------------------------------------------------------------------------------------
| Eval 249 at step    99600 | time: 132.59s | valid loss  4.19 | valid ppl    66.209
----------------------------------------------------------------------------------------------------
| epoch 229 step    99650 |    242 batches | lr 3.43e-05 | ms/batch 420.53 | loss  3.85 | ppl    46.846
| epoch 229 step    99700 |    292 batches | lr 3.42e-05 | ms/batch 320.19 | loss  3.85 | ppl    47.017
| epoch 229 step    99750 |    342 batches | lr 3.41e-05 | ms/batch 320.35 | loss  3.71 | ppl    40.820
| epoch 229 step    99800 |    392 batches | lr 3.39e-05 | ms/batch 320.59 | loss  3.83 | ppl    46.261
| epoch 230 step    99850 |      6 batches | lr 3.38e-05 | ms/batch 312.76 | loss  3.87 | ppl    47.817
| epoch 230 step    99900 |     56 batches | lr 3.37e-05 | ms/batch 320.79 | loss  3.76 | ppl    42.987
| epoch 230 step    99950 |    106 batches | lr 3.36e-05 | ms/batch 320.13 | loss  3.77 | ppl    43.218
| epoch 230 step   100000 |    156 batches | lr 3.34e-05 | ms/batch 319.48 | loss  3.83 | ppl    46.171
----------------------------------------------------------------------------------------------------
| Eval 250 at step   100000 | time: 132.75s | valid loss  4.19 | valid ppl    66.130
----------------------------------------------------------------------------------------------------
| epoch 230 step   100050 |    206 batches | lr 3.33e-05 | ms/batch 421.43 | loss  3.82 | ppl    45.583
| epoch 230 step   100100 |    256 batches | lr 3.32e-05 | ms/batch 320.16 | loss  3.84 | ppl    46.728
| epoch 230 step   100150 |    306 batches | lr 3.3e-05 | ms/batch 320.76 | loss  3.83 | ppl    46.037
| epoch 230 step   100200 |    356 batches | lr 3.29e-05 | ms/batch 320.25 | loss  3.73 | ppl    41.733
| epoch 230 step   100250 |    406 batches | lr 3.28e-05 | ms/batch 320.43 | loss  3.80 | ppl    44.712
| epoch 231 step   100300 |     20 batches | lr 3.27e-05 | ms/batch 314.47 | loss  3.84 | ppl    46.571
| epoch 231 step   100350 |     70 batches | lr 3.25e-05 | ms/batch 318.51 | loss  3.75 | ppl    42.618
| epoch 231 step   100400 |    120 batches | lr 3.24e-05 | ms/batch 318.83 | loss  3.82 | ppl    45.749
----------------------------------------------------------------------------------------------------
| Eval 251 at step   100400 | time: 132.74s | valid loss  4.20 | valid ppl    66.638
----------------------------------------------------------------------------------------------------
| epoch 231 step   100450 |    170 batches | lr 3.23e-05 | ms/batch 419.81 | loss  3.79 | ppl    44.080
| epoch 231 step   100500 |    220 batches | lr 3.21e-05 | ms/batch 319.30 | loss  3.83 | ppl    46.093
| epoch 231 step   100550 |    270 batches | lr 3.2e-05 | ms/batch 319.43 | loss  3.83 | ppl    46.214
| epoch 231 step   100600 |    320 batches | lr 3.19e-05 | ms/batch 318.33 | loss  3.81 | ppl    45.043
| epoch 231 step   100650 |    370 batches | lr 3.18e-05 | ms/batch 319.05 | loss  3.77 | ppl    43.529
| epoch 231 step   100700 |    420 batches | lr 3.16e-05 | ms/batch 319.74 | loss  3.80 | ppl    44.583
| epoch 232 step   100750 |     34 batches | lr 3.15e-05 | ms/batch 313.64 | loss  3.84 | ppl    46.433
| epoch 232 step   100800 |     84 batches | lr 3.14e-05 | ms/batch 319.17 | loss  3.74 | ppl    42.057
----------------------------------------------------------------------------------------------------
| Eval 252 at step   100800 | time: 132.43s | valid loss  4.19 | valid ppl    66.061
----------------------------------------------------------------------------------------------------
| epoch 232 step   100850 |    134 batches | lr 3.13e-05 | ms/batch 454.31 | loss  3.81 | ppl    45.343
| epoch 232 step   100900 |    184 batches | lr 3.11e-05 | ms/batch 320.94 | loss  3.82 | ppl    45.708
| epoch 232 step   100950 |    234 batches | lr 3.1e-05 | ms/batch 320.98 | loss  3.83 | ppl    45.888
| epoch 232 step   101000 |    284 batches | lr 3.09e-05 | ms/batch 322.00 | loss  3.84 | ppl    46.564
| epoch 232 step   101050 |    334 batches | lr 3.08e-05 | ms/batch 319.90 | loss  3.75 | ppl    42.509
| epoch 232 step   101100 |    384 batches | lr 3.06e-05 | ms/batch 319.71 | loss  3.83 | ppl    46.245
| epoch 232 step   101150 |    434 batches | lr 3.05e-05 | ms/batch 319.75 | loss  3.85 | ppl    46.771
| epoch 233 step   101200 |     48 batches | lr 3.04e-05 | ms/batch 313.48 | loss  3.78 | ppl    43.734
----------------------------------------------------------------------------------------------------
| Eval 253 at step   101200 | time: 132.87s | valid loss  4.20 | valid ppl    66.461
----------------------------------------------------------------------------------------------------
| epoch 233 step   101250 |     98 batches | lr 3.03e-05 | ms/batch 428.75 | loss  3.77 | ppl    43.548
| epoch 233 step   101300 |    148 batches | lr 3.01e-05 | ms/batch 335.61 | loss  3.80 | ppl    44.537
| epoch 233 step   101350 |    198 batches | lr 3e-05 | ms/batch 336.32 | loss  3.80 | ppl    44.810
| epoch 233 step   101400 |    248 batches | lr 2.99e-05 | ms/batch 336.61 | loss  3.83 | ppl    46.082
| epoch 233 step   101450 |    298 batches | lr 2.98e-05 | ms/batch 331.24 | loss  3.85 | ppl    47.039
| epoch 233 step   101500 |    348 batches | lr 2.96e-05 | ms/batch 320.15 | loss  3.73 | ppl    41.512
| epoch 233 step   101550 |    398 batches | lr 2.95e-05 | ms/batch 319.79 | loss  3.81 | ppl    45.138
| epoch 234 step   101600 |     12 batches | lr 2.94e-05 | ms/batch 312.52 | loss  3.86 | ppl    47.251
----------------------------------------------------------------------------------------------------
| Eval 254 at step   101600 | time: 136.06s | valid loss  4.19 | valid ppl    66.147
----------------------------------------------------------------------------------------------------
| epoch 234 step   101650 |     62 batches | lr 2.93e-05 | ms/batch 419.46 | loss  3.76 | ppl    43.093
| epoch 234 step   101700 |    112 batches | lr 2.92e-05 | ms/batch 319.89 | loss  3.78 | ppl    43.744
| epoch 234 step   101750 |    162 batches | lr 2.9e-05 | ms/batch 319.80 | loss  3.81 | ppl    45.013
| epoch 234 step   101800 |    212 batches | lr 2.89e-05 | ms/batch 319.49 | loss  3.83 | ppl    46.138
| epoch 234 step   101850 |    262 batches | lr 2.88e-05 | ms/batch 319.06 | loss  3.82 | ppl    45.570
| epoch 234 step   101900 |    312 batches | lr 2.87e-05 | ms/batch 319.25 | loss  3.84 | ppl    46.312
| epoch 234 step   101950 |    362 batches | lr 2.86e-05 | ms/batch 318.21 | loss  3.77 | ppl    43.211
| epoch 234 step   102000 |    412 batches | lr 2.84e-05 | ms/batch 317.98 | loss  3.79 | ppl    44.308
----------------------------------------------------------------------------------------------------
| Eval 255 at step   102000 | time: 132.63s | valid loss  4.19 | valid ppl    66.229
----------------------------------------------------------------------------------------------------
| epoch 235 step   102050 |     26 batches | lr 2.83e-05 | ms/batch 413.45 | loss  3.85 | ppl    47.019
| epoch 235 step   102100 |     76 batches | lr 2.82e-05 | ms/batch 318.85 | loss  3.75 | ppl    42.548
| epoch 235 step   102150 |    126 batches | lr 2.81e-05 | ms/batch 318.46 | loss  3.80 | ppl    44.706
| epoch 235 step   102200 |    176 batches | lr 2.8e-05 | ms/batch 318.46 | loss  3.81 | ppl    45.327
| epoch 235 step   102250 |    226 batches | lr 2.78e-05 | ms/batch 318.77 | loss  3.81 | ppl    45.129
| epoch 235 step   102300 |    276 batches | lr 2.77e-05 | ms/batch 318.70 | loss  3.85 | ppl    46.850
| epoch 235 step   102350 |    326 batches | lr 2.76e-05 | ms/batch 318.09 | loss  3.79 | ppl    44.357
| epoch 235 step   102400 |    376 batches | lr 2.75e-05 | ms/batch 319.44 | loss  3.78 | ppl    43.866
----------------------------------------------------------------------------------------------------
| Eval 256 at step   102400 | time: 132.22s | valid loss  4.19 | valid ppl    66.020
----------------------------------------------------------------------------------------------------
| epoch 235 step   102450 |    426 batches | lr 2.74e-05 | ms/batch 454.19 | loss  3.83 | ppl    45.976
| epoch 236 step   102500 |     40 batches | lr 2.72e-05 | ms/batch 314.24 | loss  3.80 | ppl    44.911
| epoch 236 step   102550 |     90 batches | lr 2.71e-05 | ms/batch 319.87 | loss  3.74 | ppl    42.142
| epoch 236 step   102600 |    140 batches | lr 2.7e-05 | ms/batch 319.02 | loss  3.82 | ppl    45.556
| epoch 236 step   102650 |    190 batches | lr 2.69e-05 | ms/batch 319.05 | loss  3.81 | ppl    44.967
| epoch 236 step   102700 |    240 batches | lr 2.68e-05 | ms/batch 318.61 | loss  3.81 | ppl    45.136
| epoch 236 step   102750 |    290 batches | lr 2.67e-05 | ms/batch 318.24 | loss  3.88 | ppl    48.538
| epoch 236 step   102800 |    340 batches | lr 2.65e-05 | ms/batch 317.49 | loss  3.71 | ppl    41.030
----------------------------------------------------------------------------------------------------
| Eval 257 at step   102800 | time: 132.36s | valid loss  4.19 | valid ppl    66.086
----------------------------------------------------------------------------------------------------
| epoch 236 step   102850 |    390 batches | lr 2.64e-05 | ms/batch 419.15 | loss  3.81 | ppl    45.267
| epoch 237 step   102900 |      4 batches | lr 2.63e-05 | ms/batch 314.07 | loss  3.83 | ppl    46.203
| epoch 237 step   102950 |     54 batches | lr 2.62e-05 | ms/batch 317.98 | loss  3.78 | ppl    43.661
| epoch 237 step   103000 |    104 batches | lr 2.61e-05 | ms/batch 318.93 | loss  3.79 | ppl    44.099
| epoch 237 step   103050 |    154 batches | lr 2.6e-05 | ms/batch 319.14 | loss  3.80 | ppl    44.536
| epoch 237 step   103100 |    204 batches | lr 2.58e-05 | ms/batch 320.09 | loss  3.80 | ppl    44.593
| epoch 237 step   103150 |    254 batches | lr 2.57e-05 | ms/batch 319.41 | loss  3.84 | ppl    46.359
| epoch 237 step   103200 |    304 batches | lr 2.56e-05 | ms/batch 319.93 | loss  3.84 | ppl    46.380
----------------------------------------------------------------------------------------------------
| Eval 258 at step   103200 | time: 132.47s | valid loss  4.19 | valid ppl    65.936
----------------------------------------------------------------------------------------------------
| epoch 237 step   103250 |    354 batches | lr 2.55e-05 | ms/batch 454.08 | loss  3.75 | ppl    42.433
| epoch 237 step   103300 |    404 batches | lr 2.54e-05 | ms/batch 320.76 | loss  3.79 | ppl    44.416
| epoch 238 step   103350 |     18 batches | lr 2.53e-05 | ms/batch 314.08 | loss  3.84 | ppl    46.750
| epoch 238 step   103400 |     68 batches | lr 2.52e-05 | ms/batch 320.69 | loss  3.75 | ppl    42.335
| epoch 238 step   103450 |    118 batches | lr 2.5e-05 | ms/batch 320.51 | loss  3.80 | ppl    44.864
| epoch 238 step   103500 |    168 batches | lr 2.49e-05 | ms/batch 319.40 | loss  3.79 | ppl    44.149
| epoch 238 step   103550 |    218 batches | lr 2.48e-05 | ms/batch 319.32 | loss  3.83 | ppl    45.887
| epoch 238 step   103600 |    268 batches | lr 2.47e-05 | ms/batch 321.25 | loss  3.82 | ppl    45.722
----------------------------------------------------------------------------------------------------
| Eval 259 at step   103600 | time: 132.85s | valid loss  4.20 | valid ppl    66.411
----------------------------------------------------------------------------------------------------
| epoch 238 step   103650 |    318 batches | lr 2.46e-05 | ms/batch 420.76 | loss  3.81 | ppl    45.285
| epoch 238 step   103700 |    368 batches | lr 2.45e-05 | ms/batch 320.48 | loss  3.77 | ppl    43.319
| epoch 238 step   103750 |    418 batches | lr 2.44e-05 | ms/batch 319.86 | loss  3.80 | ppl    44.591
| epoch 239 step   103800 |     32 batches | lr 2.43e-05 | ms/batch 313.58 | loss  3.84 | ppl    46.440
| epoch 239 step   103850 |     82 batches | lr 2.41e-05 | ms/batch 319.83 | loss  3.75 | ppl    42.629
| epoch 239 step   103900 |    132 batches | lr 2.4e-05 | ms/batch 319.37 | loss  3.79 | ppl    44.196
| epoch 239 step   103950 |    182 batches | lr 2.39e-05 | ms/batch 319.34 | loss  3.81 | ppl    44.925
| epoch 239 step   104000 |    232 batches | lr 2.38e-05 | ms/batch 318.60 | loss  3.83 | ppl    45.856
----------------------------------------------------------------------------------------------------
| Eval 260 at step   104000 | time: 132.57s | valid loss  4.19 | valid ppl    66.028
----------------------------------------------------------------------------------------------------
| epoch 239 step   104050 |    282 batches | lr 2.37e-05 | ms/batch 420.25 | loss  3.84 | ppl    46.638
| epoch 239 step   104100 |    332 batches | lr 2.36e-05 | ms/batch 319.52 | loss  3.75 | ppl    42.387
| epoch 239 step   104150 |    382 batches | lr 2.35e-05 | ms/batch 319.05 | loss  3.78 | ppl    44.005
| epoch 239 step   104200 |    432 batches | lr 2.34e-05 | ms/batch 319.08 | loss  3.83 | ppl    46.169
| epoch 240 step   104250 |     46 batches | lr 2.33e-05 | ms/batch 315.45 | loss  3.80 | ppl    44.555
| epoch 240 step   104300 |     96 batches | lr 2.32e-05 | ms/batch 320.64 | loss  3.74 | ppl    42.264
| epoch 240 step   104350 |    146 batches | lr 2.3e-05 | ms/batch 320.93 | loss  3.79 | ppl    44.447
| epoch 240 step   104400 |    196 batches | lr 2.29e-05 | ms/batch 320.34 | loss  3.82 | ppl    45.718
----------------------------------------------------------------------------------------------------
| Eval 261 at step   104400 | time: 132.77s | valid loss  4.19 | valid ppl    66.030
----------------------------------------------------------------------------------------------------
| epoch 240 step   104450 |    246 batches | lr 2.28e-05 | ms/batch 421.47 | loss  3.81 | ppl    45.297
| epoch 240 step   104500 |    296 batches | lr 2.27e-05 | ms/batch 320.48 | loss  3.86 | ppl    47.306
| epoch 240 step   104550 |    346 batches | lr 2.26e-05 | ms/batch 319.92 | loss  3.71 | ppl    40.996
| epoch 240 step   104600 |    396 batches | lr 2.25e-05 | ms/batch 321.12 | loss  3.79 | ppl    44.338
| epoch 241 step   104650 |     10 batches | lr 2.24e-05 | ms/batch 319.01 | loss  3.83 | ppl    45.983
| epoch 241 step   104700 |     60 batches | lr 2.23e-05 | ms/batch 322.16 | loss  3.75 | ppl    42.478
| epoch 241 step   104750 |    110 batches | lr 2.22e-05 | ms/batch 318.37 | loss  3.77 | ppl    43.489
| epoch 241 step   104800 |    160 batches | lr 2.21e-05 | ms/batch 319.44 | loss  3.77 | ppl    43.502
----------------------------------------------------------------------------------------------------
| Eval 262 at step   104800 | time: 133.10s | valid loss  4.19 | valid ppl    65.947
----------------------------------------------------------------------------------------------------
| epoch 241 step   104850 |    210 batches | lr 2.2e-05 | ms/batch 420.22 | loss  3.81 | ppl    45.008
| epoch 241 step   104900 |    260 batches | lr 2.19e-05 | ms/batch 319.39 | loss  3.84 | ppl    46.391
| epoch 241 step   104950 |    310 batches | lr 2.18e-05 | ms/batch 318.56 | loss  3.82 | ppl    45.702
| epoch 241 step   105000 |    360 batches | lr 2.16e-05 | ms/batch 319.85 | loss  3.75 | ppl    42.713
| epoch 241 step   105050 |    410 batches | lr 2.15e-05 | ms/batch 319.74 | loss  3.80 | ppl    44.853
| epoch 242 step   105100 |     24 batches | lr 2.14e-05 | ms/batch 313.03 | loss  3.83 | ppl    46.070
| epoch 242 step   105150 |     74 batches | lr 2.13e-05 | ms/batch 319.62 | loss  3.74 | ppl    41.942
| epoch 242 step   105200 |    124 batches | lr 2.12e-05 | ms/batch 319.86 | loss  3.80 | ppl    44.708
----------------------------------------------------------------------------------------------------
| Eval 263 at step   105200 | time: 132.53s | valid loss  4.20 | valid ppl    66.419
----------------------------------------------------------------------------------------------------
| epoch 242 step   105250 |    174 batches | lr 2.11e-05 | ms/batch 421.13 | loss  3.81 | ppl    45.371
| epoch 242 step   105300 |    224 batches | lr 2.1e-05 | ms/batch 325.09 | loss  3.84 | ppl    46.415
| epoch 242 step   105350 |    274 batches | lr 2.09e-05 | ms/batch 334.81 | loss  3.83 | ppl    46.232
| epoch 242 step   105400 |    324 batches | lr 2.08e-05 | ms/batch 325.34 | loss  3.76 | ppl    42.907
| epoch 242 step   105450 |    374 batches | lr 2.07e-05 | ms/batch 321.31 | loss  3.79 | ppl    44.438
| epoch 242 step   105500 |    424 batches | lr 2.06e-05 | ms/batch 318.06 | loss  3.80 | ppl    44.808
| epoch 243 step   105550 |     38 batches | lr 2.05e-05 | ms/batch 313.35 | loss  3.81 | ppl    45.076
| epoch 243 step   105600 |     88 batches | lr 2.04e-05 | ms/batch 318.74 | loss  3.74 | ppl    41.926
----------------------------------------------------------------------------------------------------
| Eval 264 at step   105600 | time: 133.85s | valid loss  4.19 | valid ppl    66.101
----------------------------------------------------------------------------------------------------
| epoch 243 step   105650 |    138 batches | lr 2.03e-05 | ms/batch 418.54 | loss  3.81 | ppl    45.239
| epoch 243 step   105700 |    188 batches | lr 2.02e-05 | ms/batch 318.73 | loss  3.81 | ppl    44.948
| epoch 243 step   105750 |    238 batches | lr 2.01e-05 | ms/batch 318.55 | loss  3.80 | ppl    44.874
| epoch 243 step   105800 |    288 batches | lr 2e-05 | ms/batch 319.19 | loss  3.87 | ppl    47.797
| epoch 243 step   105850 |    338 batches | lr 1.99e-05 | ms/batch 319.27 | loss  3.74 | ppl    42.009
| epoch 243 step   105900 |    388 batches | lr 1.98e-05 | ms/batch 318.34 | loss  3.79 | ppl    44.431
| epoch 244 step   105950 |      2 batches | lr 1.97e-05 | ms/batch 312.55 | loss  3.84 | ppl    46.491
| epoch 244 step   106000 |     52 batches | lr 1.96e-05 | ms/batch 319.93 | loss  3.75 | ppl    42.554
----------------------------------------------------------------------------------------------------
| Eval 265 at step   106000 | time: 132.28s | valid loss  4.19 | valid ppl    66.328
----------------------------------------------------------------------------------------------------
| epoch 244 step   106050 |    102 batches | lr 1.95e-05 | ms/batch 420.75 | loss  3.75 | ppl    42.476
| epoch 244 step   106100 |    152 batches | lr 1.94e-05 | ms/batch 320.23 | loss  3.81 | ppl    44.941
| epoch 244 step   106150 |    202 batches | lr 1.93e-05 | ms/batch 321.23 | loss  3.81 | ppl    45.043
| epoch 244 step   106200 |    252 batches | lr 1.92e-05 | ms/batch 320.31 | loss  3.81 | ppl    45.101
| epoch 244 step   106250 |    302 batches | lr 1.91e-05 | ms/batch 318.46 | loss  3.85 | ppl    46.977
| epoch 244 step   106300 |    352 batches | lr 1.9e-05 | ms/batch 319.09 | loss  3.71 | ppl    40.803
| epoch 244 step   106350 |    402 batches | lr 1.89e-05 | ms/batch 318.93 | loss  3.80 | ppl    44.699
| epoch 245 step   106400 |     16 batches | lr 1.88e-05 | ms/batch 312.84 | loss  3.84 | ppl    46.346
----------------------------------------------------------------------------------------------------
| Eval 266 at step   106400 | time: 132.58s | valid loss  4.19 | valid ppl    66.098
----------------------------------------------------------------------------------------------------
| epoch 245 step   106450 |     66 batches | lr 1.87e-05 | ms/batch 419.77 | loss  3.76 | ppl    42.834
| epoch 245 step   106500 |    116 batches | lr 1.86e-05 | ms/batch 319.50 | loss  3.79 | ppl    44.251
| epoch 245 step   106550 |    166 batches | lr 1.85e-05 | ms/batch 321.81 | loss  3.78 | ppl    43.924
| epoch 245 step   106600 |    216 batches | lr 1.84e-05 | ms/batch 320.61 | loss  3.80 | ppl    44.764
| epoch 245 step   106650 |    266 batches | lr 1.83e-05 | ms/batch 320.50 | loss  3.80 | ppl    44.506
| epoch 245 step   106700 |    316 batches | lr 1.82e-05 | ms/batch 322.44 | loss  3.80 | ppl    44.612
| epoch 245 step   106750 |    366 batches | lr 1.81e-05 | ms/batch 319.84 | loss  3.74 | ppl    42.114
| epoch 245 step   106800 |    416 batches | lr 1.8e-05 | ms/batch 320.41 | loss  3.78 | ppl    43.946
----------------------------------------------------------------------------------------------------
| Eval 267 at step   106800 | time: 133.25s | valid loss  4.19 | valid ppl    66.033
----------------------------------------------------------------------------------------------------
| epoch 246 step   106850 |     30 batches | lr 1.79e-05 | ms/batch 412.38 | loss  3.83 | ppl    46.122
| epoch 246 step   106900 |     80 batches | lr 1.78e-05 | ms/batch 320.00 | loss  3.75 | ppl    42.478
| epoch 246 step   106950 |    130 batches | lr 1.77e-05 | ms/batch 320.54 | loss  3.77 | ppl    43.445
| epoch 246 step   107000 |    180 batches | lr 1.76e-05 | ms/batch 320.09 | loss  3.80 | ppl    44.544
| epoch 246 step   107050 |    230 batches | lr 1.75e-05 | ms/batch 320.74 | loss  3.82 | ppl    45.604
| epoch 246 step   107100 |    280 batches | lr 1.74e-05 | ms/batch 320.30 | loss  3.83 | ppl    45.965
| epoch 246 step   107150 |    330 batches | lr 1.73e-05 | ms/batch 320.68 | loss  3.74 | ppl    42.264
| epoch 246 step   107200 |    380 batches | lr 1.72e-05 | ms/batch 318.36 | loss  3.79 | ppl    44.433
----------------------------------------------------------------------------------------------------
| Eval 268 at step   107200 | time: 132.65s | valid loss  4.19 | valid ppl    65.914
----------------------------------------------------------------------------------------------------
| epoch 246 step   107250 |    430 batches | lr 1.71e-05 | ms/batch 450.80 | loss  3.79 | ppl    44.447
| epoch 247 step   107300 |     44 batches | lr 1.7e-05 | ms/batch 312.55 | loss  3.79 | ppl    44.294
| epoch 247 step   107350 |     94 batches | lr 1.69e-05 | ms/batch 318.62 | loss  3.75 | ppl    42.470
| epoch 247 step   107400 |    144 batches | lr 1.68e-05 | ms/batch 317.91 | loss  3.79 | ppl    44.267
| epoch 247 step   107450 |    194 batches | lr 1.67e-05 | ms/batch 319.08 | loss  3.79 | ppl    44.352
| epoch 247 step   107500 |    244 batches | lr 1.67e-05 | ms/batch 318.97 | loss  3.82 | ppl    45.379
| epoch 247 step   107550 |    294 batches | lr 1.66e-05 | ms/batch 317.97 | loss  3.86 | ppl    47.540
| epoch 247 step   107600 |    344 batches | lr 1.65e-05 | ms/batch 318.17 | loss  3.71 | ppl    40.723
----------------------------------------------------------------------------------------------------
| Eval 269 at step   107600 | time: 132.09s | valid loss  4.19 | valid ppl    65.962
----------------------------------------------------------------------------------------------------
| epoch 247 step   107650 |    394 batches | lr 1.64e-05 | ms/batch 419.40 | loss  3.81 | ppl    45.013
| epoch 248 step   107700 |      8 batches | lr 1.63e-05 | ms/batch 314.10 | loss  3.82 | ppl    45.441
| epoch 248 step   107750 |     58 batches | lr 1.62e-05 | ms/batch 321.39 | loss  3.75 | ppl    42.405
| epoch 248 step   107800 |    108 batches | lr 1.61e-05 | ms/batch 319.67 | loss  3.77 | ppl    43.270
| epoch 248 step   107850 |    158 batches | lr 1.6e-05 | ms/batch 321.91 | loss  3.80 | ppl    44.815
| epoch 248 step   107900 |    208 batches | lr 1.59e-05 | ms/batch 321.35 | loss  3.78 | ppl    43.873
| epoch 248 step   107950 |    258 batches | lr 1.58e-05 | ms/batch 320.39 | loss  3.82 | ppl    45.379
| epoch 248 step   108000 |    308 batches | lr 1.57e-05 | ms/batch 320.66 | loss  3.83 | ppl    45.860
----------------------------------------------------------------------------------------------------
| Eval 270 at step   108000 | time: 132.97s | valid loss  4.19 | valid ppl    66.058
----------------------------------------------------------------------------------------------------
| epoch 248 step   108050 |    358 batches | lr 1.56e-05 | ms/batch 423.00 | loss  3.75 | ppl    42.706
| epoch 248 step   108100 |    408 batches | lr 1.55e-05 | ms/batch 329.02 | loss  3.80 | ppl    44.541
| epoch 249 step   108150 |     22 batches | lr 1.55e-05 | ms/batch 319.16 | loss  3.82 | ppl    45.770
| epoch 249 step   108200 |     72 batches | lr 1.54e-05 | ms/batch 319.31 | loss  3.73 | ppl    41.754
| epoch 249 step   108250 |    122 batches | lr 1.53e-05 | ms/batch 319.34 | loss  3.80 | ppl    44.713
| epoch 249 step   108300 |    172 batches | lr 1.52e-05 | ms/batch 318.95 | loss  3.77 | ppl    43.555
| epoch 249 step   108350 |    222 batches | lr 1.51e-05 | ms/batch 318.84 | loss  3.82 | ppl    45.570
| epoch 249 step   108400 |    272 batches | lr 1.5e-05 | ms/batch 318.86 | loss  3.81 | ppl    45.073
----------------------------------------------------------------------------------------------------
| Eval 271 at step   108400 | time: 133.30s | valid loss  4.19 | valid ppl    66.115
----------------------------------------------------------------------------------------------------
| epoch 249 step   108450 |    322 batches | lr 1.49e-05 | ms/batch 418.36 | loss  3.76 | ppl    42.856
| epoch 249 step   108500 |    372 batches | lr 1.48e-05 | ms/batch 320.15 | loss  3.76 | ppl    42.876
| epoch 249 step   108550 |    422 batches | lr 1.47e-05 | ms/batch 319.75 | loss  3.77 | ppl    43.287
| epoch 250 step   108600 |     36 batches | lr 1.47e-05 | ms/batch 312.02 | loss  3.82 | ppl    45.599
| epoch 250 step   108650 |     86 batches | lr 1.46e-05 | ms/batch 319.30 | loss  3.73 | ppl    41.653
| epoch 250 step   108700 |    136 batches | lr 1.45e-05 | ms/batch 318.90 | loss  3.78 | ppl    43.684
| epoch 250 step   108750 |    186 batches | lr 1.44e-05 | ms/batch 318.43 | loss  3.80 | ppl    44.734
| epoch 250 step   108800 |    236 batches | lr 1.43e-05 | ms/batch 319.24 | loss  3.80 | ppl    44.888
----------------------------------------------------------------------------------------------------
| Eval 272 at step   108800 | time: 132.30s | valid loss  4.19 | valid ppl    66.189
----------------------------------------------------------------------------------------------------
| epoch 250 step   108850 |    286 batches | lr 1.42e-05 | ms/batch 419.64 | loss  3.85 | ppl    47.105
| epoch 250 step   108900 |    336 batches | lr 1.41e-05 | ms/batch 318.84 | loss  3.72 | ppl    41.245
| epoch 250 step   108950 |    386 batches | lr 1.4e-05 | ms/batch 319.91 | loss  3.78 | ppl    43.922
| epoch 250 step   109000 |    436 batches | lr 1.4e-05 | ms/batch 314.45 | loss  3.80 | ppl    44.892
| epoch 251 step   109050 |     50 batches | lr 1.39e-05 | ms/batch 317.00 | loss  3.77 | ppl    43.434
| epoch 251 step   109100 |    100 batches | lr 1.38e-05 | ms/batch 318.99 | loss  3.78 | ppl    43.789
| epoch 251 step   109150 |    150 batches | lr 1.37e-05 | ms/batch 318.92 | loss  3.78 | ppl    43.840
| epoch 251 step   109200 |    200 batches | lr 1.36e-05 | ms/batch 320.10 | loss  3.81 | ppl    45.249
----------------------------------------------------------------------------------------------------
| Eval 273 at step   109200 | time: 132.42s | valid loss  4.19 | valid ppl    66.188
----------------------------------------------------------------------------------------------------
| epoch 251 step   109250 |    250 batches | lr 1.35e-05 | ms/batch 420.56 | loss  3.79 | ppl    44.419
| epoch 251 step   109300 |    300 batches | lr 1.34e-05 | ms/batch 319.45 | loss  3.82 | ppl    45.474
| epoch 251 step   109350 |    350 batches | lr 1.34e-05 | ms/batch 319.87 | loss  3.71 | ppl    40.881
| epoch 251 step   109400 |    400 batches | lr 1.33e-05 | ms/batch 320.40 | loss  3.81 | ppl    44.997
| epoch 252 step   109450 |     14 batches | lr 1.32e-05 | ms/batch 313.54 | loss  3.81 | ppl    45.115
| epoch 252 step   109500 |     64 batches | lr 1.31e-05 | ms/batch 319.50 | loss  3.74 | ppl    42.187
| epoch 252 step   109550 |    114 batches | lr 1.3e-05 | ms/batch 318.71 | loss  3.75 | ppl    42.553
| epoch 252 step   109600 |    164 batches | lr 1.29e-05 | ms/batch 319.38 | loss  3.78 | ppl    43.674
----------------------------------------------------------------------------------------------------
| Eval 274 at step   109600 | time: 132.57s | valid loss  4.19 | valid ppl    66.046
----------------------------------------------------------------------------------------------------
| epoch 252 step   109650 |    214 batches | lr 1.29e-05 | ms/batch 421.67 | loss  3.80 | ppl    44.878
| epoch 252 step   109700 |    264 batches | lr 1.28e-05 | ms/batch 320.69 | loss  3.81 | ppl    45.138
| epoch 252 step   109750 |    314 batches | lr 1.27e-05 | ms/batch 320.51 | loss  3.79 | ppl    44.404
| epoch 252 step   109800 |    364 batches | lr 1.26e-05 | ms/batch 320.97 | loss  3.73 | ppl    41.749
| epoch 252 step   109850 |    414 batches | lr 1.25e-05 | ms/batch 319.52 | loss  3.79 | ppl    44.308
| epoch 253 step   109900 |     28 batches | lr 1.25e-05 | ms/batch 314.30 | loss  3.82 | ppl    45.611
| epoch 253 step   109950 |     78 batches | lr 1.24e-05 | ms/batch 320.75 | loss  3.72 | ppl    41.181
| epoch 253 step   110000 |    128 batches | lr 1.23e-05 | ms/batch 318.63 | loss  3.77 | ppl    43.419
----------------------------------------------------------------------------------------------------
| Eval 275 at step   110000 | time: 132.85s | valid loss  4.19 | valid ppl    66.091
----------------------------------------------------------------------------------------------------
| epoch 253 step   110050 |    178 batches | lr 1.22e-05 | ms/batch 418.80 | loss  3.79 | ppl    44.205
| epoch 253 step   110100 |    228 batches | lr 1.21e-05 | ms/batch 319.05 | loss  3.79 | ppl    44.291
| epoch 253 step   110150 |    278 batches | lr 1.2e-05 | ms/batch 318.27 | loss  3.82 | ppl    45.499
| epoch 253 step   110200 |    328 batches | lr 1.2e-05 | ms/batch 318.66 | loss  3.74 | ppl    42.197
| epoch 253 step   110250 |    378 batches | lr 1.19e-05 | ms/batch 317.96 | loss  3.76 | ppl    42.903
| epoch 253 step   110300 |    428 batches | lr 1.18e-05 | ms/batch 319.01 | loss  3.80 | ppl    44.598
| epoch 254 step   110350 |     42 batches | lr 1.17e-05 | ms/batch 312.05 | loss  3.76 | ppl    43.133
| epoch 254 step   110400 |     92 batches | lr 1.16e-05 | ms/batch 318.44 | loss  3.70 | ppl    40.320
----------------------------------------------------------------------------------------------------
| Eval 276 at step   110400 | time: 132.16s | valid loss  4.19 | valid ppl    65.999
----------------------------------------------------------------------------------------------------
| epoch 254 step   110450 |    142 batches | lr 1.16e-05 | ms/batch 421.42 | loss  3.81 | ppl    45.265
| epoch 254 step   110500 |    192 batches | lr 1.15e-05 | ms/batch 319.16 | loss  3.80 | ppl    44.715
| epoch 254 step   110550 |    242 batches | lr 1.14e-05 | ms/batch 319.57 | loss  3.83 | ppl    46.016
| epoch 254 step   110600 |    292 batches | lr 1.13e-05 | ms/batch 319.05 | loss  3.85 | ppl    47.162
| epoch 254 step   110650 |    342 batches | lr 1.13e-05 | ms/batch 317.91 | loss  3.71 | ppl    40.652
| epoch 254 step   110700 |    392 batches | lr 1.12e-05 | ms/batch 321.80 | loss  3.82 | ppl    45.387
| epoch 255 step   110750 |      6 batches | lr 1.11e-05 | ms/batch 315.07 | loss  3.84 | ppl    46.513
| epoch 255 step   110800 |     56 batches | lr 1.1e-05 | ms/batch 320.12 | loss  3.73 | ppl    41.826
----------------------------------------------------------------------------------------------------
| Eval 277 at step   110800 | time: 132.70s | valid loss  4.19 | valid ppl    66.075
----------------------------------------------------------------------------------------------------
| epoch 255 step   110850 |    106 batches | lr 1.1e-05 | ms/batch 421.40 | loss  3.74 | ppl    41.967
| epoch 255 step   110900 |    156 batches | lr 1.09e-05 | ms/batch 320.28 | loss  3.77 | ppl    43.226
| epoch 255 step   110950 |    206 batches | lr 1.08e-05 | ms/batch 319.86 | loss  3.78 | ppl    43.671
| epoch 255 step   111000 |    256 batches | lr 1.07e-05 | ms/batch 319.26 | loss  3.80 | ppl    44.769
| epoch 255 step   111050 |    306 batches | lr 1.06e-05 | ms/batch 319.43 | loss  3.81 | ppl    45.073
| epoch 255 step   111100 |    356 batches | lr 1.06e-05 | ms/batch 320.24 | loss  3.69 | ppl    40.192
| epoch 255 step   111150 |    406 batches | lr 1.05e-05 | ms/batch 320.31 | loss  3.78 | ppl    43.977
| epoch 256 step   111200 |     20 batches | lr 1.04e-05 | ms/batch 314.52 | loss  3.80 | ppl    44.689
----------------------------------------------------------------------------------------------------
| Eval 278 at step   111200 | time: 132.76s | valid loss  4.19 | valid ppl    65.965
----------------------------------------------------------------------------------------------------
| epoch 256 step   111250 |     70 batches | lr 1.03e-05 | ms/batch 420.88 | loss  3.73 | ppl    41.846
| epoch 256 step   111300 |    120 batches | lr 1.03e-05 | ms/batch 321.35 | loss  3.79 | ppl    44.253
| epoch 256 step   111350 |    170 batches | lr 1.02e-05 | ms/batch 320.58 | loss  3.80 | ppl    44.577
| epoch 256 step   111400 |    220 batches | lr 1.01e-05 | ms/batch 320.15 | loss  3.81 | ppl    45.345
| epoch 256 step   111450 |    270 batches | lr 1e-05 | ms/batch 319.43 | loss  3.82 | ppl    45.453
| epoch 256 step   111500 |    320 batches | lr 9.98e-06 | ms/batch 319.62 | loss  3.79 | ppl    44.258
| epoch 256 step   111550 |    370 batches | lr 9.9e-06 | ms/batch 319.17 | loss  3.77 | ppl    43.445
| epoch 256 step   111600 |    420 batches | lr 9.83e-06 | ms/batch 318.05 | loss  3.77 | ppl    43.240
----------------------------------------------------------------------------------------------------
| Eval 279 at step   111600 | time: 132.93s | valid loss  4.19 | valid ppl    65.920
----------------------------------------------------------------------------------------------------
| epoch 257 step   111650 |     34 batches | lr 9.76e-06 | ms/batch 412.40 | loss  3.79 | ppl    44.333
| epoch 257 step   111700 |     84 batches | lr 9.69e-06 | ms/batch 319.73 | loss  3.72 | ppl    41.161
| epoch 257 step   111750 |    134 batches | lr 9.61e-06 | ms/batch 318.80 | loss  3.82 | ppl    45.382
| epoch 257 step   111800 |    184 batches | lr 9.54e-06 | ms/batch 318.68 | loss  3.78 | ppl    43.647
| epoch 257 step   111850 |    234 batches | lr 9.47e-06 | ms/batch 318.88 | loss  3.79 | ppl    44.101
| epoch 257 step   111900 |    284 batches | lr 9.4e-06 | ms/batch 318.08 | loss  3.84 | ppl    46.331
| epoch 257 step   111950 |    334 batches | lr 9.33e-06 | ms/batch 318.46 | loss  3.71 | ppl    40.699
| epoch 257 step   112000 |    384 batches | lr 9.26e-06 | ms/batch 319.23 | loss  3.80 | ppl    44.548
----------------------------------------------------------------------------------------------------
| Eval 280 at step   112000 | time: 132.20s | valid loss  4.19 | valid ppl    65.910
----------------------------------------------------------------------------------------------------
| epoch 257 step   112050 |    434 batches | lr 9.19e-06 | ms/batch 454.15 | loss  3.81 | ppl    45.338
| epoch 258 step   112100 |     48 batches | lr 9.12e-06 | ms/batch 314.21 | loss  3.78 | ppl    43.811
| epoch 258 step   112150 |     98 batches | lr 9.05e-06 | ms/batch 320.03 | loss  3.73 | ppl    41.547
| epoch 258 step   112200 |    148 batches | lr 8.98e-06 | ms/batch 320.41 | loss  3.78 | ppl    43.946
| epoch 258 step   112250 |    198 batches | lr 8.91e-06 | ms/batch 320.20 | loss  3.79 | ppl    44.244
| epoch 258 step   112300 |    248 batches | lr 8.84e-06 | ms/batch 321.83 | loss  3.82 | ppl    45.645
| epoch 258 step   112350 |    298 batches | lr 8.77e-06 | ms/batch 319.26 | loss  3.84 | ppl    46.364
| epoch 258 step   112400 |    348 batches | lr 8.7e-06 | ms/batch 318.45 | loss  3.71 | ppl    40.811
----------------------------------------------------------------------------------------------------
| Eval 281 at step   112400 | time: 132.77s | valid loss  4.19 | valid ppl    65.883
----------------------------------------------------------------------------------------------------
| epoch 258 step   112450 |    398 batches | lr 8.63e-06 | ms/batch 451.07 | loss  3.78 | ppl    43.811
| epoch 259 step   112500 |     12 batches | lr 8.57e-06 | ms/batch 314.08 | loss  3.81 | ppl    45.363
| epoch 259 step   112550 |     62 batches | lr 8.5e-06 | ms/batch 319.29 | loss  3.72 | ppl    41.295
| epoch 259 step   112600 |    112 batches | lr 8.43e-06 | ms/batch 319.99 | loss  3.76 | ppl    42.979
| epoch 259 step   112650 |    162 batches | lr 8.36e-06 | ms/batch 320.36 | loss  3.79 | ppl    44.355
| epoch 259 step   112700 |    212 batches | lr 8.3e-06 | ms/batch 320.75 | loss  3.80 | ppl    44.670
| epoch 259 step   112750 |    262 batches | lr 8.23e-06 | ms/batch 318.88 | loss  3.81 | ppl    45.341
| epoch 259 step   112800 |    312 batches | lr 8.16e-06 | ms/batch 318.39 | loss  3.79 | ppl    44.303
----------------------------------------------------------------------------------------------------
| Eval 282 at step   112800 | time: 132.53s | valid loss  4.19 | valid ppl    65.943
----------------------------------------------------------------------------------------------------
| epoch 259 step   112850 |    362 batches | lr 8.1e-06 | ms/batch 419.44 | loss  3.73 | ppl    41.767
| epoch 259 step   112900 |    412 batches | lr 8.03e-06 | ms/batch 320.76 | loss  3.80 | ppl    44.503
| epoch 260 step   112950 |     26 batches | lr 7.96e-06 | ms/batch 313.58 | loss  3.81 | ppl    45.308
| epoch 260 step   113000 |     76 batches | lr 7.9e-06 | ms/batch 320.64 | loss  3.74 | ppl    41.919
| epoch 260 step   113050 |    126 batches | lr 7.83e-06 | ms/batch 320.91 | loss  3.78 | ppl    43.813
| epoch 260 step   113100 |    176 batches | lr 7.77e-06 | ms/batch 321.35 | loss  3.78 | ppl    43.878
| epoch 260 step   113150 |    226 batches | lr 7.7e-06 | ms/batch 321.37 | loss  3.79 | ppl    44.249
| epoch 260 step   113200 |    276 batches | lr 7.64e-06 | ms/batch 320.81 | loss  3.83 | ppl    45.958
----------------------------------------------------------------------------------------------------
| Eval 283 at step   113200 | time: 132.98s | valid loss  4.19 | valid ppl    66.133
----------------------------------------------------------------------------------------------------
| epoch 260 step   113250 |    326 batches | lr 7.58e-06 | ms/batch 432.37 | loss  3.75 | ppl    42.561
| epoch 260 step   113300 |    376 batches | lr 7.51e-06 | ms/batch 335.44 | loss  3.76 | ppl    42.992
| epoch 260 step   113350 |    426 batches | lr 7.45e-06 | ms/batch 335.73 | loss  3.80 | ppl    44.831
| epoch 261 step   113400 |     40 batches | lr 7.38e-06 | ms/batch 314.15 | loss  3.78 | ppl    43.635
| epoch 261 step   113450 |     90 batches | lr 7.32e-06 | ms/batch 319.69 | loss  3.71 | ppl    40.902
| epoch 261 step   113500 |    140 batches | lr 7.26e-06 | ms/batch 320.85 | loss  3.78 | ppl    43.811
| epoch 261 step   113550 |    190 batches | lr 7.2e-06 | ms/batch 320.99 | loss  3.77 | ppl    43.270
| epoch 261 step   113600 |    240 batches | lr 7.13e-06 | ms/batch 319.66 | loss  3.79 | ppl    44.376
----------------------------------------------------------------------------------------------------
| Eval 284 at step   113600 | time: 134.93s | valid loss  4.19 | valid ppl    66.049
----------------------------------------------------------------------------------------------------
| epoch 261 step   113650 |    290 batches | lr 7.07e-06 | ms/batch 421.48 | loss  3.84 | ppl    46.695
| epoch 261 step   113700 |    340 batches | lr 7.01e-06 | ms/batch 321.12 | loss  3.72 | ppl    41.100
| epoch 261 step   113750 |    390 batches | lr 6.95e-06 | ms/batch 320.68 | loss  3.78 | ppl    44.017
| epoch 262 step   113800 |      4 batches | lr 6.89e-06 | ms/batch 314.69 | loss  3.82 | ppl    45.592
| epoch 262 step   113850 |     54 batches | lr 6.83e-06 | ms/batch 320.17 | loss  3.74 | ppl    42.086
| epoch 262 step   113900 |    104 batches | lr 6.77e-06 | ms/batch 320.74 | loss  3.73 | ppl    41.517
| epoch 262 step   113950 |    154 batches | lr 6.71e-06 | ms/batch 321.28 | loss  3.78 | ppl    43.921
| epoch 262 step   114000 |    204 batches | lr 6.65e-06 | ms/batch 320.48 | loss  3.77 | ppl    43.540
----------------------------------------------------------------------------------------------------
| Eval 285 at step   114000 | time: 133.03s | valid loss  4.19 | valid ppl    65.972
----------------------------------------------------------------------------------------------------
| epoch 262 step   114050 |    254 batches | lr 6.59e-06 | ms/batch 421.32 | loss  3.81 | ppl    45.080
| epoch 262 step   114100 |    304 batches | lr 6.53e-06 | ms/batch 320.74 | loss  3.82 | ppl    45.464
| epoch 262 step   114150 |    354 batches | lr 6.47e-06 | ms/batch 320.25 | loss  3.71 | ppl    40.820
| epoch 262 step   114200 |    404 batches | lr 6.41e-06 | ms/batch 320.49 | loss  3.80 | ppl    44.916
| epoch 263 step   114250 |     18 batches | lr 6.35e-06 | ms/batch 314.62 | loss  3.80 | ppl    44.822
| epoch 263 step   114300 |     68 batches | lr 6.29e-06 | ms/batch 320.84 | loss  3.74 | ppl    42.294
| epoch 263 step   114350 |    118 batches | lr 6.23e-06 | ms/batch 320.80 | loss  3.79 | ppl    44.037
| epoch 263 step   114400 |    168 batches | lr 6.17e-06 | ms/batch 320.45 | loss  3.78 | ppl    43.867
----------------------------------------------------------------------------------------------------
| Eval 286 at step   114400 | time: 132.97s | valid loss  4.19 | valid ppl    65.888
----------------------------------------------------------------------------------------------------
| epoch 263 step   114450 |    218 batches | lr 6.12e-06 | ms/batch 432.46 | loss  3.80 | ppl    44.722
| epoch 263 step   114500 |    268 batches | lr 6.06e-06 | ms/batch 320.91 | loss  3.80 | ppl    44.904
| epoch 263 step   114550 |    318 batches | lr 6e-06 | ms/batch 319.95 | loss  3.80 | ppl    44.624
| epoch 263 step   114600 |    368 batches | lr 5.94e-06 | ms/batch 321.75 | loss  3.73 | ppl    41.885
| epoch 263 step   114650 |    418 batches | lr 5.89e-06 | ms/batch 319.54 | loss  3.79 | ppl    44.383
| epoch 264 step   114700 |     32 batches | lr 5.83e-06 | ms/batch 313.46 | loss  3.81 | ppl    45.184
| epoch 264 step   114750 |     82 batches | lr 5.77e-06 | ms/batch 320.39 | loss  3.72 | ppl    41.303
| epoch 264 step   114800 |    132 batches | lr 5.72e-06 | ms/batch 319.65 | loss  3.80 | ppl    44.492
----------------------------------------------------------------------------------------------------
| Eval 287 at step   114800 | time: 133.44s | valid loss  4.19 | valid ppl    65.981
----------------------------------------------------------------------------------------------------
| epoch 264 step   114850 |    182 batches | lr 5.66e-06 | ms/batch 420.63 | loss  3.78 | ppl    43.628
| epoch 264 step   114900 |    232 batches | lr 5.61e-06 | ms/batch 319.79 | loss  3.78 | ppl    43.991
| epoch 264 step   114950 |    282 batches | lr 5.55e-06 | ms/batch 318.87 | loss  3.82 | ppl    45.577
| epoch 264 step   115000 |    332 batches | lr 5.5e-06 | ms/batch 319.89 | loss  3.73 | ppl    41.691
| epoch 264 step   115050 |    382 batches | lr 5.44e-06 | ms/batch 319.34 | loss  3.76 | ppl    43.149
| epoch 264 step   115100 |    432 batches | lr 5.39e-06 | ms/batch 319.52 | loss  3.81 | ppl    45.207
| epoch 265 step   115150 |     46 batches | lr 5.34e-06 | ms/batch 313.72 | loss  3.78 | ppl    43.845
| epoch 265 step   115200 |     96 batches | lr 5.28e-06 | ms/batch 320.40 | loss  3.71 | ppl    40.951
----------------------------------------------------------------------------------------------------
| Eval 288 at step   115200 | time: 132.58s | valid loss  4.19 | valid ppl    66.005
----------------------------------------------------------------------------------------------------
| epoch 265 step   115250 |    146 batches | lr 5.23e-06 | ms/batch 421.67 | loss  3.79 | ppl    44.390
| epoch 265 step   115300 |    196 batches | lr 5.17e-06 | ms/batch 320.41 | loss  3.81 | ppl    45.038
| epoch 265 step   115350 |    246 batches | lr 5.12e-06 | ms/batch 318.93 | loss  3.82 | ppl    45.661
| epoch 265 step   115400 |    296 batches | lr 5.07e-06 | ms/batch 319.62 | loss  3.83 | ppl    46.192
| epoch 265 step   115450 |    346 batches | lr 5.02e-06 | ms/batch 320.13 | loss  3.69 | ppl    40.151
| epoch 265 step   115500 |    396 batches | lr 4.96e-06 | ms/batch 319.14 | loss  3.79 | ppl    44.068
| epoch 266 step   115550 |     10 batches | lr 4.91e-06 | ms/batch 313.61 | loss  3.82 | ppl    45.389
| epoch 266 step   115600 |     60 batches | lr 4.86e-06 | ms/batch 319.67 | loss  3.75 | ppl    42.435
----------------------------------------------------------------------------------------------------
| Eval 289 at step   115600 | time: 132.65s | valid loss  4.19 | valid ppl    65.969
----------------------------------------------------------------------------------------------------
| epoch 266 step   115650 |    110 batches | lr 4.81e-06 | ms/batch 419.47 | loss  3.77 | ppl    43.295
| epoch 266 step   115700 |    160 batches | lr 4.76e-06 | ms/batch 319.70 | loss  3.81 | ppl    44.996
| epoch 266 step   115750 |    210 batches | lr 4.71e-06 | ms/batch 320.94 | loss  3.76 | ppl    43.108
| epoch 266 step   115800 |    260 batches | lr 4.66e-06 | ms/batch 320.63 | loss  3.81 | ppl    45.083
| epoch 266 step   115850 |    310 batches | lr 4.61e-06 | ms/batch 320.75 | loss  3.79 | ppl    44.312
| epoch 266 step   115900 |    360 batches | lr 4.56e-06 | ms/batch 320.03 | loss  3.72 | ppl    41.373
| epoch 266 step   115950 |    410 batches | lr 4.51e-06 | ms/batch 320.08 | loss  3.77 | ppl    43.514
| epoch 267 step   116000 |     24 batches | lr 4.46e-06 | ms/batch 313.94 | loss  3.81 | ppl    45.108
----------------------------------------------------------------------------------------------------
| Eval 290 at step   116000 | time: 132.78s | valid loss  4.19 | valid ppl    65.909
----------------------------------------------------------------------------------------------------
| epoch 267 step   116050 |     74 batches | lr 4.41e-06 | ms/batch 420.10 | loss  3.71 | ppl    40.993
| epoch 267 step   116100 |    124 batches | lr 4.36e-06 | ms/batch 319.95 | loss  3.78 | ppl    43.823
| epoch 267 step   116150 |    174 batches | lr 4.31e-06 | ms/batch 320.49 | loss  3.77 | ppl    43.404
| epoch 267 step   116200 |    224 batches | lr 4.26e-06 | ms/batch 319.42 | loss  3.80 | ppl    44.527
| epoch 267 step   116250 |    274 batches | lr 4.21e-06 | ms/batch 320.11 | loss  3.80 | ppl    44.782
| epoch 267 step   116300 |    324 batches | lr 4.17e-06 | ms/batch 319.71 | loss  3.75 | ppl    42.719
| epoch 267 step   116350 |    374 batches | lr 4.12e-06 | ms/batch 319.53 | loss  3.77 | ppl    43.550
| epoch 267 step   116400 |    424 batches | lr 4.07e-06 | ms/batch 319.69 | loss  3.77 | ppl    43.484
----------------------------------------------------------------------------------------------------
| Eval 291 at step   116400 | time: 132.96s | valid loss  4.19 | valid ppl    65.823
----------------------------------------------------------------------------------------------------
| epoch 268 step   116450 |     38 batches | lr 4.02e-06 | ms/batch 459.00 | loss  3.81 | ppl    45.159
| epoch 268 step   116500 |     88 batches | lr 3.98e-06 | ms/batch 319.15 | loss  3.71 | ppl    41.004
| epoch 268 step   116550 |    138 batches | lr 3.93e-06 | ms/batch 319.38 | loss  3.78 | ppl    43.979
| epoch 268 step   116600 |    188 batches | lr 3.89e-06 | ms/batch 320.02 | loss  3.80 | ppl    44.530
| epoch 268 step   116650 |    238 batches | lr 3.84e-06 | ms/batch 319.72 | loss  3.83 | ppl    45.958
| epoch 268 step   116700 |    288 batches | lr 3.79e-06 | ms/batch 319.82 | loss  3.83 | ppl    46.160
| epoch 268 step   116750 |    338 batches | lr 3.75e-06 | ms/batch 321.43 | loss  3.71 | ppl    40.946
| epoch 268 step   116800 |    388 batches | lr 3.7e-06 | ms/batch 320.35 | loss  3.78 | ppl    43.659
----------------------------------------------------------------------------------------------------
| Eval 292 at step   116800 | time: 132.68s | valid loss  4.19 | valid ppl    65.804
----------------------------------------------------------------------------------------------------
| epoch 269 step   116850 |      2 batches | lr 3.66e-06 | ms/batch 453.01 | loss  3.80 | ppl    44.775
| epoch 269 step   116900 |     52 batches | lr 3.61e-06 | ms/batch 319.53 | loss  3.73 | ppl    41.491
| epoch 269 step   116950 |    102 batches | lr 3.57e-06 | ms/batch 318.68 | loss  3.75 | ppl    42.362
| epoch 269 step   117000 |    152 batches | lr 3.53e-06 | ms/batch 319.75 | loss  3.78 | ppl    43.693
| epoch 269 step   117050 |    202 batches | lr 3.48e-06 | ms/batch 319.93 | loss  3.79 | ppl    44.165
| epoch 269 step   117100 |    252 batches | lr 3.44e-06 | ms/batch 318.10 | loss  3.82 | ppl    45.442
| epoch 269 step   117150 |    302 batches | lr 3.39e-06 | ms/batch 320.24 | loss  3.82 | ppl    45.517
| epoch 269 step   117200 |    352 batches | lr 3.35e-06 | ms/batch 319.52 | loss  3.69 | ppl    40.025
----------------------------------------------------------------------------------------------------
| Eval 293 at step   117200 | time: 132.67s | valid loss  4.19 | valid ppl    65.900
----------------------------------------------------------------------------------------------------
| epoch 269 step   117250 |    402 batches | lr 3.31e-06 | ms/batch 419.08 | loss  3.79 | ppl    44.123
| epoch 270 step   117300 |     16 batches | lr 3.27e-06 | ms/batch 314.06 | loss  3.81 | ppl    45.061
| epoch 270 step   117350 |     66 batches | lr 3.22e-06 | ms/batch 319.59 | loss  3.74 | ppl    42.086
| epoch 270 step   117400 |    116 batches | lr 3.18e-06 | ms/batch 319.22 | loss  3.78 | ppl    43.725
| epoch 270 step   117450 |    166 batches | lr 3.14e-06 | ms/batch 319.98 | loss  3.77 | ppl    43.424
| epoch 270 step   117500 |    216 batches | lr 3.1e-06 | ms/batch 319.89 | loss  3.78 | ppl    43.782
| epoch 270 step   117550 |    266 batches | lr 3.06e-06 | ms/batch 320.86 | loss  3.81 | ppl    44.964
| epoch 270 step   117600 |    316 batches | lr 3.02e-06 | ms/batch 319.98 | loss  3.79 | ppl    44.423
----------------------------------------------------------------------------------------------------
| Eval 294 at step   117600 | time: 132.63s | valid loss  4.19 | valid ppl    65.875
----------------------------------------------------------------------------------------------------
| epoch 270 step   117650 |    366 batches | lr 2.98e-06 | ms/batch 418.66 | loss  3.74 | ppl    42.137
| epoch 270 step   117700 |    416 batches | lr 2.94e-06 | ms/batch 318.67 | loss  3.79 | ppl    44.393
| epoch 271 step   117750 |     30 batches | lr 2.9e-06 | ms/batch 313.79 | loss  3.80 | ppl    44.845
| epoch 271 step   117800 |     80 batches | lr 2.86e-06 | ms/batch 318.48 | loss  3.74 | ppl    41.945
| epoch 271 step   117850 |    130 batches | lr 2.82e-06 | ms/batch 318.34 | loss  3.79 | ppl    44.220
| epoch 271 step   117900 |    180 batches | lr 2.78e-06 | ms/batch 319.82 | loss  3.78 | ppl    43.830
| epoch 271 step   117950 |    230 batches | lr 2.74e-06 | ms/batch 319.01 | loss  3.81 | ppl    45.105
| epoch 271 step   118000 |    280 batches | lr 2.7e-06 | ms/batch 319.77 | loss  3.83 | ppl    46.149
----------------------------------------------------------------------------------------------------
| Eval 295 at step   118000 | time: 132.34s | valid loss  4.19 | valid ppl    65.960
----------------------------------------------------------------------------------------------------
| epoch 271 step   118050 |    330 batches | lr 2.66e-06 | ms/batch 419.00 | loss  3.75 | ppl    42.636
| epoch 271 step   118100 |    380 batches | lr 2.62e-06 | ms/batch 319.46 | loss  3.77 | ppl    43.461
| epoch 271 step   118150 |    430 batches | lr 2.59e-06 | ms/batch 320.00 | loss  3.79 | ppl    44.098
| epoch 272 step   118200 |     44 batches | lr 2.55e-06 | ms/batch 313.28 | loss  3.75 | ppl    42.425
| epoch 272 step   118250 |     94 batches | lr 2.51e-06 | ms/batch 319.68 | loss  3.72 | ppl    41.300
| epoch 272 step   118300 |    144 batches | lr 2.48e-06 | ms/batch 320.24 | loss  3.81 | ppl    45.364
| epoch 272 step   118350 |    194 batches | lr 2.44e-06 | ms/batch 320.30 | loss  3.78 | ppl    43.936
| epoch 272 step   118400 |    244 batches | lr 2.4e-06 | ms/batch 319.50 | loss  3.81 | ppl    45.302
----------------------------------------------------------------------------------------------------
| Eval 296 at step   118400 | time: 132.60s | valid loss  4.19 | valid ppl    65.941
----------------------------------------------------------------------------------------------------
| epoch 272 step   118450 |    294 batches | lr 2.37e-06 | ms/batch 420.56 | loss  3.84 | ppl    46.631
| epoch 272 step   118500 |    344 batches | lr 2.33e-06 | ms/batch 318.95 | loss  3.70 | ppl    40.294
| epoch 272 step   118550 |    394 batches | lr 2.29e-06 | ms/batch 319.31 | loss  3.80 | ppl    44.525
| epoch 273 step   118600 |      8 batches | lr 2.26e-06 | ms/batch 311.93 | loss  3.82 | ppl    45.574
| epoch 273 step   118650 |     58 batches | lr 2.22e-06 | ms/batch 319.18 | loss  3.73 | ppl    41.559
| epoch 273 step   118700 |    108 batches | lr 2.19e-06 | ms/batch 320.32 | loss  3.75 | ppl    42.734
| epoch 273 step   118750 |    158 batches | lr 2.15e-06 | ms/batch 319.90 | loss  3.76 | ppl    42.945
| epoch 273 step   118800 |    208 batches | lr 2.12e-06 | ms/batch 319.40 | loss  3.78 | ppl    43.919
----------------------------------------------------------------------------------------------------
| Eval 297 at step   118800 | time: 132.47s | valid loss  4.19 | valid ppl    65.905
----------------------------------------------------------------------------------------------------
| epoch 273 step   118850 |    258 batches | lr 2.09e-06 | ms/batch 418.89 | loss  3.81 | ppl    45.350
| epoch 273 step   118900 |    308 batches | lr 2.05e-06 | ms/batch 319.66 | loss  3.80 | ppl    44.773
| epoch 273 step   118950 |    358 batches | lr 2.02e-06 | ms/batch 319.88 | loss  3.72 | ppl    41.327
| epoch 273 step   119000 |    408 batches | lr 1.99e-06 | ms/batch 317.69 | loss  3.75 | ppl    42.594
| epoch 274 step   119050 |     22 batches | lr 1.95e-06 | ms/batch 313.57 | loss  3.79 | ppl    44.288
| epoch 274 step   119100 |     72 batches | lr 1.92e-06 | ms/batch 319.45 | loss  3.71 | ppl    40.736
| epoch 274 step   119150 |    122 batches | lr 1.89e-06 | ms/batch 319.23 | loss  3.76 | ppl    43.027
| epoch 274 step   119200 |    172 batches | lr 1.86e-06 | ms/batch 319.29 | loss  3.77 | ppl    43.367
----------------------------------------------------------------------------------------------------
| Eval 298 at step   119200 | time: 132.39s | valid loss  4.19 | valid ppl    65.936
----------------------------------------------------------------------------------------------------
| epoch 274 step   119250 |    222 batches | lr 1.82e-06 | ms/batch 419.45 | loss  3.78 | ppl    43.994
| epoch 274 step   119300 |    272 batches | lr 1.79e-06 | ms/batch 319.33 | loss  3.79 | ppl    44.263
| epoch 274 step   119350 |    322 batches | lr 1.76e-06 | ms/batch 320.11 | loss  3.77 | ppl    43.289
| epoch 274 step   119400 |    372 batches | lr 1.73e-06 | ms/batch 318.45 | loss  3.75 | ppl    42.669
| epoch 274 step   119450 |    422 batches | lr 1.7e-06 | ms/batch 319.97 | loss  3.75 | ppl    42.593
| epoch 275 step   119500 |     36 batches | lr 1.67e-06 | ms/batch 312.74 | loss  3.81 | ppl    45.124
| epoch 275 step   119550 |     86 batches | lr 1.64e-06 | ms/batch 318.88 | loss  3.71 | ppl    40.876
| epoch 275 step   119600 |    136 batches | lr 1.61e-06 | ms/batch 319.44 | loss  3.79 | ppl    44.411
----------------------------------------------------------------------------------------------------
| Eval 299 at step   119600 | time: 132.40s | valid loss  4.19 | valid ppl    65.951
----------------------------------------------------------------------------------------------------
| epoch 275 step   119650 |    186 batches | lr 1.58e-06 | ms/batch 418.17 | loss  3.78 | ppl    43.695
| epoch 275 step   119700 |    236 batches | lr 1.55e-06 | ms/batch 323.48 | loss  3.79 | ppl    44.198
| epoch 275 step   119750 |    286 batches | lr 1.52e-06 | ms/batch 336.37 | loss  3.83 | ppl    46.221
| epoch 275 step   119800 |    336 batches | lr 1.49e-06 | ms/batch 334.90 | loss  3.70 | ppl    40.297
| epoch 275 step   119850 |    386 batches | lr 1.46e-06 | ms/batch 320.64 | loss  3.79 | ppl    44.037
| epoch 275 step   119900 |    436 batches | lr 1.44e-06 | ms/batch 315.43 | loss  3.79 | ppl    44.191
| epoch 276 step   119950 |     50 batches | lr 1.41e-06 | ms/batch 318.26 | loss  3.75 | ppl    42.410
| epoch 276 step   120000 |    100 batches | lr 1.38e-06 | ms/batch 320.73 | loss  3.78 | ppl    43.640
----------------------------------------------------------------------------------------------------
| Eval 300 at step   120000 | time: 134.44s | valid loss  4.19 | valid ppl    65.898
----------------------------------------------------------------------------------------------------
| epoch 276 step   120050 |    150 batches | lr 1.35e-06 | ms/batch 421.22 | loss  3.80 | ppl    44.583
| epoch 276 step   120100 |    200 batches | lr 1.33e-06 | ms/batch 320.86 | loss  3.78 | ppl    43.731
| epoch 276 step   120150 |    250 batches | lr 1.3e-06 | ms/batch 320.90 | loss  3.81 | ppl    44.957
| epoch 276 step   120200 |    300 batches | lr 1.27e-06 | ms/batch 320.20 | loss  3.82 | ppl    45.400
| epoch 276 step   120250 |    350 batches | lr 1.25e-06 | ms/batch 320.03 | loss  3.71 | ppl    40.857
| epoch 276 step   120300 |    400 batches | lr 1.22e-06 | ms/batch 320.84 | loss  3.79 | ppl    44.393
| epoch 277 step   120350 |     14 batches | lr 1.19e-06 | ms/batch 313.19 | loss  3.80 | ppl    44.818
| epoch 277 step   120400 |     64 batches | lr 1.17e-06 | ms/batch 321.13 | loss  3.74 | ppl    41.904
----------------------------------------------------------------------------------------------------
| Eval 301 at step   120400 | time: 132.89s | valid loss  4.19 | valid ppl    65.905
----------------------------------------------------------------------------------------------------
| epoch 277 step   120450 |    114 batches | lr 1.14e-06 | ms/batch 419.96 | loss  3.75 | ppl    42.621
| epoch 277 step   120500 |    164 batches | lr 1.12e-06 | ms/batch 319.62 | loss  3.79 | ppl    44.074
| epoch 277 step   120550 |    214 batches | lr 1.09e-06 | ms/batch 319.56 | loss  3.79 | ppl    44.411
| epoch 277 step   120600 |    264 batches | lr 1.07e-06 | ms/batch 319.76 | loss  3.76 | ppl    43.011
| epoch 277 step   120650 |    314 batches | lr 1.04e-06 | ms/batch 319.75 | loss  3.80 | ppl    44.527
| epoch 277 step   120700 |    364 batches | lr 1.02e-06 | ms/batch 319.49 | loss  3.71 | ppl    40.718
| epoch 277 step   120750 |    414 batches | lr 9.97e-07 | ms/batch 319.65 | loss  3.76 | ppl    43.039
| epoch 278 step   120800 |     28 batches | lr 9.74e-07 | ms/batch 315.14 | loss  3.82 | ppl    45.430
----------------------------------------------------------------------------------------------------
| Eval 302 at step   120800 | time: 132.66s | valid loss  4.19 | valid ppl    65.872
----------------------------------------------------------------------------------------------------
| epoch 278 step   120850 |     78 batches | lr 9.51e-07 | ms/batch 422.26 | loss  3.73 | ppl    41.593
| epoch 278 step   120900 |    128 batches | lr 9.28e-07 | ms/batch 319.26 | loss  3.78 | ppl    43.770
| epoch 278 step   120950 |    178 batches | lr 9.06e-07 | ms/batch 321.16 | loss  3.78 | ppl    43.929
| epoch 278 step   121000 |    228 batches | lr 8.84e-07 | ms/batch 320.96 | loss  3.80 | ppl    44.490
| epoch 278 step   121050 |    278 batches | lr 8.62e-07 | ms/batch 320.44 | loss  3.84 | ppl    46.413
| epoch 278 step   121100 |    328 batches | lr 8.4e-07 | ms/batch 321.19 | loss  3.74 | ppl    41.995
| epoch 278 step   121150 |    378 batches | lr 8.19e-07 | ms/batch 318.92 | loss  3.77 | ppl    43.319
| epoch 278 step   121200 |    428 batches | lr 7.97e-07 | ms/batch 321.42 | loss  3.80 | ppl    44.605
----------------------------------------------------------------------------------------------------
| Eval 303 at step   121200 | time: 133.30s | valid loss  4.19 | valid ppl    65.909
----------------------------------------------------------------------------------------------------
| epoch 279 step   121250 |     42 batches | lr 7.77e-07 | ms/batch 415.41 | loss  3.77 | ppl    43.478
| epoch 279 step   121300 |     92 batches | lr 7.56e-07 | ms/batch 319.85 | loss  3.72 | ppl    41.445
| epoch 279 step   121350 |    142 batches | lr 7.36e-07 | ms/batch 320.29 | loss  3.79 | ppl    44.333
| epoch 279 step   121400 |    192 batches | lr 7.16e-07 | ms/batch 319.57 | loss  3.78 | ppl    44.017
| epoch 279 step   121450 |    242 batches | lr 6.96e-07 | ms/batch 318.87 | loss  3.80 | ppl    44.776
| epoch 279 step   121500 |    292 batches | lr 6.77e-07 | ms/batch 319.26 | loss  3.81 | ppl    45.371
| epoch 279 step   121550 |    342 batches | lr 6.57e-07 | ms/batch 318.33 | loss  3.68 | ppl    39.834
| epoch 279 step   121600 |    392 batches | lr 6.39e-07 | ms/batch 321.35 | loss  3.78 | ppl    43.938
----------------------------------------------------------------------------------------------------
| Eval 304 at step   121600 | time: 132.64s | valid loss  4.19 | valid ppl    65.897
----------------------------------------------------------------------------------------------------
| epoch 280 step   121650 |      6 batches | lr 6.2e-07 | ms/batch 413.72 | loss  3.81 | ppl    45.302
| epoch 280 step   121700 |     56 batches | lr 6.02e-07 | ms/batch 320.04 | loss  3.75 | ppl    42.332
| epoch 280 step   121750 |    106 batches | lr 5.83e-07 | ms/batch 320.37 | loss  3.75 | ppl    42.679
| epoch 280 step   121800 |    156 batches | lr 5.66e-07 | ms/batch 320.62 | loss  3.77 | ppl    43.206
| epoch 280 step   121850 |    206 batches | lr 5.48e-07 | ms/batch 320.54 | loss  3.77 | ppl    43.273
| epoch 280 step   121900 |    256 batches | lr 5.31e-07 | ms/batch 319.94 | loss  3.80 | ppl    44.729
| epoch 280 step   121950 |    306 batches | lr 5.14e-07 | ms/batch 320.25 | loss  3.80 | ppl    44.808
| epoch 280 step   122000 |    356 batches | lr 4.97e-07 | ms/batch 320.59 | loss  3.71 | ppl    40.926
----------------------------------------------------------------------------------------------------
| Eval 305 at step   122000 | time: 132.80s | valid loss  4.19 | valid ppl    65.899
----------------------------------------------------------------------------------------------------
| epoch 280 step   122050 |    406 batches | lr 4.81e-07 | ms/batch 420.93 | loss  3.73 | ppl    41.867
| epoch 281 step   122100 |     20 batches | lr 4.65e-07 | ms/batch 313.64 | loss  3.80 | ppl    44.637
| epoch 281 step   122150 |     70 batches | lr 4.49e-07 | ms/batch 320.99 | loss  3.71 | ppl    40.868
| epoch 281 step   122200 |    120 batches | lr 4.33e-07 | ms/batch 320.51 | loss  3.78 | ppl    44.003
| epoch 281 step   122250 |    170 batches | lr 4.18e-07 | ms/batch 320.58 | loss  3.77 | ppl    43.431
| epoch 281 step   122300 |    220 batches | lr 4.03e-07 | ms/batch 320.18 | loss  3.78 | ppl    43.847
| epoch 281 step   122350 |    270 batches | lr 3.88e-07 | ms/batch 320.96 | loss  3.80 | ppl    44.586
| epoch 281 step   122400 |    320 batches | lr 3.73e-07 | ms/batch 320.69 | loss  3.76 | ppl    42.950
----------------------------------------------------------------------------------------------------
| Eval 306 at step   122400 | time: 132.94s | valid loss  4.19 | valid ppl    65.886
----------------------------------------------------------------------------------------------------
| epoch 281 step   122450 |    370 batches | lr 3.59e-07 | ms/batch 421.94 | loss  3.73 | ppl    41.819
| epoch 281 step   122500 |    420 batches | lr 3.45e-07 | ms/batch 319.80 | loss  3.79 | ppl    44.206
| epoch 282 step   122550 |     34 batches | lr 3.32e-07 | ms/batch 315.76 | loss  3.79 | ppl    44.277
| epoch 282 step   122600 |     84 batches | lr 3.18e-07 | ms/batch 321.00 | loss  3.71 | ppl    40.811
| epoch 282 step   122650 |    134 batches | lr 3.05e-07 | ms/batch 319.96 | loss  3.77 | ppl    43.531
| epoch 282 step   122700 |    184 batches | lr 2.92e-07 | ms/batch 333.86 | loss  3.78 | ppl    44.006
| epoch 282 step   122750 |    234 batches | lr 2.8e-07 | ms/batch 335.82 | loss  3.78 | ppl    43.838
| epoch 282 step   122800 |    284 batches | lr 2.67e-07 | ms/batch 332.85 | loss  3.80 | ppl    44.558
----------------------------------------------------------------------------------------------------
| Eval 307 at step   122800 | time: 135.03s | valid loss  4.19 | valid ppl    65.896
----------------------------------------------------------------------------------------------------
| epoch 282 step   122850 |    334 batches | lr 2.55e-07 | ms/batch 420.33 | loss  3.71 | ppl    40.779
| epoch 282 step   122900 |    384 batches | lr 2.44e-07 | ms/batch 319.94 | loss  3.77 | ppl    43.562
| epoch 282 step   122950 |    434 batches | lr 2.32e-07 | ms/batch 320.26 | loss  3.81 | ppl    45.371
| epoch 283 step   123000 |     48 batches | lr 2.21e-07 | ms/batch 314.41 | loss  3.74 | ppl    42.250
| epoch 283 step   123050 |     98 batches | lr 2.1e-07 | ms/batch 319.41 | loss  3.73 | ppl    41.741
| epoch 283 step   123100 |    148 batches | lr 1.99e-07 | ms/batch 320.65 | loss  3.79 | ppl    44.194
| epoch 283 step   123150 |    198 batches | lr 1.89e-07 | ms/batch 319.87 | loss  3.79 | ppl    44.113
| epoch 283 step   123200 |    248 batches | lr 1.79e-07 | ms/batch 319.51 | loss  3.79 | ppl    44.450
----------------------------------------------------------------------------------------------------
| Eval 308 at step   123200 | time: 132.72s | valid loss  4.19 | valid ppl    65.890
----------------------------------------------------------------------------------------------------
| epoch 283 step   123250 |    298 batches | lr 1.69e-07 | ms/batch 420.01 | loss  3.83 | ppl    46.281
| epoch 283 step   123300 |    348 batches | lr 1.6e-07 | ms/batch 318.50 | loss  3.70 | ppl    40.422
| epoch 283 step   123350 |    398 batches | lr 1.5e-07 | ms/batch 319.90 | loss  3.78 | ppl    43.818
| epoch 284 step   123400 |     12 batches | lr 1.41e-07 | ms/batch 315.49 | loss  3.82 | ppl    45.386
| epoch 284 step   123450 |     62 batches | lr 1.33e-07 | ms/batch 320.77 | loss  3.74 | ppl    42.123
| epoch 284 step   123500 |    112 batches | lr 1.24e-07 | ms/batch 320.32 | loss  3.76 | ppl    42.876
| epoch 284 step   123550 |    162 batches | lr 1.16e-07 | ms/batch 319.13 | loss  3.79 | ppl    44.294
| epoch 284 step   123600 |    212 batches | lr 1.08e-07 | ms/batch 318.57 | loss  3.79 | ppl    44.301
----------------------------------------------------------------------------------------------------
| Eval 309 at step   123600 | time: 132.63s | valid loss  4.19 | valid ppl    65.888
----------------------------------------------------------------------------------------------------
| epoch 284 step   123650 |    262 batches | lr 1.01e-07 | ms/batch 419.90 | loss  3.81 | ppl    45.078
| epoch 284 step   123700 |    312 batches | lr 9.34e-08 | ms/batch 320.44 | loss  3.79 | ppl    44.445
| epoch 284 step   123750 |    362 batches | lr 8.64e-08 | ms/batch 320.55 | loss  3.71 | ppl    40.851
| epoch 284 step   123800 |    412 batches | lr 7.96e-08 | ms/batch 319.51 | loss  3.77 | ppl    43.165
| epoch 285 step   123850 |     26 batches | lr 7.31e-08 | ms/batch 313.07 | loss  3.83 | ppl    46.039
| epoch 285 step   123900 |     76 batches | lr 6.69e-08 | ms/batch 319.17 | loss  3.73 | ppl    41.505
| epoch 285 step   123950 |    126 batches | lr 6.09e-08 | ms/batch 320.40 | loss  3.79 | ppl    44.163
| epoch 285 step   124000 |    176 batches | lr 5.53e-08 | ms/batch 321.08 | loss  3.79 | ppl    44.303
----------------------------------------------------------------------------------------------------
| Eval 310 at step   124000 | time: 132.71s | valid loss  4.19 | valid ppl    65.891
----------------------------------------------------------------------------------------------------
| epoch 285 step   124050 |    226 batches | lr 4.99e-08 | ms/batch 418.68 | loss  3.80 | ppl    44.609
| epoch 285 step   124100 |    276 batches | lr 4.48e-08 | ms/batch 319.24 | loss  3.84 | ppl    46.391
| epoch 285 step   124150 |    326 batches | lr 3.99e-08 | ms/batch 319.08 | loss  3.75 | ppl    42.594
| epoch 285 step   124200 |    376 batches | lr 3.54e-08 | ms/batch 319.33 | loss  3.77 | ppl    43.497
| epoch 285 step   124250 |    426 batches | lr 3.11e-08 | ms/batch 318.56 | loss  3.78 | ppl    43.599
| epoch 286 step   124300 |     40 batches | lr 2.71e-08 | ms/batch 314.02 | loss  3.78 | ppl    43.654
| epoch 286 step   124350 |     90 batches | lr 2.34e-08 | ms/batch 320.46 | loss  3.74 | ppl    42.264
| epoch 286 step   124400 |    140 batches | lr 1.99e-08 | ms/batch 317.53 | loss  3.76 | ppl    43.006
----------------------------------------------------------------------------------------------------
| Eval 311 at step   124400 | time: 132.34s | valid loss  4.19 | valid ppl    65.896
----------------------------------------------------------------------------------------------------
| epoch 286 step   124450 |    190 batches | lr 1.67e-08 | ms/batch 418.43 | loss  3.80 | ppl    44.706
| epoch 286 step   124500 |    240 batches | lr 1.38e-08 | ms/batch 318.35 | loss  3.81 | ppl    45.052
| epoch 286 step   124550 |    290 batches | lr 1.12e-08 | ms/batch 318.37 | loss  3.83 | ppl    46.100
| epoch 286 step   124600 |    340 batches | lr 8.84e-09 | ms/batch 318.89 | loss  3.69 | ppl    39.959
| epoch 286 step   124650 |    390 batches | lr 6.77e-09 | ms/batch 318.11 | loss  3.76 | ppl    43.066
| epoch 287 step   124700 |      4 batches | lr 4.97e-09 | ms/batch 313.22 | loss  3.82 | ppl    45.717
| epoch 287 step   124750 |     54 batches | lr 3.45e-09 | ms/batch 318.48 | loss  3.75 | ppl    42.370
| epoch 287 step   124800 |    104 batches | lr 2.21e-09 | ms/batch 319.12 | loss  3.74 | ppl    41.965
----------------------------------------------------------------------------------------------------
| Eval 312 at step   124800 | time: 132.15s | valid loss  4.19 | valid ppl    65.895
----------------------------------------------------------------------------------------------------
| epoch 287 step   124850 |    154 batches | lr 1.24e-09 | ms/batch 419.05 | loss  3.78 | ppl    43.744
| epoch 287 step   124900 |    204 batches | lr 5.53e-10 | ms/batch 319.97 | loss  3.79 | ppl    44.163
| epoch 287 step   124950 |    254 batches | lr 1.38e-10 | ms/batch 320.02 | loss  3.79 | ppl    44.267
| epoch 287 step   125000 |    304 batches | lr 0 | ms/batch 319.78 | loss  3.82 | ppl    45.765
----------------------------------------------------------------------------------------------------
End of training
====================================================================================================
| End of training | test loss  4.15 | test ppl    63.333
====================================================================================================
