tokens per iteration will be: 491,520
Initializing a new model from scratch
config:OpenELMConfig {
  "_name_or_path": "./",
  "activation_fn_name": "swish",
  "architectures": [
    "OpenELMForCausalLM"
  ],
  "attn_pdrop": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_openelm.OpenELMConfig",
    "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "ffn_dim_divisor": 256,
  "ffn_multipliers": [
    0.5,
    0.75,
    1.0,
    1.25,
    1.5,
    1.75,
    2.0,
    2.25
  ],
  "ffn_with_glu": true,
  "head_dim": 64,
  "initializer_range": 0.02,
  "max_context_length": 1024,
  "model_dim": 1280,
  "model_type": "openelm",
  "normalization_layer_name": "rms_norm",
  "normalize_qk_projections": true,
  "num_gqa_groups": 2,
  "num_kv_heads": [
    3,
    3,
    4,
    4,
    4,
    4,
    5,
    5
  ],
  "num_query_heads": [
    10,
    12,
    12,
    14,
    16,
    18,
    18,
    20
  ],
  "num_transformer_layers": 8,
  "qkv_multipliers": [
    0.5,
    1.0
  ],
  "rope_freq_constant": 10000,
  "rope_max_length": 1024,
  "share_input_output_layers": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

config:OpenELMConfig {
  "_name_or_path": "./",
  "activation_fn_name": "swish",
  "architectures": [
    "OpenELMForCausalLM"
  ],
  "attn_pdrop": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_openelm.OpenELMConfig",
    "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "ffn_dim_divisor": 256,
  "ffn_multipliers": [
    0.5,
    0.75,
    1.0,
    1.25,
    1.5,
    1.5,
    1.75,
    2.0
  ],
  "ffn_with_glu": true,
  "head_dim": 64,
  "initializer_range": 0.02,
  "max_context_length": 1024,
  "model_dim": 954,
  "model_type": "openelm",
  "normalization_layer_name": "rms_norm",
  "normalize_qk_projections": true,
  "num_gqa_groups": 2,
  "num_kv_heads": [
    3,
    3,
    3,
    3,
    4,
    4,
    4,
    5
  ],
  "num_query_heads": [
    6,
    6,
    6,
    6,
    8,
    8,
    8,
    10
  ],
  "num_transformer_layers": 8,
  "qkv_multipliers": [
    0.5,
    1.0
  ],
  "rope_freq_constant": 10000,
  "rope_max_length": 1024,
  "share_input_output_layers": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

num decayed parameter tensors: 33, with 87,875,802 parameters
num non-decayed parameter tensors: 33, with 17,242 parameters
using fused AdamW: True
compiling the model... (takes a ~minute)
number of parameters: 87.89M
number of transformer parameters: 39.95M
step 0: train loss 11.0824, val loss 11.0972
iter 0: loss 11.1670, time 52417.72ms 
iter 1: loss 11.1964, time 5166.68ms 
iter 2: loss 10.9772, time 5192.56ms 
iter 3: loss 10.9381, time 5247.04ms 
iter 4: loss 10.6868, time 5217.55ms 
iter 5: loss 10.2933, time 5249.02ms 
iter 6: loss 9.8838, time 5243.99ms 
iter 7: loss 9.6257, time 5225.78ms 
iter 8: loss 9.4372, time 5198.02ms 
iter 9: loss 8.8309, time 5223.94ms 
iter 10: loss 8.2376, time 5208.27ms 
iter 11: loss 7.7188, time 5205.95ms 
iter 12: loss 7.3131, time 5238.09ms 
iter 13: loss 7.2960, time 5216.57ms 
iter 14: loss 7.2612, time 5216.21ms 
iter 15: loss 7.0461, time 5228.38ms 
iter 16: loss 6.9177, time 5224.82ms 
iter 17: loss 7.1365, time 5219.30ms 
iter 18: loss 7.3780, time 5248.23ms 
iter 19: loss 6.7642, time 5244.43ms 
iter 20: loss 7.0415, time 5250.60ms 
iter 21: loss 7.0627, time 5266.98ms 
iter 22: loss 6.6603, time 5247.82ms 
iter 23: loss 6.7183, time 5255.19ms 
iter 24: loss 6.7415, time 5248.43ms 
iter 25: loss 6.7362, time 5253.51ms 
iter 26: loss 6.4557, time 5249.26ms 
iter 27: loss 6.3664, time 5246.99ms 
iter 28: loss 6.6904, time 5254.55ms 
iter 29: loss 6.5080, time 5250.59ms 
iter 30: loss 6.6046, time 5244.06ms 
iter 31: loss 6.4872, time 5178.81ms 
iter 32: loss 6.4448, time 5229.48ms 
iter 33: loss 6.3290, time 5236.76ms 
iter 34: loss 6.4428, time 5225.39ms 
iter 35: loss 6.2665, time 5200.68ms 
iter 36: loss 6.4535, time 5237.85ms 
iter 37: loss 6.3796, time 5227.56ms 
iter 38: loss 5.9802, time 5160.94ms 
iter 39: loss 6.1633, time 5223.91ms 
iter 40: loss 6.0559, time 5176.98ms 
iter 41: loss 6.1787, time 5111.30ms 
iter 42: loss 5.8368, time 5098.29ms 
iter 43: loss 5.8146, time 5110.34ms 
iter 44: loss 6.1911, time 5081.21ms 
iter 45: loss 6.1345, time 5237.18ms 
iter 46: loss 5.9547, time 5254.21ms 
iter 47: loss 5.9064, time 5249.57ms 
iter 48: loss 5.9052, time 5255.42ms 
iter 49: loss 5.8065, time 5261.45ms 
step 50: train loss 5.8838, val loss 5.8387
iter 50: loss 6.0590, time 20051.85ms 
iter 51: loss 5.6708, time 5248.02ms 
iter 52: loss 5.5770, time 5254.20ms 
iter 53: loss 6.0077, time 5249.55ms 
iter 54: loss 5.5519, time 5248.32ms 
iter 55: loss 5.8065, time 5239.11ms 
iter 56: loss 5.8892, time 5243.73ms 
iter 57: loss 5.7186, time 5240.49ms 
iter 58: loss 6.1017, time 5232.25ms 
iter 59: loss 5.7956, time 5240.42ms 
iter 60: loss 5.5698, time 5245.95ms 
iter 61: loss 5.4394, time 5254.70ms 
iter 62: loss 5.8488, time 5234.35ms 
iter 63: loss 5.5895, time 5234.70ms 
iter 64: loss 5.3920, time 5239.93ms 
iter 65: loss 5.6251, time 5248.99ms 
iter 66: loss 5.4129, time 5226.32ms 
iter 67: loss 5.5608, time 5234.76ms 
iter 68: loss 5.8131, time 5234.88ms 
iter 69: loss 5.3174, time 5253.85ms 
iter 70: loss 5.5905, time 5254.52ms 
iter 71: loss 5.4140, time 5263.84ms 
iter 72: loss 5.6387, time 5255.39ms 
iter 73: loss 5.4189, time 5259.04ms 
iter 74: loss 5.3358, time 5266.05ms 
iter 75: loss 5.3901, time 5256.30ms 
iter 76: loss 5.4938, time 5260.64ms 
iter 77: loss 5.5161, time 5254.47ms 
iter 78: loss 5.4944, time 5256.42ms 
iter 79: loss 5.3912, time 5249.07ms 
iter 80: loss 5.5526, time 5257.27ms 
iter 81: loss 5.3438, time 5314.48ms 
iter 82: loss 5.5104, time 5254.00ms 
iter 83: loss 5.3976, time 5260.80ms 
iter 84: loss 5.6104, time 5273.52ms 
iter 85: loss 5.5547, time 5252.41ms 
iter 86: loss 5.4724, time 5263.50ms 
iter 87: loss 5.1667, time 5252.96ms 
iter 88: loss 5.0782, time 5256.40ms 
iter 89: loss 5.2279, time 5261.25ms 
iter 90: loss 5.1772, time 5269.05ms 
iter 91: loss 5.4733, time 5276.47ms 
iter 92: loss 5.0491, time 5249.38ms 
iter 93: loss 5.1254, time 5250.64ms 
iter 94: loss 5.1864, time 5253.86ms 
iter 95: loss 4.9593, time 5251.22ms 
iter 96: loss 5.1228, time 5247.78ms 
iter 97: loss 5.1301, time 5253.86ms 
iter 98: loss 5.0842, time 5256.05ms 
iter 99: loss 5.3183, time 5229.36ms 
step 100: train loss 5.0495, val loss 5.0189
iter 100: loss 4.9196, time 20055.29ms 
iter 101: loss 4.9095, time 5268.54ms 
iter 102: loss 4.9567, time 5259.60ms 
iter 103: loss 4.9789, time 5291.08ms 
iter 104: loss 5.0161, time 5277.26ms 
iter 105: loss 4.9695, time 5254.72ms 
iter 106: loss 4.8136, time 5268.84ms 
iter 107: loss 4.7877, time 5323.11ms 
iter 108: loss 4.9006, time 5263.01ms 
iter 109: loss 4.8412, time 5259.68ms 
iter 110: loss 4.7245, time 5331.55ms 
iter 111: loss 4.9170, time 5329.42ms 
iter 112: loss 4.9151, time 5250.90ms 
iter 113: loss 4.7914, time 5251.31ms 
iter 114: loss 4.7085, time 5248.48ms 
iter 115: loss 4.7072, time 5248.07ms 
iter 116: loss 4.7214, time 5253.43ms 
iter 117: loss 4.7011, time 5247.94ms 
iter 118: loss 4.9194, time 5254.19ms 
iter 119: loss 4.7372, time 5254.78ms 
iter 120: loss 4.7747, time 5251.43ms 
iter 121: loss 4.7873, time 5253.46ms 
iter 122: loss 4.8024, time 5257.16ms 
iter 123: loss 4.8118, time 5253.56ms 
iter 124: loss 4.6143, time 5243.81ms 
iter 125: loss 4.6480, time 5260.01ms 
iter 126: loss 4.4997, time 5257.67ms 
iter 127: loss 4.6982, time 5262.36ms 
iter 128: loss 4.5226, time 5271.41ms 
iter 129: loss 4.4813, time 5252.65ms 
iter 130: loss 4.5734, time 5261.24ms 
iter 131: loss 4.5374, time 5250.48ms 
iter 132: loss 4.4540, time 5244.39ms 
iter 133: loss 4.6578, time 5248.42ms 
iter 134: loss 4.4120, time 5264.98ms 
iter 135: loss 4.5628, time 5271.00ms 
iter 136: loss 4.5950, time 5269.44ms 
iter 137: loss 4.5443, time 5259.42ms 
iter 138: loss 4.2715, time 5263.07ms 
iter 139: loss 4.4401, time 5253.82ms 
iter 140: loss 4.7301, time 5254.58ms 
iter 141: loss 4.6138, time 5260.33ms 
iter 142: loss 4.2001, time 5257.67ms 
iter 143: loss 4.2477, time 5255.63ms 
iter 144: loss 4.6200, time 5250.89ms 
iter 145: loss 4.4529, time 5256.19ms 
iter 146: loss 4.2379, time 5249.85ms 
iter 147: loss 4.2706, time 5253.10ms 
iter 148: loss 4.4513, time 5252.00ms 
iter 149: loss 4.4982, time 5265.27ms 
step 150: train loss 4.3801, val loss 4.3035
iter 150: loss 4.4016, time 19975.30ms 
iter 151: loss 4.5342, time 5248.90ms 
iter 152: loss 5.0719, time 5258.88ms 
iter 153: loss 4.2599, time 5263.29ms 
iter 154: loss 4.4490, time 5263.72ms 
iter 155: loss 4.2969, time 5258.22ms 
iter 156: loss 4.4716, time 5259.90ms 
iter 157: loss 4.3717, time 5267.03ms 
iter 158: loss 4.1489, time 5230.10ms 
iter 159: loss 4.6690, time 5257.45ms 
iter 160: loss 4.1177, time 5257.86ms 
iter 161: loss 4.2570, time 5229.30ms 
iter 162: loss 4.2813, time 5254.84ms 
iter 163: loss 4.1944, time 5265.66ms 
iter 164: loss 4.3933, time 5337.04ms 
iter 165: loss 4.2446, time 5254.87ms 
iter 166: loss 4.4838, time 5251.77ms 
iter 167: loss 4.2098, time 5250.95ms 
iter 168: loss 4.1121, time 5178.76ms 
iter 169: loss 4.1805, time 5236.74ms 
iter 170: loss 4.3196, time 5260.55ms 
iter 171: loss 4.0166, time 5247.60ms 
iter 172: loss 4.1836, time 5251.15ms 
iter 173: loss 4.0173, time 5251.82ms 
iter 174: loss 4.0877, time 5253.31ms 
iter 175: loss 4.2008, time 5177.46ms 
iter 176: loss 4.1468, time 5256.94ms 
iter 177: loss 4.4282, time 5242.69ms 
iter 178: loss 4.1797, time 5221.30ms 
iter 179: loss 4.3824, time 5282.59ms 
iter 180: loss 4.0485, time 5237.47ms 
iter 181: loss 4.1162, time 5101.86ms 
iter 182: loss 4.0532, time 5303.95ms 
iter 183: loss 4.0938, time 5203.83ms 
iter 184: loss 4.0988, time 5278.31ms 
iter 185: loss 4.2316, time 5260.63ms 
iter 186: loss 4.1546, time 5245.08ms 
iter 187: loss 4.1291, time 5251.61ms 
iter 188: loss 4.1867, time 5222.92ms 
iter 189: loss 3.9960, time 5243.96ms 
iter 190: loss 4.0050, time 5260.30ms 
iter 191: loss 3.9787, time 5257.94ms 
iter 192: loss 4.2410, time 5257.81ms 
iter 193: loss 4.2626, time 5240.11ms 
iter 194: loss 4.0887, time 5238.23ms 
iter 195: loss 4.1950, time 5237.35ms 
iter 196: loss 4.1409, time 5243.16ms 
iter 197: loss 3.9115, time 5256.68ms 
iter 198: loss 3.9839, time 5264.67ms 
iter 199: loss 3.8185, time 5257.80ms 
step 200: train loss 4.0378, val loss 4.0106
iter 200: loss 4.2057, time 20053.38ms 
iter 201: loss 4.0085, time 5270.24ms 
iter 202: loss 3.9854, time 5253.67ms 
iter 203: loss 3.9825, time 5259.13ms 
iter 204: loss 3.8561, time 5205.07ms 
iter 205: loss 3.9801, time 5245.61ms 
iter 206: loss 4.0762, time 5263.30ms 
iter 207: loss 3.9687, time 5271.29ms 
iter 208: loss 3.8645, time 5255.94ms 
iter 209: loss 4.0759, time 5267.50ms 
iter 210: loss 4.2792, time 5264.74ms 
iter 211: loss 4.0627, time 5256.85ms 
iter 212: loss 4.1856, time 5273.55ms 
iter 213: loss 3.8933, time 5289.44ms 
iter 214: loss 3.9180, time 5254.24ms 
iter 215: loss 3.8860, time 5257.00ms 
iter 216: loss 4.0898, time 5249.52ms 
iter 217: loss 4.2907, time 5258.93ms 
iter 218: loss 3.9823, time 5261.67ms 
iter 219: loss 4.0788, time 5263.83ms 
iter 220: loss 4.1080, time 5248.72ms 
iter 221: loss 3.9430, time 5245.41ms 
iter 222: loss 3.7716, time 5247.52ms 
iter 223: loss 4.0019, time 5245.48ms 
iter 224: loss 4.3112, time 5246.12ms 
iter 225: loss 3.9351, time 5245.83ms 
iter 226: loss 4.1433, time 5257.23ms 
iter 227: loss 3.9650, time 5214.69ms 
iter 228: loss 3.7468, time 5248.01ms 
iter 229: loss 4.0837, time 5244.91ms 
iter 230: loss 3.8080, time 5211.36ms 
iter 231: loss 3.8246, time 5243.64ms 
iter 232: loss 3.9753, time 5247.56ms 
iter 233: loss 3.9337, time 5249.88ms 
iter 234: loss 4.1083, time 5249.44ms 
iter 235: loss 3.8448, time 5249.66ms 
iter 236: loss 3.9997, time 5249.78ms 
iter 237: loss 3.7483, time 5250.84ms 
iter 238: loss 3.7755, time 5247.55ms 
iter 239: loss 4.0104, time 5264.33ms 
iter 240: loss 3.9341, time 5255.81ms 
iter 241: loss 3.9198, time 5253.39ms 
iter 242: loss 3.9078, time 5257.35ms 
iter 243: loss 3.9982, time 5240.65ms 
iter 244: loss 3.7985, time 5270.34ms 
iter 245: loss 3.7987, time 5257.95ms 
iter 246: loss 3.7402, time 5254.93ms 
iter 247: loss 3.9489, time 5266.43ms 
iter 248: loss 4.0871, time 5215.77ms 
iter 249: loss 4.0350, time 5252.26ms 
step 250: train loss 3.8922, val loss 3.8376
iter 250: loss 3.9079, time 20003.37ms 
iter 251: loss 3.8668, time 5243.16ms 
iter 252: loss 3.7723, time 5254.96ms 
iter 253: loss 3.8231, time 5187.01ms 
iter 254: loss 3.8624, time 5243.98ms 
iter 255: loss 3.8151, time 5262.20ms 
iter 256: loss 3.8789, time 5254.40ms 
iter 257: loss 3.9603, time 5271.76ms 
iter 258: loss 3.6897, time 5235.34ms 
iter 259: loss 3.9231, time 5226.41ms 
iter 260: loss 3.7310, time 5235.95ms 
iter 261: loss 3.7900, time 5249.85ms 
iter 262: loss 3.9930, time 5220.35ms 
iter 263: loss 3.9778, time 5225.42ms 
iter 264: loss 3.9241, time 5268.81ms 
iter 265: loss 3.6365, time 5254.82ms 
iter 266: loss 3.8700, time 5213.56ms 
iter 267: loss 3.9564, time 5252.21ms 
iter 268: loss 3.7080, time 5251.55ms 
iter 269: loss 3.6192, time 5236.33ms 
iter 270: loss 3.7080, time 5254.78ms 
iter 271: loss 3.6761, time 5233.46ms 
iter 272: loss 3.7127, time 5236.31ms 
iter 273: loss 4.0396, time 5227.43ms 
iter 274: loss 3.7756, time 5225.18ms 
iter 275: loss 3.7705, time 5228.19ms 
iter 276: loss 3.7997, time 5252.46ms 
iter 277: loss 3.7881, time 5240.45ms 
iter 278: loss 4.0745, time 5239.60ms 
iter 279: loss 3.7657, time 5246.09ms 
iter 280: loss 3.6939, time 5242.04ms 
iter 281: loss 3.6109, time 5221.71ms 
iter 282: loss 3.8376, time 5248.30ms 
iter 283: loss 3.9319, time 5218.96ms 
iter 284: loss 3.7027, time 5241.99ms 
iter 285: loss 3.8259, time 5236.32ms 
iter 286: loss 3.6597, time 5241.67ms 
iter 287: loss 3.6566, time 5248.39ms 
iter 288: loss 3.7417, time 5235.83ms 
iter 289: loss 3.6482, time 5223.85ms 
iter 290: loss 3.7424, time 5242.80ms 
iter 291: loss 3.9070, time 5239.90ms 
iter 292: loss 4.0021, time 5249.28ms 
iter 293: loss 4.0848, time 5226.03ms 
iter 294: loss 3.8809, time 5258.66ms 
iter 295: loss 3.7563, time 5259.11ms 
iter 296: loss 3.6117, time 5267.01ms 
iter 297: loss 3.7342, time 5256.10ms 
iter 298: loss 3.9471, time 5255.56ms 
iter 299: loss 3.6241, time 5249.44ms 
step 300: train loss 3.7690, val loss 3.7493
iter 300: loss 3.7928, time 20042.78ms 
iter 301: loss 3.7068, time 5259.74ms 
iter 302: loss 3.7266, time 5270.32ms 
iter 303: loss 3.8885, time 5244.44ms 
iter 304: loss 3.8519, time 5255.60ms 
iter 305: loss 3.9160, time 5257.99ms 
iter 306: loss 3.8853, time 5254.82ms 
iter 307: loss 3.7919, time 5266.96ms 
iter 308: loss 3.7799, time 5257.63ms 
iter 309: loss 3.7303, time 5250.60ms 
iter 310: loss 3.6588, time 5252.81ms 
iter 311: loss 3.7752, time 5246.01ms 
iter 312: loss 3.5343, time 5268.92ms 
iter 313: loss 3.7711, time 5251.70ms 
iter 314: loss 3.6042, time 5252.29ms 
iter 315: loss 3.6793, time 5260.06ms 
iter 316: loss 3.8187, time 5253.85ms 
iter 317: loss 4.0241, time 5251.73ms 
iter 318: loss 3.9675, time 5257.93ms 
iter 319: loss 3.6006, time 5255.73ms 
iter 320: loss 3.8226, time 5260.87ms 
iter 321: loss 3.6268, time 5253.21ms 
iter 322: loss 3.7912, time 5257.40ms 
iter 323: loss 3.7971, time 5253.17ms 
iter 324: loss 3.7226, time 5255.73ms 
iter 325: loss 3.6253, time 5253.36ms 
iter 326: loss 3.6430, time 5253.65ms 
iter 327: loss 3.6993, time 5248.79ms 
iter 328: loss 3.9537, time 5258.21ms 
iter 329: loss 3.9250, time 5247.41ms 
iter 330: loss 3.7612, time 5259.16ms 
iter 331: loss 3.5635, time 5268.62ms 
iter 332: loss 3.7396, time 5268.37ms 
iter 333: loss 3.5893, time 5258.09ms 
iter 334: loss 3.6625, time 5263.55ms 
iter 335: loss 4.1023, time 5263.86ms 
iter 336: loss 3.6848, time 5257.00ms 
iter 337: loss 3.6582, time 5253.98ms 
iter 338: loss 3.6283, time 5252.74ms 
iter 339: loss 3.8543, time 5257.24ms 
iter 340: loss 3.7023, time 5265.79ms 
iter 341: loss 3.7694, time 5260.95ms 
iter 342: loss 3.7158, time 5256.26ms 
iter 343: loss 3.7102, time 5255.45ms 
iter 344: loss 3.7173, time 5253.51ms 
iter 345: loss 3.7998, time 5252.14ms 
iter 346: loss 3.7095, time 5261.37ms 
iter 347: loss 3.8649, time 5254.99ms 
iter 348: loss 3.7236, time 5254.58ms 
iter 349: loss 3.7106, time 5250.69ms 
step 350: train loss 3.6828, val loss 3.6728
iter 350: loss 3.7265, time 20069.70ms 
iter 351: loss 3.5970, time 5265.49ms 
iter 352: loss 3.6137, time 5261.78ms 
iter 353: loss 3.7350, time 5265.85ms 
iter 354: loss 3.7029, time 5254.66ms 
iter 355: loss 3.6315, time 5252.49ms 
iter 356: loss 3.5543, time 5250.66ms 
iter 357: loss 3.6731, time 5251.46ms 
iter 358: loss 3.6803, time 5263.49ms 
iter 359: loss 3.4885, time 5252.78ms 
iter 360: loss 3.7073, time 5266.01ms 
iter 361: loss 3.5791, time 5261.01ms 
iter 362: loss 3.5800, time 5354.38ms 
iter 363: loss 3.7152, time 5258.34ms 
iter 364: loss 3.7247, time 5258.77ms 
iter 365: loss 3.6092, time 5266.38ms 
iter 366: loss 3.5984, time 5262.22ms 
iter 367: loss 3.8819, time 5267.18ms 
iter 368: loss 3.5408, time 5250.53ms 
iter 369: loss 3.5222, time 5264.22ms 
iter 370: loss 3.4239, time 5259.41ms 
iter 371: loss 3.6778, time 5264.53ms 
iter 372: loss 3.8658, time 5248.19ms 
iter 373: loss 3.7378, time 5250.14ms 
iter 374: loss 3.5316, time 5256.97ms 
iter 375: loss 3.5180, time 5252.45ms 
iter 376: loss 3.5475, time 5253.51ms 
iter 377: loss 3.6333, time 5257.14ms 
iter 378: loss 3.6445, time 5253.39ms 
iter 379: loss 3.6384, time 5261.13ms 
iter 380: loss 3.6852, time 5257.68ms 
iter 381: loss 3.6795, time 5252.08ms 
iter 382: loss 3.6328, time 5254.02ms 
iter 383: loss 3.7269, time 5255.74ms 
iter 384: loss 3.8221, time 5253.03ms 
iter 385: loss 3.7245, time 5259.35ms 
iter 386: loss 3.5850, time 5259.76ms 
iter 387: loss 3.6303, time 5256.65ms 
iter 388: loss 3.4963, time 5255.34ms 
iter 389: loss 3.6175, time 5242.61ms 
iter 390: loss 3.7366, time 5255.69ms 
iter 391: loss 3.5596, time 5257.69ms 
iter 392: loss 3.4777, time 5253.32ms 
iter 393: loss 3.6443, time 5250.51ms 
iter 394: loss 3.5647, time 5266.89ms 
iter 395: loss 3.6772, time 5248.06ms 
iter 396: loss 3.5322, time 5251.96ms 
iter 397: loss 3.8301, time 5250.55ms 
iter 398: loss 3.6262, time 5257.24ms 
iter 399: loss 3.6498, time 5264.17ms 
step 400: train loss 3.6389, val loss 3.5975
iter 400: loss 3.7179, time 19901.35ms 
iter 401: loss 3.5728, time 5267.18ms 
iter 402: loss 3.5543, time 5262.24ms 
iter 403: loss 3.6316, time 5261.43ms 
iter 404: loss 3.5625, time 5265.35ms 
iter 405: loss 3.6248, time 5265.76ms 
iter 406: loss 3.4983, time 5254.18ms 
iter 407: loss 3.5607, time 5268.09ms 
iter 408: loss 3.6290, time 5256.50ms 
iter 409: loss 3.7295, time 5262.80ms 
iter 410: loss 3.6812, time 5274.99ms 
iter 411: loss 3.6891, time 5256.84ms 
iter 412: loss 3.7638, time 5250.05ms 
iter 413: loss 3.5623, time 5249.11ms 
iter 414: loss 3.5988, time 5255.97ms 
iter 415: loss 3.3158, time 5250.88ms 
iter 416: loss 3.7719, time 5263.58ms 
iter 417: loss 3.5998, time 5257.29ms 
iter 418: loss 3.6814, time 5255.41ms 
iter 419: loss 3.4247, time 5265.39ms 
iter 420: loss 3.5561, time 5254.96ms 
iter 421: loss 3.5637, time 5255.90ms 
iter 422: loss 3.6059, time 5249.49ms 
iter 423: loss 3.5903, time 5266.89ms 
iter 424: loss 3.5834, time 5252.34ms 
iter 425: loss 3.5372, time 5252.41ms 
iter 426: loss 3.7215, time 5258.11ms 
iter 427: loss 3.6192, time 5265.74ms 
iter 428: loss 3.6634, time 5267.38ms 
iter 429: loss 3.6190, time 5265.07ms 
iter 430: loss 3.6815, time 5264.90ms 
iter 431: loss 3.5188, time 5265.65ms 
iter 432: loss 3.4482, time 5270.19ms 
iter 433: loss 3.6518, time 5264.23ms 
iter 434: loss 3.6232, time 5270.86ms 
iter 435: loss 3.5993, time 5266.09ms 
iter 436: loss 3.4670, time 5253.55ms 
iter 437: loss 3.4920, time 5260.32ms 
iter 438: loss 3.4629, time 5267.92ms 
iter 439: loss 3.5942, time 5262.00ms 
iter 440: loss 3.6464, time 5260.39ms 
iter 441: loss 3.6563, time 5262.45ms 
iter 442: loss 3.8004, time 5250.80ms 
iter 443: loss 3.6806, time 5253.47ms 
iter 444: loss 3.4898, time 5256.13ms 
iter 445: loss 3.5464, time 5259.26ms 
iter 446: loss 3.4589, time 5258.24ms 
iter 447: loss 3.6953, time 5255.81ms 
iter 448: loss 3.5757, time 5267.86ms 
iter 449: loss 3.4499, time 5256.45ms 
step 450: train loss 3.5751, val loss 3.5548
iter 450: loss 3.7158, time 19782.79ms 
iter 451: loss 3.4445, time 5259.16ms 
iter 452: loss 3.5595, time 5253.31ms 
iter 453: loss 3.5875, time 5258.43ms 
iter 454: loss 3.5143, time 5250.73ms 
iter 455: loss 3.5889, time 5255.11ms 
iter 456: loss 3.6870, time 5250.16ms 
iter 457: loss 3.5095, time 5255.35ms 
iter 458: loss 3.6194, time 5252.98ms 
iter 459: loss 3.4334, time 5257.44ms 
iter 460: loss 3.6523, time 5251.35ms 
iter 461: loss 3.5744, time 5252.98ms 
iter 462: loss 3.6116, time 5258.64ms 
iter 463: loss 3.6344, time 5259.05ms 
iter 464: loss 3.6232, time 5257.44ms 
iter 465: loss 3.4165, time 5256.83ms 
iter 466: loss 3.6604, time 5254.04ms 
iter 467: loss 3.5264, time 5268.74ms 
iter 468: loss 3.5722, time 5262.17ms 
iter 469: loss 3.3517, time 5263.85ms 
iter 470: loss 3.1848, time 5256.60ms 
iter 471: loss 3.7011, time 5227.81ms 
iter 472: loss 3.3926, time 5267.23ms 
iter 473: loss 3.5740, time 5241.57ms 
iter 474: loss 3.5430, time 5254.66ms 
iter 475: loss 3.5615, time 5258.99ms 
iter 476: loss 3.7405, time 5249.64ms 
iter 477: loss 3.7289, time 5255.69ms 
iter 478: loss 3.5843, time 5250.86ms 
iter 479: loss 3.5464, time 5231.25ms 
iter 480: loss 3.5474, time 5249.64ms 
iter 481: loss 3.6667, time 5259.14ms 
iter 482: loss 3.6453, time 5251.06ms 
iter 483: loss 3.4854, time 5255.14ms 
iter 484: loss 3.6503, time 5259.67ms 
iter 485: loss 3.4186, time 5261.77ms 
iter 486: loss 3.6276, time 5245.68ms 
iter 487: loss 3.4718, time 5254.03ms 
iter 488: loss 3.3762, time 5249.63ms 
iter 489: loss 3.5173, time 5265.67ms 
iter 490: loss 3.5708, time 5255.96ms 
iter 491: loss 3.4856, time 5246.77ms 
iter 492: loss 3.5008, time 5209.96ms 
iter 493: loss 3.4774, time 5235.91ms 
iter 494: loss 3.7688, time 5255.96ms 
iter 495: loss 3.5120, time 5254.44ms 
iter 496: loss 3.4738, time 5252.93ms 
iter 497: loss 3.5374, time 5264.88ms 
iter 498: loss 3.7807, time 5264.19ms 
iter 499: loss 3.5670, time 5259.36ms 
step 500: train loss 3.5069, val loss 3.4779
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 500: loss 3.4722, time 21738.23ms 
iter 501: loss 3.4765, time 5236.86ms 
iter 502: loss 3.4857, time 5102.67ms 
iter 503: loss 3.5006, time 5050.96ms 
iter 504: loss 3.4528, time 5129.01ms 
iter 505: loss 3.5398, time 5237.83ms 
iter 506: loss 3.3332, time 5257.15ms 
iter 507: loss 3.5241, time 5267.07ms 
iter 508: loss 3.6595, time 5260.47ms 
iter 509: loss 3.3593, time 5272.59ms 
iter 510: loss 3.3866, time 5254.08ms 
iter 511: loss 3.4491, time 5253.78ms 
iter 512: loss 3.4804, time 5255.08ms 
iter 513: loss 3.3428, time 5258.49ms 
iter 514: loss 3.5181, time 5236.26ms 
iter 515: loss 3.4747, time 5250.63ms 
iter 516: loss 3.3920, time 5253.01ms 
iter 517: loss 3.4643, time 5248.99ms 
iter 518: loss 3.4662, time 5258.51ms 
iter 519: loss 3.6778, time 5249.91ms 
iter 520: loss 3.4495, time 5248.52ms 
iter 521: loss 3.5807, time 5257.78ms 
iter 522: loss 3.3364, time 5255.36ms 
iter 523: loss 3.3047, time 5261.81ms 
iter 524: loss 3.7616, time 5267.81ms 
iter 525: loss 3.5784, time 5246.87ms 
iter 526: loss 3.4849, time 5251.01ms 
iter 527: loss 3.4486, time 5274.76ms 
iter 528: loss 3.3816, time 5233.43ms 
iter 529: loss 3.5486, time 5220.30ms 
iter 530: loss 3.3356, time 5250.94ms 
iter 531: loss 3.2413, time 5255.02ms 
iter 532: loss 3.3559, time 5250.41ms 
iter 533: loss 3.4889, time 5240.00ms 
iter 534: loss 3.4132, time 5262.59ms 
iter 535: loss 3.5857, time 5263.04ms 
iter 536: loss 3.5528, time 5258.54ms 
iter 537: loss 3.4589, time 5267.56ms 
iter 538: loss 3.4201, time 5253.22ms 
iter 539: loss 3.4880, time 5256.05ms 
iter 540: loss 3.3309, time 5255.88ms 
iter 541: loss 3.5071, time 5261.76ms 
iter 542: loss 3.3115, time 5278.74ms 
iter 543: loss 3.4104, time 5259.75ms 
iter 544: loss 3.4510, time 5259.71ms 
iter 545: loss 3.4469, time 5254.95ms 
iter 546: loss 3.4700, time 5260.77ms 
iter 547: loss 3.4281, time 5235.92ms 
iter 548: loss 3.4986, time 5261.18ms 
iter 549: loss 3.3992, time 5254.12ms 
step 550: train loss 3.4702, val loss 3.4358
iter 550: loss 3.3731, time 20009.21ms 
iter 551: loss 3.4704, time 5252.90ms 
iter 552: loss 3.7167, time 5266.98ms 
iter 553: loss 3.3951, time 5264.33ms 
iter 554: loss 3.4369, time 5251.74ms 
iter 555: loss 3.4229, time 5247.54ms 
iter 556: loss 3.5219, time 5254.97ms 
iter 557: loss 3.4288, time 5247.87ms 
iter 558: loss 3.4164, time 5253.73ms 
iter 559: loss 3.5560, time 5263.21ms 
iter 560: loss 3.4852, time 5266.26ms 
iter 561: loss 3.5888, time 5258.27ms 
iter 562: loss 3.2291, time 5255.80ms 
iter 563: loss 3.4011, time 5267.57ms 
iter 564: loss 3.2858, time 5257.40ms 
iter 565: loss 3.3605, time 5259.80ms 
iter 566: loss 3.3376, time 5258.18ms 
iter 567: loss 3.5264, time 5273.01ms 
iter 568: loss 3.3857, time 5248.85ms 
iter 569: loss 3.5288, time 5257.45ms 
iter 570: loss 3.4501, time 5243.99ms 
iter 571: loss 3.4689, time 5242.12ms 
iter 572: loss 3.5844, time 5255.95ms 
iter 573: loss 3.2821, time 5234.42ms 
iter 574: loss 3.4869, time 5250.99ms 
iter 575: loss 3.6443, time 5246.85ms 
iter 576: loss 3.4058, time 5242.99ms 
iter 577: loss 3.3754, time 5248.07ms 
iter 578: loss 3.4811, time 5237.99ms 
iter 579: loss 3.3599, time 5233.99ms 
iter 580: loss 3.5498, time 5233.01ms 
iter 581: loss 3.6107, time 5232.72ms 
iter 582: loss 3.6142, time 5233.92ms 
iter 583: loss 3.4937, time 5250.86ms 
iter 584: loss 3.4502, time 5241.32ms 
iter 585: loss 3.4568, time 5244.57ms 
iter 586: loss 3.4135, time 5264.45ms 
iter 587: loss 3.4746, time 5245.20ms 
iter 588: loss 3.2548, time 5245.32ms 
iter 589: loss 3.3744, time 5239.40ms 
iter 590: loss 3.3372, time 5246.68ms 
iter 591: loss 3.4266, time 5239.09ms 
iter 592: loss 3.4155, time 5240.93ms 
iter 593: loss 3.3221, time 5254.07ms 
iter 594: loss 3.5330, time 5238.35ms 
iter 595: loss 3.4398, time 5238.05ms 
iter 596: loss 3.4358, time 5245.04ms 
iter 597: loss 3.4706, time 5246.35ms 
iter 598: loss 3.3774, time 5259.96ms 
iter 599: loss 3.4909, time 5253.44ms 
step 600: train loss 3.4234, val loss 3.4043
iter 600: loss 3.3977, time 20002.36ms 
iter 601: loss 3.4401, time 5252.52ms 
iter 602: loss 3.4881, time 5248.10ms 
iter 603: loss 3.4281, time 5255.30ms 
iter 604: loss 3.4121, time 5252.89ms 
iter 605: loss 3.2185, time 5254.89ms 
iter 606: loss 3.3691, time 5249.77ms 
iter 607: loss 3.2642, time 5243.64ms 
iter 608: loss 3.3364, time 5264.37ms 
iter 609: loss 3.5040, time 5249.29ms 
iter 610: loss 3.5608, time 5255.96ms 
iter 611: loss 3.3867, time 5256.09ms 
iter 612: loss 3.3945, time 5259.47ms 
iter 613: loss 3.5280, time 5259.77ms 
iter 614: loss 3.5851, time 5258.81ms 
iter 615: loss 3.3436, time 5248.61ms 
iter 616: loss 3.6160, time 5254.29ms 
iter 617: loss 3.2494, time 5254.81ms 
iter 618: loss 3.5066, time 5254.91ms 
iter 619: loss 3.3814, time 5265.33ms 
iter 620: loss 3.2738, time 5261.90ms 
iter 621: loss 3.3575, time 5270.28ms 
iter 622: loss 3.7747, time 5269.97ms 
iter 623: loss 3.3707, time 5258.86ms 
iter 624: loss 3.2922, time 5251.58ms 
iter 625: loss 3.3513, time 5246.97ms 
iter 626: loss 3.4121, time 5252.28ms 
iter 627: loss 3.4188, time 5251.54ms 
iter 628: loss 3.3891, time 5246.95ms 
iter 629: loss 3.4956, time 5249.52ms 
iter 630: loss 3.3743, time 5259.24ms 
iter 631: loss 3.4628, time 5275.52ms 
iter 632: loss 3.3661, time 5254.73ms 
iter 633: loss 3.4723, time 5256.86ms 
iter 634: loss 3.3783, time 5261.19ms 
iter 635: loss 3.6295, time 5261.47ms 
iter 636: loss 3.4075, time 5266.60ms 
iter 637: loss 3.2928, time 5258.21ms 
iter 638: loss 3.4442, time 5256.45ms 
iter 639: loss 3.4312, time 5264.60ms 
iter 640: loss 3.3673, time 5262.00ms 
iter 641: loss 3.4672, time 5263.55ms 
iter 642: loss 3.2478, time 5256.16ms 
iter 643: loss 3.4036, time 5265.95ms 
iter 644: loss 3.3364, time 5260.43ms 
iter 645: loss 3.4658, time 5261.19ms 
iter 646: loss 3.3298, time 5253.99ms 
iter 647: loss 3.4337, time 5264.57ms 
iter 648: loss 3.4437, time 5254.91ms 
iter 649: loss 3.3376, time 5258.79ms 
step 650: train loss 3.3869, val loss 3.3702
iter 650: loss 3.4078, time 20034.06ms 
iter 651: loss 3.4565, time 5256.89ms 
iter 652: loss 3.4195, time 5245.64ms 
iter 653: loss 3.3792, time 5247.17ms 
iter 654: loss 3.5290, time 5243.34ms 
iter 655: loss 3.1878, time 5246.01ms 
iter 656: loss 3.5854, time 5253.68ms 
iter 657: loss 3.4231, time 5256.08ms 
iter 658: loss 3.2414, time 5252.08ms 
iter 659: loss 3.5264, time 5248.46ms 
iter 660: loss 3.4920, time 5246.46ms 
iter 661: loss 3.3622, time 5249.24ms 
iter 662: loss 3.3753, time 5245.41ms 
iter 663: loss 3.3677, time 5249.89ms 
iter 664: loss 3.2842, time 5266.00ms 
iter 665: loss 3.2733, time 5254.11ms 
iter 666: loss 3.3397, time 5251.45ms 
iter 667: loss 3.3168, time 5250.44ms 
iter 668: loss 3.1773, time 5252.98ms 
iter 669: loss 3.4190, time 5245.27ms 
iter 670: loss 3.2800, time 5261.02ms 
iter 671: loss 3.4486, time 5246.64ms 
iter 672: loss 3.4919, time 5252.52ms 
iter 673: loss 3.3487, time 5251.22ms 
iter 674: loss 3.3085, time 5247.33ms 
iter 675: loss 3.5651, time 5249.99ms 
iter 676: loss 3.3850, time 5248.37ms 
iter 677: loss 3.3190, time 5244.96ms 
iter 678: loss 3.4222, time 5244.41ms 
iter 679: loss 3.2683, time 5250.17ms 
iter 680: loss 3.3592, time 5249.71ms 
iter 681: loss 3.3180, time 5263.85ms 
iter 682: loss 3.2294, time 5265.29ms 
iter 683: loss 3.3388, time 5259.10ms 
iter 684: loss 3.3799, time 5258.21ms 
iter 685: loss 3.5235, time 5257.74ms 
iter 686: loss 3.4667, time 5257.58ms 
iter 687: loss 3.5330, time 5261.78ms 
iter 688: loss 3.6669, time 5259.48ms 
iter 689: loss 3.1754, time 5264.43ms 
iter 690: loss 3.4235, time 5256.67ms 
iter 691: loss 3.3962, time 5252.23ms 
iter 692: loss 3.3333, time 5254.39ms 
iter 693: loss 3.2996, time 5259.56ms 
iter 694: loss 3.4811, time 5256.97ms 
iter 695: loss 3.1793, time 5253.98ms 
iter 696: loss 3.4388, time 5259.78ms 
iter 697: loss 3.3898, time 5248.26ms 
iter 698: loss 3.2051, time 5255.30ms 
iter 699: loss 3.5338, time 5262.28ms 
step 700: train loss 3.3470, val loss 3.3307
iter 700: loss 3.3100, time 20022.87ms 
iter 701: loss 3.3374, time 5257.75ms 
iter 702: loss 3.1817, time 5262.57ms 
iter 703: loss 3.4878, time 5254.67ms 
iter 704: loss 3.2620, time 5265.48ms 
iter 705: loss 3.4279, time 5249.39ms 
iter 706: loss 3.2639, time 5246.04ms 
iter 707: loss 3.2544, time 5250.19ms 
iter 708: loss 3.4285, time 5257.99ms 
iter 709: loss 3.3537, time 5254.94ms 
iter 710: loss 3.2042, time 5250.58ms 
iter 711: loss 3.5209, time 5263.10ms 
iter 712: loss 3.2558, time 5250.74ms 
iter 713: loss 3.3490, time 5251.92ms 
iter 714: loss 3.2475, time 5254.43ms 
iter 715: loss 3.2965, time 5259.98ms 
iter 716: loss 3.3543, time 5251.55ms 
iter 717: loss 3.1207, time 5245.35ms 
iter 718: loss 3.3289, time 5248.46ms 
iter 719: loss 3.2733, time 5261.35ms 
iter 720: loss 3.2910, time 5236.36ms 
iter 721: loss 3.2868, time 5251.21ms 
iter 722: loss 3.3103, time 5247.52ms 
iter 723: loss 3.2974, time 5247.11ms 
iter 724: loss 3.3066, time 5256.52ms 
iter 725: loss 3.4514, time 5272.02ms 
iter 726: loss 3.2320, time 5268.78ms 
iter 727: loss 3.3761, time 5257.92ms 
iter 728: loss 3.2827, time 5280.46ms 
iter 729: loss 3.2645, time 5269.93ms 
iter 730: loss 3.2993, time 5254.05ms 
iter 731: loss 3.2157, time 5246.53ms 
iter 732: loss 3.3195, time 5247.44ms 
iter 733: loss 3.2285, time 5251.93ms 
iter 734: loss 3.3927, time 5256.12ms 
iter 735: loss 3.3512, time 5258.13ms 
iter 736: loss 3.3024, time 5247.41ms 
iter 737: loss 3.3553, time 5250.77ms 
iter 738: loss 3.2313, time 5257.38ms 
iter 739: loss 3.3972, time 5260.13ms 
iter 740: loss 3.5462, time 5263.80ms 
iter 741: loss 3.1328, time 5255.72ms 
iter 742: loss 3.3361, time 5264.95ms 
iter 743: loss 3.2948, time 5257.55ms 
iter 744: loss 3.4447, time 5258.49ms 
iter 745: loss 3.2866, time 5261.58ms 
iter 746: loss 3.3238, time 5260.31ms 
iter 747: loss 3.3663, time 5272.03ms 
iter 748: loss 3.2836, time 5268.01ms 
iter 749: loss 3.3207, time 5265.08ms 
step 750: train loss 3.3154, val loss 3.3035
iter 750: loss 3.4752, time 20057.59ms 
iter 751: loss 3.3108, time 5260.34ms 
iter 752: loss 3.5334, time 5261.70ms 
iter 753: loss 3.2638, time 5254.62ms 
iter 754: loss 3.5408, time 5262.25ms 
iter 755: loss 3.2781, time 5268.04ms 
iter 756: loss 3.3254, time 5272.41ms 
iter 757: loss 3.3104, time 5266.04ms 
iter 758: loss 3.2193, time 5282.33ms 
iter 759: loss 3.3360, time 5268.85ms 
iter 760: loss 3.3080, time 5229.25ms 
iter 761: loss 3.3952, time 5275.26ms 
iter 762: loss 3.1987, time 5289.36ms 
iter 763: loss 3.5552, time 5270.46ms 
iter 764: loss 3.3526, time 5268.68ms 
iter 765: loss 3.1642, time 5261.49ms 
iter 766: loss 3.3348, time 5250.10ms 
iter 767: loss 3.3670, time 5251.51ms 
iter 768: loss 3.3222, time 5253.56ms 
iter 769: loss 3.2792, time 5250.70ms 
iter 770: loss 3.3118, time 5248.94ms 
iter 771: loss 3.2814, time 5267.41ms 
iter 772: loss 3.1773, time 5255.73ms 
iter 773: loss 3.3290, time 5251.00ms 
iter 774: loss 3.1425, time 5254.52ms 
iter 775: loss 3.5191, time 5253.90ms 
iter 776: loss 3.4268, time 5254.27ms 
iter 777: loss 3.3183, time 5252.70ms 
iter 778: loss 3.1526, time 5255.40ms 
iter 779: loss 3.2757, time 5253.17ms 
iter 780: loss 3.3200, time 5251.56ms 
iter 781: loss 3.1912, time 5249.25ms 
iter 782: loss 3.2634, time 5265.95ms 
iter 783: loss 3.1844, time 5243.23ms 
iter 784: loss 3.4267, time 5246.27ms 
iter 785: loss 3.1290, time 5246.16ms 
iter 786: loss 3.0998, time 5261.25ms 
iter 787: loss 3.2951, time 5252.59ms 
iter 788: loss 3.3545, time 5227.50ms 
iter 789: loss 3.2071, time 5247.47ms 
iter 790: loss 3.1411, time 5258.07ms 
iter 791: loss 3.1630, time 5253.26ms 
iter 792: loss 3.2671, time 5263.32ms 
iter 793: loss 3.3101, time 5248.22ms 
iter 794: loss 3.2980, time 5277.78ms 
iter 795: loss 3.1473, time 5254.25ms 
iter 796: loss 3.2872, time 5258.97ms 
iter 797: loss 3.4529, time 5260.88ms 
iter 798: loss 3.2965, time 5263.63ms 
iter 799: loss 3.2170, time 5282.00ms 
step 800: train loss 3.2561, val loss 3.2542
iter 800: loss 3.2413, time 20025.19ms 
iter 801: loss 3.1779, time 5250.54ms 
iter 802: loss 3.2142, time 5254.57ms 
iter 803: loss 3.1378, time 5251.39ms 
iter 804: loss 3.2644, time 5252.43ms 
iter 805: loss 3.0968, time 5251.82ms 
iter 806: loss 3.0703, time 5256.54ms 
iter 807: loss 3.5111, time 5239.22ms 
iter 808: loss 3.4389, time 5257.14ms 
iter 809: loss 3.2974, time 5228.10ms 
iter 810: loss 3.4994, time 5267.40ms 
iter 811: loss 3.2785, time 5264.81ms 
iter 812: loss 3.3738, time 5254.40ms 
iter 813: loss 3.2543, time 5251.96ms 
iter 814: loss 3.2130, time 5260.65ms 
iter 815: loss 3.2455, time 5259.27ms 
iter 816: loss 3.4212, time 5259.82ms 
iter 817: loss 3.1072, time 5260.88ms 
iter 818: loss 3.1346, time 5256.88ms 
iter 819: loss 3.3622, time 5257.46ms 
iter 820: loss 3.2870, time 5267.74ms 
iter 821: loss 3.2784, time 5252.86ms 
iter 822: loss 3.4318, time 5270.26ms 
iter 823: loss 3.1454, time 5258.01ms 
iter 824: loss 3.2234, time 5256.40ms 
iter 825: loss 3.2252, time 5268.30ms 
iter 826: loss 3.3075, time 5237.00ms 
iter 827: loss 3.4976, time 5259.66ms 
iter 828: loss 3.2973, time 5266.84ms 
iter 829: loss 3.1699, time 5255.17ms 
iter 830: loss 3.2762, time 5242.59ms 
iter 831: loss 3.2144, time 5239.43ms 
iter 832: loss 3.4378, time 5219.66ms 
iter 833: loss 3.3360, time 5257.36ms 
iter 834: loss 3.0221, time 5247.26ms 
iter 835: loss 3.2839, time 5250.92ms 
iter 836: loss 3.0476, time 5253.51ms 
iter 837: loss 3.2667, time 5251.42ms 
iter 838: loss 3.1626, time 5256.20ms 
iter 839: loss 3.3944, time 5248.04ms 
iter 840: loss 3.3111, time 5245.58ms 
iter 841: loss 3.1868, time 5254.86ms 
iter 842: loss 3.1307, time 5248.60ms 
iter 843: loss 3.2797, time 5248.01ms 
iter 844: loss 3.3034, time 5260.28ms 
iter 845: loss 3.1364, time 5251.47ms 
iter 846: loss 3.3139, time 5255.47ms 
iter 847: loss 3.3385, time 5251.61ms 
iter 848: loss 3.4139, time 5250.85ms 
iter 849: loss 3.2436, time 5254.58ms 
step 850: train loss 3.2355, val loss 3.2373
iter 850: loss 3.3605, time 20034.33ms 
iter 851: loss 3.4446, time 5249.71ms 
iter 852: loss 3.1538, time 5249.30ms 
iter 853: loss 3.3487, time 5247.01ms 
iter 854: loss 3.3761, time 5261.57ms 
iter 855: loss 3.2970, time 5260.93ms 
iter 856: loss 3.4687, time 5248.63ms 
iter 857: loss 3.3762, time 5238.44ms 
iter 858: loss 3.1934, time 5234.75ms 
iter 859: loss 3.1779, time 5241.72ms 
iter 860: loss 3.0776, time 5235.98ms 
iter 861: loss 3.1365, time 5234.04ms 
iter 862: loss 3.2732, time 5234.79ms 
iter 863: loss 3.2645, time 5241.43ms 
iter 864: loss 3.2185, time 5237.24ms 
iter 865: loss 3.1181, time 5249.28ms 
iter 866: loss 3.2868, time 5243.26ms 
iter 867: loss 3.2819, time 5250.80ms 
iter 868: loss 3.4990, time 5221.14ms 
iter 869: loss 3.3993, time 5194.88ms 
iter 870: loss 3.1198, time 5220.19ms 
iter 871: loss 3.4238, time 5261.40ms 
iter 872: loss 3.0679, time 5252.86ms 
iter 873: loss 3.2564, time 5248.92ms 
iter 874: loss 3.1328, time 5256.88ms 
iter 875: loss 3.3988, time 5258.34ms 
iter 876: loss 3.1418, time 5283.71ms 
iter 877: loss 3.2268, time 5278.30ms 
iter 878: loss 3.3341, time 5259.12ms 
iter 879: loss 3.0954, time 5263.51ms 
iter 880: loss 3.2206, time 5255.02ms 
iter 881: loss 3.1942, time 5257.70ms 
iter 882: loss 3.3013, time 5258.97ms 
iter 883: loss 3.2236, time 5251.91ms 
iter 884: loss 3.6502, time 5251.65ms 
iter 885: loss 3.1680, time 5260.94ms 
iter 886: loss 3.0889, time 5265.32ms 
iter 887: loss 3.3245, time 5267.01ms 
iter 888: loss 3.1741, time 5254.29ms 
iter 889: loss 3.1071, time 5253.98ms 
iter 890: loss 2.9905, time 5273.17ms 
iter 891: loss 3.5259, time 5262.84ms 
iter 892: loss 3.3292, time 5264.37ms 
iter 893: loss 3.3580, time 5254.40ms 
iter 894: loss 3.2141, time 5251.61ms 
iter 895: loss 3.3339, time 5250.55ms 
iter 896: loss 3.4891, time 5251.13ms 
iter 897: loss 3.0725, time 5272.02ms 
iter 898: loss 3.2959, time 5248.02ms 
iter 899: loss 3.2615, time 5252.46ms 
step 900: train loss 3.1981, val loss 3.2053
iter 900: loss 3.2948, time 20052.12ms 
iter 901: loss 3.3410, time 5249.74ms 
iter 902: loss 3.3482, time 5263.85ms 
iter 903: loss 3.3175, time 5259.79ms 
iter 904: loss 3.1969, time 5260.65ms 
iter 905: loss 3.2313, time 5266.42ms 
iter 906: loss 3.2709, time 5269.29ms 
iter 907: loss 3.3645, time 5286.33ms 
iter 908: loss 3.2107, time 5269.17ms 
iter 909: loss 3.1658, time 5256.32ms 
iter 910: loss 3.1429, time 5262.11ms 
iter 911: loss 3.2776, time 5262.79ms 
iter 912: loss 3.1999, time 5259.12ms 
iter 913: loss 3.1370, time 5258.66ms 
iter 914: loss 3.1371, time 5259.80ms 
iter 915: loss 3.1676, time 5275.40ms 
iter 916: loss 3.1539, time 5256.62ms 
iter 917: loss 3.2341, time 5268.06ms 
iter 918: loss 3.3434, time 5292.43ms 
iter 919: loss 3.2551, time 5261.11ms 
iter 920: loss 3.1159, time 5259.76ms 
iter 921: loss 3.1744, time 5267.38ms 
iter 922: loss 3.2158, time 5210.58ms 
iter 923: loss 3.1730, time 5246.66ms 
iter 924: loss 2.9805, time 5251.35ms 
iter 925: loss 3.2860, time 5250.96ms 
iter 926: loss 3.0096, time 5263.57ms 
iter 927: loss 3.1509, time 5264.36ms 
iter 928: loss 3.3153, time 5254.86ms 
iter 929: loss 3.1326, time 5257.02ms 
iter 930: loss 3.5265, time 5253.17ms 
iter 931: loss 3.1451, time 5258.76ms 
iter 932: loss 3.2357, time 5246.33ms 
iter 933: loss 3.0289, time 5251.03ms 
iter 934: loss 3.2285, time 5251.62ms 
iter 935: loss 3.1451, time 5264.93ms 
iter 936: loss 3.2057, time 5259.26ms 
iter 937: loss 3.4328, time 5244.29ms 
iter 938: loss 3.3513, time 5263.10ms 
iter 939: loss 3.1367, time 5258.17ms 
iter 940: loss 3.2874, time 5261.41ms 
iter 941: loss 3.2095, time 5269.32ms 
iter 942: loss 3.1754, time 5279.51ms 
iter 943: loss 3.1450, time 5267.29ms 
iter 944: loss 3.0632, time 5276.93ms 
iter 945: loss 3.2659, time 5276.38ms 
iter 946: loss 3.0352, time 5248.54ms 
iter 947: loss 3.2563, time 5247.79ms 
iter 948: loss 3.2271, time 5212.44ms 
iter 949: loss 3.0568, time 5225.28ms 
step 950: train loss 3.1633, val loss 3.1873
iter 950: loss 3.0639, time 20034.66ms 
iter 951: loss 3.1993, time 5267.27ms 
iter 952: loss 3.2120, time 5255.02ms 
iter 953: loss 3.4104, time 5261.91ms 
iter 954: loss 3.2206, time 5254.96ms 
iter 955: loss 3.0995, time 5206.97ms 
iter 956: loss 3.1917, time 5256.73ms 
iter 957: loss 3.2954, time 5255.97ms 
iter 958: loss 3.0905, time 5250.72ms 
iter 959: loss 3.0371, time 5259.38ms 
iter 960: loss 3.2158, time 5254.72ms 
iter 961: loss 3.1881, time 5255.80ms 
iter 962: loss 3.2017, time 5260.40ms 
iter 963: loss 3.2877, time 5257.09ms 
iter 964: loss 3.1110, time 5249.81ms 
iter 965: loss 2.9869, time 5218.04ms 
iter 966: loss 3.3235, time 5271.07ms 
iter 967: loss 3.0347, time 5266.57ms 
iter 968: loss 3.4763, time 5263.83ms 
iter 969: loss 3.1278, time 5262.06ms 
iter 970: loss 3.0611, time 5262.06ms 
iter 971: loss 3.0080, time 5253.90ms 
iter 972: loss 3.1098, time 5246.86ms 
iter 973: loss 3.0564, time 5253.14ms 
iter 974: loss 2.9648, time 5298.85ms 
iter 975: loss 3.0828, time 5341.85ms 
iter 976: loss 3.3561, time 5257.03ms 
iter 977: loss 3.0957, time 5255.74ms 
iter 978: loss 3.2906, time 5256.24ms 
iter 979: loss 3.2292, time 5253.00ms 
iter 980: loss 3.2497, time 5255.76ms 
iter 981: loss 3.3049, time 5246.56ms 
iter 982: loss 3.1375, time 5248.17ms 
iter 983: loss 3.0795, time 5248.83ms 
iter 984: loss 3.0610, time 5261.88ms 
iter 985: loss 3.0591, time 5264.72ms 
iter 986: loss 3.0116, time 5253.16ms 
iter 987: loss 3.0492, time 5255.63ms 
iter 988: loss 3.4365, time 5261.70ms 
iter 989: loss 3.0780, time 5258.92ms 
iter 990: loss 3.2155, time 5260.67ms 
iter 991: loss 3.3002, time 5260.49ms 
iter 992: loss 3.0239, time 5263.08ms 
iter 993: loss 3.0343, time 5410.82ms 
iter 994: loss 2.9554, time 5412.34ms 
iter 995: loss 3.3134, time 5380.99ms 
iter 996: loss 2.9940, time 5248.47ms 
iter 997: loss 3.1714, time 5253.01ms 
iter 998: loss 3.1733, time 5251.57ms 
iter 999: loss 3.1955, time 5344.10ms 
step 1000: train loss 3.1693, val loss 3.1653
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 1000: loss 3.0506, time 21803.03ms 
iter 1001: loss 3.0231, time 5248.98ms 
iter 1002: loss 3.0748, time 5255.24ms 
iter 1003: loss 3.3412, time 5256.29ms 
iter 1004: loss 3.3182, time 5257.58ms 
iter 1005: loss 3.2380, time 5265.11ms 
iter 1006: loss 3.2146, time 5272.13ms 
iter 1007: loss 3.1154, time 5263.13ms 
iter 1008: loss 3.1950, time 5261.71ms 
iter 1009: loss 3.1966, time 5255.74ms 
iter 1010: loss 3.0757, time 5255.72ms 
iter 1011: loss 3.3091, time 5257.77ms 
iter 1012: loss 3.3669, time 5260.75ms 
iter 1013: loss 3.0233, time 5257.39ms 
iter 1014: loss 3.0501, time 5261.21ms 
iter 1015: loss 3.3378, time 5257.03ms 
iter 1016: loss 3.0829, time 5253.81ms 
iter 1017: loss 3.1049, time 5258.94ms 
iter 1018: loss 3.0676, time 5261.41ms 
iter 1019: loss 3.2225, time 5257.32ms 
iter 1020: loss 3.2938, time 5250.16ms 
iter 1021: loss 3.1649, time 5251.03ms 
iter 1022: loss 3.2664, time 5264.82ms 
iter 1023: loss 3.1204, time 5255.74ms 
iter 1024: loss 3.2286, time 5253.36ms 
iter 1025: loss 3.0647, time 5250.31ms 
iter 1026: loss 3.1958, time 5261.89ms 
iter 1027: loss 3.0484, time 5253.23ms 
iter 1028: loss 3.1711, time 5250.75ms 
iter 1029: loss 3.2435, time 5265.78ms 
iter 1030: loss 3.0433, time 5268.17ms 
iter 1031: loss 3.1747, time 5258.39ms 
iter 1032: loss 3.2833, time 5228.40ms 
iter 1033: loss 3.3109, time 5263.25ms 
iter 1034: loss 3.0998, time 5260.33ms 
iter 1035: loss 3.1264, time 5259.09ms 
iter 1036: loss 3.1243, time 5256.97ms 
iter 1037: loss 3.0184, time 5266.14ms 
iter 1038: loss 3.2703, time 5263.77ms 
iter 1039: loss 3.1595, time 5263.98ms 
iter 1040: loss 3.1825, time 5250.75ms 
iter 1041: loss 3.0960, time 5260.84ms 
iter 1042: loss 3.0206, time 5267.73ms 
iter 1043: loss 3.1648, time 5253.59ms 
iter 1044: loss 3.1276, time 5246.26ms 
iter 1045: loss 3.1558, time 5251.66ms 
iter 1046: loss 3.2465, time 5255.10ms 
iter 1047: loss 3.1232, time 5262.70ms 
iter 1048: loss 2.9269, time 5253.92ms 
iter 1049: loss 3.1092, time 5251.29ms 
step 1050: train loss 3.1266, val loss 3.1478
iter 1050: loss 3.1692, time 20048.75ms 
iter 1051: loss 3.1943, time 5247.59ms 
iter 1052: loss 3.1081, time 5258.69ms 
iter 1053: loss 3.0733, time 5248.91ms 
iter 1054: loss 3.1446, time 5246.26ms 
iter 1055: loss 3.2022, time 5254.06ms 
iter 1056: loss 3.1499, time 5256.32ms 
iter 1057: loss 3.1607, time 5258.03ms 
iter 1058: loss 3.1985, time 5259.72ms 
iter 1059: loss 3.1475, time 5254.98ms 
iter 1060: loss 3.1624, time 5251.40ms 
iter 1061: loss 3.2973, time 5249.62ms 
iter 1062: loss 3.1267, time 5222.75ms 
iter 1063: loss 3.0892, time 5259.07ms 
iter 1064: loss 3.0869, time 5261.92ms 
iter 1065: loss 3.2347, time 5252.74ms 
iter 1066: loss 3.1478, time 5248.99ms 
iter 1067: loss 3.0813, time 5229.16ms 
iter 1068: loss 3.0334, time 5261.21ms 
iter 1069: loss 3.2032, time 5252.88ms 
iter 1070: loss 3.1383, time 5248.56ms 
iter 1071: loss 3.1698, time 5253.59ms 
iter 1072: loss 2.9692, time 5216.68ms 
iter 1073: loss 3.1876, time 5177.15ms 
iter 1074: loss 3.0942, time 5249.65ms 
iter 1075: loss 3.0899, time 5249.83ms 
iter 1076: loss 3.0266, time 5254.31ms 
iter 1077: loss 3.1294, time 5253.88ms 
iter 1078: loss 3.1411, time 5249.62ms 
iter 1079: loss 3.0798, time 5248.55ms 
iter 1080: loss 3.1089, time 5252.01ms 
iter 1081: loss 3.4463, time 5263.57ms 
iter 1082: loss 3.0858, time 5267.65ms 
iter 1083: loss 3.1360, time 5266.37ms 
iter 1084: loss 3.0005, time 5250.40ms 
iter 1085: loss 3.0772, time 5260.96ms 
iter 1086: loss 3.2017, time 5257.44ms 
iter 1087: loss 3.1180, time 5252.37ms 
iter 1088: loss 3.0134, time 5260.62ms 
iter 1089: loss 3.0630, time 5258.98ms 
iter 1090: loss 3.1392, time 5250.29ms 
iter 1091: loss 3.1007, time 5271.88ms 
iter 1092: loss 3.1322, time 5259.79ms 
iter 1093: loss 3.1164, time 5256.17ms 
iter 1094: loss 3.4936, time 5262.95ms 
iter 1095: loss 3.0983, time 5254.96ms 
iter 1096: loss 3.0773, time 5260.58ms 
iter 1097: loss 3.0170, time 5273.43ms 
iter 1098: loss 3.0828, time 5215.80ms 
iter 1099: loss 3.1479, time 5269.05ms 
step 1100: train loss 3.1229, val loss 3.1440
iter 1100: loss 3.2296, time 19998.50ms 
iter 1101: loss 3.1972, time 5266.71ms 
iter 1102: loss 3.1987, time 5250.48ms 
iter 1103: loss 3.0915, time 5250.04ms 
iter 1104: loss 3.3617, time 5254.64ms 
iter 1105: loss 3.0722, time 5261.33ms 
iter 1106: loss 3.3300, time 5258.37ms 
iter 1107: loss 3.3417, time 5241.61ms 
iter 1108: loss 3.1182, time 5244.84ms 
iter 1109: loss 3.0623, time 5243.55ms 
iter 1110: loss 3.2472, time 5236.46ms 
iter 1111: loss 3.1183, time 5162.21ms 
iter 1112: loss 3.0371, time 5097.14ms 
iter 1113: loss 3.1187, time 5076.86ms 
iter 1114: loss 3.5026, time 5099.20ms 
iter 1115: loss 3.0787, time 5215.36ms 
iter 1116: loss 3.0403, time 5236.75ms 
iter 1117: loss 3.2136, time 5244.69ms 
iter 1118: loss 3.0883, time 5156.64ms 
iter 1119: loss 3.2320, time 5215.57ms 
iter 1120: loss 3.0624, time 5248.66ms 
iter 1121: loss 3.1707, time 5235.11ms 
iter 1122: loss 3.0990, time 5236.39ms 
iter 1123: loss 3.1150, time 5236.11ms 
iter 1124: loss 3.0427, time 5235.28ms 
iter 1125: loss 3.0696, time 5250.54ms 
iter 1126: loss 2.9744, time 5226.41ms 
iter 1127: loss 3.0732, time 5244.15ms 
iter 1128: loss 2.8630, time 5237.71ms 
iter 1129: loss 3.3578, time 5247.57ms 
iter 1130: loss 3.1720, time 5235.88ms 
iter 1131: loss 3.1326, time 5234.58ms 
iter 1132: loss 3.0510, time 5235.59ms 
iter 1133: loss 3.0671, time 5231.06ms 
iter 1134: loss 3.2246, time 5234.32ms 
iter 1135: loss 3.2063, time 5234.57ms 
iter 1136: loss 3.2670, time 5243.79ms 
iter 1137: loss 3.0545, time 5284.79ms 
iter 1138: loss 3.1574, time 5248.31ms 
iter 1139: loss 3.1197, time 5247.01ms 
iter 1140: loss 2.9963, time 5240.04ms 
iter 1141: loss 3.0806, time 5242.65ms 
iter 1142: loss 3.2948, time 5244.86ms 
iter 1143: loss 3.0936, time 5236.04ms 
iter 1144: loss 3.1119, time 5234.31ms 
iter 1145: loss 3.0257, time 5227.46ms 
iter 1146: loss 2.9553, time 5244.72ms 
iter 1147: loss 2.9678, time 5232.26ms 
iter 1148: loss 2.9917, time 5241.96ms 
iter 1149: loss 3.2457, time 5295.89ms 
step 1150: train loss 3.0852, val loss 3.1152
iter 1150: loss 3.1693, time 20429.39ms 
iter 1151: loss 3.0694, time 5255.42ms 
iter 1152: loss 3.3900, time 5253.98ms 
iter 1153: loss 3.3029, time 5269.56ms 
iter 1154: loss 3.0014, time 5384.74ms 
iter 1155: loss 3.1362, time 5352.43ms 
iter 1156: loss 3.1151, time 5258.93ms 
iter 1157: loss 3.0378, time 5258.70ms 
iter 1158: loss 3.0143, time 5252.56ms 
iter 1159: loss 3.1613, time 5261.26ms 
iter 1160: loss 3.1812, time 5262.70ms 
iter 1161: loss 3.0959, time 5251.77ms 
iter 1162: loss 3.0245, time 5254.90ms 
iter 1163: loss 3.3632, time 5256.74ms 
iter 1164: loss 3.1624, time 5252.03ms 
iter 1165: loss 3.2990, time 5255.80ms 
iter 1166: loss 3.0725, time 5254.36ms 
iter 1167: loss 3.0596, time 5254.94ms 
iter 1168: loss 3.1435, time 5261.96ms 
iter 1169: loss 3.1118, time 5253.69ms 
iter 1170: loss 3.2012, time 5269.04ms 
iter 1171: loss 3.0295, time 5272.93ms 
iter 1172: loss 2.8917, time 5256.31ms 
iter 1173: loss 3.0266, time 5244.74ms 
iter 1174: loss 3.1238, time 5250.00ms 
iter 1175: loss 3.1621, time 5235.13ms 
iter 1176: loss 3.1092, time 5242.92ms 
iter 1177: loss 3.2555, time 5231.52ms 
iter 1178: loss 2.9380, time 5235.89ms 
iter 1179: loss 3.4037, time 5223.20ms 
iter 1180: loss 3.1349, time 5268.17ms 
iter 1181: loss 3.0377, time 5269.79ms 
iter 1182: loss 3.1571, time 5259.93ms 
iter 1183: loss 2.9537, time 5261.41ms 
iter 1184: loss 3.2697, time 5271.18ms 
iter 1185: loss 3.0565, time 5288.13ms 
iter 1186: loss 2.9579, time 5260.69ms 
iter 1187: loss 3.0016, time 5260.91ms 
iter 1188: loss 3.0183, time 5257.69ms 
iter 1189: loss 3.2610, time 5249.06ms 
iter 1190: loss 3.1362, time 5253.41ms 
iter 1191: loss 3.0093, time 5241.16ms 
iter 1192: loss 3.0259, time 5233.13ms 
iter 1193: loss 3.0911, time 5193.56ms 
iter 1194: loss 3.0318, time 5235.93ms 
iter 1195: loss 3.0373, time 5243.00ms 
iter 1196: loss 3.1438, time 5241.08ms 
iter 1197: loss 3.0422, time 5231.58ms 
iter 1198: loss 3.1833, time 5243.80ms 
iter 1199: loss 3.0793, time 5260.67ms 
step 1200: train loss 3.0711, val loss 3.1050
iter 1200: loss 3.0623, time 19997.66ms 
iter 1201: loss 3.0406, time 5244.08ms 
iter 1202: loss 3.0885, time 5229.39ms 
iter 1203: loss 3.1712, time 5241.40ms 
iter 1204: loss 3.0010, time 5237.73ms 
iter 1205: loss 3.0223, time 5236.67ms 
iter 1206: loss 3.1031, time 5235.74ms 
iter 1207: loss 3.2557, time 5218.61ms 
iter 1208: loss 3.0656, time 5099.36ms 
iter 1209: loss 3.1827, time 5081.53ms 
iter 1210: loss 3.0394, time 5113.37ms 
iter 1211: loss 3.1960, time 5106.17ms 
iter 1212: loss 3.0564, time 5248.89ms 
iter 1213: loss 3.0213, time 5185.04ms 
iter 1214: loss 3.0826, time 5068.40ms 
iter 1215: loss 3.2814, time 5080.35ms 
iter 1216: loss 2.8900, time 5086.76ms 
iter 1217: loss 3.2501, time 5091.18ms 
iter 1218: loss 3.0346, time 5130.11ms 
iter 1219: loss 3.0645, time 5263.13ms 
iter 1220: loss 2.9694, time 5262.87ms 
iter 1221: loss 3.0534, time 5257.00ms 
iter 1222: loss 3.0501, time 5251.86ms 
iter 1223: loss 2.9776, time 5257.42ms 
iter 1224: loss 3.1029, time 5256.49ms 
iter 1225: loss 3.0483, time 5256.82ms 
iter 1226: loss 2.9755, time 5256.84ms 
iter 1227: loss 3.0965, time 5260.69ms 
iter 1228: loss 2.9931, time 5254.07ms 
iter 1229: loss 2.9630, time 5252.30ms 
iter 1230: loss 3.0275, time 5260.90ms 
iter 1231: loss 3.0885, time 5253.71ms 
iter 1232: loss 3.0816, time 5247.53ms 
iter 1233: loss 3.2011, time 5223.44ms 
iter 1234: loss 3.2758, time 5238.87ms 
iter 1235: loss 3.1417, time 5260.38ms 
iter 1236: loss 3.1187, time 5264.74ms 
iter 1237: loss 3.0007, time 5257.82ms 
iter 1238: loss 3.1385, time 5270.92ms 
iter 1239: loss 3.0539, time 5266.37ms 
iter 1240: loss 3.1118, time 5257.39ms 
iter 1241: loss 2.8987, time 5265.20ms 
iter 1242: loss 3.0960, time 5270.98ms 
iter 1243: loss 3.1298, time 5256.74ms 
iter 1244: loss 3.1405, time 5254.91ms 
iter 1245: loss 3.0839, time 5255.28ms 
iter 1246: loss 2.9050, time 5264.47ms 
iter 1247: loss 3.1223, time 5250.66ms 
iter 1248: loss 2.8941, time 5261.11ms 
iter 1249: loss 2.9464, time 5262.56ms 
step 1250: train loss 3.0684, val loss 3.0940
iter 1250: loss 2.9192, time 20065.82ms 
iter 1251: loss 2.9881, time 5367.47ms 
iter 1252: loss 3.0041, time 5388.07ms 
iter 1253: loss 2.9993, time 5413.03ms 
iter 1254: loss 3.1793, time 5400.24ms 
iter 1255: loss 3.0293, time 5389.01ms 
iter 1256: loss 2.9417, time 5404.78ms 
iter 1257: loss 3.0980, time 5395.68ms 
iter 1258: loss 3.1395, time 5392.44ms 
iter 1259: loss 3.0264, time 5423.80ms 
iter 1260: loss 2.9587, time 5394.17ms 
iter 1261: loss 3.0106, time 5390.72ms 
iter 1262: loss 2.9323, time 5399.07ms 
iter 1263: loss 3.0702, time 5385.96ms 
iter 1264: loss 2.9774, time 5337.23ms 
iter 1265: loss 3.0130, time 5262.02ms 
iter 1266: loss 2.9087, time 5260.67ms 
iter 1267: loss 3.0110, time 5267.48ms 
iter 1268: loss 2.9865, time 5271.80ms 
iter 1269: loss 2.9726, time 5269.51ms 
iter 1270: loss 2.9560, time 5250.45ms 
iter 1271: loss 3.0453, time 5251.94ms 
iter 1272: loss 2.9213, time 5258.70ms 
iter 1273: loss 2.8426, time 5255.96ms 
iter 1274: loss 3.0815, time 5273.72ms 
iter 1275: loss 3.1243, time 5248.44ms 
iter 1276: loss 2.9345, time 5245.96ms 
iter 1277: loss 3.0129, time 5252.60ms 
iter 1278: loss 3.1591, time 5248.03ms 
iter 1279: loss 3.2245, time 5248.96ms 
iter 1280: loss 3.1270, time 5209.16ms 
iter 1281: loss 3.1150, time 5260.44ms 
iter 1282: loss 3.1025, time 5255.61ms 
iter 1283: loss 3.0335, time 5257.97ms 
iter 1284: loss 2.8027, time 5274.42ms 
iter 1285: loss 3.1930, time 5267.30ms 
iter 1286: loss 2.9919, time 5254.13ms 
iter 1287: loss 3.0733, time 5275.52ms 
iter 1288: loss 3.1649, time 5283.85ms 
iter 1289: loss 3.1641, time 5270.80ms 
iter 1290: loss 2.9583, time 5263.67ms 
iter 1291: loss 3.0881, time 5256.29ms 
iter 1292: loss 3.0096, time 5256.71ms 
iter 1293: loss 2.9353, time 5257.05ms 
iter 1294: loss 2.9490, time 5261.05ms 
iter 1295: loss 2.8254, time 5253.92ms 
iter 1296: loss 3.1360, time 5236.97ms 
iter 1297: loss 2.9186, time 5219.90ms 
iter 1298: loss 2.9144, time 5269.68ms 
iter 1299: loss 2.8808, time 5265.83ms 
step 1300: train loss 3.0321, val loss 3.0937
iter 1300: loss 3.1994, time 20189.11ms 
iter 1301: loss 2.9809, time 5262.22ms 
iter 1302: loss 3.1137, time 5266.35ms 
iter 1303: loss 2.9880, time 5262.07ms 
iter 1304: loss 3.1129, time 5267.25ms 
iter 1305: loss 3.1551, time 5266.65ms 
iter 1306: loss 2.9656, time 5272.33ms 
iter 1307: loss 2.9716, time 5266.14ms 
iter 1308: loss 2.9625, time 5257.38ms 
iter 1309: loss 3.1869, time 5263.82ms 
iter 1310: loss 3.1474, time 5261.11ms 
iter 1311: loss 2.9953, time 5256.70ms 
iter 1312: loss 2.9935, time 5262.50ms 
iter 1313: loss 2.9932, time 5261.60ms 
iter 1314: loss 2.9685, time 5260.70ms 
iter 1315: loss 3.0390, time 5254.64ms 
iter 1316: loss 2.9974, time 5253.76ms 
iter 1317: loss 3.0093, time 5257.19ms 
iter 1318: loss 3.0517, time 5258.54ms 
iter 1319: loss 2.9513, time 5253.99ms 
iter 1320: loss 2.8031, time 5259.36ms 
iter 1321: loss 2.9577, time 5257.15ms 
iter 1322: loss 3.0613, time 5250.01ms 
iter 1323: loss 2.9145, time 5248.66ms 
iter 1324: loss 2.9803, time 5258.88ms 
iter 1325: loss 3.1340, time 5248.38ms 
iter 1326: loss 2.8196, time 5230.51ms 
iter 1327: loss 3.1045, time 5257.45ms 
iter 1328: loss 3.1302, time 5264.61ms 
iter 1329: loss 3.0936, time 5263.83ms 
iter 1330: loss 3.0141, time 5263.04ms 
iter 1331: loss 3.0252, time 5259.24ms 
iter 1332: loss 3.0128, time 5260.10ms 
iter 1333: loss 3.2501, time 5251.79ms 
iter 1334: loss 3.0244, time 5255.35ms 
iter 1335: loss 3.0203, time 5246.51ms 
iter 1336: loss 2.9012, time 5251.81ms 
iter 1337: loss 3.1094, time 5257.30ms 
iter 1338: loss 2.9007, time 5247.15ms 
iter 1339: loss 3.0963, time 5234.89ms 
iter 1340: loss 3.0944, time 5260.92ms 
iter 1341: loss 3.0396, time 5256.76ms 
iter 1342: loss 2.9073, time 5260.63ms 
iter 1343: loss 3.0215, time 5263.97ms 
iter 1344: loss 2.8472, time 5248.75ms 
iter 1345: loss 2.9202, time 5251.38ms 
iter 1346: loss 3.1636, time 5254.81ms 
iter 1347: loss 3.1179, time 5262.00ms 
iter 1348: loss 3.0377, time 5259.77ms 
iter 1349: loss 3.1271, time 5249.81ms 
step 1350: train loss 3.0277, val loss 3.0675
iter 1350: loss 3.0325, time 20054.95ms 
iter 1351: loss 3.0750, time 5265.15ms 
iter 1352: loss 3.1001, time 5266.76ms 
iter 1353: loss 3.1964, time 5257.66ms 
iter 1354: loss 2.9876, time 5258.09ms 
iter 1355: loss 3.1638, time 5272.29ms 
iter 1356: loss 2.9930, time 5271.34ms 
iter 1357: loss 3.1317, time 5257.97ms 
iter 1358: loss 2.9509, time 5262.84ms 
iter 1359: loss 2.8691, time 5264.60ms 
iter 1360: loss 3.3101, time 5260.93ms 
iter 1361: loss 3.0242, time 5260.07ms 
iter 1362: loss 3.0935, time 5262.13ms 
iter 1363: loss 3.1021, time 5261.39ms 
iter 1364: loss 2.9776, time 5267.36ms 
iter 1365: loss 3.2667, time 5251.11ms 
iter 1366: loss 3.0840, time 5259.63ms 
iter 1367: loss 3.2569, time 5262.72ms 
iter 1368: loss 3.1999, time 5243.96ms 
iter 1369: loss 2.9683, time 5254.73ms 
iter 1370: loss 3.0763, time 5254.76ms 
iter 1371: loss 3.2377, time 5255.43ms 
iter 1372: loss 3.0880, time 5258.43ms 
iter 1373: loss 2.9749, time 5252.68ms 
iter 1374: loss 3.1091, time 5251.71ms 
iter 1375: loss 3.0650, time 5259.39ms 
iter 1376: loss 2.9711, time 5255.14ms 
iter 1377: loss 2.8315, time 5268.27ms 
iter 1378: loss 3.0433, time 5266.84ms 
iter 1379: loss 3.1334, time 5249.37ms 
iter 1380: loss 2.9310, time 5257.90ms 
iter 1381: loss 2.8827, time 5252.97ms 
iter 1382: loss 3.1646, time 5262.17ms 
iter 1383: loss 2.8776, time 5260.23ms 
iter 1384: loss 3.0175, time 5259.11ms 
iter 1385: loss 2.9994, time 5256.02ms 
iter 1386: loss 2.9344, time 5260.50ms 
iter 1387: loss 3.1566, time 5261.75ms 
iter 1388: loss 3.0025, time 5257.50ms 
iter 1389: loss 2.8523, time 5273.71ms 
iter 1390: loss 3.0049, time 5256.18ms 
iter 1391: loss 2.9831, time 5252.14ms 
iter 1392: loss 3.1553, time 5260.48ms 
iter 1393: loss 3.0940, time 5261.72ms 
iter 1394: loss 3.1120, time 5267.85ms 
iter 1395: loss 2.8028, time 5261.96ms 
iter 1396: loss 2.9564, time 5259.58ms 
iter 1397: loss 3.1633, time 5252.66ms 
iter 1398: loss 2.9777, time 5261.64ms 
iter 1399: loss 3.0543, time 5256.47ms 
step 1400: train loss 2.9986, val loss 3.0447
iter 1400: loss 2.8737, time 19864.67ms 
iter 1401: loss 2.9819, time 5075.22ms 
iter 1402: loss 3.0843, time 5078.91ms 
iter 1403: loss 2.7917, time 5109.86ms 
iter 1404: loss 2.9616, time 5070.10ms 
iter 1405: loss 2.9909, time 5073.76ms 
iter 1406: loss 3.2397, time 5085.01ms 
iter 1407: loss 2.8786, time 5072.10ms 
iter 1408: loss 3.0245, time 5087.29ms 
iter 1409: loss 2.9742, time 5091.58ms 
iter 1410: loss 2.8362, time 5093.78ms 
iter 1411: loss 3.0047, time 5100.01ms 
iter 1412: loss 3.1259, time 5074.21ms 
iter 1413: loss 2.9056, time 5072.20ms 
iter 1414: loss 2.9687, time 5069.45ms 
iter 1415: loss 2.8485, time 5064.94ms 
iter 1416: loss 2.8261, time 5081.05ms 
iter 1417: loss 2.8786, time 5082.15ms 
iter 1418: loss 3.0135, time 5082.73ms 
iter 1419: loss 2.9192, time 5110.70ms 
iter 1420: loss 2.9478, time 5263.63ms 
iter 1421: loss 3.1118, time 5259.49ms 
iter 1422: loss 3.1882, time 5249.39ms 
iter 1423: loss 3.0918, time 5244.46ms 
iter 1424: loss 3.1299, time 5264.12ms 
iter 1425: loss 3.0365, time 5263.44ms 
iter 1426: loss 3.0907, time 5261.15ms 
iter 1427: loss 3.0056, time 5262.83ms 
iter 1428: loss 2.9878, time 5267.05ms 
iter 1429: loss 3.2375, time 5266.88ms 
iter 1430: loss 2.9939, time 5264.00ms 
iter 1431: loss 2.9348, time 5271.69ms 
iter 1432: loss 2.9897, time 5271.59ms 
iter 1433: loss 2.9721, time 5266.96ms 
iter 1434: loss 2.9658, time 5265.31ms 
iter 1435: loss 3.0434, time 5272.78ms 
iter 1436: loss 2.9051, time 5263.57ms 
iter 1437: loss 2.9974, time 5243.07ms 
iter 1438: loss 3.0362, time 5186.26ms 
iter 1439: loss 3.0213, time 5265.49ms 
iter 1440: loss 2.9405, time 5284.35ms 
iter 1441: loss 2.9665, time 5294.37ms 
iter 1442: loss 2.9066, time 5261.31ms 
iter 1443: loss 2.9941, time 5325.95ms 
iter 1444: loss 2.9745, time 5262.70ms 
iter 1445: loss 3.0587, time 5294.72ms 
iter 1446: loss 3.0061, time 5268.43ms 
iter 1447: loss 3.1136, time 5275.20ms 
iter 1448: loss 3.1654, time 5299.94ms 
iter 1449: loss 3.1574, time 5319.95ms 
step 1450: train loss 3.0069, val loss 3.0559
iter 1450: loss 3.0415, time 20162.00ms 
iter 1451: loss 2.7086, time 5256.03ms 
iter 1452: loss 2.8331, time 5253.21ms 
iter 1453: loss 2.9834, time 5258.63ms 
iter 1454: loss 3.1664, time 5260.70ms 
iter 1455: loss 2.9185, time 5259.63ms 
iter 1456: loss 3.0177, time 5255.87ms 
iter 1457: loss 3.0240, time 5262.08ms 
iter 1458: loss 2.9255, time 5249.23ms 
iter 1459: loss 2.8870, time 5256.54ms 
iter 1460: loss 2.8465, time 5336.21ms 
iter 1461: loss 3.0843, time 5345.33ms 
iter 1462: loss 3.0136, time 5273.58ms 
iter 1463: loss 2.8479, time 5277.79ms 
iter 1464: loss 2.8664, time 5274.70ms 
iter 1465: loss 2.9820, time 5278.00ms 
iter 1466: loss 2.8712, time 5274.01ms 
iter 1467: loss 2.9252, time 5276.91ms 
iter 1468: loss 2.8717, time 5273.90ms 
iter 1469: loss 3.0614, time 5256.65ms 
iter 1470: loss 2.9996, time 5263.38ms 
iter 1471: loss 2.9598, time 5246.25ms 
iter 1472: loss 3.0919, time 5255.94ms 
iter 1473: loss 2.7859, time 5252.10ms 
iter 1474: loss 3.0986, time 5250.12ms 
iter 1475: loss 2.9300, time 5246.73ms 
iter 1476: loss 2.8951, time 5249.21ms 
iter 1477: loss 2.8409, time 5246.69ms 
iter 1478: loss 3.1493, time 5258.64ms 
iter 1479: loss 3.2622, time 5261.85ms 
iter 1480: loss 2.9267, time 5263.44ms 
iter 1481: loss 3.2334, time 5286.31ms 
iter 1482: loss 2.9152, time 5285.55ms 
iter 1483: loss 3.0992, time 5267.70ms 
iter 1484: loss 2.8802, time 5255.93ms 
iter 1485: loss 3.0158, time 5269.68ms 
iter 1486: loss 3.1153, time 5256.79ms 
iter 1487: loss 3.2034, time 5254.44ms 
iter 1488: loss 2.9020, time 5249.77ms 
iter 1489: loss 2.9067, time 5251.53ms 
iter 1490: loss 2.9571, time 5253.54ms 
iter 1491: loss 3.1184, time 5260.11ms 
iter 1492: loss 3.0865, time 5259.99ms 
iter 1493: loss 2.9324, time 5258.57ms 
iter 1494: loss 3.1047, time 5255.14ms 
iter 1495: loss 2.8296, time 5260.96ms 
iter 1496: loss 3.1427, time 5255.76ms 
iter 1497: loss 3.0321, time 5253.31ms 
iter 1498: loss 3.2453, time 5255.86ms 
iter 1499: loss 2.9031, time 5264.66ms 
step 1500: train loss 2.9780, val loss 3.0285
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 1500: loss 3.0868, time 21840.65ms 
iter 1501: loss 3.0927, time 5251.57ms 
iter 1502: loss 2.8561, time 5231.72ms 
iter 1503: loss 2.8205, time 5313.33ms 
iter 1504: loss 3.2064, time 5371.17ms 
iter 1505: loss 3.1873, time 5347.54ms 
iter 1506: loss 2.9893, time 5354.92ms 
iter 1507: loss 3.0090, time 5336.43ms 
iter 1508: loss 3.0035, time 5342.69ms 
iter 1509: loss 2.8291, time 5253.06ms 
iter 1510: loss 2.9607, time 5254.65ms 
iter 1511: loss 2.9295, time 5259.18ms 
iter 1512: loss 2.8751, time 5260.64ms 
iter 1513: loss 2.9907, time 5254.59ms 
iter 1514: loss 3.0709, time 5257.93ms 
iter 1515: loss 2.9952, time 5241.77ms 
iter 1516: loss 2.8320, time 5232.19ms 
iter 1517: loss 2.9181, time 5228.38ms 
iter 1518: loss 3.0497, time 5282.60ms 
iter 1519: loss 3.0101, time 5230.36ms 
iter 1520: loss 2.8286, time 5241.69ms 
iter 1521: loss 3.1300, time 5240.48ms 
iter 1522: loss 2.8954, time 5245.86ms 
iter 1523: loss 2.9745, time 5250.41ms 
iter 1524: loss 2.7932, time 5231.04ms 
iter 1525: loss 3.0322, time 5250.76ms 
iter 1526: loss 2.9673, time 5254.77ms 
iter 1527: loss 3.0455, time 5259.19ms 
iter 1528: loss 2.8924, time 5275.56ms 
iter 1529: loss 2.9246, time 5277.03ms 
iter 1530: loss 3.1303, time 5284.49ms 
iter 1531: loss 2.9894, time 5274.56ms 
iter 1532: loss 3.0548, time 5253.48ms 
iter 1533: loss 2.8220, time 5253.66ms 
iter 1534: loss 2.9251, time 5255.84ms 
iter 1535: loss 2.9103, time 5269.12ms 
iter 1536: loss 3.0503, time 5259.70ms 
iter 1537: loss 2.8215, time 5273.86ms 
iter 1538: loss 3.0056, time 5263.53ms 
iter 1539: loss 2.8864, time 5259.79ms 
iter 1540: loss 3.0843, time 5276.66ms 
iter 1541: loss 2.8722, time 5270.57ms 
iter 1542: loss 2.8401, time 5262.08ms 
iter 1543: loss 3.1669, time 5250.31ms 
iter 1544: loss 2.8307, time 5249.09ms 
iter 1545: loss 2.8398, time 5248.51ms 
iter 1546: loss 3.0636, time 5248.46ms 
iter 1547: loss 3.0677, time 5250.09ms 
iter 1548: loss 2.9576, time 5251.07ms 
iter 1549: loss 2.8860, time 5250.21ms 
step 1550: train loss 2.9776, val loss 3.0397
iter 1550: loss 2.8657, time 20027.88ms 
iter 1551: loss 2.8872, time 5254.38ms 
iter 1552: loss 3.0340, time 5265.53ms 
iter 1553: loss 2.9529, time 5253.52ms 
iter 1554: loss 2.9973, time 5248.46ms 
iter 1555: loss 2.8875, time 5251.93ms 
iter 1556: loss 3.2043, time 5248.65ms 
iter 1557: loss 2.8819, time 5244.12ms 
iter 1558: loss 3.1157, time 5250.60ms 
iter 1559: loss 2.8316, time 5249.30ms 
iter 1560: loss 2.9471, time 5228.12ms 
iter 1561: loss 3.1166, time 5255.96ms 
iter 1562: loss 2.9744, time 5246.88ms 
iter 1563: loss 2.8334, time 5258.57ms 
iter 1564: loss 3.0108, time 5253.31ms 
iter 1565: loss 2.7941, time 5256.21ms 
iter 1566: loss 2.9188, time 5206.01ms 
iter 1567: loss 2.9980, time 5248.78ms 
iter 1568: loss 3.0736, time 5245.26ms 
iter 1569: loss 2.9319, time 5251.81ms 
iter 1570: loss 2.9215, time 5253.87ms 
iter 1571: loss 3.0743, time 5249.75ms 
iter 1572: loss 3.0438, time 5246.57ms 
iter 1573: loss 2.8878, time 5248.92ms 
iter 1574: loss 3.0145, time 5251.73ms 
iter 1575: loss 3.1426, time 5253.82ms 
iter 1576: loss 2.9460, time 5245.66ms 
iter 1577: loss 3.0352, time 5249.63ms 
iter 1578: loss 3.0747, time 5262.95ms 
iter 1579: loss 2.9974, time 5210.72ms 
iter 1580: loss 2.9962, time 5251.80ms 
iter 1581: loss 2.7638, time 5242.93ms 
iter 1582: loss 3.0023, time 5240.36ms 
iter 1583: loss 2.7893, time 5220.97ms 
iter 1584: loss 2.9904, time 5221.96ms 
iter 1585: loss 2.9777, time 5236.85ms 
iter 1586: loss 3.0480, time 5260.11ms 
iter 1587: loss 2.9340, time 5264.28ms 
iter 1588: loss 2.9329, time 5265.83ms 
iter 1589: loss 2.8961, time 5269.12ms 
iter 1590: loss 3.0808, time 5251.96ms 
iter 1591: loss 2.9306, time 5252.52ms 
iter 1592: loss 2.9735, time 5256.84ms 
iter 1593: loss 2.9361, time 5232.63ms 
iter 1594: loss 2.9508, time 5272.01ms 
iter 1595: loss 3.1289, time 5246.01ms 
iter 1596: loss 2.8065, time 5251.35ms 
iter 1597: loss 3.0582, time 5255.83ms 
iter 1598: loss 2.7263, time 5242.13ms 
iter 1599: loss 2.7703, time 5246.35ms 
step 1600: train loss 2.9468, val loss 3.0206
iter 1600: loss 2.9080, time 20052.71ms 
iter 1601: loss 2.8788, time 5254.73ms 
iter 1602: loss 2.9517, time 5244.96ms 
iter 1603: loss 2.9117, time 5249.28ms 
iter 1604: loss 2.8640, time 5258.93ms 
iter 1605: loss 2.9237, time 5254.69ms 
iter 1606: loss 2.9669, time 5249.53ms 
iter 1607: loss 2.7503, time 5254.11ms 
iter 1608: loss 2.8807, time 5256.97ms 
iter 1609: loss 2.7812, time 5267.78ms 
iter 1610: loss 2.9940, time 5254.14ms 
iter 1611: loss 2.9450, time 5253.60ms 
iter 1612: loss 2.9382, time 5256.92ms 
iter 1613: loss 2.8638, time 5265.00ms 
iter 1614: loss 2.8445, time 5224.78ms 
iter 1615: loss 2.9574, time 5254.82ms 
iter 1616: loss 3.0666, time 5248.90ms 
iter 1617: loss 2.8638, time 5246.97ms 
iter 1618: loss 2.7597, time 5249.44ms 
iter 1619: loss 2.7585, time 5263.04ms 
iter 1620: loss 2.9927, time 5252.72ms 
iter 1621: loss 3.0022, time 5252.19ms 
iter 1622: loss 2.9669, time 5219.99ms 
iter 1623: loss 2.9903, time 5251.72ms 
iter 1624: loss 3.1099, time 5252.27ms 
iter 1625: loss 3.2298, time 5255.90ms 
iter 1626: loss 2.9530, time 5246.54ms 
iter 1627: loss 3.0676, time 5252.68ms 
iter 1628: loss 2.9769, time 5226.14ms 
iter 1629: loss 2.8100, time 5225.68ms 
iter 1630: loss 2.9978, time 5236.22ms 
iter 1631: loss 3.0491, time 5234.44ms 
iter 1632: loss 2.9232, time 5222.50ms 
iter 1633: loss 2.9325, time 5110.10ms 
iter 1634: loss 3.0330, time 5119.56ms 
iter 1635: loss 2.9155, time 5124.27ms 
iter 1636: loss 3.0200, time 5139.94ms 
iter 1637: loss 2.8888, time 5093.73ms 
iter 1638: loss 2.9069, time 5169.95ms 
iter 1639: loss 2.9792, time 5247.79ms 
iter 1640: loss 2.9798, time 5236.80ms 
iter 1641: loss 3.0447, time 5178.58ms 
iter 1642: loss 2.7570, time 5265.70ms 
iter 1643: loss 2.8694, time 5293.09ms 
iter 1644: loss 3.1850, time 5113.80ms 
iter 1645: loss 2.8620, time 5074.75ms 
iter 1646: loss 2.9844, time 5077.14ms 
iter 1647: loss 2.8234, time 5178.26ms 
iter 1648: loss 3.0824, time 5267.02ms 
iter 1649: loss 2.7913, time 5259.78ms 
step 1650: train loss 2.9433, val loss 3.0216
iter 1650: loss 2.9885, time 20011.26ms 
iter 1651: loss 2.8184, time 5261.65ms 
iter 1652: loss 2.8555, time 5252.72ms 
iter 1653: loss 2.7932, time 5263.89ms 
iter 1654: loss 2.8439, time 5247.82ms 
iter 1655: loss 3.1516, time 5252.83ms 
iter 1656: loss 2.9678, time 5246.77ms 
iter 1657: loss 2.8615, time 5254.61ms 
iter 1658: loss 3.0186, time 5247.87ms 
iter 1659: loss 3.0128, time 5252.65ms 
iter 1660: loss 2.6998, time 5251.24ms 
iter 1661: loss 3.0323, time 5119.68ms 
iter 1662: loss 3.0311, time 5104.93ms 
iter 1663: loss 2.7725, time 5252.55ms 
iter 1664: loss 2.9402, time 5254.56ms 
iter 1665: loss 3.0136, time 5251.39ms 
iter 1666: loss 2.9738, time 5248.77ms 
iter 1667: loss 2.9894, time 5246.08ms 
iter 1668: loss 2.9985, time 5255.48ms 
iter 1669: loss 2.8291, time 5264.67ms 
iter 1670: loss 2.7604, time 5246.50ms 
iter 1671: loss 3.1625, time 5265.54ms 
iter 1672: loss 2.9663, time 5270.99ms 
iter 1673: loss 2.9741, time 5263.14ms 
iter 1674: loss 2.7152, time 5256.68ms 
iter 1675: loss 3.0987, time 5245.50ms 
iter 1676: loss 2.9798, time 5208.30ms 
iter 1677: loss 2.7689, time 5098.28ms 
iter 1678: loss 2.9424, time 5099.75ms 
iter 1679: loss 2.9560, time 5085.24ms 
iter 1680: loss 3.0434, time 5109.03ms 
iter 1681: loss 2.8749, time 5082.25ms 
iter 1682: loss 2.9713, time 5095.82ms 
iter 1683: loss 2.9234, time 5115.37ms 
iter 1684: loss 3.1367, time 5270.13ms 
iter 1685: loss 2.9402, time 5293.88ms 
iter 1686: loss 2.8666, time 5220.27ms 
iter 1687: loss 3.1519, time 5245.13ms 
iter 1688: loss 2.9514, time 5242.45ms 
iter 1689: loss 2.7657, time 5243.59ms 
iter 1690: loss 2.8948, time 5247.84ms 
iter 1691: loss 2.8778, time 5346.16ms 
iter 1692: loss 2.6982, time 5251.07ms 
iter 1693: loss 2.9603, time 5256.42ms 
iter 1694: loss 2.8683, time 5257.11ms 
iter 1695: loss 2.9715, time 5252.26ms 
iter 1696: loss 2.8352, time 5264.64ms 
iter 1697: loss 2.7789, time 5356.43ms 
iter 1698: loss 2.7818, time 5253.52ms 
iter 1699: loss 2.8931, time 5366.43ms 
step 1700: train loss 2.9262, val loss 3.0092
iter 1700: loss 2.9249, time 20175.36ms 
iter 1701: loss 3.0237, time 5254.77ms 
iter 1702: loss 2.9236, time 5258.45ms 
iter 1703: loss 2.8814, time 5236.06ms 
iter 1704: loss 2.8174, time 5196.00ms 
iter 1705: loss 2.8478, time 5385.93ms 
iter 1706: loss 2.8797, time 5419.14ms 
iter 1707: loss 2.8757, time 5411.81ms 
iter 1708: loss 2.9894, time 5416.15ms 
iter 1709: loss 2.9393, time 5399.15ms 
iter 1710: loss 3.0366, time 5240.84ms 
iter 1711: loss 2.8924, time 5257.82ms 
iter 1712: loss 3.0298, time 5252.95ms 
iter 1713: loss 2.9956, time 5256.97ms 
iter 1714: loss 2.9052, time 5233.17ms 
iter 1715: loss 3.0997, time 5245.13ms 
iter 1716: loss 3.0869, time 5243.56ms 
iter 1717: loss 3.1101, time 5233.61ms 
iter 1718: loss 2.9256, time 5242.12ms 
iter 1719: loss 2.8782, time 5241.96ms 
iter 1720: loss 2.9277, time 5246.65ms 
iter 1721: loss 2.8000, time 5256.02ms 
iter 1722: loss 2.8386, time 5245.09ms 
iter 1723: loss 2.7965, time 5242.44ms 
iter 1724: loss 2.8872, time 5235.58ms 
iter 1725: loss 3.0128, time 5223.45ms 
iter 1726: loss 2.8392, time 5245.41ms 
iter 1727: loss 2.8603, time 5242.10ms 
iter 1728: loss 2.9450, time 5239.27ms 
iter 1729: loss 2.8885, time 5236.75ms 
iter 1730: loss 2.9219, time 5245.74ms 
iter 1731: loss 2.9915, time 5238.26ms 
iter 1732: loss 2.8558, time 5247.49ms 
iter 1733: loss 2.9565, time 5238.29ms 
iter 1734: loss 2.8951, time 5238.42ms 
iter 1735: loss 2.9643, time 5231.21ms 
iter 1736: loss 3.0320, time 5241.83ms 
iter 1737: loss 2.8918, time 5259.36ms 
iter 1738: loss 3.0818, time 5252.02ms 
iter 1739: loss 3.1049, time 5245.31ms 
iter 1740: loss 3.0315, time 5250.60ms 
iter 1741: loss 2.8010, time 5258.91ms 
iter 1742: loss 2.8644, time 5261.07ms 
iter 1743: loss 2.8660, time 5250.16ms 
iter 1744: loss 2.9772, time 5249.09ms 
iter 1745: loss 2.8001, time 5257.28ms 
iter 1746: loss 2.9707, time 5254.85ms 
iter 1747: loss 2.9344, time 5244.80ms 
iter 1748: loss 3.0469, time 5245.38ms 
iter 1749: loss 2.8663, time 5254.36ms 
step 1750: train loss 2.9141, val loss 3.0069
iter 1750: loss 2.6240, time 20075.12ms 
iter 1751: loss 3.0276, time 5247.88ms 
iter 1752: loss 2.9638, time 5262.95ms 
iter 1753: loss 3.0079, time 5272.91ms 
iter 1754: loss 2.8709, time 5252.55ms 
iter 1755: loss 2.8209, time 5259.97ms 
iter 1756: loss 2.8396, time 5255.98ms 
iter 1757: loss 2.9525, time 5255.55ms 
iter 1758: loss 2.9088, time 5247.92ms 
iter 1759: loss 3.0166, time 5248.31ms 
iter 1760: loss 2.9963, time 5250.53ms 
iter 1761: loss 2.9434, time 5263.80ms 
iter 1762: loss 3.0125, time 5260.31ms 
iter 1763: loss 2.8763, time 5255.31ms 
iter 1764: loss 2.9568, time 5252.16ms 
iter 1765: loss 2.9170, time 5256.20ms 
iter 1766: loss 2.9434, time 5246.24ms 
iter 1767: loss 2.8214, time 5254.70ms 
iter 1768: loss 2.8756, time 5247.95ms 
iter 1769: loss 2.9304, time 5256.07ms 
iter 1770: loss 3.0437, time 5247.82ms 
iter 1771: loss 2.8351, time 5255.96ms 
iter 1772: loss 2.8444, time 5255.86ms 
iter 1773: loss 2.9923, time 5246.14ms 
iter 1774: loss 2.9332, time 5236.35ms 
iter 1775: loss 2.9960, time 5245.15ms 
iter 1776: loss 2.8626, time 5236.03ms 
iter 1777: loss 3.0623, time 5239.63ms 
iter 1778: loss 3.0229, time 5244.40ms 
iter 1779: loss 3.0344, time 5238.30ms 
iter 1780: loss 2.7112, time 5244.05ms 
iter 1781: loss 3.0029, time 5238.81ms 
iter 1782: loss 2.8142, time 5237.87ms 
iter 1783: loss 2.9000, time 5241.66ms 
iter 1784: loss 2.9691, time 5241.64ms 
iter 1785: loss 3.0596, time 5240.77ms 
iter 1786: loss 2.9424, time 5223.94ms 
iter 1787: loss 3.0703, time 5237.99ms 
iter 1788: loss 3.0446, time 5240.90ms 
iter 1789: loss 2.9683, time 5241.90ms 
iter 1790: loss 2.7008, time 5232.67ms 
iter 1791: loss 2.8276, time 5238.21ms 
iter 1792: loss 2.7624, time 5234.29ms 
iter 1793: loss 2.8859, time 5232.63ms 
iter 1794: loss 2.9314, time 5239.17ms 
iter 1795: loss 2.8066, time 5246.06ms 
iter 1796: loss 3.0606, time 5244.60ms 
iter 1797: loss 2.8632, time 5234.00ms 
iter 1798: loss 2.8300, time 5206.25ms 
iter 1799: loss 2.9185, time 5231.03ms 
step 1800: train loss 2.9132, val loss 3.0094
iter 1800: loss 2.9592, time 19956.26ms 
iter 1801: loss 3.1014, time 5073.56ms 
iter 1802: loss 2.8913, time 5061.67ms 
iter 1803: loss 3.1204, time 5095.24ms 
iter 1804: loss 2.7167, time 5087.83ms 
iter 1805: loss 2.8073, time 5177.99ms 
iter 1806: loss 2.8416, time 5255.02ms 
iter 1807: loss 3.0088, time 5156.88ms 
iter 1808: loss 2.9530, time 5088.09ms 
iter 1809: loss 2.7956, time 5162.30ms 
iter 1810: loss 2.9052, time 5251.28ms 
iter 1811: loss 3.1499, time 5243.70ms 
iter 1812: loss 2.8098, time 5258.18ms 
iter 1813: loss 3.0535, time 5270.07ms 
iter 1814: loss 3.1538, time 5274.43ms 
iter 1815: loss 3.0186, time 5266.79ms 
iter 1816: loss 2.8308, time 5280.01ms 
iter 1817: loss 2.9872, time 5264.21ms 
iter 1818: loss 2.8323, time 5250.13ms 
iter 1819: loss 2.7498, time 5242.85ms 
iter 1820: loss 2.7996, time 5258.54ms 
iter 1821: loss 2.7713, time 5255.65ms 
iter 1822: loss 3.0539, time 5257.60ms 
iter 1823: loss 3.1420, time 5261.91ms 
iter 1824: loss 2.9361, time 5248.91ms 
iter 1825: loss 2.8757, time 5265.26ms 
iter 1826: loss 2.9806, time 5267.35ms 
iter 1827: loss 2.8176, time 5250.84ms 
iter 1828: loss 3.1098, time 5262.26ms 
iter 1829: loss 2.9404, time 5263.90ms 
iter 1830: loss 2.8925, time 5260.09ms 
iter 1831: loss 2.8336, time 5259.00ms 
iter 1832: loss 2.7244, time 5261.67ms 
iter 1833: loss 3.0017, time 5272.54ms 
iter 1834: loss 2.6307, time 5267.75ms 
iter 1835: loss 2.9835, time 5272.64ms 
iter 1836: loss 2.8724, time 5282.16ms 
iter 1837: loss 2.7940, time 5285.52ms 
iter 1838: loss 3.0287, time 5254.19ms 
iter 1839: loss 2.8020, time 5263.98ms 
iter 1840: loss 2.8493, time 5256.52ms 
iter 1841: loss 2.6096, time 5262.27ms 
iter 1842: loss 2.6215, time 5249.94ms 
iter 1843: loss 3.0283, time 5256.27ms 
iter 1844: loss 2.8829, time 5251.17ms 
iter 1845: loss 2.7301, time 5257.27ms 
iter 1846: loss 2.8670, time 5251.26ms 
iter 1847: loss 2.9300, time 5319.10ms 
iter 1848: loss 2.9247, time 5390.72ms 
iter 1849: loss 3.0861, time 5247.78ms 
step 1850: train loss 2.8999, val loss 2.9924
iter 1850: loss 2.9252, time 20157.83ms 
iter 1851: loss 2.7822, time 5259.23ms 
iter 1852: loss 2.6652, time 5245.82ms 
iter 1853: loss 2.9993, time 5260.58ms 
iter 1854: loss 2.9318, time 5267.92ms 
iter 1855: loss 2.8761, time 5269.98ms 
iter 1856: loss 3.0626, time 5257.93ms 
iter 1857: loss 2.9457, time 5229.07ms 
iter 1858: loss 2.8010, time 5259.29ms 
iter 1859: loss 2.9205, time 5263.52ms 
iter 1860: loss 2.8142, time 5234.78ms 
iter 1861: loss 2.8192, time 5275.09ms 
iter 1862: loss 2.8652, time 5272.84ms 
iter 1863: loss 2.7834, time 5259.35ms 
iter 1864: loss 2.7296, time 5269.27ms 
iter 1865: loss 2.8297, time 5275.21ms 
iter 1866: loss 2.7263, time 5263.17ms 
iter 1867: loss 2.8463, time 5267.23ms 
iter 1868: loss 2.9058, time 5253.25ms 
iter 1869: loss 2.9394, time 5244.31ms 
iter 1870: loss 2.9635, time 5277.81ms 
iter 1871: loss 2.8981, time 5270.55ms 
iter 1872: loss 2.9700, time 5262.00ms 
iter 1873: loss 2.9198, time 5263.52ms 
iter 1874: loss 2.7815, time 5227.11ms 
iter 1875: loss 2.8745, time 5273.35ms 
iter 1876: loss 2.7083, time 5262.50ms 
iter 1877: loss 3.0384, time 5262.96ms 
iter 1878: loss 2.8296, time 5240.07ms 
iter 1879: loss 2.8927, time 5253.84ms 
iter 1880: loss 2.8110, time 5259.64ms 
iter 1881: loss 2.7646, time 5270.08ms 
iter 1882: loss 2.8308, time 5254.12ms 
iter 1883: loss 2.9147, time 5256.76ms 
iter 1884: loss 2.8969, time 5258.52ms 
iter 1885: loss 2.7025, time 5255.18ms 
iter 1886: loss 2.9076, time 5257.67ms 
iter 1887: loss 3.0617, time 5276.53ms 
iter 1888: loss 2.8815, time 5262.33ms 
iter 1889: loss 2.9362, time 5264.48ms 
iter 1890: loss 2.7735, time 5255.93ms 
iter 1891: loss 2.8512, time 5252.23ms 
iter 1892: loss 2.9555, time 5235.93ms 
iter 1893: loss 2.7738, time 5256.78ms 
iter 1894: loss 2.9082, time 5255.01ms 
iter 1895: loss 2.9942, time 5256.19ms 
iter 1896: loss 2.7717, time 5258.90ms 
iter 1897: loss 3.0045, time 5257.78ms 
iter 1898: loss 2.7970, time 5257.15ms 
iter 1899: loss 2.9840, time 5257.56ms 
step 1900: train loss 2.8979, val loss 2.9740
iter 1900: loss 3.0869, time 20074.59ms 
iter 1901: loss 3.0283, time 5257.38ms 
iter 1902: loss 3.0161, time 5258.94ms 
iter 1903: loss 2.8720, time 5255.36ms 
iter 1904: loss 3.0032, time 5245.01ms 
iter 1905: loss 3.1016, time 5100.15ms 
iter 1906: loss 2.9225, time 5117.40ms 
iter 1907: loss 2.9014, time 5100.33ms 
iter 1908: loss 2.8109, time 5076.69ms 
iter 1909: loss 2.9586, time 5099.96ms 
iter 1910: loss 2.7691, time 5097.31ms 
iter 1911: loss 2.7917, time 5085.71ms 
iter 1912: loss 2.8863, time 5084.13ms 
iter 1913: loss 2.8999, time 5082.96ms 
iter 1914: loss 3.0171, time 5092.66ms 
iter 1915: loss 2.8500, time 5088.42ms 
iter 1916: loss 2.8205, time 5093.21ms 
iter 1917: loss 2.9501, time 5084.83ms 
iter 1918: loss 3.0055, time 5103.02ms 
iter 1919: loss 2.8947, time 5072.45ms 
iter 1920: loss 2.9121, time 5058.12ms 
iter 1921: loss 2.7876, time 5086.71ms 
iter 1922: loss 2.9147, time 5076.01ms 
iter 1923: loss 2.9808, time 5080.54ms 
iter 1924: loss 2.8909, time 5228.21ms 
iter 1925: loss 2.9837, time 5262.20ms 
iter 1926: loss 2.9251, time 5262.26ms 
iter 1927: loss 2.7862, time 5131.98ms 
iter 1928: loss 2.6431, time 5094.75ms 
iter 1929: loss 2.8868, time 5090.60ms 
iter 1930: loss 2.9381, time 5108.65ms 
iter 1931: loss 2.8690, time 5085.47ms 
iter 1932: loss 2.7704, time 5093.08ms 
iter 1933: loss 2.8617, time 5104.51ms 
iter 1934: loss 3.0798, time 5096.18ms 
iter 1935: loss 2.8029, time 5099.02ms 
iter 1936: loss 3.0388, time 5108.34ms 
iter 1937: loss 2.9996, time 5234.60ms 
iter 1938: loss 2.7671, time 5267.10ms 
iter 1939: loss 2.9415, time 5267.15ms 
iter 1940: loss 2.9280, time 5194.87ms 
iter 1941: loss 2.8458, time 5084.18ms 
iter 1942: loss 2.7595, time 5092.10ms 
iter 1943: loss 2.8182, time 5087.30ms 
iter 1944: loss 2.7827, time 5186.28ms 
iter 1945: loss 2.7871, time 5278.94ms 
iter 1946: loss 2.7998, time 5249.51ms 
iter 1947: loss 2.8493, time 5262.27ms 
iter 1948: loss 2.9621, time 5253.19ms 
iter 1949: loss 2.8027, time 5255.82ms 
step 1950: train loss 2.8971, val loss 2.9691
iter 1950: loss 2.9407, time 20048.15ms 
iter 1951: loss 2.9457, time 5254.56ms 
iter 1952: loss 3.0395, time 5260.98ms 
iter 1953: loss 2.9093, time 5251.88ms 
iter 1954: loss 3.0338, time 5249.77ms 
iter 1955: loss 2.9394, time 5225.23ms 
iter 1956: loss 2.8490, time 5191.36ms 
iter 1957: loss 2.9777, time 5091.99ms 
iter 1958: loss 2.9140, time 5074.65ms 
iter 1959: loss 2.7912, time 5104.11ms 
iter 1960: loss 2.7971, time 5098.80ms 
iter 1961: loss 2.7785, time 5112.78ms 
iter 1962: loss 2.7477, time 5085.13ms 
iter 1963: loss 2.7704, time 5071.71ms 
iter 1964: loss 2.8815, time 5099.12ms 
iter 1965: loss 3.0267, time 5110.65ms 
iter 1966: loss 2.8309, time 5092.62ms 
iter 1967: loss 2.8704, time 5073.87ms 
iter 1968: loss 2.9459, time 5104.81ms 
iter 1969: loss 2.9471, time 5105.73ms 
iter 1970: loss 2.5809, time 5084.77ms 
iter 1971: loss 2.6659, time 5250.53ms 
iter 1972: loss 2.9782, time 5263.88ms 
iter 1973: loss 2.9120, time 5251.81ms 
iter 1974: loss 2.9205, time 5247.46ms 
iter 1975: loss 2.8189, time 5267.03ms 
iter 1976: loss 3.0436, time 5252.54ms 
iter 1977: loss 2.8723, time 5259.34ms 
iter 1978: loss 3.0346, time 5259.28ms 
iter 1979: loss 2.6201, time 5255.20ms 
iter 1980: loss 2.8451, time 5239.98ms 
iter 1981: loss 3.0066, time 5198.64ms 
iter 1982: loss 2.7383, time 5265.05ms 
iter 1983: loss 2.6911, time 5263.68ms 
iter 1984: loss 2.9094, time 5253.41ms 
iter 1985: loss 2.8506, time 5266.86ms 
iter 1986: loss 2.7635, time 5256.50ms 
iter 1987: loss 2.8623, time 5256.98ms 
iter 1988: loss 2.7603, time 5231.98ms 
iter 1989: loss 2.7484, time 5251.72ms 
iter 1990: loss 2.7449, time 5252.23ms 
iter 1991: loss 2.7979, time 5257.01ms 
iter 1992: loss 3.0157, time 5256.48ms 
iter 1993: loss 2.8851, time 5252.03ms 
iter 1994: loss 2.9816, time 5251.48ms 
iter 1995: loss 2.7749, time 5252.03ms 
iter 1996: loss 2.9503, time 5252.72ms 
iter 1997: loss 3.0412, time 5252.28ms 
iter 1998: loss 2.8356, time 5260.39ms 
iter 1999: loss 3.1092, time 5256.86ms 
step 2000: train loss 2.8770, val loss 2.9708
iter 2000: loss 2.9548, time 20064.14ms 
iter 2001: loss 2.8979, time 5256.10ms 
iter 2002: loss 2.8706, time 5259.32ms 
iter 2003: loss 2.9120, time 5259.61ms 
iter 2004: loss 2.7938, time 5259.57ms 
iter 2005: loss 2.7122, time 5252.56ms 
iter 2006: loss 2.6380, time 5259.54ms 
iter 2007: loss 2.8873, time 5256.93ms 
iter 2008: loss 2.9163, time 5190.32ms 
iter 2009: loss 2.8146, time 5019.98ms 
iter 2010: loss 2.8143, time 5119.26ms 
iter 2011: loss 3.0232, time 5272.97ms 
iter 2012: loss 2.7803, time 5268.79ms 
iter 2013: loss 2.8133, time 5269.56ms 
iter 2014: loss 2.9070, time 5285.94ms 
iter 2015: loss 2.9051, time 5284.98ms 
iter 2016: loss 2.7431, time 5257.14ms 
iter 2017: loss 2.8970, time 5239.96ms 
iter 2018: loss 2.9155, time 5252.25ms 
iter 2019: loss 2.7338, time 5261.53ms 
iter 2020: loss 2.9308, time 5236.39ms 
iter 2021: loss 2.7702, time 5255.07ms 
iter 2022: loss 2.8748, time 5259.35ms 
iter 2023: loss 2.8209, time 5258.48ms 
iter 2024: loss 2.8699, time 5225.34ms 
iter 2025: loss 3.0416, time 5249.91ms 
iter 2026: loss 2.8312, time 5259.46ms 
iter 2027: loss 2.9911, time 5253.83ms 
iter 2028: loss 3.0049, time 5265.00ms 
iter 2029: loss 2.8323, time 5255.01ms 
iter 2030: loss 2.8062, time 5255.83ms 
iter 2031: loss 2.9491, time 5250.56ms 
iter 2032: loss 2.7910, time 5253.71ms 
iter 2033: loss 2.7991, time 5247.19ms 
iter 2034: loss 3.1196, time 5250.60ms 
iter 2035: loss 2.8003, time 5252.35ms 
iter 2036: loss 2.9755, time 5257.88ms 
iter 2037: loss 2.8025, time 5265.60ms 
iter 2038: loss 2.8335, time 5268.38ms 
iter 2039: loss 2.7981, time 5253.28ms 
iter 2040: loss 2.9876, time 5250.80ms 
iter 2041: loss 2.9629, time 5252.26ms 
iter 2042: loss 2.8010, time 5264.10ms 
iter 2043: loss 2.8813, time 5259.72ms 
iter 2044: loss 2.8927, time 5268.87ms 
iter 2045: loss 3.1784, time 5253.25ms 
iter 2046: loss 2.7696, time 5253.86ms 
iter 2047: loss 2.8999, time 5260.82ms 
iter 2048: loss 2.9022, time 5268.49ms 
iter 2049: loss 2.9298, time 5260.57ms 
step 2050: train loss 2.8708, val loss 2.9594
iter 2050: loss 2.7331, time 19917.74ms 
iter 2051: loss 2.8460, time 5266.80ms 
iter 2052: loss 2.6969, time 5265.62ms 
iter 2053: loss 2.6738, time 5264.12ms 
iter 2054: loss 2.9077, time 5262.82ms 
iter 2055: loss 2.8747, time 5266.62ms 
iter 2056: loss 2.8366, time 5256.85ms 
iter 2057: loss 2.8741, time 5246.94ms 
iter 2058: loss 2.8470, time 5256.88ms 
iter 2059: loss 2.9594, time 5262.41ms 
iter 2060: loss 2.9380, time 5253.85ms 
iter 2061: loss 2.7904, time 5255.00ms 
iter 2062: loss 2.8989, time 5248.34ms 
iter 2063: loss 2.7617, time 5256.75ms 
iter 2064: loss 2.8526, time 5256.53ms 
iter 2065: loss 2.7710, time 5251.48ms 
iter 2066: loss 2.8856, time 5254.31ms 
iter 2067: loss 2.8805, time 5251.05ms 
iter 2068: loss 2.7535, time 5256.30ms 
iter 2069: loss 2.9576, time 5253.50ms 
iter 2070: loss 2.7420, time 5255.70ms 
iter 2071: loss 2.7505, time 5247.73ms 
iter 2072: loss 2.7187, time 5251.11ms 
iter 2073: loss 2.9398, time 5250.39ms 
iter 2074: loss 3.0223, time 5254.85ms 
iter 2075: loss 2.8843, time 5255.63ms 
iter 2076: loss 2.7045, time 5250.40ms 
iter 2077: loss 2.9586, time 5253.24ms 
iter 2078: loss 2.9465, time 5256.03ms 
iter 2079: loss 2.9991, time 5257.35ms 
iter 2080: loss 2.8675, time 5247.53ms 
iter 2081: loss 2.8041, time 5256.35ms 
iter 2082: loss 2.9189, time 5250.84ms 
iter 2083: loss 2.9659, time 5256.84ms 
iter 2084: loss 2.8891, time 5250.88ms 
iter 2085: loss 2.7038, time 5251.94ms 
iter 2086: loss 2.8068, time 5261.24ms 
iter 2087: loss 3.0557, time 5257.28ms 
iter 2088: loss 2.9894, time 5256.23ms 
iter 2089: loss 2.9374, time 5253.30ms 
iter 2090: loss 2.8322, time 5257.24ms 
iter 2091: loss 2.8426, time 5252.26ms 
iter 2092: loss 2.9459, time 5256.06ms 
iter 2093: loss 2.8106, time 5263.98ms 
iter 2094: loss 2.7443, time 5257.90ms 
iter 2095: loss 2.8374, time 5253.54ms 
iter 2096: loss 2.8783, time 5260.55ms 
iter 2097: loss 3.0917, time 5266.86ms 
iter 2098: loss 2.9145, time 5274.22ms 
iter 2099: loss 2.7999, time 5272.73ms 
step 2100: train loss 2.8563, val loss 2.9535
iter 2100: loss 2.8840, time 20143.93ms 
iter 2101: loss 2.7735, time 5257.78ms 
iter 2102: loss 3.0588, time 5257.93ms 
iter 2103: loss 2.6673, time 5254.89ms 
iter 2104: loss 2.7500, time 5257.44ms 
iter 2105: loss 3.0234, time 5253.70ms 
iter 2106: loss 2.7870, time 5249.25ms 
iter 2107: loss 2.9473, time 5255.01ms 
iter 2108: loss 2.9990, time 5265.34ms 
iter 2109: loss 2.7706, time 5275.60ms 
iter 2110: loss 2.4123, time 5271.22ms 
iter 2111: loss 2.8909, time 5267.63ms 
iter 2112: loss 2.7624, time 5268.01ms 
iter 2113: loss 2.8668, time 5261.51ms 
iter 2114: loss 2.7013, time 5261.57ms 
iter 2115: loss 2.8741, time 5262.32ms 
iter 2116: loss 2.8733, time 5247.98ms 
iter 2117: loss 2.8019, time 5277.90ms 
iter 2118: loss 2.7231, time 5273.68ms 
iter 2119: loss 2.6538, time 5265.30ms 
iter 2120: loss 2.9164, time 5259.96ms 
iter 2121: loss 2.8429, time 5263.28ms 
iter 2122: loss 2.9706, time 5261.36ms 
iter 2123: loss 2.8147, time 5259.90ms 
iter 2124: loss 2.7820, time 5262.99ms 
iter 2125: loss 2.9640, time 5257.80ms 
iter 2126: loss 2.6108, time 5261.86ms 
iter 2127: loss 2.9293, time 5265.32ms 
iter 2128: loss 2.8116, time 5260.83ms 
iter 2129: loss 2.8672, time 5264.24ms 
iter 2130: loss 2.7323, time 5255.30ms 
iter 2131: loss 3.0435, time 5252.68ms 
iter 2132: loss 2.5901, time 5265.89ms 
iter 2133: loss 2.7905, time 5222.37ms 
iter 2134: loss 2.7200, time 5258.49ms 
iter 2135: loss 2.7674, time 5251.42ms 
iter 2136: loss 2.8530, time 5259.72ms 
iter 2137: loss 2.9176, time 5251.72ms 
iter 2138: loss 2.8505, time 5247.37ms 
iter 2139: loss 2.8371, time 5258.82ms 
iter 2140: loss 2.7609, time 5256.25ms 
iter 2141: loss 2.7443, time 5258.82ms 
iter 2142: loss 2.8995, time 5258.38ms 
iter 2143: loss 2.9251, time 5260.07ms 
iter 2144: loss 2.7696, time 5251.19ms 
iter 2145: loss 3.0410, time 5247.94ms 
iter 2146: loss 2.9704, time 5263.24ms 
iter 2147: loss 2.8501, time 5279.02ms 
iter 2148: loss 3.1311, time 5275.32ms 
iter 2149: loss 2.7589, time 5256.88ms 
step 2150: train loss 2.8407, val loss 2.9372
iter 2150: loss 2.9078, time 20061.84ms 
iter 2151: loss 2.6679, time 5257.15ms 
iter 2152: loss 2.9793, time 5265.26ms 
iter 2153: loss 3.0024, time 5250.34ms 
iter 2154: loss 2.8987, time 5406.21ms 
iter 2155: loss 2.8427, time 5381.97ms 
iter 2156: loss 2.9949, time 5367.50ms 
iter 2157: loss 2.7292, time 5385.04ms 
iter 2158: loss 2.7898, time 5413.99ms 
iter 2159: loss 2.9005, time 5291.67ms 
iter 2160: loss 2.8350, time 5248.71ms 
iter 2161: loss 2.6462, time 5252.74ms 
iter 2162: loss 2.7988, time 5256.01ms 
iter 2163: loss 2.9928, time 5341.89ms 
iter 2164: loss 2.7920, time 5397.89ms 
iter 2165: loss 2.9568, time 5388.32ms 
iter 2166: loss 2.7245, time 5424.20ms 
iter 2167: loss 2.9338, time 5398.87ms 
iter 2168: loss 2.8279, time 5316.75ms 
iter 2169: loss 3.0438, time 5252.49ms 
iter 2170: loss 2.6385, time 5251.09ms 
iter 2171: loss 2.8156, time 5262.75ms 
iter 2172: loss 2.7362, time 5247.95ms 
iter 2173: loss 2.9897, time 5262.22ms 
iter 2174: loss 2.7539, time 5263.94ms 
iter 2175: loss 2.8647, time 5264.08ms 
iter 2176: loss 2.7861, time 5261.39ms 
iter 2177: loss 3.0992, time 5280.20ms 
iter 2178: loss 2.8857, time 5260.20ms 
iter 2179: loss 2.7023, time 5252.34ms 
iter 2180: loss 2.8027, time 5254.77ms 
iter 2181: loss 2.7920, time 5255.92ms 
iter 2182: loss 2.8860, time 5259.40ms 
iter 2183: loss 3.0530, time 5258.31ms 
iter 2184: loss 2.6783, time 5253.02ms 
iter 2185: loss 2.6790, time 5254.84ms 
iter 2186: loss 2.8530, time 5279.93ms 
iter 2187: loss 2.6910, time 5274.15ms 
iter 2188: loss 3.0121, time 5274.23ms 
iter 2189: loss 2.7998, time 5266.89ms 
iter 2190: loss 2.9339, time 5290.01ms 
iter 2191: loss 2.9334, time 5283.30ms 
iter 2192: loss 2.9572, time 5265.72ms 
iter 2193: loss 2.8839, time 5266.36ms 
iter 2194: loss 2.7655, time 5261.39ms 
iter 2195: loss 2.7316, time 5264.81ms 
iter 2196: loss 2.8854, time 5260.02ms 
iter 2197: loss 2.9433, time 5274.08ms 
iter 2198: loss 2.8108, time 5266.55ms 
iter 2199: loss 2.8066, time 5251.98ms 
step 2200: train loss 2.8363, val loss 2.9346
iter 2200: loss 2.9387, time 19991.84ms 
iter 2201: loss 2.7169, time 5267.61ms 
iter 2202: loss 2.8981, time 5264.33ms 
iter 2203: loss 2.8640, time 5256.27ms 
iter 2204: loss 2.6820, time 5268.09ms 
iter 2205: loss 2.7131, time 5260.60ms 
iter 2206: loss 2.6498, time 5257.67ms 
iter 2207: loss 2.8284, time 5264.50ms 
iter 2208: loss 2.5965, time 5264.25ms 
iter 2209: loss 2.8039, time 5262.82ms 
iter 2210: loss 2.8020, time 5262.79ms 
iter 2211: loss 2.7799, time 5261.40ms 
iter 2212: loss 2.8040, time 5262.07ms 
iter 2213: loss 2.7499, time 5266.05ms 
iter 2214: loss 2.8648, time 5257.25ms 
iter 2215: loss 2.8206, time 5272.66ms 
iter 2216: loss 2.9405, time 5264.84ms 
iter 2217: loss 2.8723, time 5281.57ms 
iter 2218: loss 2.7949, time 5277.67ms 
iter 2219: loss 2.7432, time 5257.64ms 
iter 2220: loss 2.7588, time 5255.01ms 
iter 2221: loss 2.7331, time 5263.33ms 
iter 2222: loss 2.9797, time 5255.91ms 
iter 2223: loss 2.8086, time 5254.94ms 
iter 2224: loss 2.8182, time 5257.35ms 
iter 2225: loss 2.8784, time 5248.21ms 
iter 2226: loss 2.8722, time 5261.15ms 
iter 2227: loss 2.9071, time 5265.72ms 
iter 2228: loss 2.7391, time 5255.35ms 
iter 2229: loss 2.8040, time 5280.05ms 
iter 2230: loss 2.9025, time 5259.34ms 
iter 2231: loss 2.9497, time 5265.28ms 
iter 2232: loss 2.8052, time 5262.77ms 
iter 2233: loss 2.9985, time 5269.89ms 
iter 2234: loss 2.8293, time 5269.99ms 
iter 2235: loss 2.9283, time 5262.87ms 
iter 2236: loss 3.0138, time 5258.86ms 
iter 2237: loss 2.9514, time 5268.83ms 
iter 2238: loss 2.9084, time 5270.18ms 
iter 2239: loss 2.7340, time 5261.69ms 
iter 2240: loss 2.7942, time 5274.83ms 
iter 2241: loss 2.7958, time 5260.49ms 
iter 2242: loss 2.9075, time 5252.76ms 
iter 2243: loss 2.9231, time 5249.23ms 
iter 2244: loss 2.9964, time 5275.13ms 
iter 2245: loss 2.7553, time 5263.14ms 
iter 2246: loss 2.6967, time 5255.06ms 
iter 2247: loss 2.8222, time 5262.95ms 
iter 2248: loss 3.1603, time 5260.71ms 
iter 2249: loss 2.8262, time 5257.27ms 
step 2250: train loss 2.8411, val loss 2.9261
iter 2250: loss 2.7786, time 19948.95ms 
iter 2251: loss 2.6347, time 5258.75ms 
iter 2252: loss 2.7358, time 5252.75ms 
iter 2253: loss 2.9896, time 5250.97ms 
iter 2254: loss 3.0366, time 5250.59ms 
iter 2255: loss 2.9558, time 5253.82ms 
iter 2256: loss 2.8097, time 5250.31ms 
iter 2257: loss 2.9429, time 5249.37ms 
iter 2258: loss 2.9067, time 5253.06ms 
iter 2259: loss 2.8978, time 5251.70ms 
iter 2260: loss 2.8973, time 5263.06ms 
iter 2261: loss 2.7635, time 5249.37ms 
iter 2262: loss 2.7693, time 5253.42ms 
iter 2263: loss 2.6544, time 5253.33ms 
iter 2264: loss 2.7551, time 5254.41ms 
iter 2265: loss 2.6444, time 5260.21ms 
iter 2266: loss 2.7836, time 5257.05ms 
iter 2267: loss 2.8200, time 5258.17ms 
iter 2268: loss 2.8058, time 5264.78ms 
iter 2269: loss 2.7522, time 5257.68ms 
iter 2270: loss 2.6496, time 5259.15ms 
iter 2271: loss 2.7883, time 5258.40ms 
iter 2272: loss 2.8394, time 5251.05ms 
iter 2273: loss 2.7088, time 5254.91ms 
iter 2274: loss 2.7267, time 5254.50ms 
iter 2275: loss 2.9993, time 5266.76ms 
iter 2276: loss 2.9979, time 5255.73ms 
iter 2277: loss 3.0007, time 5266.65ms 
iter 2278: loss 2.8797, time 5273.37ms 
iter 2279: loss 2.7123, time 5258.01ms 
iter 2280: loss 2.8302, time 5266.72ms 
iter 2281: loss 2.7041, time 5260.62ms 
iter 2282: loss 2.7949, time 5268.11ms 
iter 2283: loss 2.8508, time 5269.82ms 
iter 2284: loss 2.8567, time 5271.71ms 
iter 2285: loss 3.0300, time 5255.38ms 
iter 2286: loss 3.0456, time 5259.90ms 
iter 2287: loss 2.6602, time 5257.12ms 
iter 2288: loss 2.7015, time 5258.01ms 
iter 2289: loss 2.6884, time 5264.22ms 
iter 2290: loss 2.9634, time 5263.83ms 
iter 2291: loss 2.8578, time 5263.13ms 
iter 2292: loss 2.8948, time 5274.43ms 
iter 2293: loss 2.7597, time 5272.40ms 
iter 2294: loss 3.0524, time 5261.17ms 
iter 2295: loss 2.8817, time 5246.50ms 
iter 2296: loss 2.8222, time 5241.14ms 
iter 2297: loss 2.8275, time 5254.54ms 
iter 2298: loss 2.8893, time 5260.98ms 
iter 2299: loss 2.8463, time 5265.02ms 
step 2300: train loss 2.8127, val loss 2.9149
iter 2300: loss 2.7616, time 19904.06ms 
iter 2301: loss 2.8334, time 5257.21ms 
iter 2302: loss 2.7392, time 5251.15ms 
iter 2303: loss 3.0315, time 5253.53ms 
iter 2304: loss 2.7962, time 5264.95ms 
iter 2305: loss 2.8715, time 5238.45ms 
iter 2306: loss 2.9485, time 5253.46ms 
iter 2307: loss 3.0159, time 5251.55ms 
iter 2308: loss 2.8856, time 5252.65ms 
iter 2309: loss 2.7523, time 5253.06ms 
iter 2310: loss 2.9162, time 5257.58ms 
iter 2311: loss 2.7778, time 5258.86ms 
iter 2312: loss 2.7868, time 5259.05ms 
iter 2313: loss 3.0271, time 5239.62ms 
iter 2314: loss 2.8410, time 5263.81ms 
iter 2315: loss 2.7098, time 5263.31ms 
iter 2316: loss 2.8129, time 5258.75ms 
iter 2317: loss 2.6710, time 5258.47ms 
iter 2318: loss 2.7338, time 5258.03ms 
iter 2319: loss 2.6859, time 5258.14ms 
iter 2320: loss 2.8506, time 5256.28ms 
iter 2321: loss 2.9284, time 5281.76ms 
iter 2322: loss 2.7372, time 5262.83ms 
iter 2323: loss 2.8746, time 5338.62ms 
iter 2324: loss 2.8553, time 5251.62ms 
iter 2325: loss 2.5091, time 5251.58ms 
iter 2326: loss 2.8983, time 5267.59ms 
iter 2327: loss 2.7740, time 5252.94ms 
iter 2328: loss 2.9190, time 5251.11ms 
iter 2329: loss 2.7735, time 5273.82ms 
iter 2330: loss 2.7150, time 5257.52ms 
iter 2331: loss 2.6962, time 5256.29ms 
iter 2332: loss 2.9268, time 5252.16ms 
iter 2333: loss 2.8134, time 5253.71ms 
iter 2334: loss 3.0524, time 5253.19ms 
iter 2335: loss 2.6330, time 5251.05ms 
iter 2336: loss 2.7000, time 5250.92ms 
iter 2337: loss 2.7876, time 5266.79ms 
iter 2338: loss 2.7820, time 5251.32ms 
iter 2339: loss 2.8462, time 5254.76ms 
iter 2340: loss 2.8570, time 5247.66ms 
iter 2341: loss 2.9462, time 5251.45ms 
iter 2342: loss 2.7933, time 5247.83ms 
iter 2343: loss 2.8097, time 5254.24ms 
iter 2344: loss 2.7761, time 5251.85ms 
iter 2345: loss 2.7702, time 5252.63ms 
iter 2346: loss 3.0472, time 5268.83ms 
iter 2347: loss 2.9179, time 5255.96ms 
iter 2348: loss 3.0776, time 5260.27ms 
iter 2349: loss 2.8741, time 5266.08ms 
step 2350: train loss 2.8068, val loss 2.9179
iter 2350: loss 2.6301, time 20069.42ms 
iter 2351: loss 2.4317, time 5251.04ms 
iter 2352: loss 2.9065, time 5273.27ms 
iter 2353: loss 2.7852, time 5255.40ms 
iter 2354: loss 2.8643, time 5259.65ms 
iter 2355: loss 2.8391, time 5266.92ms 
iter 2356: loss 2.9257, time 5257.31ms 
iter 2357: loss 2.9821, time 5253.10ms 
iter 2358: loss 2.5822, time 5258.41ms 
iter 2359: loss 2.8675, time 5264.06ms 
iter 2360: loss 2.7467, time 5252.72ms 
iter 2361: loss 2.8462, time 5267.38ms 
iter 2362: loss 2.8912, time 5261.17ms 
iter 2363: loss 2.7011, time 5259.97ms 
iter 2364: loss 2.8947, time 5257.74ms 
iter 2365: loss 2.8349, time 5256.81ms 
iter 2366: loss 2.7235, time 5263.31ms 
iter 2367: loss 2.8648, time 5260.73ms 
iter 2368: loss 2.9237, time 5264.35ms 
iter 2369: loss 2.9414, time 5261.85ms 
iter 2370: loss 2.6365, time 5253.48ms 
iter 2371: loss 2.8745, time 5255.35ms 
iter 2372: loss 2.6988, time 5252.84ms 
iter 2373: loss 2.9113, time 5270.80ms 
iter 2374: loss 2.6231, time 5207.94ms 
iter 2375: loss 2.6271, time 5227.54ms 
iter 2376: loss 2.7529, time 5264.84ms 
iter 2377: loss 2.7291, time 5250.03ms 
iter 2378: loss 2.9377, time 5250.86ms 
iter 2379: loss 2.7749, time 5254.66ms 
iter 2380: loss 2.8497, time 5262.32ms 
iter 2381: loss 2.7379, time 5251.98ms 
iter 2382: loss 2.8529, time 5258.68ms 
iter 2383: loss 2.7613, time 5265.30ms 
iter 2384: loss 2.6885, time 5262.96ms 
iter 2385: loss 2.5183, time 5266.45ms 
iter 2386: loss 2.9224, time 5278.24ms 
iter 2387: loss 2.7482, time 5270.78ms 
iter 2388: loss 2.8250, time 5267.04ms 
iter 2389: loss 2.7360, time 5270.20ms 
iter 2390: loss 2.7635, time 5256.83ms 
iter 2391: loss 2.9222, time 5260.29ms 
iter 2392: loss 2.6671, time 5265.24ms 
iter 2393: loss 2.9534, time 5251.86ms 
iter 2394: loss 2.6890, time 5256.13ms 
iter 2395: loss 2.7402, time 5252.61ms 
iter 2396: loss 2.8421, time 5255.11ms 
iter 2397: loss 2.9466, time 5235.86ms 
iter 2398: loss 2.8475, time 5271.34ms 
iter 2399: loss 2.8012, time 5267.34ms 
step 2400: train loss 2.8048, val loss 2.9208
iter 2400: loss 2.8893, time 20093.41ms 
iter 2401: loss 2.9610, time 5263.11ms 
iter 2402: loss 2.7920, time 5260.63ms 
iter 2403: loss 2.8957, time 5263.26ms 
iter 2404: loss 2.7124, time 5264.53ms 
iter 2405: loss 2.9391, time 5263.94ms 
iter 2406: loss 2.8961, time 5252.57ms 
iter 2407: loss 2.8081, time 5253.64ms 
iter 2408: loss 2.7137, time 5250.14ms 
iter 2409: loss 2.8875, time 5252.57ms 
iter 2410: loss 2.7598, time 5253.10ms 
iter 2411: loss 2.8802, time 5259.29ms 
iter 2412: loss 2.7455, time 5277.81ms 
iter 2413: loss 2.8528, time 5277.17ms 
iter 2414: loss 2.7599, time 5269.10ms 
iter 2415: loss 2.6871, time 5274.14ms 
iter 2416: loss 2.9070, time 5294.49ms 
iter 2417: loss 2.9902, time 5273.32ms 
iter 2418: loss 2.8587, time 5269.62ms 
iter 2419: loss 2.7007, time 5264.37ms 
iter 2420: loss 2.7822, time 5268.61ms 
iter 2421: loss 2.8729, time 5253.88ms 
iter 2422: loss 2.6824, time 5263.65ms 
iter 2423: loss 2.9351, time 5264.94ms 
iter 2424: loss 2.6974, time 5257.45ms 
iter 2425: loss 2.8049, time 5251.58ms 
iter 2426: loss 2.7748, time 5255.93ms 
iter 2427: loss 2.9512, time 5262.11ms 
iter 2428: loss 2.6943, time 5258.89ms 
iter 2429: loss 2.8110, time 5258.25ms 
iter 2430: loss 2.7609, time 5255.04ms 
iter 2431: loss 2.7255, time 5252.19ms 
iter 2432: loss 2.6006, time 5270.52ms 
iter 2433: loss 2.7847, time 5249.84ms 
iter 2434: loss 2.9483, time 5270.30ms 
iter 2435: loss 2.9782, time 5274.86ms 
iter 2436: loss 2.6930, time 5270.53ms 
iter 2437: loss 2.5911, time 5259.33ms 
iter 2438: loss 2.7972, time 5264.64ms 
iter 2439: loss 2.8818, time 5263.04ms 
iter 2440: loss 2.8272, time 5209.99ms 
iter 2441: loss 2.6423, time 5256.33ms 
iter 2442: loss 2.6138, time 5265.39ms 
iter 2443: loss 2.7762, time 5268.44ms 
iter 2444: loss 2.7766, time 5272.67ms 
iter 2445: loss 2.7850, time 5254.07ms 
iter 2446: loss 3.0176, time 5254.04ms 
iter 2447: loss 2.8188, time 5256.51ms 
iter 2448: loss 2.7061, time 5253.08ms 
iter 2449: loss 2.6449, time 5266.01ms 
step 2450: train loss 2.8047, val loss 2.9196
iter 2450: loss 2.6660, time 20072.82ms 
iter 2451: loss 2.9321, time 5255.98ms 
iter 2452: loss 2.7666, time 5265.73ms 
iter 2453: loss 2.9042, time 5254.02ms 
iter 2454: loss 3.0070, time 5255.89ms 
iter 2455: loss 2.6582, time 5257.17ms 
iter 2456: loss 2.8106, time 5267.49ms 
iter 2457: loss 2.9584, time 5265.41ms 
iter 2458: loss 2.6693, time 5271.49ms 
iter 2459: loss 2.7438, time 5266.68ms 
iter 2460: loss 2.8017, time 5262.05ms 
iter 2461: loss 2.8368, time 5257.99ms 
iter 2462: loss 2.8209, time 5267.44ms 
iter 2463: loss 2.7563, time 5258.63ms 
iter 2464: loss 2.9431, time 5268.73ms 
iter 2465: loss 2.8072, time 5258.39ms 
iter 2466: loss 2.7224, time 5253.98ms 
iter 2467: loss 2.8065, time 5261.67ms 
iter 2468: loss 2.9138, time 5253.32ms 
iter 2469: loss 2.9648, time 5257.56ms 
iter 2470: loss 2.8212, time 5256.27ms 
iter 2471: loss 2.5003, time 5272.94ms 
iter 2472: loss 2.6768, time 5268.51ms 
iter 2473: loss 2.6722, time 5264.89ms 
iter 2474: loss 2.6307, time 5255.50ms 
iter 2475: loss 2.9697, time 5258.24ms 
iter 2476: loss 2.6364, time 5267.17ms 
iter 2477: loss 2.8922, time 5254.76ms 
iter 2478: loss 2.8570, time 5259.35ms 
iter 2479: loss 2.7691, time 5258.14ms 
iter 2480: loss 2.8499, time 5250.99ms 
iter 2481: loss 2.8641, time 5265.87ms 
iter 2482: loss 2.7367, time 5271.80ms 
iter 2483: loss 2.8005, time 5274.75ms 
iter 2484: loss 2.9417, time 5252.24ms 
iter 2485: loss 2.7659, time 5265.32ms 
iter 2486: loss 2.7539, time 5253.54ms 
iter 2487: loss 2.8491, time 5252.42ms 
iter 2488: loss 2.8664, time 5255.41ms 
iter 2489: loss 2.9238, time 5254.74ms 
iter 2490: loss 2.8764, time 5224.36ms 
iter 2491: loss 2.6173, time 5259.65ms 
iter 2492: loss 2.5734, time 5257.06ms 
iter 2493: loss 3.0426, time 5267.00ms 
iter 2494: loss 2.8037, time 5251.52ms 
iter 2495: loss 2.7303, time 5253.44ms 
iter 2496: loss 2.8903, time 5253.27ms 
iter 2497: loss 2.6723, time 5253.22ms 
iter 2498: loss 2.4958, time 5255.19ms 
iter 2499: loss 3.0137, time 5260.46ms 
step 2500: train loss 2.7888, val loss 2.9188
iter 2500: loss 2.6123, time 19965.04ms 
iter 2501: loss 2.7717, time 5253.22ms 
iter 2502: loss 2.7083, time 5279.60ms 
iter 2503: loss 2.6786, time 5271.92ms 
iter 2504: loss 2.5725, time 5268.86ms 
iter 2505: loss 2.8413, time 5265.20ms 
iter 2506: loss 2.7083, time 5277.36ms 
iter 2507: loss 2.8238, time 5257.20ms 
iter 2508: loss 2.9498, time 5269.15ms 
iter 2509: loss 2.6482, time 5270.75ms 
iter 2510: loss 2.7999, time 5265.03ms 
iter 2511: loss 2.7287, time 5261.52ms 
iter 2512: loss 2.6489, time 5256.39ms 
iter 2513: loss 2.5001, time 5275.91ms 
iter 2514: loss 2.7826, time 5213.15ms 
iter 2515: loss 2.7340, time 5266.15ms 
iter 2516: loss 2.7213, time 5269.52ms 
iter 2517: loss 2.6156, time 5259.78ms 
iter 2518: loss 2.7247, time 5272.56ms 
iter 2519: loss 2.7768, time 5256.28ms 
iter 2520: loss 2.7373, time 5259.17ms 
iter 2521: loss 2.9616, time 5259.25ms 
iter 2522: loss 2.7349, time 5250.07ms 
iter 2523: loss 2.7435, time 5250.35ms 
iter 2524: loss 2.7825, time 5255.80ms 
iter 2525: loss 2.9551, time 5258.67ms 
iter 2526: loss 2.8815, time 5258.90ms 
iter 2527: loss 2.8030, time 5255.67ms 
iter 2528: loss 2.7805, time 5264.60ms 
iter 2529: loss 2.8599, time 5253.73ms 
iter 2530: loss 2.7089, time 5254.28ms 
iter 2531: loss 2.7895, time 5258.16ms 
iter 2532: loss 2.6699, time 5253.35ms 
iter 2533: loss 2.6667, time 5260.32ms 
iter 2534: loss 2.7906, time 5257.91ms 
iter 2535: loss 2.7134, time 5252.37ms 
iter 2536: loss 2.8069, time 5257.67ms 
iter 2537: loss 2.8099, time 5249.02ms 
iter 2538: loss 2.5986, time 5247.76ms 
iter 2539: loss 2.8630, time 5252.83ms 
iter 2540: loss 2.6063, time 5256.31ms 
iter 2541: loss 2.8121, time 5261.94ms 
iter 2542: loss 2.6498, time 5261.42ms 
iter 2543: loss 2.8556, time 5274.50ms 
iter 2544: loss 2.7593, time 5254.93ms 
iter 2545: loss 2.7199, time 5256.60ms 
iter 2546: loss 2.8003, time 5266.13ms 
iter 2547: loss 2.8104, time 5258.02ms 
iter 2548: loss 2.9010, time 5235.89ms 
iter 2549: loss 2.7817, time 5253.49ms 
step 2550: train loss 2.7967, val loss 2.9104
iter 2550: loss 2.8375, time 20058.15ms 
iter 2551: loss 2.7921, time 5254.89ms 
iter 2552: loss 2.9244, time 5259.02ms 
iter 2553: loss 2.9524, time 5257.24ms 
iter 2554: loss 2.8560, time 5261.74ms 
iter 2555: loss 2.7118, time 5254.84ms 
iter 2556: loss 2.7120, time 5241.01ms 
iter 2557: loss 2.7774, time 5254.07ms 
iter 2558: loss 2.5508, time 5254.35ms 
iter 2559: loss 2.7518, time 5250.03ms 
iter 2560: loss 2.8197, time 5256.84ms 
iter 2561: loss 2.9549, time 5258.91ms 
iter 2562: loss 2.7881, time 5254.42ms 
iter 2563: loss 2.8333, time 5246.99ms 
iter 2564: loss 2.8836, time 5252.73ms 
iter 2565: loss 2.7464, time 5242.44ms 
iter 2566: loss 2.6525, time 5253.62ms 
iter 2567: loss 2.4923, time 5255.21ms 
iter 2568: loss 2.8307, time 5252.24ms 
iter 2569: loss 3.0629, time 5257.57ms 
iter 2570: loss 2.7733, time 5253.15ms 
iter 2571: loss 2.8918, time 5254.82ms 
iter 2572: loss 2.6072, time 5251.03ms 
iter 2573: loss 2.7118, time 5260.81ms 
iter 2574: loss 2.8523, time 5266.20ms 
iter 2575: loss 2.6023, time 5180.04ms 
iter 2576: loss 2.9388, time 5252.98ms 
iter 2577: loss 2.7105, time 5258.08ms 
iter 2578: loss 2.9103, time 5234.60ms 
iter 2579: loss 2.6714, time 5255.44ms 
iter 2580: loss 2.9465, time 5250.66ms 
iter 2581: loss 2.6906, time 5261.82ms 
iter 2582: loss 2.7687, time 5260.26ms 
iter 2583: loss 2.8689, time 5265.86ms 
iter 2584: loss 2.7811, time 5264.79ms 
iter 2585: loss 2.6759, time 5268.13ms 
iter 2586: loss 2.7638, time 5264.79ms 
iter 2587: loss 2.8958, time 5260.38ms 
iter 2588: loss 2.5880, time 5258.88ms 
iter 2589: loss 2.5680, time 5261.63ms 
iter 2590: loss 2.8880, time 5253.11ms 
iter 2591: loss 2.7477, time 5256.80ms 
iter 2592: loss 2.7648, time 5264.83ms 
iter 2593: loss 2.6391, time 5266.64ms 
iter 2594: loss 2.9425, time 5272.68ms 
iter 2595: loss 2.7107, time 5258.72ms 
iter 2596: loss 2.4861, time 5263.28ms 
iter 2597: loss 2.6519, time 5258.27ms 
iter 2598: loss 2.7595, time 5205.02ms 
iter 2599: loss 2.6696, time 5256.64ms 
step 2600: train loss 2.7837, val loss 2.9136
iter 2600: loss 2.8839, time 20029.82ms 
iter 2601: loss 2.5565, time 5257.62ms 
iter 2602: loss 2.7233, time 5252.82ms 
iter 2603: loss 2.8392, time 5234.45ms 
iter 2604: loss 2.9347, time 5270.49ms 
iter 2605: loss 2.7062, time 5277.09ms 
iter 2606: loss 2.7431, time 5264.15ms 
iter 2607: loss 2.8333, time 5270.52ms 
iter 2608: loss 2.7595, time 5289.47ms 
iter 2609: loss 2.8177, time 5291.28ms 
iter 2610: loss 2.6891, time 5260.46ms 
iter 2611: loss 2.7998, time 5255.16ms 
iter 2612: loss 2.8570, time 5264.51ms 
iter 2613: loss 2.6881, time 5272.80ms 
iter 2614: loss 2.9572, time 5260.44ms 
iter 2615: loss 2.8286, time 5266.75ms 
iter 2616: loss 2.8526, time 5262.73ms 
iter 2617: loss 2.8057, time 5257.45ms 
iter 2618: loss 2.6459, time 5261.37ms 
iter 2619: loss 2.8417, time 5248.73ms 
iter 2620: loss 2.7762, time 5253.64ms 
iter 2621: loss 3.0604, time 5256.74ms 
iter 2622: loss 2.5659, time 5257.41ms 
iter 2623: loss 2.8050, time 5262.09ms 
iter 2624: loss 2.7782, time 5262.04ms 
iter 2625: loss 2.6586, time 5253.27ms 
iter 2626: loss 2.7430, time 5254.46ms 
iter 2627: loss 2.9220, time 5254.28ms 
iter 2628: loss 2.9350, time 5265.48ms 
iter 2629: loss 2.7785, time 5259.44ms 
iter 2630: loss 2.7074, time 5256.74ms 
iter 2631: loss 2.8067, time 5260.49ms 
iter 2632: loss 2.6423, time 5252.08ms 
iter 2633: loss 2.8931, time 5263.04ms 
iter 2634: loss 2.9069, time 5259.75ms 
iter 2635: loss 2.6707, time 5264.59ms 
iter 2636: loss 2.7555, time 5285.24ms 
iter 2637: loss 3.0153, time 5282.56ms 
iter 2638: loss 2.8780, time 5253.20ms 
iter 2639: loss 2.7246, time 5264.01ms 
iter 2640: loss 2.8271, time 5254.20ms 
iter 2641: loss 2.9295, time 5257.25ms 
iter 2642: loss 2.9057, time 5253.32ms 
iter 2643: loss 2.7955, time 5252.21ms 
iter 2644: loss 2.7117, time 5253.63ms 
iter 2645: loss 2.6343, time 5255.40ms 
iter 2646: loss 2.7562, time 5253.46ms 
iter 2647: loss 2.7046, time 5239.51ms 
iter 2648: loss 2.6471, time 5228.19ms 
iter 2649: loss 2.9375, time 5260.83ms 
step 2650: train loss 2.7819, val loss 2.8852
iter 2650: loss 2.7375, time 20049.57ms 
iter 2651: loss 3.0294, time 5259.60ms 
iter 2652: loss 2.8981, time 5251.54ms 
iter 2653: loss 2.7606, time 5267.47ms 
iter 2654: loss 2.8196, time 5263.29ms 
iter 2655: loss 2.9293, time 5249.89ms 
iter 2656: loss 2.7379, time 5269.71ms 
iter 2657: loss 2.8230, time 5268.42ms 
iter 2658: loss 2.6464, time 5253.50ms 
iter 2659: loss 2.9653, time 5263.47ms 
iter 2660: loss 2.8026, time 5262.51ms 
iter 2661: loss 2.6776, time 5254.34ms 
iter 2662: loss 2.7643, time 5251.67ms 
iter 2663: loss 2.7903, time 5260.13ms 
iter 2664: loss 2.6798, time 5260.49ms 
iter 2665: loss 2.7715, time 5253.66ms 
iter 2666: loss 2.9142, time 5261.78ms 
iter 2667: loss 2.8419, time 5263.70ms 
iter 2668: loss 3.0823, time 5263.41ms 
iter 2669: loss 2.6791, time 5256.03ms 
iter 2670: loss 3.0241, time 5251.03ms 
iter 2671: loss 2.9410, time 5261.92ms 
iter 2672: loss 2.7578, time 5256.02ms 
iter 2673: loss 2.6626, time 5258.67ms 
iter 2674: loss 2.7996, time 5257.77ms 
iter 2675: loss 2.5978, time 5258.15ms 
iter 2676: loss 2.5970, time 5258.18ms 
iter 2677: loss 2.7933, time 5278.66ms 
iter 2678: loss 2.6941, time 5253.53ms 
iter 2679: loss 2.7903, time 5252.30ms 
iter 2680: loss 2.9243, time 5255.48ms 
iter 2681: loss 2.8449, time 5259.50ms 
iter 2682: loss 2.7518, time 5259.79ms 
iter 2683: loss 2.9217, time 5381.89ms 
iter 2684: loss 2.7479, time 5250.57ms 
iter 2685: loss 2.7337, time 5251.36ms 
iter 2686: loss 2.7036, time 5262.14ms 
iter 2687: loss 2.7413, time 5266.33ms 
iter 2688: loss 2.7701, time 5262.30ms 
iter 2689: loss 3.0255, time 5254.72ms 
iter 2690: loss 2.8337, time 5255.53ms 
iter 2691: loss 2.9424, time 5252.18ms 
iter 2692: loss 2.9094, time 5409.65ms 
iter 2693: loss 2.5665, time 5253.31ms 
iter 2694: loss 2.9062, time 5260.73ms 
iter 2695: loss 2.8242, time 5267.72ms 
iter 2696: loss 2.7181, time 5272.36ms 
iter 2697: loss 2.9847, time 5258.82ms 
iter 2698: loss 3.0161, time 5261.45ms 
iter 2699: loss 2.7928, time 5261.33ms 
step 2700: train loss 2.7549, val loss 2.8966
iter 2700: loss 2.9197, time 20097.57ms 
iter 2701: loss 2.6715, time 5253.62ms 
iter 2702: loss 2.8266, time 5265.75ms 
iter 2703: loss 2.9084, time 5268.00ms 
iter 2704: loss 2.9763, time 5257.29ms 
iter 2705: loss 2.7664, time 5261.03ms 
iter 2706: loss 2.7230, time 5266.17ms 
iter 2707: loss 2.7124, time 5256.90ms 
iter 2708: loss 3.0760, time 5258.52ms 
iter 2709: loss 2.7026, time 5264.94ms 
iter 2710: loss 2.7622, time 5256.07ms 
iter 2711: loss 2.6571, time 5261.11ms 
iter 2712: loss 3.0207, time 5263.79ms 
iter 2713: loss 2.5861, time 5256.11ms 
iter 2714: loss 2.6034, time 5261.20ms 
iter 2715: loss 2.7194, time 5249.02ms 
iter 2716: loss 2.7798, time 5252.52ms 
iter 2717: loss 2.8661, time 5253.59ms 
iter 2718: loss 2.9509, time 5251.92ms 
iter 2719: loss 2.8656, time 5262.34ms 
iter 2720: loss 2.6806, time 5256.23ms 
iter 2721: loss 2.8580, time 5262.24ms 
iter 2722: loss 2.7328, time 5255.54ms 
iter 2723: loss 2.7610, time 5254.54ms 
iter 2724: loss 2.9145, time 5254.26ms 
iter 2725: loss 2.7306, time 5256.99ms 
iter 2726: loss 2.7198, time 5259.96ms 
iter 2727: loss 2.8446, time 5259.50ms 
iter 2728: loss 2.8760, time 5252.43ms 
iter 2729: loss 2.7377, time 5255.45ms 
iter 2730: loss 2.8517, time 5254.18ms 
iter 2731: loss 2.7464, time 5258.63ms 
iter 2732: loss 2.7034, time 5258.70ms 
iter 2733: loss 2.8947, time 5263.50ms 
iter 2734: loss 2.9925, time 5255.63ms 
iter 2735: loss 2.6732, time 5252.32ms 
iter 2736: loss 2.8687, time 5257.96ms 
iter 2737: loss 2.8029, time 5255.85ms 
iter 2738: loss 2.8230, time 5240.19ms 
iter 2739: loss 2.7505, time 5259.02ms 
iter 2740: loss 2.8191, time 5263.60ms 
iter 2741: loss 2.7293, time 5261.30ms 
iter 2742: loss 2.7052, time 5255.86ms 
iter 2743: loss 3.0079, time 5261.48ms 
iter 2744: loss 2.7905, time 5252.41ms 
iter 2745: loss 2.7533, time 5255.89ms 
iter 2746: loss 2.6029, time 5251.38ms 
iter 2747: loss 2.6361, time 5257.54ms 
iter 2748: loss 2.7768, time 5256.77ms 
iter 2749: loss 2.7150, time 5257.01ms 
step 2750: train loss 2.7652, val loss 2.9004
iter 2750: loss 2.5923, time 20131.00ms 
iter 2751: loss 2.7602, time 5261.21ms 
iter 2752: loss 2.7565, time 5263.48ms 
iter 2753: loss 2.6942, time 5256.09ms 
iter 2754: loss 2.8332, time 5222.16ms 
iter 2755: loss 2.6288, time 5210.89ms 
iter 2756: loss 2.7822, time 5213.49ms 
iter 2757: loss 2.7786, time 5211.22ms 
iter 2758: loss 2.7771, time 5213.64ms 
iter 2759: loss 2.6035, time 5214.41ms 
iter 2760: loss 2.4966, time 5216.31ms 
iter 2761: loss 2.8551, time 5216.40ms 
iter 2762: loss 2.9059, time 5214.01ms 
iter 2763: loss 2.7430, time 5221.26ms 
iter 2764: loss 2.8605, time 5251.07ms 
iter 2765: loss 2.7756, time 5248.46ms 
iter 2766: loss 2.7621, time 5257.38ms 
iter 2767: loss 2.7281, time 5255.50ms 
iter 2768: loss 2.9210, time 5259.72ms 
iter 2769: loss 2.9400, time 5254.56ms 
iter 2770: loss 2.8461, time 5264.04ms 
iter 2771: loss 2.8624, time 5254.76ms 
iter 2772: loss 2.5593, time 5255.59ms 
iter 2773: loss 2.6474, time 5260.34ms 
iter 2774: loss 2.7469, time 5254.49ms 
iter 2775: loss 2.7412, time 5255.97ms 
iter 2776: loss 2.6526, time 5247.60ms 
iter 2777: loss 2.7877, time 5215.18ms 
iter 2778: loss 2.8882, time 5266.67ms 
iter 2779: loss 2.9542, time 5262.36ms 
iter 2780: loss 2.7607, time 5264.71ms 
iter 2781: loss 2.8309, time 5264.83ms 
iter 2782: loss 2.8185, time 5256.58ms 
iter 2783: loss 2.7136, time 5253.19ms 
iter 2784: loss 2.6476, time 5262.89ms 
iter 2785: loss 2.6796, time 5265.71ms 
iter 2786: loss 2.9198, time 5257.30ms 
iter 2787: loss 2.6155, time 5263.45ms 
iter 2788: loss 2.7157, time 5253.29ms 
iter 2789: loss 2.6441, time 5267.24ms 
iter 2790: loss 2.6936, time 5267.61ms 
iter 2791: loss 2.6761, time 5283.76ms 
iter 2792: loss 2.8101, time 5269.14ms 
iter 2793: loss 2.8134, time 5279.84ms 
iter 2794: loss 2.7106, time 5303.10ms 
iter 2795: loss 2.8224, time 5286.27ms 
iter 2796: loss 2.7092, time 5266.85ms 
iter 2797: loss 2.6676, time 5255.94ms 
iter 2798: loss 2.7623, time 5253.63ms 
iter 2799: loss 3.0824, time 5276.89ms 
step 2800: train loss 2.7392, val loss 2.8933
iter 2800: loss 2.7134, time 20002.26ms 
iter 2801: loss 2.6970, time 5253.71ms 
iter 2802: loss 2.7206, time 5261.83ms 
iter 2803: loss 2.7664, time 5254.21ms 
iter 2804: loss 2.9354, time 5264.30ms 
iter 2805: loss 2.8106, time 5263.34ms 
iter 2806: loss 2.6431, time 5260.93ms 
iter 2807: loss 2.7485, time 5251.12ms 
iter 2808: loss 2.7991, time 5255.92ms 
iter 2809: loss 2.9581, time 5260.16ms 
iter 2810: loss 2.7285, time 5233.69ms 
iter 2811: loss 2.9678, time 5258.46ms 
iter 2812: loss 2.7365, time 5266.25ms 
iter 2813: loss 2.7831, time 5254.38ms 
iter 2814: loss 2.6690, time 5258.23ms 
iter 2815: loss 2.8496, time 5257.61ms 
iter 2816: loss 2.8551, time 5264.12ms 
iter 2817: loss 2.7280, time 5258.47ms 
iter 2818: loss 2.8794, time 5129.53ms 
iter 2819: loss 2.8431, time 5088.43ms 
iter 2820: loss 2.8531, time 5223.77ms 
iter 2821: loss 2.7756, time 5222.16ms 
iter 2822: loss 2.7693, time 5216.08ms 
iter 2823: loss 2.8498, time 5225.09ms 
iter 2824: loss 2.6716, time 5221.20ms 
iter 2825: loss 2.9257, time 5212.59ms 
iter 2826: loss 2.7148, time 5214.61ms 
iter 2827: loss 2.5601, time 5218.80ms 
iter 2828: loss 2.8144, time 5214.16ms 
iter 2829: loss 2.6947, time 5213.47ms 
iter 2830: loss 2.6534, time 5215.97ms 
iter 2831: loss 2.7658, time 5212.46ms 
iter 2832: loss 2.7542, time 5206.45ms 
iter 2833: loss 2.8263, time 5103.96ms 
iter 2834: loss 2.7360, time 5119.68ms 
iter 2835: loss 2.7149, time 5109.67ms 
iter 2836: loss 2.6503, time 5066.67ms 
iter 2837: loss 2.6448, time 5094.73ms 
iter 2838: loss 2.7225, time 5079.54ms 
iter 2839: loss 2.6935, time 5103.09ms 
iter 2840: loss 2.6665, time 5076.07ms 
iter 2841: loss 2.6555, time 5075.03ms 
iter 2842: loss 2.7529, time 5108.03ms 
iter 2843: loss 2.7418, time 5110.05ms 
iter 2844: loss 2.7260, time 5129.30ms 
iter 2845: loss 2.8060, time 5141.42ms 
iter 2846: loss 2.7197, time 5269.73ms 
iter 2847: loss 2.6763, time 5272.69ms 
iter 2848: loss 2.8691, time 5281.87ms 
iter 2849: loss 2.7277, time 5276.71ms 
step 2850: train loss 2.7372, val loss 2.8815
iter 2850: loss 2.7182, time 20011.04ms 
iter 2851: loss 2.6661, time 5255.06ms 
iter 2852: loss 2.9786, time 5258.37ms 
iter 2853: loss 2.7570, time 5256.46ms 
iter 2854: loss 2.6857, time 5260.11ms 
iter 2855: loss 2.7482, time 5254.63ms 
iter 2856: loss 2.6960, time 5251.59ms 
iter 2857: loss 2.8015, time 5261.19ms 
iter 2858: loss 2.5292, time 5251.09ms 
iter 2859: loss 2.8362, time 5255.66ms 
iter 2860: loss 2.7045, time 5253.18ms 
iter 2861: loss 2.5875, time 5253.27ms 
iter 2862: loss 2.4657, time 5254.76ms 
iter 2863: loss 2.4881, time 5254.81ms 
iter 2864: loss 2.7577, time 5271.98ms 
iter 2865: loss 2.7559, time 5266.24ms 
iter 2866: loss 2.8443, time 5255.75ms 
iter 2867: loss 2.8228, time 5257.34ms 
iter 2868: loss 2.8662, time 5250.25ms 
iter 2869: loss 2.6154, time 5259.92ms 
iter 2870: loss 2.4834, time 5263.05ms 
iter 2871: loss 2.8531, time 5261.00ms 
iter 2872: loss 2.7964, time 5252.77ms 
iter 2873: loss 2.6136, time 5255.15ms 
iter 2874: loss 2.7265, time 5252.77ms 
iter 2875: loss 2.7045, time 5254.70ms 
iter 2876: loss 2.7387, time 5263.06ms 
iter 2877: loss 2.7481, time 5260.94ms 
iter 2878: loss 2.8430, time 5270.56ms 
iter 2879: loss 2.8127, time 5264.70ms 
iter 2880: loss 2.8338, time 5266.74ms 
iter 2881: loss 2.6459, time 5260.87ms 
iter 2882: loss 2.7177, time 5257.61ms 
iter 2883: loss 2.4719, time 5263.88ms 
iter 2884: loss 2.8297, time 5257.69ms 
iter 2885: loss 2.8140, time 5255.28ms 
iter 2886: loss 2.7272, time 5254.66ms 
iter 2887: loss 2.8944, time 5254.03ms 
iter 2888: loss 2.9328, time 5264.77ms 
iter 2889: loss 2.8523, time 5256.98ms 
iter 2890: loss 2.7322, time 5259.04ms 
iter 2891: loss 2.7160, time 5253.78ms 
iter 2892: loss 2.5683, time 5254.18ms 
iter 2893: loss 2.6510, time 5251.71ms 
iter 2894: loss 2.8712, time 5255.40ms 
iter 2895: loss 2.6372, time 5259.91ms 
iter 2896: loss 2.6414, time 5249.92ms 
iter 2897: loss 2.6513, time 5260.81ms 
iter 2898: loss 2.6444, time 5252.68ms 
iter 2899: loss 2.8345, time 5251.08ms 
step 2900: train loss 2.7475, val loss 2.8863
iter 2900: loss 2.6057, time 20039.42ms 
iter 2901: loss 2.9629, time 5261.10ms 
iter 2902: loss 2.6354, time 5271.13ms 
iter 2903: loss 2.6879, time 5259.83ms 
iter 2904: loss 2.6441, time 5256.66ms 
iter 2905: loss 2.8401, time 5258.54ms 
iter 2906: loss 2.5711, time 5254.94ms 
iter 2907: loss 2.7225, time 5266.22ms 
iter 2908: loss 2.8277, time 5255.73ms 
iter 2909: loss 2.7594, time 5259.94ms 
iter 2910: loss 2.7066, time 5260.25ms 
iter 2911: loss 2.8788, time 5252.41ms 
iter 2912: loss 2.7004, time 5269.18ms 
iter 2913: loss 2.8801, time 5254.29ms 
iter 2914: loss 2.8249, time 5262.23ms 
iter 2915: loss 2.8594, time 5257.76ms 
iter 2916: loss 2.4169, time 5264.20ms 
iter 2917: loss 2.8318, time 5256.72ms 
iter 2918: loss 2.6201, time 5255.86ms 
iter 2919: loss 2.7017, time 5256.35ms 
iter 2920: loss 2.7279, time 5261.08ms 
iter 2921: loss 2.8034, time 5258.77ms 
iter 2922: loss 2.8975, time 5259.67ms 
iter 2923: loss 2.8477, time 5259.68ms 
iter 2924: loss 2.7664, time 5255.89ms 
iter 2925: loss 2.5958, time 5255.39ms 
iter 2926: loss 2.6410, time 5235.16ms 
iter 2927: loss 2.6695, time 5255.51ms 
iter 2928: loss 2.6994, time 5263.70ms 
iter 2929: loss 2.7064, time 5258.59ms 
iter 2930: loss 2.7024, time 5262.70ms 
iter 2931: loss 2.9420, time 5254.12ms 
iter 2932: loss 2.8541, time 5261.87ms 
iter 2933: loss 2.5686, time 5261.01ms 
iter 2934: loss 2.9042, time 5260.11ms 
iter 2935: loss 2.8136, time 5266.14ms 
iter 2936: loss 2.7160, time 5259.43ms 
iter 2937: loss 2.9310, time 5259.21ms 
iter 2938: loss 3.0289, time 5264.90ms 
iter 2939: loss 2.6804, time 5223.19ms 
iter 2940: loss 2.7093, time 5252.48ms 
iter 2941: loss 2.7450, time 5259.23ms 
iter 2942: loss 2.5844, time 5257.56ms 
iter 2943: loss 2.5600, time 5258.11ms 
iter 2944: loss 2.8032, time 5254.24ms 
iter 2945: loss 2.6739, time 5253.08ms 
iter 2946: loss 2.7162, time 5252.84ms 
iter 2947: loss 2.6942, time 5257.72ms 
iter 2948: loss 2.6161, time 5255.11ms 
iter 2949: loss 2.7285, time 5257.62ms 
step 2950: train loss 2.7240, val loss 2.8533
iter 2950: loss 2.6416, time 20053.65ms 
iter 2951: loss 2.7064, time 5253.61ms 
iter 2952: loss 2.8212, time 5261.58ms 
iter 2953: loss 2.8054, time 5254.79ms 
iter 2954: loss 2.6824, time 5254.86ms 
iter 2955: loss 2.5326, time 5255.35ms 
iter 2956: loss 2.9446, time 5256.21ms 
iter 2957: loss 2.7246, time 5252.53ms 
iter 2958: loss 2.7833, time 5254.86ms 
iter 2959: loss 2.4539, time 5226.75ms 
iter 2960: loss 2.7386, time 5258.79ms 
iter 2961: loss 2.6897, time 5257.34ms 
iter 2962: loss 2.5161, time 5257.90ms 
iter 2963: loss 2.8589, time 5260.51ms 
iter 2964: loss 2.7720, time 5265.67ms 
iter 2965: loss 2.7263, time 5262.46ms 
iter 2966: loss 2.6660, time 5272.82ms 
iter 2967: loss 2.6100, time 5264.72ms 
iter 2968: loss 2.7832, time 5266.10ms 
iter 2969: loss 2.6340, time 5264.71ms 
iter 2970: loss 2.8360, time 5254.96ms 
iter 2971: loss 2.6564, time 5256.86ms 
iter 2972: loss 2.6928, time 5277.18ms 
iter 2973: loss 2.7806, time 5277.09ms 
iter 2974: loss 2.7120, time 5269.63ms 
iter 2975: loss 2.6721, time 5286.54ms 
iter 2976: loss 2.7086, time 5276.73ms 
iter 2977: loss 2.8418, time 5254.14ms 
iter 2978: loss 2.7752, time 5271.40ms 
iter 2979: loss 2.8471, time 5259.90ms 
iter 2980: loss 2.7333, time 5260.30ms 
iter 2981: loss 2.7006, time 5253.19ms 
iter 2982: loss 2.7257, time 5250.61ms 
iter 2983: loss 2.6147, time 5252.17ms 
iter 2984: loss 2.6113, time 5261.75ms 
iter 2985: loss 2.7418, time 5260.09ms 
iter 2986: loss 2.8025, time 5251.97ms 
iter 2987: loss 2.8659, time 5250.41ms 
iter 2988: loss 2.7542, time 5256.50ms 
iter 2989: loss 2.6653, time 5247.92ms 
iter 2990: loss 2.7914, time 5252.59ms 
iter 2991: loss 2.8840, time 5271.30ms 
iter 2992: loss 2.5730, time 5257.07ms 
iter 2993: loss 2.8029, time 5250.41ms 
iter 2994: loss 2.6656, time 5250.92ms 
iter 2995: loss 2.8490, time 5255.45ms 
iter 2996: loss 2.7547, time 5271.12ms 
iter 2997: loss 2.7503, time 5239.35ms 
iter 2998: loss 2.6792, time 5267.74ms 
iter 2999: loss 2.4969, time 5270.69ms 
step 3000: train loss 2.7263, val loss 2.8758
iter 3000: loss 2.4099, time 20076.62ms 
iter 3001: loss 2.8510, time 5260.53ms 
iter 3002: loss 2.7590, time 5251.05ms 
iter 3003: loss 2.6011, time 5261.26ms 
iter 3004: loss 2.7559, time 5251.42ms 
iter 3005: loss 2.8554, time 5251.26ms 
iter 3006: loss 3.0318, time 5259.83ms 
iter 3007: loss 2.6738, time 5267.31ms 
iter 3008: loss 2.8095, time 5281.09ms 
iter 3009: loss 2.5807, time 5251.43ms 
iter 3010: loss 2.7254, time 5251.99ms 
iter 3011: loss 2.6332, time 5248.06ms 
iter 3012: loss 2.7197, time 5258.16ms 
iter 3013: loss 2.8144, time 5261.64ms 
iter 3014: loss 2.7567, time 5255.58ms 
iter 3015: loss 2.7405, time 5251.70ms 
iter 3016: loss 2.6427, time 5249.83ms 
iter 3017: loss 2.7620, time 5246.12ms 
iter 3018: loss 2.5961, time 5244.68ms 
iter 3019: loss 2.7173, time 5261.71ms 
iter 3020: loss 2.6480, time 5257.09ms 
iter 3021: loss 2.7296, time 5276.28ms 
iter 3022: loss 2.7090, time 5253.72ms 
iter 3023: loss 2.5994, time 5252.10ms 
iter 3024: loss 2.6021, time 5262.35ms 
iter 3025: loss 2.7931, time 5256.64ms 
iter 3026: loss 2.8192, time 5254.42ms 
iter 3027: loss 2.7688, time 5267.49ms 
iter 3028: loss 2.6811, time 5271.66ms 
iter 3029: loss 2.9647, time 5255.49ms 
iter 3030: loss 2.8461, time 5254.91ms 
iter 3031: loss 2.6477, time 5253.12ms 
iter 3032: loss 2.6697, time 5253.67ms 
iter 3033: loss 2.5878, time 5262.40ms 
iter 3034: loss 2.5898, time 5257.79ms 
iter 3035: loss 2.9386, time 5262.60ms 
iter 3036: loss 2.7453, time 5299.69ms 
iter 3037: loss 2.5376, time 5329.97ms 
iter 3038: loss 2.4745, time 5347.77ms 
iter 3039: loss 2.5174, time 5357.44ms 
iter 3040: loss 2.5443, time 5367.90ms 
iter 3041: loss 2.6561, time 5264.94ms 
iter 3042: loss 2.9121, time 5227.18ms 
iter 3043: loss 2.7214, time 5251.18ms 
iter 3044: loss 2.7972, time 5331.15ms 
iter 3045: loss 2.7631, time 5327.81ms 
iter 3046: loss 2.6181, time 5329.30ms 
iter 3047: loss 2.7948, time 5348.03ms 
iter 3048: loss 2.5480, time 5328.62ms 
iter 3049: loss 2.7452, time 5350.83ms 
step 3050: train loss 2.7324, val loss 2.8752
iter 3050: loss 2.8114, time 20213.94ms 
iter 3051: loss 2.7236, time 5226.76ms 
iter 3052: loss 2.5814, time 5241.80ms 
iter 3053: loss 2.7021, time 5254.77ms 
iter 3054: loss 2.5161, time 5255.79ms 
iter 3055: loss 2.8280, time 5262.89ms 
iter 3056: loss 2.8801, time 5250.69ms 
iter 3057: loss 2.7602, time 5259.08ms 
iter 3058: loss 2.6479, time 5229.70ms 
iter 3059: loss 2.7581, time 5253.25ms 
iter 3060: loss 2.6905, time 5251.23ms 
iter 3061: loss 2.4993, time 5254.47ms 
iter 3062: loss 2.6081, time 5272.44ms 
iter 3063: loss 2.8217, time 5261.20ms 
iter 3064: loss 2.6092, time 5259.75ms 
iter 3065: loss 2.7640, time 5261.33ms 
iter 3066: loss 2.6344, time 5266.27ms 
iter 3067: loss 2.5583, time 5257.17ms 
iter 3068: loss 2.8218, time 5252.53ms 
iter 3069: loss 2.6213, time 5253.97ms 
iter 3070: loss 2.6978, time 5256.62ms 
iter 3071: loss 2.7469, time 5263.21ms 
iter 3072: loss 2.8237, time 5247.06ms 
iter 3073: loss 2.8019, time 5242.79ms 
iter 3074: loss 2.9697, time 5246.31ms 
iter 3075: loss 2.9337, time 5252.14ms 
iter 3076: loss 2.8535, time 5243.12ms 
iter 3077: loss 2.6302, time 5256.08ms 
iter 3078: loss 2.7642, time 5267.45ms 
iter 3079: loss 2.7901, time 5250.43ms 
iter 3080: loss 2.7675, time 5260.78ms 
iter 3081: loss 2.6831, time 5267.34ms 
iter 3082: loss 2.8315, time 5263.76ms 
iter 3083: loss 2.6853, time 5276.06ms 
iter 3084: loss 2.7686, time 5266.66ms 
iter 3085: loss 2.7296, time 5266.37ms 
iter 3086: loss 2.3804, time 5260.09ms 
iter 3087: loss 2.8744, time 5263.22ms 
iter 3088: loss 2.6280, time 5252.22ms 
iter 3089: loss 2.7456, time 5259.75ms 
iter 3090: loss 2.9507, time 5256.57ms 
iter 3091: loss 2.8181, time 5264.94ms 
iter 3092: loss 2.7968, time 5250.44ms 
iter 3093: loss 2.7755, time 5254.75ms 
iter 3094: loss 2.8695, time 5250.08ms 
iter 3095: loss 2.7846, time 5252.61ms 
iter 3096: loss 2.6571, time 5254.41ms 
iter 3097: loss 2.6308, time 5270.46ms 
iter 3098: loss 2.9343, time 5254.79ms 
iter 3099: loss 2.5305, time 5250.98ms 
step 3100: train loss 2.7208, val loss 2.8676
iter 3100: loss 2.7009, time 20070.13ms 
iter 3101: loss 2.6751, time 5250.11ms 
iter 3102: loss 2.6940, time 5252.26ms 
iter 3103: loss 2.5905, time 5258.71ms 
iter 3104: loss 2.7257, time 5261.14ms 
iter 3105: loss 2.8061, time 5257.42ms 
iter 3106: loss 2.8456, time 5257.45ms 
iter 3107: loss 2.7622, time 5263.08ms 
iter 3108: loss 2.9205, time 5252.31ms 
iter 3109: loss 2.8012, time 5251.08ms 
iter 3110: loss 2.8939, time 5258.60ms 
iter 3111: loss 2.9741, time 5265.81ms 
iter 3112: loss 2.7697, time 5259.07ms 
iter 3113: loss 2.8118, time 5268.55ms 
iter 3114: loss 2.7846, time 5259.19ms 
iter 3115: loss 2.5549, time 5260.70ms 
iter 3116: loss 2.6131, time 5262.35ms 
iter 3117: loss 2.8086, time 5264.64ms 
iter 3118: loss 2.8904, time 5241.29ms 
iter 3119: loss 3.0235, time 5273.20ms 
iter 3120: loss 2.6223, time 5257.03ms 
iter 3121: loss 2.7069, time 5254.86ms 
iter 3122: loss 2.5304, time 5254.86ms 
iter 3123: loss 2.6735, time 5251.76ms 
iter 3124: loss 2.8795, time 5256.63ms 
iter 3125: loss 2.6715, time 5258.99ms 
iter 3126: loss 2.7087, time 5254.16ms 
iter 3127: loss 2.6021, time 5308.49ms 
iter 3128: loss 2.3609, time 5254.52ms 
iter 3129: loss 2.8643, time 5254.70ms 
iter 3130: loss 2.5311, time 5277.45ms 
iter 3131: loss 2.6306, time 5263.31ms 
iter 3132: loss 2.9701, time 5253.60ms 
iter 3133: loss 2.5362, time 5256.62ms 
iter 3134: loss 2.6528, time 5264.16ms 
iter 3135: loss 2.7930, time 5253.31ms 
iter 3136: loss 2.5810, time 5259.74ms 
iter 3137: loss 2.7635, time 5257.00ms 
iter 3138: loss 2.6484, time 5259.33ms 
iter 3139: loss 2.5921, time 5250.98ms 
iter 3140: loss 2.8624, time 5273.71ms 
iter 3141: loss 2.7961, time 5256.36ms 
iter 3142: loss 2.9369, time 5265.98ms 
iter 3143: loss 2.8505, time 5256.73ms 
iter 3144: loss 2.5136, time 5256.01ms 
iter 3145: loss 2.7529, time 5259.39ms 
iter 3146: loss 2.5721, time 5270.30ms 
iter 3147: loss 2.6967, time 5267.13ms 
iter 3148: loss 2.7937, time 5267.11ms 
iter 3149: loss 2.7297, time 5267.62ms 
step 3150: train loss 2.7089, val loss 2.8791
iter 3150: loss 2.9372, time 20020.73ms 
iter 3151: loss 2.7240, time 5298.17ms 
iter 3152: loss 2.5005, time 5261.77ms 
iter 3153: loss 2.6609, time 5270.85ms 
iter 3154: loss 2.6212, time 5259.49ms 
iter 3155: loss 2.7105, time 5262.40ms 
iter 3156: loss 2.8824, time 5255.07ms 
iter 3157: loss 2.7179, time 5259.68ms 
iter 3158: loss 2.6651, time 5262.37ms 
iter 3159: loss 2.6178, time 5225.88ms 
iter 3160: loss 2.6940, time 5233.73ms 
iter 3161: loss 2.8201, time 5261.14ms 
iter 3162: loss 2.6592, time 5271.45ms 
iter 3163: loss 2.6150, time 5258.26ms 
iter 3164: loss 2.8173, time 5278.23ms 
iter 3165: loss 2.7069, time 5259.63ms 
iter 3166: loss 2.5274, time 5256.99ms 
iter 3167: loss 2.7893, time 5212.25ms 
iter 3168: loss 2.8582, time 5251.81ms 
iter 3169: loss 2.6277, time 5224.34ms 
iter 3170: loss 2.6671, time 5239.37ms 
iter 3171: loss 2.7652, time 5240.39ms 
iter 3172: loss 2.7512, time 5233.50ms 
iter 3173: loss 2.5931, time 5260.18ms 
iter 3174: loss 2.5612, time 5210.10ms 
iter 3175: loss 2.8021, time 5269.92ms 
iter 3176: loss 2.8910, time 5240.76ms 
iter 3177: loss 2.8047, time 5227.75ms 
iter 3178: loss 2.6749, time 5214.73ms 
iter 3179: loss 2.6488, time 5224.15ms 
iter 3180: loss 2.5793, time 5222.98ms 
iter 3181: loss 2.6205, time 5269.51ms 
iter 3182: loss 2.6570, time 5258.28ms 
iter 3183: loss 2.7639, time 5246.87ms 
iter 3184: loss 2.5713, time 5233.15ms 
iter 3185: loss 2.7819, time 5245.53ms 
iter 3186: loss 2.7994, time 5251.13ms 
iter 3187: loss 2.6546, time 5230.91ms 
iter 3188: loss 2.7323, time 5262.89ms 
iter 3189: loss 2.5702, time 5219.62ms 
iter 3190: loss 2.7168, time 5207.93ms 
iter 3191: loss 2.7268, time 5224.25ms 
iter 3192: loss 2.5856, time 5228.83ms 
iter 3193: loss 2.7793, time 5250.78ms 
iter 3194: loss 2.7421, time 5250.58ms 
iter 3195: loss 2.6093, time 5256.30ms 
iter 3196: loss 2.5671, time 5252.32ms 
iter 3197: loss 2.6742, time 5253.97ms 
iter 3198: loss 2.6411, time 5256.05ms 
iter 3199: loss 2.6417, time 5247.64ms 
step 3200: train loss 2.7022, val loss 2.8606
iter 3200: loss 2.5227, time 20069.68ms 
iter 3201: loss 2.6749, time 5262.69ms 
iter 3202: loss 2.7958, time 5257.42ms 
iter 3203: loss 2.5673, time 5254.20ms 
iter 3204: loss 2.6970, time 5260.75ms 
iter 3205: loss 2.7150, time 5236.49ms 
iter 3206: loss 2.8752, time 5238.51ms 
iter 3207: loss 2.8534, time 5247.03ms 
iter 3208: loss 2.7172, time 5256.50ms 
iter 3209: loss 2.6454, time 5262.45ms 
iter 3210: loss 2.7087, time 5267.32ms 
iter 3211: loss 2.7139, time 5254.18ms 
iter 3212: loss 2.7182, time 5257.18ms 
iter 3213: loss 2.6620, time 5251.18ms 
iter 3214: loss 2.6977, time 5263.88ms 
iter 3215: loss 2.5182, time 5254.47ms 
iter 3216: loss 2.6891, time 5248.81ms 
iter 3217: loss 2.6083, time 5258.84ms 
iter 3218: loss 2.7184, time 5254.49ms 
iter 3219: loss 2.5927, time 5269.55ms 
iter 3220: loss 2.7316, time 5267.65ms 
iter 3221: loss 2.7582, time 5260.73ms 
iter 3222: loss 2.6639, time 5252.78ms 
iter 3223: loss 2.8376, time 5262.54ms 
iter 3224: loss 2.5486, time 5217.37ms 
iter 3225: loss 2.9477, time 5215.76ms 
iter 3226: loss 2.7073, time 5234.36ms 
iter 3227: loss 2.6496, time 5252.32ms 
iter 3228: loss 2.6622, time 5219.39ms 
iter 3229: loss 2.6978, time 5224.33ms 
iter 3230: loss 2.5835, time 5257.94ms 
iter 3231: loss 2.8078, time 5249.03ms 
iter 3232: loss 2.6979, time 5213.56ms 
iter 3233: loss 2.6386, time 5224.38ms 
iter 3234: loss 2.8174, time 5223.09ms 
iter 3235: loss 2.9606, time 5251.09ms 
iter 3236: loss 2.7267, time 5228.13ms 
iter 3237: loss 2.6567, time 5258.85ms 
iter 3238: loss 2.6802, time 5225.02ms 
iter 3239: loss 2.5675, time 5202.16ms 
iter 3240: loss 2.7034, time 5255.81ms 
iter 3241: loss 2.8692, time 5214.13ms 
iter 3242: loss 2.7288, time 5239.44ms 
iter 3243: loss 2.6553, time 5268.12ms 
iter 3244: loss 2.6904, time 5247.66ms 
iter 3245: loss 2.6808, time 5222.12ms 
iter 3246: loss 2.6549, time 5210.13ms 
iter 3247: loss 2.8177, time 5214.85ms 
iter 3248: loss 2.5694, time 5221.89ms 
iter 3249: loss 2.7611, time 5229.47ms 
step 3250: train loss 2.6977, val loss 2.8595
iter 3250: loss 2.8200, time 20074.82ms 
iter 3251: loss 2.5604, time 5216.15ms 
iter 3252: loss 2.6421, time 5226.74ms 
iter 3253: loss 2.5202, time 5241.14ms 
iter 3254: loss 2.7411, time 5234.57ms 
iter 3255: loss 2.6785, time 5218.26ms 
iter 3256: loss 2.8880, time 5212.35ms 
iter 3257: loss 2.7042, time 5252.25ms 
iter 3258: loss 2.6479, time 5238.66ms 
iter 3259: loss 2.5361, time 5235.27ms 
iter 3260: loss 2.5950, time 5201.44ms 
iter 3261: loss 2.6252, time 5246.00ms 
iter 3262: loss 2.7838, time 5221.60ms 
iter 3263: loss 2.5106, time 5222.24ms 
iter 3264: loss 2.9460, time 5220.22ms 
iter 3265: loss 2.8349, time 5211.66ms 
iter 3266: loss 2.7525, time 5248.03ms 
iter 3267: loss 2.7495, time 5229.02ms 
iter 3268: loss 2.6572, time 5221.03ms 
iter 3269: loss 2.6387, time 5226.67ms 
iter 3270: loss 2.7293, time 5244.45ms 
iter 3271: loss 2.6824, time 5227.14ms 
iter 3272: loss 2.7552, time 5212.40ms 
iter 3273: loss 2.6838, time 5216.12ms 
iter 3274: loss 2.6807, time 5234.00ms 
iter 3275: loss 2.7268, time 5256.78ms 
iter 3276: loss 2.6911, time 5219.54ms 
iter 3277: loss 2.5963, time 5258.54ms 
iter 3278: loss 2.7716, time 5216.31ms 
iter 3279: loss 2.7102, time 5207.27ms 
iter 3280: loss 2.7156, time 5231.06ms 
iter 3281: loss 2.5780, time 5211.37ms 
iter 3282: loss 2.8000, time 5210.64ms 
iter 3283: loss 2.7393, time 5219.96ms 
iter 3284: loss 2.7921, time 5227.94ms 
iter 3285: loss 2.5697, time 5212.97ms 
iter 3286: loss 2.7346, time 5213.61ms 
iter 3287: loss 2.7164, time 5218.98ms 
iter 3288: loss 2.6211, time 5226.11ms 
iter 3289: loss 2.5903, time 5239.78ms 
iter 3290: loss 2.7463, time 5234.94ms 
iter 3291: loss 2.8552, time 5252.41ms 
iter 3292: loss 2.5710, time 5247.39ms 
iter 3293: loss 2.7485, time 5242.59ms 
iter 3294: loss 2.6025, time 5212.24ms 
iter 3295: loss 2.6282, time 5214.36ms 
iter 3296: loss 2.8879, time 5224.69ms 
iter 3297: loss 2.7067, time 5218.39ms 
iter 3298: loss 2.6036, time 5208.40ms 
iter 3299: loss 2.8049, time 5221.71ms 
step 3300: train loss 2.6930, val loss 2.8637
iter 3300: loss 2.8428, time 20039.57ms 
iter 3301: loss 2.7621, time 5219.44ms 
iter 3302: loss 2.6867, time 5225.78ms 
iter 3303: loss 2.4938, time 5250.87ms 
iter 3304: loss 2.6920, time 5230.43ms 
iter 3305: loss 2.8427, time 5211.77ms 
iter 3306: loss 2.6637, time 5213.42ms 
iter 3307: loss 2.6351, time 5226.43ms 
iter 3308: loss 2.7143, time 5221.23ms 
iter 3309: loss 2.5785, time 5207.68ms 
iter 3310: loss 2.6844, time 5215.69ms 
iter 3311: loss 2.6210, time 5227.27ms 
iter 3312: loss 2.5934, time 5236.00ms 
iter 3313: loss 2.8515, time 5213.97ms 
iter 3314: loss 2.7372, time 5242.35ms 
iter 3315: loss 2.7306, time 5210.30ms 
iter 3316: loss 2.6604, time 5239.12ms 
iter 3317: loss 2.8521, time 5208.15ms 
iter 3318: loss 2.8407, time 5261.73ms 
iter 3319: loss 2.6452, time 5208.11ms 
iter 3320: loss 2.9258, time 5220.25ms 
iter 3321: loss 2.8797, time 5228.83ms 
iter 3322: loss 2.6711, time 5210.54ms 
iter 3323: loss 2.7807, time 5267.70ms 
iter 3324: loss 2.8270, time 5333.41ms 
iter 3325: loss 2.8292, time 5395.58ms 
iter 3326: loss 2.5763, time 5270.54ms 
iter 3327: loss 2.5103, time 5252.73ms 
iter 3328: loss 2.7654, time 5217.67ms 
iter 3329: loss 2.7238, time 5216.75ms 
iter 3330: loss 2.8496, time 5218.05ms 
iter 3331: loss 2.6987, time 5238.92ms 
iter 3332: loss 2.7261, time 5252.80ms 
iter 3333: loss 2.7417, time 5215.79ms 
iter 3334: loss 2.7638, time 5221.37ms 
iter 3335: loss 2.7208, time 5266.79ms 
iter 3336: loss 2.6689, time 5273.41ms 
iter 3337: loss 2.5553, time 5307.91ms 
iter 3338: loss 2.6319, time 5262.92ms 
iter 3339: loss 2.5076, time 5258.69ms 
iter 3340: loss 2.7016, time 5265.20ms 
iter 3341: loss 2.6615, time 5261.48ms 
iter 3342: loss 2.6350, time 5259.00ms 
iter 3343: loss 2.5472, time 5259.79ms 
iter 3344: loss 2.5245, time 5253.48ms 
iter 3345: loss 2.6569, time 5250.79ms 
iter 3346: loss 2.6399, time 5247.78ms 
iter 3347: loss 2.6016, time 5245.69ms 
iter 3348: loss 2.4338, time 5248.95ms 
iter 3349: loss 2.6078, time 5264.51ms 
step 3350: train loss 2.7029, val loss 2.8568
iter 3350: loss 2.5557, time 20057.65ms 
iter 3351: loss 2.4708, time 5250.65ms 
iter 3352: loss 2.7173, time 5271.98ms 
iter 3353: loss 2.5374, time 5223.43ms 
iter 3354: loss 2.4646, time 5224.04ms 
iter 3355: loss 2.6287, time 5259.91ms 
iter 3356: loss 2.5478, time 5255.08ms 
iter 3357: loss 2.9892, time 5252.65ms 
iter 3358: loss 2.8182, time 5251.74ms 
iter 3359: loss 2.6969, time 5265.25ms 
iter 3360: loss 2.5610, time 5257.67ms 
iter 3361: loss 2.5471, time 5273.09ms 
iter 3362: loss 2.5635, time 5259.51ms 
iter 3363: loss 2.5992, time 5259.30ms 
iter 3364: loss 2.8748, time 5261.87ms 
iter 3365: loss 2.9228, time 5255.96ms 
iter 3366: loss 2.5360, time 5262.80ms 
iter 3367: loss 2.6958, time 5230.06ms 
iter 3368: loss 2.7254, time 5262.74ms 
iter 3369: loss 2.8075, time 5250.89ms 
iter 3370: loss 2.7566, time 5258.19ms 
iter 3371: loss 2.6697, time 5256.75ms 
iter 3372: loss 2.6519, time 5260.76ms 
iter 3373: loss 2.6949, time 5263.54ms 
iter 3374: loss 2.6044, time 5276.79ms 
iter 3375: loss 2.6549, time 5282.12ms 
iter 3376: loss 2.6029, time 5274.92ms 
iter 3377: loss 2.7189, time 5261.89ms 
iter 3378: loss 2.7500, time 5264.73ms 
iter 3379: loss 2.6468, time 5276.81ms 
iter 3380: loss 2.7748, time 5271.45ms 
iter 3381: loss 2.8840, time 5266.19ms 
iter 3382: loss 2.9871, time 5258.11ms 
iter 3383: loss 2.4400, time 5268.05ms 
iter 3384: loss 2.6437, time 5268.89ms 
iter 3385: loss 2.6677, time 5268.82ms 
iter 3386: loss 2.5591, time 5274.09ms 
iter 3387: loss 2.7433, time 5268.07ms 
iter 3388: loss 2.7235, time 5267.64ms 
iter 3389: loss 2.8213, time 5276.38ms 
iter 3390: loss 2.7002, time 5270.25ms 
iter 3391: loss 2.6164, time 5278.14ms 
iter 3392: loss 2.6962, time 5251.88ms 
iter 3393: loss 2.8246, time 5266.51ms 
iter 3394: loss 2.7057, time 5313.02ms 
iter 3395: loss 2.6309, time 5328.85ms 
iter 3396: loss 2.7599, time 5249.15ms 
iter 3397: loss 2.7430, time 5255.34ms 
iter 3398: loss 2.6321, time 5252.97ms 
iter 3399: loss 2.6240, time 5271.18ms 
step 3400: train loss 2.6791, val loss 2.8794
iter 3400: loss 2.6084, time 20052.55ms 
iter 3401: loss 2.7325, time 5265.88ms 
iter 3402: loss 2.8098, time 5242.92ms 
iter 3403: loss 2.6198, time 5259.16ms 
iter 3404: loss 2.6356, time 5255.30ms 
iter 3405: loss 2.6981, time 5253.97ms 
iter 3406: loss 2.5582, time 5271.01ms 
iter 3407: loss 2.7162, time 5256.50ms 
iter 3408: loss 2.5959, time 5261.63ms 
iter 3409: loss 2.8996, time 5257.86ms 
iter 3410: loss 2.6951, time 5251.33ms 
iter 3411: loss 2.6342, time 5257.29ms 
iter 3412: loss 2.7497, time 5250.39ms 
iter 3413: loss 2.4633, time 5258.35ms 
iter 3414: loss 2.6006, time 5265.08ms 
iter 3415: loss 2.8058, time 5263.63ms 
iter 3416: loss 2.6231, time 5255.73ms 
iter 3417: loss 2.7109, time 5257.17ms 
iter 3418: loss 2.8420, time 5255.38ms 
iter 3419: loss 2.7787, time 5271.83ms 
iter 3420: loss 2.5354, time 5260.73ms 
iter 3421: loss 2.8744, time 5255.07ms 
iter 3422: loss 2.7777, time 5250.75ms 
iter 3423: loss 2.5726, time 5254.13ms 
iter 3424: loss 2.6997, time 5259.42ms 
iter 3425: loss 2.7714, time 5209.80ms 
iter 3426: loss 2.7306, time 5257.25ms 
iter 3427: loss 2.8607, time 5266.16ms 
iter 3428: loss 2.8019, time 5225.42ms 
iter 3429: loss 2.6656, time 5260.66ms 
iter 3430: loss 2.8976, time 5251.69ms 
iter 3431: loss 2.4478, time 5251.15ms 
iter 3432: loss 2.5631, time 5256.92ms 
iter 3433: loss 2.9194, time 5257.99ms 
iter 3434: loss 2.8905, time 5257.60ms 
iter 3435: loss 2.6869, time 5250.88ms 
iter 3436: loss 2.6248, time 5251.17ms 
iter 3437: loss 2.6801, time 5251.94ms 
iter 3438: loss 2.7506, time 5250.20ms 
iter 3439: loss 2.5813, time 5250.37ms 
iter 3440: loss 2.7052, time 5260.11ms 
iter 3441: loss 2.8093, time 5252.88ms 
iter 3442: loss 2.8618, time 5252.57ms 
iter 3443: loss 2.8238, time 5252.07ms 
iter 3444: loss 2.6474, time 5251.68ms 
iter 3445: loss 2.7473, time 5251.72ms 
iter 3446: loss 2.5511, time 5262.23ms 
iter 3447: loss 2.6875, time 5262.43ms 
iter 3448: loss 2.7295, time 5256.73ms 
iter 3449: loss 2.5845, time 5237.16ms 
step 3450: train loss 2.6850, val loss 2.8717
iter 3450: loss 2.6459, time 20084.61ms 
iter 3451: loss 2.6901, time 5269.19ms 
iter 3452: loss 2.6763, time 5254.47ms 
iter 3453: loss 2.8413, time 5261.59ms 
iter 3454: loss 2.6570, time 5268.17ms 
iter 3455: loss 2.6129, time 5255.71ms 
iter 3456: loss 2.7453, time 5256.95ms 
iter 3457: loss 2.6510, time 5259.35ms 
iter 3458: loss 2.6598, time 5231.25ms 
iter 3459: loss 2.6801, time 5259.43ms 
iter 3460: loss 2.3542, time 5254.25ms 
iter 3461: loss 2.6827, time 5257.03ms 
iter 3462: loss 2.6638, time 5261.39ms 
iter 3463: loss 2.7330, time 5264.36ms 
iter 3464: loss 2.6134, time 5262.44ms 
iter 3465: loss 2.6380, time 5250.80ms 
iter 3466: loss 2.7579, time 5266.07ms 
iter 3467: loss 2.6998, time 5261.92ms 
iter 3468: loss 2.4672, time 5275.07ms 
iter 3469: loss 2.5483, time 5257.66ms 
iter 3470: loss 2.6608, time 5252.10ms 
iter 3471: loss 2.5897, time 5255.20ms 
iter 3472: loss 2.7692, time 5253.57ms 
iter 3473: loss 2.6177, time 5266.67ms 
iter 3474: loss 2.8522, time 5264.40ms 
iter 3475: loss 2.5233, time 5254.00ms 
iter 3476: loss 2.5534, time 5234.30ms 
iter 3477: loss 2.7653, time 5274.95ms 
iter 3478: loss 2.7737, time 5280.64ms 
iter 3479: loss 2.5953, time 5268.70ms 
iter 3480: loss 2.8497, time 5262.74ms 
iter 3481: loss 2.6573, time 5276.84ms 
iter 3482: loss 2.6363, time 5284.30ms 
iter 3483: loss 2.7075, time 5274.23ms 
iter 3484: loss 2.8449, time 5281.34ms 
iter 3485: loss 2.7355, time 5273.96ms 
iter 3486: loss 2.7360, time 5294.54ms 
iter 3487: loss 2.6279, time 5266.84ms 
iter 3488: loss 2.7532, time 5259.79ms 
iter 3489: loss 2.7496, time 5262.30ms 
iter 3490: loss 2.7411, time 5258.95ms 
iter 3491: loss 2.8346, time 5263.38ms 
iter 3492: loss 2.6326, time 5273.10ms 
iter 3493: loss 2.5310, time 5247.46ms 
iter 3494: loss 2.7258, time 5256.31ms 
iter 3495: loss 2.7740, time 5261.14ms 
iter 3496: loss 2.7161, time 5252.24ms 
iter 3497: loss 2.7781, time 5252.34ms 
iter 3498: loss 2.9094, time 5247.99ms 
iter 3499: loss 2.7516, time 5238.07ms 
step 3500: train loss 2.6763, val loss 2.8738
iter 3500: loss 2.8022, time 20086.32ms 
iter 3501: loss 2.7489, time 5248.28ms 
iter 3502: loss 2.7753, time 5260.33ms 
iter 3503: loss 2.5723, time 5263.17ms 
iter 3504: loss 2.8368, time 5264.13ms 
iter 3505: loss 2.5586, time 5274.27ms 
iter 3506: loss 2.8612, time 5258.64ms 
iter 3507: loss 2.4670, time 5303.99ms 
iter 3508: loss 2.9592, time 5272.69ms 
iter 3509: loss 2.5900, time 5276.59ms 
iter 3510: loss 2.6971, time 5298.34ms 
iter 3511: loss 2.8449, time 5302.23ms 
iter 3512: loss 2.5268, time 5278.50ms 
iter 3513: loss 2.6614, time 5282.55ms 
iter 3514: loss 2.7206, time 5265.76ms 
iter 3515: loss 2.7139, time 5276.92ms 
iter 3516: loss 2.6102, time 5298.74ms 
iter 3517: loss 2.6406, time 5265.92ms 
iter 3518: loss 2.6920, time 5202.05ms 
iter 3519: loss 2.5451, time 5326.99ms 
iter 3520: loss 2.7097, time 5223.45ms 
iter 3521: loss 2.4862, time 5260.02ms 
iter 3522: loss 2.7667, time 5249.03ms 
iter 3523: loss 2.7474, time 5248.61ms 
iter 3524: loss 2.8200, time 5264.20ms 
iter 3525: loss 2.5825, time 5247.78ms 
iter 3526: loss 2.6918, time 5243.51ms 
iter 3527: loss 2.7551, time 5304.53ms 
iter 3528: loss 2.9136, time 5270.51ms 
iter 3529: loss 2.4403, time 5314.37ms 
iter 3530: loss 2.6684, time 5307.58ms 
iter 3531: loss 2.6802, time 5250.99ms 
iter 3532: loss 2.6711, time 5337.42ms 
iter 3533: loss 2.7642, time 5219.94ms 
iter 3534: loss 2.5407, time 5297.02ms 
iter 3535: loss 2.6180, time 5219.29ms 
iter 3536: loss 2.8088, time 5257.93ms 
iter 3537: loss 2.7515, time 5250.12ms 
iter 3538: loss 2.8210, time 5255.93ms 
iter 3539: loss 2.7122, time 5258.66ms 
iter 3540: loss 2.7798, time 5260.49ms 
iter 3541: loss 2.3317, time 5230.74ms 
iter 3542: loss 2.6810, time 5256.61ms 
iter 3543: loss 2.4974, time 5263.78ms 
iter 3544: loss 2.7413, time 5258.33ms 
iter 3545: loss 2.6829, time 5248.27ms 
iter 3546: loss 2.6318, time 5265.65ms 
iter 3547: loss 2.6799, time 5257.57ms 
iter 3548: loss 2.6601, time 5259.63ms 
iter 3549: loss 2.5106, time 5263.40ms 
step 3550: train loss 2.6860, val loss 2.8591
iter 3550: loss 2.6124, time 20081.05ms 
iter 3551: loss 2.6330, time 5252.60ms 
iter 3552: loss 2.9064, time 5255.08ms 
iter 3553: loss 2.7742, time 5259.53ms 
iter 3554: loss 2.4033, time 5262.48ms 
iter 3555: loss 2.8339, time 5274.32ms 
iter 3556: loss 2.6273, time 5247.26ms 
iter 3557: loss 2.7907, time 5260.52ms 
iter 3558: loss 2.5789, time 5252.33ms 
iter 3559: loss 2.6673, time 5255.96ms 
iter 3560: loss 2.8112, time 5262.02ms 
iter 3561: loss 2.5932, time 5269.61ms 
iter 3562: loss 2.6619, time 5274.69ms 
iter 3563: loss 2.6377, time 5301.40ms 
iter 3564: loss 2.7914, time 5299.54ms 
iter 3565: loss 2.9283, time 5294.70ms 
iter 3566: loss 2.6574, time 5304.26ms 
iter 3567: loss 2.9299, time 5321.09ms 
iter 3568: loss 2.6434, time 5261.70ms 
iter 3569: loss 2.5794, time 5227.43ms 
iter 3570: loss 3.0604, time 5261.39ms 
iter 3571: loss 2.5756, time 5255.27ms 
iter 3572: loss 2.6809, time 5271.97ms 
iter 3573: loss 2.8790, time 5267.39ms 
iter 3574: loss 2.6447, time 5259.17ms 
iter 3575: loss 2.6836, time 5258.45ms 
iter 3576: loss 2.5711, time 5254.54ms 
iter 3577: loss 2.6057, time 5250.81ms 
iter 3578: loss 2.7388, time 5260.52ms 
iter 3579: loss 2.5335, time 5259.23ms 
iter 3580: loss 2.6180, time 5254.09ms 
iter 3581: loss 2.5247, time 5260.95ms 
iter 3582: loss 2.7481, time 5252.45ms 
iter 3583: loss 2.6429, time 5252.35ms 
iter 3584: loss 2.6520, time 5269.02ms 
iter 3585: loss 2.6684, time 5273.17ms 
iter 3586: loss 2.5302, time 5250.88ms 
iter 3587: loss 2.7025, time 5269.43ms 
iter 3588: loss 2.5903, time 5264.13ms 
iter 3589: loss 2.6786, time 5267.75ms 
iter 3590: loss 2.9021, time 5261.08ms 
iter 3591: loss 2.6173, time 5261.91ms 
iter 3592: loss 2.6679, time 5260.71ms 
iter 3593: loss 2.7019, time 5256.71ms 
iter 3594: loss 2.5453, time 5253.51ms 
iter 3595: loss 2.7463, time 5248.53ms 
iter 3596: loss 2.7857, time 5255.93ms 
iter 3597: loss 2.8295, time 5255.01ms 
iter 3598: loss 2.7167, time 5267.81ms 
iter 3599: loss 2.8785, time 5261.25ms 
step 3600: train loss 2.6687, val loss 2.8652
iter 3600: loss 2.4984, time 20082.43ms 
iter 3601: loss 2.6185, time 5264.06ms 
iter 3602: loss 2.6549, time 5263.98ms 
iter 3603: loss 2.6316, time 5261.06ms 
iter 3604: loss 2.5634, time 5257.54ms 
iter 3605: loss 2.8132, time 5249.42ms 
iter 3606: loss 2.8500, time 5254.39ms 
iter 3607: loss 2.5126, time 5261.48ms 
iter 3608: loss 2.8677, time 5270.34ms 
iter 3609: loss 2.5968, time 5254.61ms 
iter 3610: loss 2.5394, time 5257.91ms 
iter 3611: loss 2.6311, time 5263.74ms 
iter 3612: loss 2.7175, time 5264.39ms 
iter 3613: loss 2.6439, time 5267.08ms 
iter 3614: loss 2.6965, time 5260.17ms 
iter 3615: loss 2.5887, time 5257.99ms 
iter 3616: loss 2.8874, time 5270.34ms 
iter 3617: loss 2.6275, time 5261.50ms 
iter 3618: loss 2.7894, time 5266.65ms 
iter 3619: loss 2.6398, time 5270.77ms 
iter 3620: loss 2.5463, time 5249.58ms 
iter 3621: loss 2.7049, time 5256.51ms 
iter 3622: loss 2.7540, time 5259.56ms 
iter 3623: loss 2.8757, time 5262.83ms 
iter 3624: loss 2.6019, time 5248.15ms 
iter 3625: loss 2.7247, time 5256.99ms 
iter 3626: loss 2.7510, time 5269.02ms 
iter 3627: loss 2.5384, time 5272.47ms 
iter 3628: loss 2.7717, time 5272.52ms 
iter 3629: loss 2.6665, time 5261.65ms 
iter 3630: loss 2.7771, time 5256.96ms 
iter 3631: loss 2.3925, time 5258.83ms 
iter 3632: loss 2.5697, time 5263.09ms 
iter 3633: loss 2.5489, time 5268.58ms 
iter 3634: loss 2.5183, time 5261.86ms 
iter 3635: loss 2.6776, time 5265.13ms 
iter 3636: loss 2.5094, time 5252.24ms 
iter 3637: loss 2.8256, time 5258.57ms 
iter 3638: loss 2.7825, time 5264.62ms 
iter 3639: loss 2.8274, time 5270.83ms 
iter 3640: loss 2.7512, time 5317.28ms 
iter 3641: loss 2.6883, time 5315.02ms 
iter 3642: loss 2.6545, time 5300.40ms 
iter 3643: loss 2.4740, time 5281.10ms 
iter 3644: loss 2.7756, time 5280.58ms 
iter 3645: loss 2.6978, time 5276.20ms 
iter 3646: loss 2.6196, time 5268.23ms 
iter 3647: loss 2.5139, time 5268.43ms 
iter 3648: loss 2.6652, time 5253.12ms 
iter 3649: loss 2.5719, time 5258.89ms 
step 3650: train loss 2.6711, val loss 2.8630
iter 3650: loss 2.5925, time 20066.56ms 
iter 3651: loss 2.6884, time 5250.27ms 
iter 3652: loss 2.6643, time 5252.76ms 
iter 3653: loss 2.7469, time 5249.70ms 
iter 3654: loss 2.7115, time 5257.21ms 
iter 3655: loss 2.9156, time 5253.09ms 
iter 3656: loss 2.7498, time 5254.20ms 
iter 3657: loss 2.8076, time 5254.49ms 
iter 3658: loss 2.7476, time 5267.69ms 
iter 3659: loss 2.6085, time 5272.33ms 
iter 3660: loss 2.5878, time 5262.89ms 
iter 3661: loss 2.5480, time 5253.64ms 
iter 3662: loss 2.7336, time 5252.25ms 
iter 3663: loss 2.6173, time 5252.52ms 
iter 3664: loss 2.5815, time 5262.29ms 
iter 3665: loss 2.4749, time 5262.77ms 
iter 3666: loss 2.5349, time 5261.48ms 
iter 3667: loss 2.7645, time 5253.40ms 
iter 3668: loss 2.6906, time 5255.41ms 
iter 3669: loss 2.5899, time 5256.31ms 
iter 3670: loss 2.7934, time 5248.27ms 
iter 3671: loss 2.6301, time 5266.50ms 
iter 3672: loss 2.6314, time 5262.54ms 
iter 3673: loss 2.8399, time 5253.87ms 
iter 3674: loss 2.6076, time 5232.46ms 
iter 3675: loss 2.6224, time 5252.37ms 
iter 3676: loss 2.6905, time 5262.49ms 
iter 3677: loss 2.6046, time 5263.15ms 
iter 3678: loss 2.7869, time 5258.66ms 
iter 3679: loss 2.7309, time 5257.13ms 
iter 3680: loss 2.8340, time 5250.95ms 
iter 3681: loss 2.8128, time 5248.41ms 
iter 3682: loss 2.8168, time 5250.25ms 
iter 3683: loss 2.5683, time 5260.91ms 
iter 3684: loss 2.7068, time 5257.78ms 
iter 3685: loss 2.8507, time 5251.25ms 
iter 3686: loss 2.6412, time 5259.44ms 
iter 3687: loss 2.6134, time 5261.20ms 
iter 3688: loss 2.6588, time 5252.02ms 
iter 3689: loss 2.5765, time 5257.38ms 
iter 3690: loss 2.6556, time 5261.44ms 
iter 3691: loss 2.5081, time 5252.51ms 
iter 3692: loss 2.6591, time 5254.89ms 
iter 3693: loss 2.8033, time 5252.64ms 
iter 3694: loss 2.6581, time 5260.34ms 
iter 3695: loss 2.8256, time 5266.46ms 
iter 3696: loss 2.5355, time 5262.03ms 
iter 3697: loss 2.6191, time 5256.81ms 
iter 3698: loss 2.7810, time 5265.94ms 
iter 3699: loss 2.6199, time 5254.43ms 
step 3700: train loss 2.6664, val loss 2.8548
iter 3700: loss 2.6259, time 20164.04ms 
iter 3701: loss 2.6459, time 5260.23ms 
iter 3702: loss 2.6390, time 5270.00ms 
iter 3703: loss 2.8576, time 5276.77ms 
iter 3704: loss 2.8636, time 5286.30ms 
iter 3705: loss 2.6891, time 5298.24ms 
iter 3706: loss 2.6774, time 5251.54ms 
iter 3707: loss 2.5337, time 5251.74ms 
iter 3708: loss 2.6364, time 5258.54ms 
iter 3709: loss 2.5932, time 5263.29ms 
iter 3710: loss 2.5859, time 5252.72ms 
iter 3711: loss 2.5241, time 5247.51ms 
iter 3712: loss 2.7044, time 5252.62ms 
iter 3713: loss 2.7756, time 5253.38ms 
iter 3714: loss 2.5443, time 5254.52ms 
iter 3715: loss 2.7725, time 5259.47ms 
iter 3716: loss 2.6797, time 5254.53ms 
iter 3717: loss 2.7108, time 5250.90ms 
iter 3718: loss 2.6490, time 5254.80ms 
iter 3719: loss 2.7860, time 5247.48ms 
iter 3720: loss 2.8241, time 5249.37ms 
iter 3721: loss 2.6457, time 5255.66ms 
iter 3722: loss 2.6684, time 5314.75ms 
iter 3723: loss 2.7650, time 5246.37ms 
iter 3724: loss 2.8394, time 5250.54ms 
iter 3725: loss 2.5760, time 5249.79ms 
iter 3726: loss 2.5888, time 5254.29ms 
iter 3727: loss 2.7372, time 5263.72ms 
iter 3728: loss 2.6361, time 5219.76ms 
iter 3729: loss 2.7246, time 5253.89ms 
iter 3730: loss 2.7312, time 5230.68ms 
iter 3731: loss 2.4829, time 5253.24ms 
iter 3732: loss 2.5679, time 5256.26ms 
iter 3733: loss 2.7192, time 5256.70ms 
iter 3734: loss 2.5208, time 5254.29ms 
iter 3735: loss 2.5505, time 5250.57ms 
iter 3736: loss 2.6225, time 5264.55ms 
iter 3737: loss 2.7804, time 5257.48ms 
iter 3738: loss 2.6819, time 5254.50ms 
iter 3739: loss 2.4907, time 5269.55ms 
iter 3740: loss 2.5784, time 5258.34ms 
iter 3741: loss 2.7028, time 5276.23ms 
iter 3742: loss 2.7879, time 5271.84ms 
iter 3743: loss 2.6319, time 5268.96ms 
iter 3744: loss 2.6289, time 5290.74ms 
iter 3745: loss 2.9218, time 5295.60ms 
iter 3746: loss 2.5006, time 5284.91ms 
iter 3747: loss 2.7406, time 5251.16ms 
iter 3748: loss 2.6580, time 5262.34ms 
iter 3749: loss 2.6990, time 5270.08ms 
step 3750: train loss 2.6810, val loss 2.8482
iter 3750: loss 2.6207, time 20105.10ms 
iter 3751: loss 2.6566, time 5258.46ms 
iter 3752: loss 2.5638, time 5265.67ms 
iter 3753: loss 2.7295, time 5260.88ms 
iter 3754: loss 2.7746, time 5264.72ms 
iter 3755: loss 2.7506, time 5274.23ms 
iter 3756: loss 2.7091, time 5270.17ms 
iter 3757: loss 2.7603, time 5267.29ms 
iter 3758: loss 2.5747, time 5266.06ms 
iter 3759: loss 2.6759, time 5263.02ms 
iter 3760: loss 2.8044, time 5258.66ms 
iter 3761: loss 2.5981, time 5260.69ms 
iter 3762: loss 2.6849, time 5275.92ms 
iter 3763: loss 2.7354, time 5255.32ms 
iter 3764: loss 2.8444, time 5256.98ms 
iter 3765: loss 2.6209, time 5249.57ms 
iter 3766: loss 2.6025, time 5253.43ms 
iter 3767: loss 2.8664, time 5258.63ms 
iter 3768: loss 2.6767, time 5266.29ms 
iter 3769: loss 2.6040, time 5262.51ms 
iter 3770: loss 2.6536, time 5267.35ms 
iter 3771: loss 2.6649, time 5264.67ms 
iter 3772: loss 2.7420, time 5268.46ms 
iter 3773: loss 2.6679, time 5273.45ms 
iter 3774: loss 2.5732, time 5259.54ms 
iter 3775: loss 2.6474, time 5253.21ms 
iter 3776: loss 2.4979, time 5253.57ms 
iter 3777: loss 2.3657, time 5254.94ms 
iter 3778: loss 2.4632, time 5256.11ms 
iter 3779: loss 2.7491, time 5246.83ms 
iter 3780: loss 2.4297, time 5254.43ms 
iter 3781: loss 2.4204, time 5263.38ms 
iter 3782: loss 2.5692, time 5256.70ms 
iter 3783: loss 2.6072, time 5251.82ms 
iter 3784: loss 2.6019, time 5248.25ms 
iter 3785: loss 2.5369, time 5261.59ms 
iter 3786: loss 2.8308, time 5257.36ms 
iter 3787: loss 2.6913, time 5257.39ms 
iter 3788: loss 2.6286, time 5254.21ms 
iter 3789: loss 2.7724, time 5253.91ms 
iter 3790: loss 2.5863, time 5262.51ms 
iter 3791: loss 2.7261, time 5227.05ms 
iter 3792: loss 2.5936, time 5255.07ms 
iter 3793: loss 2.4578, time 5254.87ms 
iter 3794: loss 2.7114, time 5263.41ms 
iter 3795: loss 2.8051, time 5253.84ms 
iter 3796: loss 2.7400, time 5258.06ms 
iter 3797: loss 2.7189, time 5277.31ms 
iter 3798: loss 2.6704, time 5254.28ms 
iter 3799: loss 2.7946, time 5253.54ms 
step 3800: train loss 2.6661, val loss 2.8636
iter 3800: loss 2.6209, time 20038.45ms 
iter 3801: loss 2.6568, time 5246.06ms 
iter 3802: loss 2.7605, time 5254.21ms 
iter 3803: loss 2.6663, time 5259.32ms 
iter 3804: loss 2.4979, time 5255.93ms 
iter 3805: loss 2.6357, time 5264.48ms 
iter 3806: loss 2.8853, time 5254.26ms 
iter 3807: loss 2.6533, time 5225.28ms 
iter 3808: loss 2.8698, time 5270.17ms 
iter 3809: loss 2.5603, time 5253.23ms 
iter 3810: loss 2.7470, time 5258.79ms 
iter 3811: loss 2.7255, time 5263.33ms 
iter 3812: loss 2.8704, time 5261.23ms 
iter 3813: loss 2.6529, time 5262.36ms 
iter 3814: loss 2.4693, time 5251.96ms 
iter 3815: loss 2.5240, time 5231.08ms 
iter 3816: loss 2.7745, time 5332.19ms 
iter 3817: loss 2.8730, time 5275.64ms 
iter 3818: loss 2.6903, time 5234.54ms 
iter 3819: loss 2.7304, time 5193.97ms 
iter 3820: loss 2.6304, time 5236.93ms 
iter 3821: loss 2.6305, time 5220.58ms 
iter 3822: loss 2.4375, time 5185.39ms 
iter 3823: loss 2.7352, time 5236.74ms 
iter 3824: loss 2.3685, time 5285.11ms 
iter 3825: loss 2.6475, time 5242.15ms 
iter 3826: loss 2.5759, time 5297.81ms 
iter 3827: loss 2.5438, time 5253.87ms 
iter 3828: loss 2.5190, time 5253.13ms 
iter 3829: loss 2.4502, time 5253.54ms 
iter 3830: loss 2.6947, time 5257.97ms 
iter 3831: loss 2.6186, time 5249.67ms 
iter 3832: loss 2.5895, time 5246.12ms 
iter 3833: loss 2.4116, time 5247.35ms 
iter 3834: loss 2.7963, time 5259.27ms 
iter 3835: loss 2.5098, time 5261.48ms 
iter 3836: loss 2.8273, time 5236.50ms 
iter 3837: loss 2.5205, time 5245.57ms 
iter 3838: loss 2.7727, time 5250.56ms 
iter 3839: loss 2.6535, time 5242.96ms 
iter 3840: loss 2.6338, time 5247.86ms 
iter 3841: loss 2.6275, time 5260.72ms 
iter 3842: loss 2.6962, time 5254.03ms 
iter 3843: loss 2.5711, time 5253.38ms 
iter 3844: loss 2.7865, time 5247.87ms 
iter 3845: loss 2.6187, time 5290.69ms 
iter 3846: loss 2.7084, time 5342.62ms 
iter 3847: loss 2.5756, time 5342.29ms 
iter 3848: loss 2.7202, time 5254.65ms 
iter 3849: loss 2.7586, time 5298.68ms 
step 3850: train loss 2.6522, val loss 2.8431
iter 3850: loss 2.7051, time 20043.21ms 
iter 3851: loss 2.6933, time 5251.14ms 
iter 3852: loss 2.6538, time 5249.99ms 
iter 3853: loss 2.7021, time 5247.57ms 
iter 3854: loss 2.6605, time 5261.02ms 
iter 3855: loss 2.8813, time 5257.14ms 
iter 3856: loss 2.6567, time 5250.38ms 
iter 3857: loss 2.8135, time 5256.76ms 
iter 3858: loss 2.4997, time 5253.56ms 
iter 3859: loss 2.6071, time 5254.04ms 
iter 3860: loss 3.0165, time 5265.04ms 
iter 3861: loss 2.5779, time 5254.87ms 
iter 3862: loss 2.4685, time 5244.73ms 
iter 3863: loss 2.5806, time 5252.38ms 
iter 3864: loss 2.5986, time 5253.57ms 
iter 3865: loss 2.6510, time 5257.88ms 
iter 3866: loss 2.7508, time 5260.68ms 
iter 3867: loss 2.6033, time 5249.23ms 
iter 3868: loss 2.5142, time 5247.38ms 
iter 3869: loss 2.4977, time 5248.65ms 
iter 3870: loss 2.6912, time 5250.41ms 
iter 3871: loss 2.5405, time 5256.44ms 
iter 3872: loss 2.5968, time 5251.78ms 
iter 3873: loss 2.7802, time 5257.04ms 
iter 3874: loss 2.7658, time 5250.22ms 
iter 3875: loss 2.7483, time 5259.74ms 
iter 3876: loss 2.7308, time 5247.06ms 
iter 3877: loss 2.4518, time 5259.01ms 
iter 3878: loss 2.6387, time 5248.41ms 
iter 3879: loss 2.7495, time 5247.30ms 
iter 3880: loss 2.6369, time 5248.82ms 
iter 3881: loss 2.8102, time 5254.46ms 
iter 3882: loss 2.7541, time 5261.42ms 
iter 3883: loss 2.5767, time 5268.29ms 
iter 3884: loss 2.5964, time 5274.44ms 
iter 3885: loss 2.5773, time 5270.65ms 
iter 3886: loss 2.5970, time 5276.29ms 
iter 3887: loss 2.6105, time 5278.35ms 
iter 3888: loss 2.5024, time 5298.67ms 
iter 3889: loss 2.7604, time 5289.50ms 
iter 3890: loss 2.6528, time 5263.02ms 
iter 3891: loss 2.5471, time 5251.49ms 
iter 3892: loss 2.5978, time 5271.35ms 
iter 3893: loss 2.8575, time 5267.99ms 
iter 3894: loss 2.6438, time 5265.68ms 
iter 3895: loss 2.5515, time 5254.25ms 
iter 3896: loss 2.5775, time 5260.97ms 
iter 3897: loss 2.6547, time 5256.16ms 
iter 3898: loss 2.8149, time 5268.83ms 
iter 3899: loss 2.7569, time 5237.49ms 
step 3900: train loss 2.6497, val loss 2.8527
iter 3900: loss 2.6701, time 20058.50ms 
iter 3901: loss 2.6252, time 5265.02ms 
iter 3902: loss 2.8576, time 5261.70ms 
iter 3903: loss 2.9489, time 5250.25ms 
iter 3904: loss 2.7700, time 5253.18ms 
iter 3905: loss 2.4590, time 5261.48ms 
iter 3906: loss 2.8928, time 5258.75ms 
iter 3907: loss 2.6298, time 5256.27ms 
iter 3908: loss 2.6883, time 5258.29ms 
iter 3909: loss 2.7507, time 5256.75ms 
iter 3910: loss 2.5394, time 5255.69ms 
iter 3911: loss 2.6437, time 5260.55ms 
iter 3912: loss 2.3829, time 5248.75ms 
iter 3913: loss 2.5061, time 5260.37ms 
iter 3914: loss 2.7365, time 5266.40ms 
iter 3915: loss 2.5112, time 5252.12ms 
iter 3916: loss 2.6982, time 5250.76ms 
iter 3917: loss 2.4507, time 5250.91ms 
iter 3918: loss 2.5953, time 5258.38ms 
iter 3919: loss 2.6128, time 5261.60ms 
iter 3920: loss 2.6946, time 5261.00ms 
iter 3921: loss 2.7279, time 5250.03ms 
iter 3922: loss 2.7275, time 5250.14ms 
iter 3923: loss 2.4513, time 5252.66ms 
iter 3924: loss 2.5678, time 5266.24ms 
iter 3925: loss 2.5631, time 5256.28ms 
iter 3926: loss 2.6220, time 5248.51ms 
iter 3927: loss 2.9170, time 5247.94ms 
iter 3928: loss 2.7722, time 5250.72ms 
iter 3929: loss 2.6570, time 5261.21ms 
iter 3930: loss 2.6159, time 5254.60ms 
iter 3931: loss 2.5767, time 5251.12ms 
iter 3932: loss 2.3577, time 5252.97ms 
iter 3933: loss 2.5682, time 5249.78ms 
iter 3934: loss 2.5311, time 5270.01ms 
iter 3935: loss 2.6681, time 5270.10ms 
iter 3936: loss 2.6724, time 5270.34ms 
iter 3937: loss 2.5474, time 5267.47ms 
iter 3938: loss 2.5658, time 5276.24ms 
iter 3939: loss 2.7684, time 5263.88ms 
iter 3940: loss 2.7646, time 5263.07ms 
iter 3941: loss 2.6321, time 5263.47ms 
iter 3942: loss 2.6025, time 5258.90ms 
iter 3943: loss 2.7415, time 5257.69ms 
iter 3944: loss 2.5047, time 5266.08ms 
iter 3945: loss 2.6492, time 5254.87ms 
iter 3946: loss 2.4577, time 5269.71ms 
iter 3947: loss 2.6628, time 5277.93ms 
iter 3948: loss 2.6252, time 5266.23ms 
iter 3949: loss 2.6645, time 5269.74ms 
step 3950: train loss 2.6497, val loss 2.8735
iter 3950: loss 2.5260, time 20229.78ms 
iter 3951: loss 2.6062, time 5258.62ms 
iter 3952: loss 2.7968, time 5255.51ms 
iter 3953: loss 2.7923, time 5256.79ms 
iter 3954: loss 2.6646, time 5260.77ms 
iter 3955: loss 2.6788, time 5261.23ms 
iter 3956: loss 2.6625, time 5262.47ms 
iter 3957: loss 2.9246, time 5258.62ms 
iter 3958: loss 2.4475, time 5264.75ms 
iter 3959: loss 2.5598, time 5254.70ms 
iter 3960: loss 2.5775, time 5258.27ms 
iter 3961: loss 2.7267, time 5258.94ms 
iter 3962: loss 2.5806, time 5256.67ms 
iter 3963: loss 2.7178, time 5267.57ms 
iter 3964: loss 2.5002, time 5259.29ms 
iter 3965: loss 2.5821, time 5260.07ms 
iter 3966: loss 2.6848, time 5266.71ms 
iter 3967: loss 2.5375, time 5255.79ms 
iter 3968: loss 2.6433, time 5258.12ms 
iter 3969: loss 2.5424, time 5258.87ms 
iter 3970: loss 2.6988, time 5264.56ms 
iter 3971: loss 2.4125, time 5270.97ms 
iter 3972: loss 2.6597, time 5266.10ms 
iter 3973: loss 2.6681, time 5255.88ms 
iter 3974: loss 2.6400, time 5253.82ms 
iter 3975: loss 2.7053, time 5252.51ms 
iter 3976: loss 2.6296, time 5256.43ms 
iter 3977: loss 2.7128, time 5263.59ms 
iter 3978: loss 2.6951, time 5256.46ms 
iter 3979: loss 2.7162, time 5255.15ms 
iter 3980: loss 2.8218, time 5252.08ms 
iter 3981: loss 2.6912, time 5268.26ms 
iter 3982: loss 2.5180, time 5263.91ms 
iter 3983: loss 2.5810, time 5257.96ms 
iter 3984: loss 2.6369, time 5251.14ms 
iter 3985: loss 2.3881, time 5265.57ms 
iter 3986: loss 2.6020, time 5260.63ms 
iter 3987: loss 2.5816, time 5271.55ms 
iter 3988: loss 2.3295, time 5267.11ms 
iter 3989: loss 2.5887, time 5234.03ms 
iter 3990: loss 2.7674, time 5255.37ms 
iter 3991: loss 2.8202, time 5252.35ms 
iter 3992: loss 2.6135, time 5264.88ms 
iter 3993: loss 2.6229, time 5264.16ms 
iter 3994: loss 2.7002, time 5255.51ms 
iter 3995: loss 2.8220, time 5256.93ms 
iter 3996: loss 2.6469, time 5262.00ms 
iter 3997: loss 2.6070, time 5257.64ms 
iter 3998: loss 2.6377, time 5265.66ms 
iter 3999: loss 2.6742, time 5260.26ms 
step 4000: train loss 2.6653, val loss 2.8347
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 4000: loss 2.4191, time 21116.39ms 
iter 4001: loss 2.4986, time 5259.55ms 
iter 4002: loss 2.5775, time 5256.81ms 
iter 4003: loss 2.6428, time 5269.88ms 
iter 4004: loss 2.6062, time 5260.93ms 
iter 4005: loss 2.4336, time 5267.96ms 
iter 4006: loss 2.5749, time 5256.27ms 
iter 4007: loss 2.7676, time 5274.61ms 
iter 4008: loss 2.5736, time 5270.56ms 
iter 4009: loss 2.7475, time 5258.19ms 
iter 4010: loss 2.5424, time 5220.55ms 
iter 4011: loss 2.6677, time 5151.81ms 
iter 4012: loss 2.6262, time 5206.50ms 
iter 4013: loss 2.8394, time 5249.65ms 
iter 4014: loss 2.5471, time 5249.45ms 
iter 4015: loss 2.5448, time 5253.51ms 
iter 4016: loss 2.6037, time 5272.19ms 
iter 4017: loss 2.5318, time 5315.28ms 
iter 4018: loss 2.6848, time 5293.73ms 
iter 4019: loss 2.6614, time 5286.17ms 
iter 4020: loss 2.8168, time 5278.62ms 
iter 4021: loss 2.5551, time 5279.72ms 
iter 4022: loss 2.5714, time 5262.88ms 
iter 4023: loss 2.6305, time 5234.36ms 
iter 4024: loss 2.6625, time 5261.24ms 
iter 4025: loss 2.7558, time 5270.17ms 
iter 4026: loss 2.8248, time 5271.12ms 
iter 4027: loss 2.8056, time 5263.49ms 
iter 4028: loss 2.6223, time 5252.89ms 
iter 4029: loss 2.7240, time 5240.75ms 
iter 4030: loss 2.8030, time 5257.16ms 
iter 4031: loss 2.4699, time 5253.17ms 
iter 4032: loss 2.5025, time 5254.02ms 
iter 4033: loss 2.7898, time 5248.26ms 
iter 4034: loss 2.7037, time 5251.28ms 
iter 4035: loss 2.7102, time 5268.60ms 
iter 4036: loss 2.7236, time 5256.68ms 
iter 4037: loss 2.3994, time 5257.11ms 
iter 4038: loss 2.6429, time 5249.80ms 
iter 4039: loss 2.3900, time 5255.05ms 
iter 4040: loss 2.6471, time 5262.31ms 
iter 4041: loss 2.8119, time 5276.73ms 
iter 4042: loss 2.7126, time 5269.76ms 
iter 4043: loss 2.4991, time 5243.81ms 
iter 4044: loss 2.6965, time 5259.72ms 
iter 4045: loss 2.5286, time 5263.84ms 
iter 4046: loss 2.7751, time 5266.37ms 
iter 4047: loss 2.6080, time 5265.24ms 
iter 4048: loss 2.5807, time 5250.52ms 
iter 4049: loss 2.7243, time 5260.02ms 
step 4050: train loss 2.6470, val loss 2.8443
iter 4050: loss 2.5630, time 20139.83ms 
iter 4051: loss 2.4873, time 5248.52ms 
iter 4052: loss 2.6811, time 5250.97ms 
iter 4053: loss 2.5681, time 5254.00ms 
iter 4054: loss 2.5672, time 5260.33ms 
iter 4055: loss 2.6114, time 5261.85ms 
iter 4056: loss 2.5636, time 5261.96ms 
iter 4057: loss 2.8039, time 5262.59ms 
iter 4058: loss 2.4804, time 5261.11ms 
iter 4059: loss 2.6238, time 5292.79ms 
iter 4060: loss 2.7662, time 5357.40ms 
iter 4061: loss 2.6935, time 5276.84ms 
iter 4062: loss 2.5613, time 5259.85ms 
iter 4063: loss 2.5138, time 5254.55ms 
iter 4064: loss 2.6580, time 5274.47ms 
iter 4065: loss 2.7355, time 5268.66ms 
iter 4066: loss 2.7177, time 5286.50ms 
iter 4067: loss 2.6192, time 5282.42ms 
iter 4068: loss 2.7108, time 5239.91ms 
iter 4069: loss 2.5266, time 5273.23ms 
iter 4070: loss 2.8647, time 5269.39ms 
iter 4071: loss 2.6759, time 5262.94ms 
iter 4072: loss 2.5544, time 5258.69ms 
iter 4073: loss 2.6327, time 5251.44ms 
iter 4074: loss 2.6150, time 5250.71ms 
iter 4075: loss 2.6109, time 5283.58ms 
iter 4076: loss 2.7056, time 5410.13ms 
iter 4077: loss 2.4880, time 5420.67ms 
iter 4078: loss 2.6326, time 5423.27ms 
iter 4079: loss 2.6687, time 5414.90ms 
iter 4080: loss 2.5793, time 5262.16ms 
iter 4081: loss 2.6001, time 5225.64ms 
iter 4082: loss 2.5873, time 5265.51ms 
iter 4083: loss 2.5282, time 5261.09ms 
iter 4084: loss 2.6131, time 5265.56ms 
iter 4085: loss 2.5161, time 5265.19ms 
iter 4086: loss 2.7144, time 5257.54ms 
iter 4087: loss 2.3875, time 5263.96ms 
iter 4088: loss 2.7278, time 5269.11ms 
iter 4089: loss 2.5761, time 5272.35ms 
iter 4090: loss 2.6608, time 5240.96ms 
iter 4091: loss 2.6450, time 5260.80ms 
iter 4092: loss 2.7730, time 5251.65ms 
iter 4093: loss 2.6706, time 5257.54ms 
iter 4094: loss 2.3769, time 5270.16ms 
iter 4095: loss 2.7916, time 5216.20ms 
iter 4096: loss 2.5382, time 5257.19ms 
iter 4097: loss 2.5567, time 5255.71ms 
iter 4098: loss 2.7759, time 5265.41ms 
iter 4099: loss 2.6001, time 5248.95ms 
step 4100: train loss 2.6361, val loss 2.8434
iter 4100: loss 2.7276, time 20106.62ms 
iter 4101: loss 2.7317, time 5324.64ms 
iter 4102: loss 2.4469, time 5280.59ms 
iter 4103: loss 2.7976, time 5247.37ms 
iter 4104: loss 2.8058, time 5249.70ms 
iter 4105: loss 2.3985, time 5253.69ms 
iter 4106: loss 2.6061, time 5272.91ms 
iter 4107: loss 2.7887, time 5253.19ms 
iter 4108: loss 2.8279, time 5255.30ms 
iter 4109: loss 2.7272, time 5256.93ms 
iter 4110: loss 2.5467, time 5266.86ms 
iter 4111: loss 2.7864, time 5242.39ms 
iter 4112: loss 2.7586, time 5273.88ms 
iter 4113: loss 2.7545, time 5257.88ms 
iter 4114: loss 2.6348, time 5264.88ms 
iter 4115: loss 2.7834, time 5261.58ms 
iter 4116: loss 2.6351, time 5250.01ms 
iter 4117: loss 2.7016, time 5262.60ms 
iter 4118: loss 2.4861, time 5256.17ms 
iter 4119: loss 2.7175, time 5263.89ms 
iter 4120: loss 2.5317, time 5262.14ms 
iter 4121: loss 2.7234, time 5288.90ms 
iter 4122: loss 2.7817, time 5262.30ms 
iter 4123: loss 2.6938, time 5257.10ms 
iter 4124: loss 2.6438, time 5253.15ms 
iter 4125: loss 2.5572, time 5254.64ms 
iter 4126: loss 2.7064, time 5261.94ms 
iter 4127: loss 2.6321, time 5266.58ms 
iter 4128: loss 2.5868, time 5266.34ms 
iter 4129: loss 2.6685, time 5267.75ms 
iter 4130: loss 2.5594, time 5245.76ms 
iter 4131: loss 2.6240, time 5269.00ms 
iter 4132: loss 2.7307, time 5263.38ms 
iter 4133: loss 2.8332, time 5265.87ms 
iter 4134: loss 2.4963, time 5270.81ms 
iter 4135: loss 2.5283, time 5264.85ms 
iter 4136: loss 2.7684, time 5267.00ms 
iter 4137: loss 2.5672, time 5267.72ms 
iter 4138: loss 2.8516, time 5262.41ms 
iter 4139: loss 2.6651, time 5255.26ms 
iter 4140: loss 2.5563, time 5260.61ms 
iter 4141: loss 2.8722, time 5252.21ms 
iter 4142: loss 2.7247, time 5270.87ms 
iter 4143: loss 2.5769, time 5263.18ms 
iter 4144: loss 2.6175, time 5250.12ms 
iter 4145: loss 2.5968, time 5256.79ms 
iter 4146: loss 2.5665, time 5254.61ms 
iter 4147: loss 2.5975, time 5257.18ms 
iter 4148: loss 2.5332, time 5260.77ms 
iter 4149: loss 2.7127, time 5253.70ms 
step 4150: train loss 2.6283, val loss 2.8455
iter 4150: loss 2.6417, time 20037.67ms 
iter 4151: loss 2.7199, time 5239.99ms 
iter 4152: loss 2.5734, time 5265.88ms 
iter 4153: loss 2.6035, time 5259.22ms 
iter 4154: loss 2.5733, time 5270.42ms 
iter 4155: loss 2.5321, time 5258.63ms 
iter 4156: loss 2.5105, time 5258.49ms 
iter 4157: loss 2.6590, time 5253.13ms 
iter 4158: loss 2.6280, time 5263.78ms 
iter 4159: loss 2.5893, time 5263.26ms 
iter 4160: loss 2.4071, time 5264.08ms 
iter 4161: loss 2.5637, time 5259.73ms 
iter 4162: loss 2.4938, time 5246.89ms 
iter 4163: loss 2.7630, time 5248.85ms 
iter 4164: loss 2.5727, time 5259.32ms 
iter 4165: loss 2.8002, time 5250.18ms 
iter 4166: loss 2.5482, time 5251.91ms 
iter 4167: loss 2.4026, time 5255.18ms 
iter 4168: loss 2.5645, time 5249.54ms 
iter 4169: loss 2.4968, time 5257.67ms 
iter 4170: loss 2.8377, time 5246.29ms 
iter 4171: loss 2.7655, time 5251.93ms 
iter 4172: loss 2.4869, time 5249.89ms 
iter 4173: loss 2.4818, time 5249.71ms 
iter 4174: loss 2.6320, time 5266.04ms 
iter 4175: loss 2.6878, time 5252.88ms 
iter 4176: loss 2.4902, time 5259.75ms 
iter 4177: loss 2.6787, time 5252.71ms 
iter 4178: loss 2.8603, time 5251.93ms 
iter 4179: loss 2.6093, time 5254.25ms 
iter 4180: loss 2.4626, time 5260.31ms 
iter 4181: loss 2.3945, time 5255.72ms 
iter 4182: loss 2.6763, time 5253.38ms 
iter 4183: loss 2.6036, time 5260.73ms 
iter 4184: loss 2.6775, time 5253.60ms 
iter 4185: loss 2.6427, time 5252.94ms 
iter 4186: loss 2.7041, time 5250.34ms 
iter 4187: loss 2.7342, time 5252.18ms 
iter 4188: loss 2.4341, time 5282.15ms 
iter 4189: loss 2.6935, time 5262.40ms 
iter 4190: loss 2.5250, time 5292.13ms 
iter 4191: loss 2.6188, time 5263.01ms 
iter 4192: loss 2.5509, time 5259.67ms 
iter 4193: loss 2.7452, time 5260.30ms 
iter 4194: loss 2.7488, time 5255.72ms 
iter 4195: loss 2.6740, time 5262.21ms 
iter 4196: loss 2.5100, time 5268.10ms 
iter 4197: loss 2.5729, time 5265.35ms 
iter 4198: loss 2.6179, time 5265.35ms 
iter 4199: loss 2.6719, time 5266.36ms 
step 4200: train loss 2.6467, val loss 2.8390
iter 4200: loss 2.6745, time 20044.36ms 
iter 4201: loss 2.5944, time 5265.75ms 
iter 4202: loss 2.8155, time 5265.60ms 
iter 4203: loss 2.7000, time 5262.49ms 
iter 4204: loss 2.5070, time 5255.08ms 
iter 4205: loss 2.7017, time 5266.24ms 
iter 4206: loss 2.5720, time 5257.72ms 
iter 4207: loss 2.5087, time 5264.14ms 
iter 4208: loss 2.7740, time 5256.67ms 
iter 4209: loss 2.6153, time 5262.26ms 
iter 4210: loss 2.5737, time 5270.64ms 
iter 4211: loss 2.4503, time 5260.13ms 
iter 4212: loss 2.5009, time 5256.37ms 
iter 4213: loss 2.6712, time 5264.98ms 
iter 4214: loss 2.7020, time 5259.34ms 
iter 4215: loss 2.7008, time 5264.95ms 
iter 4216: loss 2.5549, time 5272.19ms 
iter 4217: loss 2.6033, time 5267.08ms 
iter 4218: loss 2.6473, time 5281.16ms 
iter 4219: loss 2.6826, time 5226.83ms 
iter 4220: loss 2.6285, time 5257.37ms 
iter 4221: loss 2.6342, time 5260.45ms 
iter 4222: loss 2.5933, time 5276.90ms 
iter 4223: loss 2.5167, time 5258.24ms 
iter 4224: loss 2.3672, time 5250.12ms 
iter 4225: loss 2.6191, time 5251.55ms 
iter 4226: loss 2.2672, time 5261.05ms 
iter 4227: loss 2.6439, time 5252.40ms 
iter 4228: loss 2.5741, time 5225.03ms 
iter 4229: loss 2.4924, time 5250.37ms 
iter 4230: loss 2.8208, time 5253.75ms 
iter 4231: loss 2.6401, time 5252.22ms 
iter 4232: loss 2.4330, time 5272.27ms 
iter 4233: loss 2.5582, time 5255.22ms 
iter 4234: loss 2.6871, time 5249.46ms 
iter 4235: loss 2.5593, time 5250.39ms 
iter 4236: loss 2.5187, time 5256.54ms 
iter 4237: loss 2.5775, time 5262.37ms 
iter 4238: loss 2.7897, time 5266.79ms 
iter 4239: loss 2.4708, time 5250.79ms 
iter 4240: loss 2.5949, time 5243.56ms 
iter 4241: loss 2.6242, time 5255.25ms 
iter 4242: loss 2.6752, time 5256.49ms 
iter 4243: loss 2.8075, time 5250.94ms 
iter 4244: loss 2.7200, time 5249.00ms 
iter 4245: loss 2.5612, time 5245.86ms 
iter 4246: loss 2.5113, time 5247.25ms 
iter 4247: loss 2.8151, time 5261.85ms 
iter 4248: loss 2.7432, time 5255.82ms 
iter 4249: loss 2.5542, time 5246.66ms 
step 4250: train loss 2.6219, val loss 2.8357
iter 4250: loss 2.5852, time 20054.81ms 
iter 4251: loss 2.6085, time 5253.59ms 
iter 4252: loss 2.6363, time 5262.04ms 
iter 4253: loss 2.4728, time 5273.82ms 
iter 4254: loss 2.5390, time 5286.28ms 
iter 4255: loss 2.7159, time 5274.69ms 
iter 4256: loss 2.7206, time 5279.33ms 
iter 4257: loss 2.6680, time 5276.10ms 
iter 4258: loss 2.7653, time 5262.79ms 
iter 4259: loss 2.3891, time 5261.44ms 
iter 4260: loss 2.8494, time 5259.00ms 
iter 4261: loss 2.5816, time 5250.91ms 
iter 4262: loss 2.6509, time 5253.55ms 
iter 4263: loss 2.5939, time 5252.14ms 
iter 4264: loss 2.7567, time 5260.07ms 
iter 4265: loss 2.5657, time 5267.64ms 
iter 4266: loss 2.4744, time 5251.66ms 
iter 4267: loss 2.7872, time 5323.73ms 
iter 4268: loss 2.5236, time 5273.05ms 
iter 4269: loss 2.5085, time 5249.83ms 
iter 4270: loss 2.5368, time 5267.83ms 
iter 4271: loss 2.8692, time 5263.76ms 
iter 4272: loss 2.7632, time 5260.89ms 
iter 4273: loss 2.8293, time 5258.78ms 
iter 4274: loss 2.5436, time 5272.19ms 
iter 4275: loss 2.5677, time 5262.13ms 
iter 4276: loss 2.6175, time 5248.90ms 
iter 4277: loss 2.6583, time 5251.65ms 
iter 4278: loss 2.7292, time 5252.24ms 
iter 4279: loss 2.7164, time 5256.59ms 
iter 4280: loss 2.8600, time 5262.85ms 
iter 4281: loss 2.4712, time 5268.04ms 
iter 4282: loss 2.8315, time 5267.82ms 
iter 4283: loss 2.5580, time 5262.75ms 
iter 4284: loss 2.5657, time 5274.83ms 
iter 4285: loss 2.5958, time 5265.89ms 
iter 4286: loss 2.7698, time 5266.26ms 
iter 4287: loss 2.4802, time 5262.82ms 
iter 4288: loss 2.7268, time 5257.23ms 
iter 4289: loss 2.6502, time 5257.75ms 
iter 4290: loss 2.4689, time 5171.58ms 
iter 4291: loss 2.6994, time 5100.20ms 
iter 4292: loss 2.9844, time 5129.37ms 
iter 4293: loss 2.5419, time 5225.86ms 
iter 4294: loss 2.5604, time 5254.07ms 
iter 4295: loss 2.8980, time 5307.85ms 
iter 4296: loss 2.4606, time 5265.57ms 
iter 4297: loss 2.4601, time 5287.00ms 
iter 4298: loss 2.5771, time 5264.43ms 
iter 4299: loss 2.6076, time 5326.87ms 
step 4300: train loss 2.6315, val loss 2.8339
iter 4300: loss 2.6777, time 20216.47ms 
iter 4301: loss 2.6276, time 5322.13ms 
iter 4302: loss 2.6138, time 5327.97ms 
iter 4303: loss 2.6239, time 5295.87ms 
iter 4304: loss 2.5861, time 5304.88ms 
iter 4305: loss 2.4316, time 5314.31ms 
iter 4306: loss 2.8873, time 5289.52ms 
iter 4307: loss 2.5305, time 5334.92ms 
iter 4308: loss 2.4367, time 5298.12ms 
iter 4309: loss 2.5709, time 5269.52ms 
iter 4310: loss 2.8072, time 5308.87ms 
iter 4311: loss 2.5711, time 5317.41ms 
iter 4312: loss 2.5847, time 5249.74ms 
iter 4313: loss 2.6384, time 5295.22ms 
iter 4314: loss 2.6797, time 5315.89ms 
iter 4315: loss 2.9202, time 5297.36ms 
iter 4316: loss 2.7946, time 5317.04ms 
iter 4317: loss 2.6272, time 5331.41ms 
iter 4318: loss 2.6866, time 5333.66ms 
iter 4319: loss 2.5332, time 5306.01ms 
iter 4320: loss 2.7181, time 5314.40ms 
iter 4321: loss 2.5786, time 5339.25ms 
iter 4322: loss 2.5790, time 5280.37ms 
iter 4323: loss 2.5572, time 5304.97ms 
iter 4324: loss 2.7763, time 5299.83ms 
iter 4325: loss 2.7203, time 5326.61ms 
iter 4326: loss 2.4730, time 5259.03ms 
iter 4327: loss 2.6220, time 5257.26ms 
iter 4328: loss 2.7120, time 5249.94ms 
iter 4329: loss 2.4692, time 5250.55ms 
iter 4330: loss 2.6675, time 5252.54ms 
iter 4331: loss 2.6744, time 5252.84ms 
iter 4332: loss 2.5054, time 5259.11ms 
iter 4333: loss 2.5994, time 5261.18ms 
iter 4334: loss 2.7118, time 5266.67ms 
iter 4335: loss 2.6089, time 5250.44ms 
iter 4336: loss 2.7595, time 5255.34ms 
iter 4337: loss 2.4782, time 5251.04ms 
iter 4338: loss 2.3312, time 5270.99ms 
iter 4339: loss 2.5114, time 5259.56ms 
iter 4340: loss 2.5527, time 5249.85ms 
iter 4341: loss 2.5583, time 5259.74ms 
iter 4342: loss 2.6215, time 5263.09ms 
iter 4343: loss 2.5084, time 5269.75ms 
iter 4344: loss 2.5960, time 5256.40ms 
iter 4345: loss 2.7613, time 5252.38ms 
iter 4346: loss 2.5198, time 5247.79ms 
iter 4347: loss 2.6271, time 5248.48ms 
iter 4348: loss 2.6431, time 5247.56ms 
iter 4349: loss 2.6235, time 5252.85ms 
step 4350: train loss 2.6100, val loss 2.8428
iter 4350: loss 2.6447, time 20188.16ms 
iter 4351: loss 2.5856, time 5263.28ms 
iter 4352: loss 2.6978, time 5256.62ms 
iter 4353: loss 2.8265, time 5253.08ms 
iter 4354: loss 2.7453, time 5254.20ms 
iter 4355: loss 2.4794, time 5256.55ms 
iter 4356: loss 2.5464, time 5269.33ms 
iter 4357: loss 2.5645, time 5275.95ms 
iter 4358: loss 2.6109, time 5208.16ms 
iter 4359: loss 2.4756, time 5260.23ms 
iter 4360: loss 2.6803, time 5302.55ms 
iter 4361: loss 2.6284, time 5237.35ms 
iter 4362: loss 2.5753, time 5282.61ms 
iter 4363: loss 2.6676, time 5307.60ms 
iter 4364: loss 2.6020, time 5261.60ms 
iter 4365: loss 2.5686, time 5259.02ms 
iter 4366: loss 2.8460, time 5260.56ms 
iter 4367: loss 2.6788, time 5271.48ms 
iter 4368: loss 2.4516, time 5274.52ms 
iter 4369: loss 2.5708, time 5291.44ms 
iter 4370: loss 2.8629, time 5282.35ms 
iter 4371: loss 2.4543, time 5327.43ms 
iter 4372: loss 2.5275, time 5279.51ms 
iter 4373: loss 2.6167, time 5277.41ms 
iter 4374: loss 2.5896, time 5271.33ms 
iter 4375: loss 2.4484, time 5258.32ms 
iter 4376: loss 2.5789, time 5259.62ms 
iter 4377: loss 2.6035, time 5266.82ms 
iter 4378: loss 2.6857, time 5267.60ms 
iter 4379: loss 2.5193, time 5279.89ms 
iter 4380: loss 2.7212, time 5280.61ms 
iter 4381: loss 2.4763, time 5266.79ms 
iter 4382: loss 2.7122, time 5258.25ms 
iter 4383: loss 2.6106, time 5266.35ms 
iter 4384: loss 2.5111, time 5274.62ms 
iter 4385: loss 2.6591, time 5289.73ms 
iter 4386: loss 2.8296, time 5274.71ms 
iter 4387: loss 2.5380, time 5265.26ms 
iter 4388: loss 2.6201, time 5274.68ms 
iter 4389: loss 2.5301, time 5266.16ms 
iter 4390: loss 2.5997, time 5274.17ms 
iter 4391: loss 2.5170, time 5281.59ms 
iter 4392: loss 2.8696, time 5258.21ms 
iter 4393: loss 2.4396, time 5261.26ms 
iter 4394: loss 2.6959, time 5262.86ms 
iter 4395: loss 2.5848, time 5269.16ms 
iter 4396: loss 2.6537, time 5264.63ms 
iter 4397: loss 2.6074, time 5260.77ms 
iter 4398: loss 2.6608, time 5258.79ms 
iter 4399: loss 2.5757, time 5272.15ms 
step 4400: train loss 2.6280, val loss 2.8504
iter 4400: loss 2.4697, time 20130.17ms 
iter 4401: loss 2.8325, time 5254.51ms 
iter 4402: loss 2.6415, time 5261.70ms 
iter 4403: loss 2.6974, time 5267.90ms 
iter 4404: loss 2.8016, time 5270.62ms 
iter 4405: loss 2.7730, time 5259.22ms 
iter 4406: loss 2.5196, time 5258.17ms 
iter 4407: loss 2.6515, time 5265.49ms 
iter 4408: loss 2.4861, time 5265.02ms 
iter 4409: loss 2.7872, time 5271.31ms 
iter 4410: loss 2.6831, time 5267.87ms 
iter 4411: loss 2.7436, time 5269.99ms 
iter 4412: loss 2.5241, time 5256.35ms 
iter 4413: loss 2.7661, time 5276.12ms 
iter 4414: loss 2.7674, time 5269.27ms 
iter 4415: loss 2.7313, time 5273.70ms 
iter 4416: loss 3.0004, time 5270.82ms 
iter 4417: loss 2.4459, time 5269.64ms 
iter 4418: loss 2.4418, time 5258.61ms 
iter 4419: loss 2.5295, time 5259.76ms 
iter 4420: loss 2.6171, time 5261.22ms 
iter 4421: loss 2.4945, time 5264.75ms 
iter 4422: loss 2.6420, time 5257.28ms 
iter 4423: loss 2.4435, time 5248.92ms 
iter 4424: loss 2.5299, time 5265.87ms 
iter 4425: loss 2.6491, time 5226.77ms 
iter 4426: loss 2.5910, time 5254.73ms 
iter 4427: loss 2.3772, time 5260.29ms 
iter 4428: loss 2.6338, time 5261.75ms 
iter 4429: loss 2.6010, time 5234.97ms 
iter 4430: loss 2.4443, time 5254.29ms 
iter 4431: loss 2.7180, time 5256.13ms 
iter 4432: loss 2.7425, time 5255.34ms 
iter 4433: loss 2.8325, time 5261.55ms 
iter 4434: loss 2.6651, time 5261.75ms 
iter 4435: loss 2.4527, time 5261.15ms 
iter 4436: loss 2.6945, time 5256.96ms 
iter 4437: loss 2.6400, time 5258.15ms 
iter 4438: loss 2.4922, time 5266.98ms 
iter 4439: loss 2.6880, time 5253.33ms 
iter 4440: loss 2.4448, time 5248.84ms 
iter 4441: loss 2.4688, time 5252.65ms 
iter 4442: loss 2.3532, time 5252.05ms 
iter 4443: loss 2.5698, time 5262.10ms 
iter 4444: loss 2.6574, time 5252.25ms 
iter 4445: loss 2.7434, time 5256.39ms 
iter 4446: loss 2.6589, time 5253.92ms 
iter 4447: loss 2.6708, time 5242.40ms 
iter 4448: loss 2.7531, time 5269.55ms 
iter 4449: loss 2.4167, time 5291.98ms 
step 4450: train loss 2.6297, val loss 2.8472
iter 4450: loss 2.6991, time 20056.00ms 
iter 4451: loss 2.5945, time 5377.68ms 
iter 4452: loss 2.7656, time 5303.01ms 
iter 4453: loss 2.8425, time 5265.50ms 
iter 4454: loss 2.7067, time 5270.01ms 
iter 4455: loss 2.6880, time 5258.43ms 
iter 4456: loss 2.6493, time 5253.85ms 
iter 4457: loss 2.4475, time 5297.44ms 
iter 4458: loss 2.5817, time 5376.97ms 
iter 4459: loss 2.7562, time 5340.97ms 
iter 4460: loss 2.7372, time 5421.77ms 
iter 4461: loss 2.7655, time 5407.32ms 
iter 4462: loss 2.6736, time 5243.16ms 
iter 4463: loss 2.6562, time 5251.50ms 
iter 4464: loss 2.6721, time 5252.76ms 
iter 4465: loss 2.6336, time 5251.83ms 
iter 4466: loss 2.5339, time 5261.40ms 
iter 4467: loss 2.5916, time 5255.46ms 
iter 4468: loss 2.4916, time 5233.04ms 
iter 4469: loss 2.6635, time 5258.08ms 
iter 4470: loss 2.8514, time 5263.63ms 
iter 4471: loss 2.7580, time 5256.25ms 
iter 4472: loss 2.7531, time 5253.74ms 
iter 4473: loss 2.6281, time 5256.19ms 
iter 4474: loss 2.6229, time 5260.58ms 
iter 4475: loss 2.5117, time 5263.89ms 
iter 4476: loss 2.6812, time 5249.03ms 
iter 4477: loss 2.5995, time 5252.86ms 
iter 4478: loss 2.7131, time 5252.22ms 
iter 4479: loss 2.5869, time 5252.94ms 
iter 4480: loss 2.6266, time 5256.09ms 
iter 4481: loss 2.4903, time 5266.67ms 
iter 4482: loss 2.5450, time 5263.51ms 
iter 4483: loss 2.6616, time 5237.03ms 
iter 4484: loss 2.6706, time 5257.40ms 
iter 4485: loss 2.6683, time 5258.08ms 
iter 4486: loss 2.4555, time 5267.33ms 
iter 4487: loss 2.7120, time 5256.50ms 
iter 4488: loss 2.7033, time 5263.40ms 
iter 4489: loss 2.6075, time 5264.35ms 
iter 4490: loss 2.6968, time 5272.40ms 
iter 4491: loss 2.4183, time 5274.81ms 
iter 4492: loss 2.6088, time 5271.75ms 
iter 4493: loss 2.5756, time 5270.17ms 
iter 4494: loss 2.5419, time 5259.09ms 
iter 4495: loss 2.6345, time 5270.58ms 
iter 4496: loss 2.8764, time 5270.04ms 
iter 4497: loss 2.6617, time 5264.82ms 
iter 4498: loss 2.6630, time 5253.63ms 
iter 4499: loss 2.6168, time 5255.76ms 
step 4500: train loss 2.6067, val loss 2.8398
iter 4500: loss 2.4680, time 20075.50ms 
iter 4501: loss 2.5827, time 5264.16ms 
iter 4502: loss 2.6667, time 5263.12ms 
iter 4503: loss 2.8334, time 5303.57ms 
iter 4504: loss 2.6760, time 5401.55ms 
iter 4505: loss 2.5671, time 5395.86ms 
iter 4506: loss 2.5155, time 5409.16ms 
iter 4507: loss 2.5773, time 5400.61ms 
iter 4508: loss 2.4689, time 5286.30ms 
iter 4509: loss 2.5928, time 5259.89ms 
iter 4510: loss 2.6984, time 5253.28ms 
iter 4511: loss 2.6267, time 5334.31ms 
iter 4512: loss 2.5530, time 5363.48ms 
iter 4513: loss 2.4176, time 5260.20ms 
iter 4514: loss 2.8636, time 5255.70ms 
iter 4515: loss 2.6088, time 5355.49ms 
iter 4516: loss 2.5447, time 5392.70ms 
iter 4517: loss 2.5381, time 5401.76ms 
iter 4518: loss 2.7110, time 5370.42ms 
iter 4519: loss 2.7582, time 5307.78ms 
iter 4520: loss 2.8303, time 5358.51ms 
iter 4521: loss 2.5913, time 5409.18ms 
iter 4522: loss 2.6386, time 5256.12ms 
iter 4523: loss 2.6912, time 5266.34ms 
iter 4524: loss 2.5226, time 5268.91ms 
iter 4525: loss 2.5567, time 5264.57ms 
iter 4526: loss 2.5950, time 5272.16ms 
iter 4527: loss 2.6036, time 5264.10ms 
iter 4528: loss 2.7205, time 5262.36ms 
iter 4529: loss 2.4723, time 5264.75ms 
iter 4530: loss 2.4708, time 5288.99ms 
iter 4531: loss 2.6395, time 5283.38ms 
iter 4532: loss 2.5110, time 5282.65ms 
iter 4533: loss 2.7564, time 5273.64ms 
iter 4534: loss 2.7314, time 5261.21ms 
iter 4535: loss 2.4202, time 5260.81ms 
iter 4536: loss 2.7966, time 5275.07ms 
iter 4537: loss 2.7574, time 5260.01ms 
iter 4538: loss 2.5666, time 5260.35ms 
iter 4539: loss 2.6906, time 5262.17ms 
iter 4540: loss 2.6471, time 5261.65ms 
iter 4541: loss 2.5203, time 5229.15ms 
iter 4542: loss 2.6599, time 5268.25ms 
iter 4543: loss 2.6674, time 5254.63ms 
iter 4544: loss 2.7087, time 5259.11ms 
iter 4545: loss 2.5915, time 5262.95ms 
iter 4546: loss 2.4112, time 5258.32ms 
iter 4547: loss 2.6179, time 5222.94ms 
iter 4548: loss 2.7525, time 5202.52ms 
iter 4549: loss 2.7187, time 5210.92ms 
step 4550: train loss 2.6239, val loss 2.8251
iter 4550: loss 2.6814, time 19748.63ms 
iter 4551: loss 2.7928, time 5209.06ms 
iter 4552: loss 2.4002, time 5261.25ms 
iter 4553: loss 2.6478, time 5238.47ms 
iter 4554: loss 2.5032, time 5273.51ms 
iter 4555: loss 2.5785, time 5236.59ms 
iter 4556: loss 2.6100, time 5250.29ms 
iter 4557: loss 2.7662, time 5237.63ms 
iter 4558: loss 2.7190, time 5271.20ms 
iter 4559: loss 2.4287, time 5256.43ms 
iter 4560: loss 2.5484, time 5240.94ms 
iter 4561: loss 2.7083, time 5233.33ms 
iter 4562: loss 2.6408, time 5258.27ms 
iter 4563: loss 2.6065, time 5253.82ms 
iter 4564: loss 2.6178, time 5269.94ms 
iter 4565: loss 2.6108, time 5262.47ms 
iter 4566: loss 2.6751, time 5260.33ms 
iter 4567: loss 2.5776, time 5269.00ms 
iter 4568: loss 2.6143, time 5261.46ms 
iter 4569: loss 2.8334, time 5276.83ms 
iter 4570: loss 2.5883, time 5269.47ms 
iter 4571: loss 2.6429, time 5268.60ms 
iter 4572: loss 2.5690, time 5270.98ms 
iter 4573: loss 2.7339, time 5263.33ms 
iter 4574: loss 2.7044, time 5272.27ms 
iter 4575: loss 2.8390, time 5264.77ms 
iter 4576: loss 2.7197, time 5265.85ms 
iter 4577: loss 2.5238, time 5267.24ms 
iter 4578: loss 2.5423, time 5263.29ms 
iter 4579: loss 2.7538, time 5258.65ms 
iter 4580: loss 2.6226, time 5255.53ms 
iter 4581: loss 2.7109, time 5251.22ms 
iter 4582: loss 2.3461, time 5256.48ms 
iter 4583: loss 2.4871, time 5244.16ms 
iter 4584: loss 2.6301, time 5255.53ms 
iter 4585: loss 2.6625, time 5260.77ms 
iter 4586: loss 2.4949, time 5260.36ms 
iter 4587: loss 2.7166, time 5246.30ms 
iter 4588: loss 2.5027, time 5247.57ms 
iter 4589: loss 2.7409, time 5257.92ms 
iter 4590: loss 2.3889, time 5247.56ms 
iter 4591: loss 2.7702, time 5274.50ms 
iter 4592: loss 2.6059, time 5252.83ms 
iter 4593: loss 2.6663, time 5251.92ms 
iter 4594: loss 2.6844, time 5252.60ms 
iter 4595: loss 2.5720, time 5268.55ms 
iter 4596: loss 2.4669, time 5239.71ms 
iter 4597: loss 2.4449, time 5248.13ms 
iter 4598: loss 2.7423, time 5216.60ms 
iter 4599: loss 2.6819, time 5264.56ms 
step 4600: train loss 2.6137, val loss 2.8468
iter 4600: loss 2.5964, time 20093.44ms 
iter 4601: loss 2.7428, time 5263.13ms 
iter 4602: loss 2.5167, time 5183.77ms 
iter 4603: loss 2.8386, time 5244.27ms 
iter 4604: loss 2.7157, time 5250.81ms 
iter 4605: loss 2.4389, time 5240.84ms 
iter 4606: loss 2.6407, time 5249.00ms 
iter 4607: loss 2.8555, time 5257.13ms 
iter 4608: loss 2.6530, time 5252.91ms 
iter 4609: loss 2.5682, time 5275.12ms 
iter 4610: loss 2.6315, time 5276.53ms 
iter 4611: loss 2.5121, time 5263.24ms 
iter 4612: loss 2.5755, time 5224.83ms 
iter 4613: loss 2.9099, time 5264.75ms 
iter 4614: loss 2.7250, time 5274.52ms 
iter 4615: loss 2.7540, time 5263.69ms 
iter 4616: loss 2.4472, time 5248.40ms 
iter 4617: loss 2.5017, time 5249.22ms 
iter 4618: loss 2.6432, time 5237.56ms 
iter 4619: loss 2.5610, time 5247.79ms 
iter 4620: loss 2.4428, time 5244.09ms 
iter 4621: loss 2.4836, time 5245.87ms 
iter 4622: loss 2.4654, time 5256.25ms 
iter 4623: loss 2.6470, time 5251.89ms 
iter 4624: loss 2.7343, time 5247.67ms 
iter 4625: loss 2.4935, time 5260.43ms 
iter 4626: loss 2.6359, time 5251.20ms 
iter 4627: loss 2.4422, time 5260.13ms 
iter 4628: loss 2.5711, time 5245.89ms 
iter 4629: loss 2.5844, time 5267.01ms 
iter 4630: loss 2.6287, time 5265.35ms 
iter 4631: loss 2.5304, time 5248.63ms 
iter 4632: loss 2.5025, time 5259.98ms 
iter 4633: loss 2.5552, time 5259.72ms 
iter 4634: loss 2.5683, time 5242.56ms 
iter 4635: loss 2.6467, time 5258.10ms 
iter 4636: loss 2.6185, time 5427.25ms 
iter 4637: loss 2.7439, time 5402.41ms 
iter 4638: loss 2.6275, time 5409.51ms 
iter 4639: loss 2.6570, time 5350.60ms 
iter 4640: loss 2.6020, time 5253.32ms 
iter 4641: loss 2.6203, time 5258.99ms 
iter 4642: loss 2.5077, time 5404.05ms 
iter 4643: loss 2.5494, time 5307.72ms 
iter 4644: loss 2.7872, time 5252.35ms 
iter 4645: loss 2.6628, time 5260.09ms 
iter 4646: loss 2.8311, time 5263.22ms 
iter 4647: loss 2.6741, time 5257.42ms 
iter 4648: loss 2.6758, time 5262.55ms 
iter 4649: loss 2.6294, time 5253.78ms 
step 4650: train loss 2.6122, val loss 2.8469
iter 4650: loss 2.7980, time 20191.95ms 
iter 4651: loss 2.3837, time 5216.36ms 
iter 4652: loss 2.6214, time 5235.40ms 
iter 4653: loss 2.2926, time 5268.46ms 
iter 4654: loss 2.6486, time 5258.09ms 
iter 4655: loss 2.5133, time 5263.74ms 
iter 4656: loss 2.5963, time 5266.66ms 
iter 4657: loss 2.5694, time 5263.91ms 
iter 4658: loss 2.6304, time 5271.32ms 
iter 4659: loss 2.6242, time 5273.68ms 
iter 4660: loss 2.5983, time 5267.85ms 
iter 4661: loss 2.7559, time 5281.40ms 
iter 4662: loss 2.7476, time 5259.36ms 
iter 4663: loss 2.6746, time 5244.65ms 
iter 4664: loss 2.5195, time 5286.29ms 
iter 4665: loss 2.5888, time 5289.43ms 
iter 4666: loss 2.6414, time 5262.72ms 
iter 4667: loss 2.5764, time 5252.91ms 
iter 4668: loss 2.6251, time 5245.80ms 
iter 4669: loss 2.6971, time 5258.90ms 
iter 4670: loss 2.6145, time 5267.68ms 
iter 4671: loss 2.8274, time 5260.48ms 
iter 4672: loss 2.7076, time 5252.97ms 
iter 4673: loss 2.6147, time 5255.21ms 
iter 4674: loss 2.7540, time 5257.36ms 
iter 4675: loss 2.5444, time 5260.26ms 
iter 4676: loss 2.8262, time 5253.18ms 
iter 4677: loss 2.6446, time 5258.72ms 
iter 4678: loss 2.5095, time 5257.26ms 
iter 4679: loss 2.6622, time 5259.40ms 
iter 4680: loss 2.6484, time 5258.20ms 
iter 4681: loss 2.4963, time 5257.21ms 
iter 4682: loss 2.6174, time 5252.91ms 
iter 4683: loss 2.7736, time 5251.11ms 
iter 4684: loss 2.3913, time 5258.49ms 
iter 4685: loss 2.5769, time 5256.98ms 
iter 4686: loss 2.5955, time 5255.65ms 
iter 4687: loss 2.6937, time 5249.70ms 
iter 4688: loss 2.6017, time 5251.46ms 
iter 4689: loss 2.5647, time 5256.48ms 
iter 4690: loss 2.5226, time 5268.78ms 
iter 4691: loss 2.9476, time 5249.81ms 
iter 4692: loss 2.4541, time 5251.34ms 
iter 4693: loss 2.7702, time 5262.88ms 
iter 4694: loss 2.6004, time 5262.38ms 
iter 4695: loss 2.5285, time 5263.53ms 
iter 4696: loss 2.6895, time 5264.24ms 
iter 4697: loss 2.7260, time 5263.04ms 
iter 4698: loss 2.6163, time 5270.70ms 
iter 4699: loss 2.4874, time 5262.90ms 
step 4700: train loss 2.5966, val loss 2.8488
iter 4700: loss 2.5434, time 20085.97ms 
iter 4701: loss 2.7173, time 5284.92ms 
iter 4702: loss 2.6091, time 5258.76ms 
iter 4703: loss 2.5743, time 5263.65ms 
iter 4704: loss 2.5308, time 5265.25ms 
iter 4705: loss 2.3780, time 5254.42ms 
iter 4706: loss 2.5591, time 5262.43ms 
iter 4707: loss 2.5371, time 5252.74ms 
iter 4708: loss 2.5749, time 5255.02ms 
iter 4709: loss 2.8545, time 5253.27ms 
iter 4710: loss 2.5532, time 5235.70ms 
iter 4711: loss 2.5388, time 5257.48ms 
iter 4712: loss 2.5517, time 5259.86ms 
iter 4713: loss 2.5577, time 5245.33ms 
iter 4714: loss 2.4647, time 5255.79ms 
iter 4715: loss 2.6310, time 5255.16ms 
iter 4716: loss 2.6728, time 5249.56ms 
iter 4717: loss 2.6466, time 5257.66ms 
iter 4718: loss 2.5515, time 5251.14ms 
iter 4719: loss 2.6980, time 5258.38ms 
iter 4720: loss 2.4742, time 5251.58ms 
iter 4721: loss 2.4614, time 5251.33ms 
iter 4722: loss 2.2513, time 5261.52ms 
iter 4723: loss 2.5507, time 5265.86ms 
iter 4724: loss 2.4697, time 5261.97ms 
iter 4725: loss 2.7394, time 5268.59ms 
iter 4726: loss 2.7647, time 5255.73ms 
iter 4727: loss 2.5070, time 5259.64ms 
iter 4728: loss 2.7389, time 5257.47ms 
iter 4729: loss 2.6081, time 5261.71ms 
iter 4730: loss 2.5174, time 5254.66ms 
iter 4731: loss 2.4911, time 5255.82ms 
iter 4732: loss 2.6043, time 5256.47ms 
iter 4733: loss 2.6165, time 5263.75ms 
iter 4734: loss 2.6523, time 5257.74ms 
iter 4735: loss 2.7682, time 5249.33ms 
iter 4736: loss 2.6055, time 5248.98ms 
iter 4737: loss 2.5827, time 5258.27ms 
iter 4738: loss 2.7997, time 5261.96ms 
iter 4739: loss 2.3901, time 5259.59ms 
iter 4740: loss 2.5971, time 5293.49ms 
iter 4741: loss 2.6614, time 5415.59ms 
iter 4742: loss 2.5226, time 5408.83ms 
iter 4743: loss 2.5551, time 5393.33ms 
iter 4744: loss 2.7484, time 5397.99ms 
iter 4745: loss 2.8154, time 5400.57ms 
iter 4746: loss 2.6069, time 5407.73ms 
iter 4747: loss 2.6918, time 5420.34ms 
iter 4748: loss 2.5940, time 5265.39ms 
iter 4749: loss 2.5920, time 5249.11ms 
step 4750: train loss 2.5869, val loss 2.8581
iter 4750: loss 2.6155, time 20062.30ms 
iter 4751: loss 2.6331, time 5257.89ms 
iter 4752: loss 2.5425, time 5239.81ms 
iter 4753: loss 2.7714, time 5249.09ms 
iter 4754: loss 2.5596, time 5254.37ms 
iter 4755: loss 2.6745, time 5260.48ms 
iter 4756: loss 2.8235, time 5250.00ms 
iter 4757: loss 2.3825, time 5261.44ms 
iter 4758: loss 2.6217, time 5253.70ms 
iter 4759: loss 2.7036, time 5257.91ms 
iter 4760: loss 2.6726, time 5258.02ms 
iter 4761: loss 2.6216, time 5260.03ms 
iter 4762: loss 2.6192, time 5286.00ms 
iter 4763: loss 2.4970, time 5367.86ms 
iter 4764: loss 2.5464, time 5375.47ms 
iter 4765: loss 2.6897, time 5416.37ms 
iter 4766: loss 2.6225, time 5405.66ms 
iter 4767: loss 2.5986, time 5400.07ms 
iter 4768: loss 2.5045, time 5334.42ms 
iter 4769: loss 2.6314, time 5402.93ms 
iter 4770: loss 2.4944, time 5273.97ms 
iter 4771: loss 2.4713, time 5259.33ms 
iter 4772: loss 2.6593, time 5256.26ms 
iter 4773: loss 2.6741, time 5259.20ms 
iter 4774: loss 2.3129, time 5258.67ms 
iter 4775: loss 2.6787, time 5257.00ms 
iter 4776: loss 2.5981, time 5275.25ms 
iter 4777: loss 2.6660, time 5255.21ms 
iter 4778: loss 2.5478, time 5266.98ms 
iter 4779: loss 2.5512, time 5251.48ms 
iter 4780: loss 2.6470, time 5280.73ms 
iter 4781: loss 2.5910, time 5270.49ms 
iter 4782: loss 2.6203, time 5276.91ms 
iter 4783: loss 2.6065, time 5265.12ms 
iter 4784: loss 2.6742, time 5261.55ms 
iter 4785: loss 2.5795, time 5253.87ms 
iter 4786: loss 2.5589, time 5266.31ms 
iter 4787: loss 2.3627, time 5266.89ms 
iter 4788: loss 2.5854, time 5261.05ms 
iter 4789: loss 2.6264, time 5256.63ms 
iter 4790: loss 2.5429, time 5263.80ms 
iter 4791: loss 2.6443, time 5269.46ms 
iter 4792: loss 2.5661, time 5266.34ms 
iter 4793: loss 2.4761, time 5257.41ms 
iter 4794: loss 2.5510, time 5254.65ms 
iter 4795: loss 2.6240, time 5249.15ms 
iter 4796: loss 2.6482, time 5252.15ms 
iter 4797: loss 2.7175, time 5261.15ms 
iter 4798: loss 2.6471, time 5252.89ms 
iter 4799: loss 2.7636, time 5248.11ms 
step 4800: train loss 2.5983, val loss 2.8368
iter 4800: loss 2.3366, time 20068.90ms 
iter 4801: loss 2.4970, time 5245.06ms 
iter 4802: loss 2.6205, time 5251.52ms 
iter 4803: loss 2.6855, time 5254.78ms 
iter 4804: loss 2.5922, time 5277.81ms 
iter 4805: loss 2.6368, time 5246.66ms 
iter 4806: loss 2.7294, time 5251.79ms 
iter 4807: loss 2.7980, time 5266.04ms 
iter 4808: loss 2.4421, time 5259.04ms 
iter 4809: loss 2.6099, time 5266.82ms 
iter 4810: loss 2.6685, time 5266.06ms 
iter 4811: loss 2.6254, time 5259.22ms 
iter 4812: loss 2.7712, time 5255.14ms 
iter 4813: loss 2.7212, time 5252.30ms 
iter 4814: loss 2.5068, time 5258.39ms 
iter 4815: loss 2.5408, time 5274.56ms 
iter 4816: loss 2.5167, time 5267.69ms 
iter 4817: loss 2.5946, time 5261.68ms 
iter 4818: loss 2.5754, time 5264.68ms 
iter 4819: loss 2.4134, time 5261.01ms 
iter 4820: loss 2.7890, time 5279.70ms 
iter 4821: loss 2.6872, time 5270.26ms 
iter 4822: loss 2.4638, time 5261.17ms 
iter 4823: loss 2.6713, time 5261.87ms 
iter 4824: loss 2.4408, time 5248.36ms 
iter 4825: loss 2.3577, time 5261.71ms 
iter 4826: loss 2.5792, time 5272.74ms 
iter 4827: loss 2.5111, time 5265.41ms 
iter 4828: loss 2.5764, time 5272.41ms 
iter 4829: loss 2.8329, time 5261.07ms 
iter 4830: loss 2.6275, time 5279.85ms 
iter 4831: loss 2.5331, time 5283.05ms 
iter 4832: loss 2.3831, time 5276.44ms 
iter 4833: loss 2.5655, time 5259.91ms 
iter 4834: loss 2.5296, time 5261.90ms 
iter 4835: loss 2.6713, time 5278.61ms 
iter 4836: loss 2.6144, time 5279.24ms 
iter 4837: loss 2.6428, time 5261.91ms 
iter 4838: loss 2.4925, time 5269.85ms 
iter 4839: loss 2.7100, time 5259.30ms 
iter 4840: loss 2.8829, time 5282.82ms 
iter 4841: loss 2.4903, time 5278.62ms 
iter 4842: loss 2.3895, time 5272.81ms 
iter 4843: loss 2.7084, time 5264.13ms 
iter 4844: loss 2.4926, time 5261.68ms 
iter 4845: loss 2.6311, time 5275.10ms 
iter 4846: loss 2.5966, time 5273.04ms 
iter 4847: loss 2.7371, time 5254.58ms 
iter 4848: loss 2.4534, time 5257.32ms 
iter 4849: loss 2.8181, time 5254.32ms 
step 4850: train loss 2.6096, val loss 2.8372
iter 4850: loss 2.5924, time 20084.96ms 
iter 4851: loss 2.4700, time 5274.46ms 
iter 4852: loss 2.4950, time 5272.05ms 
iter 4853: loss 2.6133, time 5275.20ms 
iter 4854: loss 2.4650, time 5278.50ms 
iter 4855: loss 2.6392, time 5291.65ms 
iter 4856: loss 2.6340, time 5286.94ms 
iter 4857: loss 2.6485, time 5273.78ms 
iter 4858: loss 2.7040, time 5248.89ms 
iter 4859: loss 2.6372, time 5254.12ms 
iter 4860: loss 2.4901, time 5249.08ms 
iter 4861: loss 2.5874, time 5251.64ms 
iter 4862: loss 2.6022, time 5264.19ms 
iter 4863: loss 2.5637, time 5250.76ms 
iter 4864: loss 2.6104, time 5251.32ms 
iter 4865: loss 2.6461, time 5244.74ms 
iter 4866: loss 2.6336, time 5234.23ms 
iter 4867: loss 2.3776, time 5249.62ms 
iter 4868: loss 2.6032, time 5262.90ms 
iter 4869: loss 2.7202, time 5244.91ms 
iter 4870: loss 2.4664, time 5242.00ms 
iter 4871: loss 2.4854, time 5253.99ms 
iter 4872: loss 2.5812, time 5264.54ms 
iter 4873: loss 2.7267, time 5253.42ms 
iter 4874: loss 2.4821, time 5252.77ms 
iter 4875: loss 2.6298, time 5263.82ms 
iter 4876: loss 2.5377, time 5266.77ms 
iter 4877: loss 2.6303, time 5255.50ms 
iter 4878: loss 2.4281, time 5258.91ms 
iter 4879: loss 2.7078, time 5249.44ms 
iter 4880: loss 2.4471, time 5255.51ms 
iter 4881: loss 2.8173, time 5249.78ms 
iter 4882: loss 2.7629, time 5254.78ms 
iter 4883: loss 2.5896, time 5259.06ms 
iter 4884: loss 2.6749, time 5274.98ms 
iter 4885: loss 2.7013, time 5253.06ms 
iter 4886: loss 2.3241, time 5256.67ms 
iter 4887: loss 2.5889, time 5256.82ms 
iter 4888: loss 2.5626, time 5251.95ms 
iter 4889: loss 2.5416, time 5265.78ms 
iter 4890: loss 2.5486, time 5259.04ms 
iter 4891: loss 2.6309, time 5271.64ms 
iter 4892: loss 2.6367, time 5265.37ms 
iter 4893: loss 2.8767, time 5276.12ms 
iter 4894: loss 2.4466, time 5272.31ms 
iter 4895: loss 2.4738, time 5271.95ms 
iter 4896: loss 2.5687, time 5270.27ms 
iter 4897: loss 2.5955, time 5258.62ms 
iter 4898: loss 2.6138, time 5275.66ms 
iter 4899: loss 2.4122, time 5266.87ms 
step 4900: train loss 2.5941, val loss 2.8661
iter 4900: loss 2.4758, time 19961.59ms 
iter 4901: loss 2.5312, time 5280.10ms 
iter 4902: loss 2.5258, time 5235.42ms 
iter 4903: loss 2.7709, time 5269.74ms 
iter 4904: loss 2.7023, time 5266.21ms 
iter 4905: loss 2.8198, time 5279.74ms 
iter 4906: loss 2.4775, time 5271.96ms 
iter 4907: loss 2.6963, time 5265.27ms 
iter 4908: loss 2.6070, time 5260.06ms 
iter 4909: loss 2.5546, time 5270.35ms 
iter 4910: loss 2.5409, time 5266.30ms 
iter 4911: loss 2.7521, time 5261.04ms 
iter 4912: loss 2.5950, time 5256.78ms 
iter 4913: loss 2.5346, time 5254.75ms 
iter 4914: loss 2.5684, time 5257.41ms 
iter 4915: loss 2.7838, time 5317.24ms 
iter 4916: loss 2.4758, time 5393.52ms 
iter 4917: loss 2.5385, time 5287.07ms 
iter 4918: loss 2.6104, time 5252.00ms 
iter 4919: loss 2.5755, time 5263.73ms 
iter 4920: loss 2.6454, time 5268.63ms 
iter 4921: loss 2.7367, time 5246.32ms 
iter 4922: loss 2.5332, time 5245.31ms 
iter 4923: loss 2.6329, time 5250.67ms 
iter 4924: loss 2.5040, time 5258.44ms 
iter 4925: loss 2.6973, time 5240.88ms 
iter 4926: loss 2.4836, time 5256.96ms 
iter 4927: loss 2.5042, time 5261.18ms 
iter 4928: loss 2.6175, time 5257.72ms 
iter 4929: loss 2.5732, time 5253.00ms 
iter 4930: loss 2.5376, time 5270.80ms 
iter 4931: loss 2.3790, time 5258.13ms 
iter 4932: loss 2.3492, time 5254.42ms 
iter 4933: loss 2.9369, time 5258.15ms 
iter 4934: loss 2.6437, time 5257.40ms 
iter 4935: loss 2.5988, time 5268.31ms 
iter 4936: loss 2.3466, time 5254.00ms 
iter 4937: loss 2.7187, time 5250.89ms 
iter 4938: loss 2.6471, time 5264.99ms 
iter 4939: loss 2.5311, time 5247.91ms 
iter 4940: loss 2.6058, time 5257.90ms 
iter 4941: loss 2.5137, time 5254.74ms 
iter 4942: loss 2.5275, time 5245.51ms 
iter 4943: loss 2.4491, time 5252.76ms 
iter 4944: loss 2.6256, time 5254.19ms 
iter 4945: loss 2.3904, time 5260.00ms 
iter 4946: loss 2.4944, time 5253.44ms 
iter 4947: loss 2.8959, time 5253.18ms 
iter 4948: loss 2.5452, time 5251.78ms 
iter 4949: loss 2.5175, time 5245.17ms 
step 4950: train loss 2.5924, val loss 2.8439
iter 4950: loss 2.6083, time 20050.03ms 
iter 4951: loss 2.5347, time 5249.15ms 
iter 4952: loss 2.7460, time 5258.41ms 
iter 4953: loss 2.6888, time 5250.23ms 
iter 4954: loss 2.5329, time 5246.82ms 
iter 4955: loss 2.6787, time 5252.07ms 
iter 4956: loss 2.3971, time 5243.22ms 
iter 4957: loss 2.7061, time 5267.57ms 
iter 4958: loss 2.5967, time 5259.62ms 
iter 4959: loss 2.5595, time 5261.58ms 
iter 4960: loss 2.5960, time 5261.10ms 
iter 4961: loss 2.6586, time 5261.78ms 
iter 4962: loss 2.5906, time 5257.79ms 
iter 4963: loss 2.8126, time 5255.52ms 
iter 4964: loss 2.5710, time 5258.51ms 
iter 4965: loss 2.7541, time 5259.86ms 
iter 4966: loss 2.5513, time 5267.12ms 
iter 4967: loss 2.6704, time 5264.85ms 
iter 4968: loss 2.4699, time 5254.97ms 
iter 4969: loss 2.4238, time 5250.51ms 
iter 4970: loss 2.6741, time 5253.19ms 
iter 4971: loss 2.4804, time 5258.07ms 
iter 4972: loss 2.2804, time 5257.95ms 
iter 4973: loss 2.6821, time 5248.84ms 
iter 4974: loss 2.4975, time 5253.68ms 
iter 4975: loss 2.6428, time 5257.55ms 
iter 4976: loss 2.6354, time 5259.39ms 
iter 4977: loss 2.6619, time 5253.80ms 
iter 4978: loss 2.6618, time 5250.62ms 
iter 4979: loss 2.6159, time 5255.24ms 
iter 4980: loss 2.5631, time 5253.03ms 
iter 4981: loss 2.4467, time 5270.60ms 
iter 4982: loss 2.5253, time 5253.29ms 
iter 4983: loss 2.4823, time 5249.36ms 
iter 4984: loss 2.6537, time 5250.34ms 
iter 4985: loss 2.6792, time 5257.38ms 
iter 4986: loss 2.5953, time 5267.18ms 
iter 4987: loss 2.5556, time 5251.58ms 
iter 4988: loss 2.5352, time 5251.26ms 
iter 4989: loss 2.6154, time 5250.72ms 
iter 4990: loss 2.6620, time 5259.34ms 
iter 4991: loss 2.7124, time 5258.37ms 
iter 4992: loss 2.3972, time 5260.57ms 
iter 4993: loss 2.4155, time 5257.94ms 
iter 4994: loss 2.5659, time 5250.77ms 
iter 4995: loss 2.6074, time 5255.87ms 
iter 4996: loss 2.5702, time 5253.97ms 
iter 4997: loss 2.6598, time 5249.38ms 
iter 4998: loss 2.6896, time 5249.37ms 
iter 4999: loss 2.6920, time 5246.74ms 
step 5000: train loss 2.5773, val loss 2.8407
iter 5000: loss 2.5643, time 20060.28ms 
iter 5001: loss 2.8762, time 5258.94ms 
iter 5002: loss 2.2809, time 5255.20ms 
iter 5003: loss 2.5056, time 5253.14ms 
iter 5004: loss 2.5396, time 5256.38ms 
iter 5005: loss 2.5582, time 5255.06ms 
iter 5006: loss 2.6373, time 5250.70ms 
iter 5007: loss 2.5309, time 5258.29ms 
iter 5008: loss 2.5813, time 5255.90ms 
iter 5009: loss 2.5773, time 5236.24ms 
iter 5010: loss 2.7159, time 5246.37ms 
iter 5011: loss 2.5236, time 5248.23ms 
iter 5012: loss 2.4751, time 5259.73ms 
iter 5013: loss 2.4351, time 5237.73ms 
iter 5014: loss 2.8715, time 5262.60ms 
iter 5015: loss 2.3782, time 5272.86ms 
iter 5016: loss 2.5150, time 5262.29ms 
iter 5017: loss 2.3961, time 5270.76ms 
iter 5018: loss 2.6389, time 5264.10ms 
iter 5019: loss 2.5699, time 5259.87ms 
iter 5020: loss 2.5196, time 5259.24ms 
iter 5021: loss 2.5279, time 5316.40ms 
iter 5022: loss 2.4896, time 5263.70ms 
iter 5023: loss 2.3648, time 5352.08ms 
iter 5024: loss 2.4425, time 5350.74ms 
iter 5025: loss 2.6381, time 5268.75ms 
iter 5026: loss 2.5327, time 5265.54ms 
iter 5027: loss 2.6739, time 5261.15ms 
iter 5028: loss 2.5481, time 5269.67ms 
iter 5029: loss 2.6446, time 5270.60ms 
iter 5030: loss 2.5014, time 5250.48ms 
iter 5031: loss 2.5789, time 5253.17ms 
iter 5032: loss 2.5096, time 5246.36ms 
iter 5033: loss 2.4983, time 5273.19ms 
iter 5034: loss 2.5419, time 5279.72ms 
iter 5035: loss 2.7120, time 5266.74ms 
iter 5036: loss 2.6410, time 5275.68ms 
iter 5037: loss 2.5457, time 5284.22ms 
iter 5038: loss 2.6003, time 5283.18ms 
iter 5039: loss 2.5993, time 5271.22ms 
iter 5040: loss 2.6387, time 5270.20ms 
iter 5041: loss 2.5467, time 5258.98ms 
iter 5042: loss 2.6364, time 5343.91ms 
iter 5043: loss 2.6211, time 5347.44ms 
iter 5044: loss 2.6241, time 5304.31ms 
iter 5045: loss 2.4274, time 5249.69ms 
iter 5046: loss 2.3136, time 5247.58ms 
iter 5047: loss 2.5327, time 5256.19ms 
iter 5048: loss 2.5501, time 5247.96ms 
iter 5049: loss 2.7206, time 5260.93ms 
step 5050: train loss 2.5754, val loss 2.8421
iter 5050: loss 2.4300, time 20048.05ms 
iter 5051: loss 2.6399, time 5255.85ms 
iter 5052: loss 2.5666, time 5249.87ms 
iter 5053: loss 2.5088, time 5252.29ms 
iter 5054: loss 2.5619, time 5261.02ms 
iter 5055: loss 2.5378, time 5398.45ms 
iter 5056: loss 2.6684, time 5261.32ms 
iter 5057: loss 2.6569, time 5250.20ms 
iter 5058: loss 2.5759, time 5252.15ms 
iter 5059: loss 2.7320, time 5247.83ms 
iter 5060: loss 2.5615, time 5265.96ms 
iter 5061: loss 2.6169, time 5252.41ms 
iter 5062: loss 2.6073, time 5251.04ms 
iter 5063: loss 2.5842, time 5264.92ms 
iter 5064: loss 2.5301, time 5268.38ms 
iter 5065: loss 2.3977, time 5254.80ms 
iter 5066: loss 2.7804, time 5255.75ms 
iter 5067: loss 2.4946, time 5257.00ms 
iter 5068: loss 2.5002, time 5255.54ms 
iter 5069: loss 2.5626, time 5254.91ms 
iter 5070: loss 2.7133, time 5266.44ms 
iter 5071: loss 2.4720, time 5253.57ms 
iter 5072: loss 2.4221, time 5247.86ms 
iter 5073: loss 2.6890, time 5257.93ms 
iter 5074: loss 2.4976, time 5265.94ms 
iter 5075: loss 2.4554, time 5268.19ms 
iter 5076: loss 2.5237, time 5272.93ms 
iter 5077: loss 2.4574, time 5256.94ms 
iter 5078: loss 2.6857, time 5252.32ms 
iter 5079: loss 2.5450, time 5253.30ms 
iter 5080: loss 2.5571, time 5279.64ms 
iter 5081: loss 2.6020, time 5240.27ms 
iter 5082: loss 2.7064, time 5252.19ms 
iter 5083: loss 2.7589, time 5252.69ms 
iter 5084: loss 2.6313, time 5254.83ms 
iter 5085: loss 2.5107, time 5267.48ms 
iter 5086: loss 2.4630, time 5257.93ms 
iter 5087: loss 2.6345, time 5257.00ms 
iter 5088: loss 2.4745, time 5250.83ms 
iter 5089: loss 2.4694, time 5266.23ms 
iter 5090: loss 2.6739, time 5252.23ms 
iter 5091: loss 2.4744, time 5259.03ms 
iter 5092: loss 2.6110, time 5276.55ms 
iter 5093: loss 2.5514, time 5271.74ms 
iter 5094: loss 2.6455, time 5264.03ms 
iter 5095: loss 2.6231, time 5279.03ms 
iter 5096: loss 2.5341, time 5264.68ms 
iter 5097: loss 2.7317, time 5291.05ms 
iter 5098: loss 2.6061, time 5348.99ms 
iter 5099: loss 2.7136, time 5222.18ms 
step 5100: train loss 2.6002, val loss 2.8457
iter 5100: loss 2.5250, time 20032.26ms 
iter 5101: loss 2.7321, time 5267.23ms 
iter 5102: loss 2.5495, time 5261.72ms 
iter 5103: loss 2.7516, time 5276.54ms 
iter 5104: loss 2.5568, time 5260.56ms 
iter 5105: loss 2.5686, time 5266.84ms 
iter 5106: loss 2.5350, time 5254.58ms 
iter 5107: loss 2.7365, time 5282.03ms 
iter 5108: loss 2.5934, time 5322.15ms 
iter 5109: loss 2.8173, time 5396.76ms 
iter 5110: loss 2.6838, time 5385.95ms 
iter 5111: loss 2.6504, time 5365.32ms 
iter 5112: loss 2.6303, time 5255.44ms 
iter 5113: loss 2.5428, time 5254.88ms 
iter 5114: loss 2.5707, time 5266.25ms 
iter 5115: loss 2.5907, time 5274.18ms 
iter 5116: loss 2.7038, time 5268.54ms 
iter 5117: loss 2.5745, time 5268.00ms 
iter 5118: loss 2.7527, time 5252.46ms 
iter 5119: loss 2.6117, time 5262.38ms 
iter 5120: loss 2.4928, time 5263.06ms 
iter 5121: loss 2.5125, time 5253.42ms 
iter 5122: loss 2.5385, time 5257.80ms 
iter 5123: loss 2.6132, time 5250.50ms 
iter 5124: loss 2.5662, time 5263.65ms 
iter 5125: loss 2.8046, time 5260.24ms 
iter 5126: loss 2.5029, time 5252.74ms 
iter 5127: loss 2.6780, time 5256.35ms 
iter 5128: loss 2.4751, time 5256.88ms 
iter 5129: loss 2.5884, time 5258.17ms 
iter 5130: loss 2.5889, time 5259.80ms 
iter 5131: loss 2.5712, time 5257.80ms 
iter 5132: loss 2.5562, time 5254.09ms 
iter 5133: loss 2.6318, time 5262.48ms 
iter 5134: loss 2.6169, time 5249.26ms 
iter 5135: loss 2.4427, time 5235.04ms 
iter 5136: loss 2.6364, time 5259.92ms 
iter 5137: loss 2.6790, time 5246.77ms 
iter 5138: loss 2.4260, time 5261.66ms 
iter 5139: loss 2.5649, time 5251.68ms 
iter 5140: loss 2.6256, time 5249.71ms 
iter 5141: loss 2.5591, time 5253.51ms 
iter 5142: loss 2.6312, time 5253.76ms 
iter 5143: loss 2.7006, time 5262.39ms 
iter 5144: loss 2.5664, time 5252.85ms 
iter 5145: loss 2.5153, time 5250.54ms 
iter 5146: loss 2.4200, time 5253.55ms 
iter 5147: loss 2.3874, time 5257.66ms 
iter 5148: loss 2.5959, time 5260.18ms 
iter 5149: loss 2.6006, time 5262.54ms 
step 5150: train loss 2.5935, val loss 2.8360
iter 5150: loss 2.7215, time 20112.25ms 
iter 5151: loss 2.7247, time 5249.81ms 
iter 5152: loss 2.6158, time 5253.82ms 
iter 5153: loss 2.6588, time 5251.05ms 
iter 5154: loss 2.6965, time 5271.83ms 
iter 5155: loss 2.8071, time 5258.35ms 
iter 5156: loss 2.8577, time 5254.38ms 
iter 5157: loss 2.5289, time 5252.10ms 
iter 5158: loss 2.6832, time 5258.25ms 
iter 5159: loss 2.6966, time 5257.47ms 
iter 5160: loss 2.6570, time 5254.03ms 
iter 5161: loss 2.6724, time 5255.30ms 
iter 5162: loss 2.7288, time 5247.38ms 
iter 5163: loss 2.7731, time 5262.52ms 
iter 5164: loss 2.5043, time 5264.73ms 
iter 5165: loss 2.5088, time 5258.14ms 
iter 5166: loss 2.6542, time 5260.35ms 
iter 5167: loss 2.6129, time 5268.23ms 
iter 5168: loss 2.5703, time 5254.94ms 
iter 5169: loss 2.6209, time 5255.26ms 
iter 5170: loss 2.4847, time 5261.36ms 
iter 5171: loss 2.6551, time 5262.85ms 
iter 5172: loss 2.5957, time 5263.71ms 
iter 5173: loss 2.4231, time 5260.16ms 
iter 5174: loss 2.5504, time 5265.50ms 
iter 5175: loss 2.7237, time 5268.78ms 
iter 5176: loss 2.6677, time 5261.16ms 
iter 5177: loss 2.8330, time 5261.62ms 
iter 5178: loss 2.6601, time 5267.59ms 
iter 5179: loss 2.5739, time 5252.73ms 
iter 5180: loss 2.5181, time 5257.21ms 
iter 5181: loss 2.4784, time 5254.79ms 
iter 5182: loss 2.7581, time 5260.76ms 
iter 5183: loss 2.7483, time 5274.79ms 
iter 5184: loss 2.6534, time 5268.75ms 
iter 5185: loss 2.6549, time 5244.62ms 
iter 5186: loss 2.4963, time 5253.99ms 
iter 5187: loss 2.6406, time 5251.18ms 
iter 5188: loss 2.4275, time 5260.10ms 
iter 5189: loss 2.6648, time 5251.76ms 
iter 5190: loss 2.7601, time 5249.15ms 
iter 5191: loss 2.6537, time 5253.43ms 
iter 5192: loss 2.5833, time 5256.14ms 
iter 5193: loss 2.7769, time 5262.91ms 
iter 5194: loss 2.6725, time 5252.91ms 
iter 5195: loss 2.7411, time 5251.97ms 
iter 5196: loss 2.6313, time 5255.53ms 
iter 5197: loss 2.6775, time 5258.50ms 
iter 5198: loss 2.6698, time 5229.21ms 
iter 5199: loss 2.4620, time 5259.79ms 
step 5200: train loss 2.5774, val loss 2.8374
iter 5200: loss 2.5063, time 20037.79ms 
iter 5201: loss 2.6087, time 5220.99ms 
iter 5202: loss 2.3777, time 5254.75ms 
iter 5203: loss 2.5914, time 5264.87ms 
iter 5204: loss 2.5468, time 5265.08ms 
iter 5205: loss 2.7578, time 5266.26ms 
iter 5206: loss 2.6650, time 5256.56ms 
iter 5207: loss 2.4383, time 5243.25ms 
iter 5208: loss 2.5498, time 5258.90ms 
iter 5209: loss 2.6371, time 5262.93ms 
iter 5210: loss 2.5086, time 5257.34ms 
iter 5211: loss 2.8015, time 5259.00ms 
iter 5212: loss 2.4272, time 5270.38ms 
iter 5213: loss 2.6948, time 5270.64ms 
iter 5214: loss 2.6275, time 5274.93ms 
iter 5215: loss 2.5755, time 5270.71ms 
iter 5216: loss 2.6268, time 5261.51ms 
iter 5217: loss 2.4930, time 5260.93ms 
iter 5218: loss 2.4926, time 5266.61ms 
iter 5219: loss 2.6101, time 5285.56ms 
iter 5220: loss 2.3909, time 5272.20ms 
iter 5221: loss 2.4611, time 5290.06ms 
iter 5222: loss 2.4809, time 5284.91ms 
iter 5223: loss 2.3770, time 5279.72ms 
iter 5224: loss 2.2524, time 5276.90ms 
iter 5225: loss 2.7437, time 5257.57ms 
iter 5226: loss 2.6696, time 5265.66ms 
iter 5227: loss 2.4970, time 5264.90ms 
iter 5228: loss 2.4883, time 5267.70ms 
iter 5229: loss 2.4285, time 5259.43ms 
iter 5230: loss 2.5223, time 5259.34ms 
iter 5231: loss 2.5265, time 5257.42ms 
iter 5232: loss 2.5105, time 5261.83ms 
iter 5233: loss 2.7792, time 5268.85ms 
iter 5234: loss 2.3089, time 5255.22ms 
iter 5235: loss 2.3778, time 5259.84ms 
iter 5236: loss 2.5129, time 5245.49ms 
iter 5237: loss 2.6051, time 5246.64ms 
iter 5238: loss 2.5874, time 5258.44ms 
iter 5239: loss 2.4949, time 5257.71ms 
iter 5240: loss 2.2534, time 5253.11ms 
iter 5241: loss 2.5804, time 5244.74ms 
iter 5242: loss 2.6304, time 5253.82ms 
iter 5243: loss 2.5323, time 5251.77ms 
iter 5244: loss 2.7454, time 5254.68ms 
iter 5245: loss 2.5489, time 5235.50ms 
iter 5246: loss 2.5419, time 5250.21ms 
iter 5247: loss 2.3611, time 5263.83ms 
iter 5248: loss 2.6161, time 5262.45ms 
iter 5249: loss 2.4727, time 5249.93ms 
step 5250: train loss 2.5696, val loss 2.8481
iter 5250: loss 2.5350, time 20036.62ms 
iter 5251: loss 2.4830, time 5247.90ms 
iter 5252: loss 2.6430, time 5241.86ms 
iter 5253: loss 2.5838, time 5272.09ms 
iter 5254: loss 2.6585, time 5257.05ms 
iter 5255: loss 2.4338, time 5260.26ms 
iter 5256: loss 2.5570, time 5250.04ms 
iter 5257: loss 2.5115, time 5249.82ms 
iter 5258: loss 2.4710, time 5273.91ms 
iter 5259: loss 2.7715, time 5267.77ms 
iter 5260: loss 2.3594, time 5263.72ms 
iter 5261: loss 2.6347, time 5241.94ms 
iter 5262: loss 2.5480, time 5266.84ms 
iter 5263: loss 2.4941, time 5250.73ms 
iter 5264: loss 2.5073, time 5245.63ms 
iter 5265: loss 2.3351, time 5255.47ms 
iter 5266: loss 2.3232, time 5262.43ms 
iter 5267: loss 2.4785, time 5264.23ms 
iter 5268: loss 2.4940, time 5261.12ms 
iter 5269: loss 2.7493, time 5274.03ms 
iter 5270: loss 2.6252, time 5265.98ms 
iter 5271: loss 2.4659, time 5249.72ms 
iter 5272: loss 2.6029, time 5098.71ms 
iter 5273: loss 2.5180, time 5100.21ms 
iter 5274: loss 2.5000, time 5113.73ms 
iter 5275: loss 2.5315, time 5113.13ms 
iter 5276: loss 2.8189, time 5109.28ms 
iter 5277: loss 2.5308, time 5239.35ms 
iter 5278: loss 2.5794, time 5260.95ms 
iter 5279: loss 2.2485, time 5252.94ms 
iter 5280: loss 2.7075, time 5258.60ms 
iter 5281: loss 2.4690, time 5265.57ms 
iter 5282: loss 2.7421, time 5253.71ms 
iter 5283: loss 2.4913, time 5257.24ms 
iter 5284: loss 2.5225, time 5261.99ms 
iter 5285: loss 2.7507, time 5264.15ms 
iter 5286: loss 2.5970, time 5259.63ms 
iter 5287: loss 2.5018, time 5257.32ms 
iter 5288: loss 2.6608, time 5258.40ms 
iter 5289: loss 2.7909, time 5261.29ms 
iter 5290: loss 2.5264, time 5255.91ms 
iter 5291: loss 2.8481, time 5266.58ms 
iter 5292: loss 2.6520, time 5256.47ms 
iter 5293: loss 2.7466, time 5251.92ms 
iter 5294: loss 2.6721, time 5251.69ms 
iter 5295: loss 2.7083, time 5262.66ms 
iter 5296: loss 2.7965, time 5256.84ms 
iter 5297: loss 2.7333, time 5263.86ms 
iter 5298: loss 2.6784, time 5261.20ms 
iter 5299: loss 2.4541, time 5253.27ms 
step 5300: train loss 2.5839, val loss 2.8272
iter 5300: loss 2.4379, time 20156.69ms 
iter 5301: loss 2.6433, time 5260.99ms 
iter 5302: loss 2.5323, time 5276.38ms 
iter 5303: loss 2.5545, time 5280.01ms 
iter 5304: loss 2.6358, time 5260.45ms 
iter 5305: loss 2.5629, time 5273.80ms 
iter 5306: loss 2.5108, time 5268.60ms 
iter 5307: loss 2.6326, time 5257.29ms 
iter 5308: loss 2.7936, time 5254.52ms 
iter 5309: loss 2.6160, time 5252.25ms 
iter 5310: loss 2.3926, time 5254.52ms 
iter 5311: loss 2.7043, time 5261.82ms 
iter 5312: loss 2.6383, time 5277.67ms 
iter 5313: loss 2.4347, time 5255.55ms 
iter 5314: loss 2.3959, time 5254.78ms 
iter 5315: loss 2.5355, time 5259.10ms 
iter 5316: loss 2.7072, time 5261.84ms 
iter 5317: loss 2.4724, time 5261.28ms 
iter 5318: loss 2.2492, time 5253.06ms 
iter 5319: loss 2.5984, time 5251.36ms 
iter 5320: loss 2.5493, time 5250.28ms 
iter 5321: loss 2.5084, time 5262.93ms 
iter 5322: loss 2.6468, time 5261.69ms 
iter 5323: loss 2.5506, time 5254.72ms 
iter 5324: loss 2.6095, time 5251.08ms 
iter 5325: loss 2.8570, time 5257.18ms 
iter 5326: loss 2.5676, time 5261.43ms 
iter 5327: loss 2.6573, time 5383.96ms 
iter 5328: loss 2.5792, time 5409.04ms 
iter 5329: loss 2.6533, time 5425.94ms 
iter 5330: loss 2.5211, time 5246.79ms 
iter 5331: loss 2.7030, time 5247.09ms 
iter 5332: loss 2.6424, time 5214.25ms 
iter 5333: loss 2.5077, time 5248.36ms 
iter 5334: loss 2.4668, time 5234.98ms 
iter 5335: loss 2.6481, time 5242.37ms 
iter 5336: loss 2.7510, time 5247.13ms 
iter 5337: loss 2.8259, time 5218.54ms 
iter 5338: loss 2.3393, time 5208.00ms 
iter 5339: loss 2.6682, time 5089.49ms 
iter 5340: loss 2.6851, time 5104.49ms 
iter 5341: loss 2.4421, time 5081.13ms 
iter 5342: loss 2.6638, time 5253.78ms 
iter 5343: loss 2.2892, time 5252.53ms 
iter 5344: loss 2.6067, time 5256.30ms 
iter 5345: loss 2.4555, time 5264.72ms 
iter 5346: loss 2.6444, time 5234.23ms 
iter 5347: loss 2.5815, time 5253.23ms 
iter 5348: loss 2.6139, time 5222.55ms 
iter 5349: loss 2.6323, time 5130.04ms 
step 5350: train loss 2.5801, val loss 2.8405
iter 5350: loss 2.6154, time 19896.68ms 
iter 5351: loss 2.3822, time 5262.74ms 
iter 5352: loss 2.8670, time 5290.03ms 
iter 5353: loss 2.6774, time 5315.92ms 
iter 5354: loss 2.5847, time 5307.93ms 
iter 5355: loss 2.4466, time 5286.13ms 
iter 5356: loss 2.4838, time 5268.66ms 
iter 5357: loss 2.6114, time 5240.27ms 
iter 5358: loss 2.7542, time 5272.68ms 
iter 5359: loss 2.4857, time 5264.91ms 
iter 5360: loss 2.5271, time 5273.07ms 
iter 5361: loss 2.7714, time 5261.17ms 
iter 5362: loss 2.7937, time 5263.35ms 
iter 5363: loss 2.4857, time 5263.73ms 
iter 5364: loss 2.5314, time 5259.13ms 
iter 5365: loss 2.6378, time 5270.10ms 
iter 5366: loss 2.5465, time 5254.45ms 
iter 5367: loss 2.5875, time 5250.25ms 
iter 5368: loss 2.6135, time 5244.04ms 
iter 5369: loss 2.6815, time 5272.96ms 
iter 5370: loss 2.3977, time 5274.98ms 
iter 5371: loss 2.4706, time 5159.83ms 
iter 5372: loss 2.5757, time 5066.92ms 
iter 5373: loss 2.6192, time 5080.58ms 
iter 5374: loss 2.5183, time 5089.85ms 
iter 5375: loss 2.4208, time 5083.80ms 
iter 5376: loss 2.3709, time 5066.05ms 
iter 5377: loss 2.5659, time 5092.87ms 
iter 5378: loss 2.6371, time 5134.16ms 
iter 5379: loss 2.5274, time 5267.32ms 
iter 5380: loss 2.6207, time 5209.35ms 
iter 5381: loss 2.5266, time 5235.84ms 
iter 5382: loss 2.6871, time 5200.37ms 
iter 5383: loss 2.6254, time 5256.03ms 
iter 5384: loss 2.6198, time 5250.70ms 
iter 5385: loss 2.5477, time 5248.97ms 
iter 5386: loss 2.5505, time 5252.81ms 
iter 5387: loss 2.6639, time 5263.71ms 
iter 5388: loss 2.4911, time 5252.40ms 
iter 5389: loss 2.5531, time 5256.51ms 
iter 5390: loss 2.6384, time 5269.68ms 
iter 5391: loss 2.4928, time 5271.94ms 
iter 5392: loss 2.5050, time 5307.79ms 
iter 5393: loss 2.8158, time 5279.22ms 
iter 5394: loss 2.5893, time 5279.45ms 
iter 5395: loss 2.5178, time 5288.86ms 
iter 5396: loss 2.4683, time 5281.31ms 
iter 5397: loss 2.3481, time 5260.54ms 
iter 5398: loss 2.5905, time 5256.45ms 
iter 5399: loss 2.7168, time 5261.28ms 
step 5400: train loss 2.5744, val loss 2.8441
iter 5400: loss 2.6714, time 20101.51ms 
iter 5401: loss 2.6825, time 5260.36ms 
iter 5402: loss 2.4365, time 5270.82ms 
iter 5403: loss 2.6035, time 5271.69ms 
iter 5404: loss 2.6817, time 5258.01ms 
iter 5405: loss 2.6176, time 5257.30ms 
iter 5406: loss 2.6484, time 5253.58ms 
iter 5407: loss 2.4608, time 5252.52ms 
iter 5408: loss 2.7213, time 5260.94ms 
iter 5409: loss 2.5927, time 5262.36ms 
iter 5410: loss 2.6165, time 5258.55ms 
iter 5411: loss 2.6654, time 5261.65ms 
iter 5412: loss 2.7218, time 5275.26ms 
iter 5413: loss 2.7133, time 5256.32ms 
iter 5414: loss 2.3406, time 5257.80ms 
iter 5415: loss 2.7458, time 5242.21ms 
iter 5416: loss 2.5424, time 5277.89ms 
iter 5417: loss 2.7066, time 5267.93ms 
iter 5418: loss 2.6416, time 5266.49ms 
iter 5419: loss 2.5506, time 5269.34ms 
iter 5420: loss 2.5956, time 5287.50ms 
iter 5421: loss 2.5290, time 5279.74ms 
iter 5422: loss 2.5506, time 5185.29ms 
iter 5423: loss 2.7364, time 5240.97ms 
iter 5424: loss 2.7442, time 5254.95ms 
iter 5425: loss 2.4302, time 5267.99ms 
iter 5426: loss 2.3803, time 5272.74ms 
iter 5427: loss 2.5137, time 5268.68ms 
iter 5428: loss 2.3935, time 5257.98ms 
iter 5429: loss 2.6138, time 5256.17ms 
iter 5430: loss 2.3603, time 5246.49ms 
iter 5431: loss 2.8400, time 5259.24ms 
iter 5432: loss 2.7458, time 5256.60ms 
iter 5433: loss 2.4990, time 5252.88ms 
iter 5434: loss 2.5486, time 5248.52ms 
iter 5435: loss 2.5137, time 5253.59ms 
iter 5436: loss 2.4287, time 5275.32ms 
iter 5437: loss 2.6271, time 5278.14ms 
iter 5438: loss 2.5312, time 5255.71ms 
iter 5439: loss 2.3483, time 5249.92ms 
iter 5440: loss 2.7051, time 5258.93ms 
iter 5441: loss 2.6041, time 5271.54ms 
iter 5442: loss 2.6179, time 5265.43ms 
iter 5443: loss 2.6724, time 5262.74ms 
iter 5444: loss 2.7003, time 5255.80ms 
iter 5445: loss 2.5893, time 5254.38ms 
iter 5446: loss 2.4240, time 5271.68ms 
iter 5447: loss 2.4221, time 5269.93ms 
iter 5448: loss 2.6341, time 5259.92ms 
iter 5449: loss 2.5643, time 5265.38ms 
step 5450: train loss 2.5733, val loss 2.8513
iter 5450: loss 2.2213, time 20102.86ms 
iter 5451: loss 2.6655, time 5266.78ms 
iter 5452: loss 2.4967, time 5253.29ms 
iter 5453: loss 2.5872, time 5267.86ms 
iter 5454: loss 2.5688, time 5268.23ms 
iter 5455: loss 2.5481, time 5250.33ms 
iter 5456: loss 2.7186, time 5247.56ms 
iter 5457: loss 2.5730, time 5258.65ms 
iter 5458: loss 2.6913, time 5264.58ms 
iter 5459: loss 2.6035, time 5258.77ms 
iter 5460: loss 2.4980, time 5250.98ms 
iter 5461: loss 2.5399, time 5261.66ms 
iter 5462: loss 2.3694, time 5260.95ms 
iter 5463: loss 2.5963, time 5259.83ms 
iter 5464: loss 2.5554, time 5265.66ms 
iter 5465: loss 2.4465, time 5254.93ms 
iter 5466: loss 2.5888, time 5254.36ms 
iter 5467: loss 2.7061, time 5250.30ms 
iter 5468: loss 2.5779, time 5256.58ms 
iter 5469: loss 2.4788, time 5256.87ms 
iter 5470: loss 2.4449, time 5282.20ms 
iter 5471: loss 2.4959, time 5282.02ms 
iter 5472: loss 2.6779, time 5254.59ms 
iter 5473: loss 2.5559, time 5261.13ms 
iter 5474: loss 2.9268, time 5362.56ms 
iter 5475: loss 2.7444, time 5352.73ms 
iter 5476: loss 2.6875, time 5329.35ms 
iter 5477: loss 2.5041, time 5322.51ms 
iter 5478: loss 2.2946, time 5376.94ms 
iter 5479: loss 2.5987, time 5418.03ms 
iter 5480: loss 2.6785, time 5241.80ms 
iter 5481: loss 2.6544, time 5251.56ms 
iter 5482: loss 2.5212, time 5261.02ms 
iter 5483: loss 2.4621, time 5273.00ms 
iter 5484: loss 2.6686, time 5266.38ms 
iter 5485: loss 2.4566, time 5262.98ms 
iter 5486: loss 2.4997, time 5265.35ms 
iter 5487: loss 2.4541, time 5256.39ms 
iter 5488: loss 2.6623, time 5257.20ms 
iter 5489: loss 2.6150, time 5251.24ms 
iter 5490: loss 2.5749, time 5251.89ms 
iter 5491: loss 2.6794, time 5253.95ms 
iter 5492: loss 2.4862, time 5248.37ms 
iter 5493: loss 2.3878, time 5278.12ms 
iter 5494: loss 2.8329, time 5262.72ms 
iter 5495: loss 2.8842, time 5248.15ms 
iter 5496: loss 2.7785, time 5248.06ms 
iter 5497: loss 2.4839, time 5249.66ms 
iter 5498: loss 2.5444, time 5261.59ms 
iter 5499: loss 2.5973, time 5259.55ms 
step 5500: train loss 2.5654, val loss 2.8265
iter 5500: loss 2.5560, time 20059.27ms 
iter 5501: loss 2.5724, time 5253.87ms 
iter 5502: loss 2.6436, time 5249.81ms 
iter 5503: loss 2.6428, time 5247.32ms 
iter 5504: loss 2.5336, time 5255.17ms 
iter 5505: loss 2.6515, time 5267.40ms 
iter 5506: loss 2.5978, time 5256.85ms 
iter 5507: loss 2.3969, time 5261.40ms 
iter 5508: loss 2.6304, time 5256.61ms 
iter 5509: loss 2.4479, time 5260.49ms 
iter 5510: loss 2.4542, time 5257.31ms 
iter 5511: loss 2.5638, time 5253.32ms 
iter 5512: loss 2.5631, time 5254.19ms 
iter 5513: loss 2.2228, time 5268.35ms 
iter 5514: loss 2.6094, time 5275.84ms 
iter 5515: loss 2.4032, time 5254.14ms 
iter 5516: loss 2.5999, time 5212.15ms 
iter 5517: loss 2.6294, time 5214.08ms 
iter 5518: loss 2.5473, time 5217.24ms 
iter 5519: loss 2.5015, time 5235.53ms 
iter 5520: loss 2.6547, time 5243.92ms 
iter 5521: loss 2.5079, time 5221.06ms 
iter 5522: loss 2.7298, time 5254.42ms 
iter 5523: loss 2.5210, time 5234.46ms 
iter 5524: loss 2.4643, time 5266.71ms 
iter 5525: loss 2.4823, time 5265.74ms 
iter 5526: loss 2.5537, time 5258.10ms 
iter 5527: loss 2.6673, time 5257.72ms 
iter 5528: loss 2.4123, time 5245.97ms 
iter 5529: loss 2.6527, time 5260.09ms 
iter 5530: loss 2.7212, time 5242.85ms 
iter 5531: loss 2.5213, time 5262.62ms 
iter 5532: loss 2.2669, time 5242.74ms 
iter 5533: loss 2.6079, time 5257.01ms 
iter 5534: loss 2.6610, time 5289.07ms 
iter 5535: loss 2.5684, time 5233.75ms 
iter 5536: loss 2.5557, time 5277.85ms 
iter 5537: loss 2.5740, time 5242.50ms 
iter 5538: loss 2.7954, time 5210.30ms 
iter 5539: loss 2.7158, time 5249.13ms 
iter 5540: loss 2.6136, time 5242.11ms 
iter 5541: loss 2.6310, time 5229.86ms 
iter 5542: loss 2.6349, time 5253.29ms 
iter 5543: loss 2.4328, time 5254.70ms 
iter 5544: loss 2.7622, time 5249.26ms 
iter 5545: loss 2.5343, time 5261.53ms 
iter 5546: loss 2.7054, time 5244.42ms 
iter 5547: loss 2.7597, time 5264.78ms 
iter 5548: loss 2.6782, time 5263.07ms 
iter 5549: loss 2.5748, time 5238.28ms 
step 5550: train loss 2.5565, val loss 2.8371
iter 5550: loss 2.7532, time 20030.57ms 
iter 5551: loss 2.6048, time 5244.47ms 
iter 5552: loss 2.4761, time 5234.22ms 
iter 5553: loss 2.5524, time 5254.51ms 
iter 5554: loss 2.6234, time 5275.26ms 
iter 5555: loss 2.3907, time 5251.33ms 
iter 5556: loss 2.6259, time 5242.44ms 
iter 5557: loss 2.3982, time 5246.37ms 
iter 5558: loss 2.4621, time 5264.44ms 
iter 5559: loss 2.3969, time 5266.34ms 
iter 5560: loss 2.4419, time 5268.91ms 
iter 5561: loss 2.4505, time 5272.07ms 
iter 5562: loss 2.5150, time 5259.70ms 
iter 5563: loss 2.6311, time 5274.44ms 
iter 5564: loss 2.6118, time 5266.98ms 
iter 5565: loss 2.4446, time 5258.17ms 
iter 5566: loss 2.5385, time 5263.81ms 
iter 5567: loss 2.6093, time 5273.49ms 
iter 5568: loss 2.6540, time 5267.69ms 
iter 5569: loss 2.5833, time 5263.97ms 
iter 5570: loss 2.4357, time 5258.85ms 
iter 5571: loss 2.4904, time 5246.80ms 
iter 5572: loss 2.4848, time 5262.55ms 
iter 5573: loss 2.6019, time 5287.48ms 
iter 5574: loss 2.6465, time 5272.92ms 
iter 5575: loss 2.4885, time 5268.48ms 
iter 5576: loss 2.6059, time 5289.92ms 
iter 5577: loss 2.4789, time 5293.50ms 
iter 5578: loss 2.5241, time 5270.67ms 
iter 5579: loss 2.3215, time 5262.22ms 
iter 5580: loss 2.6902, time 5257.43ms 
iter 5581: loss 2.5738, time 5265.38ms 
iter 5582: loss 2.6922, time 5255.36ms 
iter 5583: loss 2.5270, time 5274.56ms 
iter 5584: loss 2.8396, time 5265.47ms 
iter 5585: loss 2.5220, time 5255.45ms 
iter 5586: loss 2.7421, time 5265.13ms 
iter 5587: loss 2.6004, time 5246.71ms 
iter 5588: loss 2.5594, time 5253.13ms 
iter 5589: loss 2.4049, time 5214.83ms 
iter 5590: loss 2.5711, time 5251.95ms 
iter 5591: loss 2.6876, time 5260.70ms 
iter 5592: loss 2.3004, time 5272.10ms 
iter 5593: loss 2.6340, time 5261.25ms 
iter 5594: loss 2.3964, time 5256.90ms 
iter 5595: loss 2.4196, time 5259.00ms 
iter 5596: loss 2.4534, time 5259.37ms 
iter 5597: loss 2.4496, time 5260.94ms 
iter 5598: loss 2.5346, time 5261.55ms 
iter 5599: loss 2.5847, time 5250.53ms 
step 5600: train loss 2.5578, val loss 2.8425
iter 5600: loss 2.6561, time 20072.95ms 
iter 5601: loss 2.5331, time 5250.81ms 
iter 5602: loss 2.6504, time 5255.09ms 
iter 5603: loss 2.5679, time 5261.81ms 
iter 5604: loss 2.4896, time 5263.32ms 
iter 5605: loss 2.6140, time 5256.39ms 
iter 5606: loss 2.6971, time 5258.66ms 
iter 5607: loss 2.6267, time 5251.64ms 
iter 5608: loss 2.6332, time 5263.17ms 
iter 5609: loss 2.6239, time 5262.96ms 
iter 5610: loss 2.6233, time 5259.77ms 
iter 5611: loss 2.5927, time 5259.07ms 
iter 5612: loss 2.6243, time 5273.73ms 
iter 5613: loss 2.5885, time 5259.37ms 
iter 5614: loss 2.4527, time 5254.84ms 
iter 5615: loss 2.5261, time 5253.63ms 
iter 5616: loss 2.7323, time 5250.35ms 
iter 5617: loss 2.6596, time 5257.66ms 
iter 5618: loss 2.6706, time 5267.61ms 
iter 5619: loss 2.6769, time 5250.29ms 
iter 5620: loss 2.7618, time 5256.38ms 
iter 5621: loss 2.4800, time 5254.68ms 
iter 5622: loss 2.7257, time 5258.77ms 
iter 5623: loss 2.7833, time 5262.84ms 
iter 5624: loss 2.4547, time 5262.66ms 
iter 5625: loss 2.6642, time 5250.94ms 
iter 5626: loss 2.5912, time 5249.66ms 
iter 5627: loss 2.7808, time 5259.61ms 
iter 5628: loss 2.6175, time 5262.47ms 
iter 5629: loss 2.6978, time 5258.12ms 
iter 5630: loss 2.7368, time 5244.69ms 
iter 5631: loss 2.6312, time 5270.62ms 
iter 5632: loss 2.4079, time 5284.90ms 
iter 5633: loss 2.4458, time 5265.96ms 
iter 5634: loss 2.6272, time 5260.20ms 
iter 5635: loss 2.6336, time 5257.59ms 
iter 5636: loss 2.7572, time 5258.91ms 
iter 5637: loss 2.7005, time 5270.41ms 
iter 5638: loss 2.5682, time 5267.39ms 
iter 5639: loss 2.5972, time 5258.97ms 
iter 5640: loss 2.5155, time 5259.69ms 
iter 5641: loss 2.4970, time 5266.53ms 
iter 5642: loss 2.5077, time 5276.10ms 
iter 5643: loss 2.6237, time 5262.92ms 
iter 5644: loss 2.5544, time 5262.39ms 
iter 5645: loss 2.4098, time 5262.77ms 
iter 5646: loss 2.5349, time 5263.79ms 
iter 5647: loss 2.5876, time 5272.43ms 
iter 5648: loss 2.6507, time 5262.18ms 
iter 5649: loss 2.7164, time 5257.96ms 
step 5650: train loss 2.5739, val loss 2.8302
iter 5650: loss 2.7998, time 20102.57ms 
iter 5651: loss 2.7717, time 5268.04ms 
iter 5652: loss 2.4955, time 5317.98ms 
iter 5653: loss 2.5122, time 5421.60ms 
iter 5654: loss 2.3669, time 5420.07ms 
iter 5655: loss 2.6984, time 5312.21ms 
iter 5656: loss 2.3624, time 5240.40ms 
iter 5657: loss 2.6051, time 5245.82ms 
iter 5658: loss 2.6056, time 5254.36ms 
iter 5659: loss 2.5481, time 5255.12ms 
iter 5660: loss 2.5125, time 5256.29ms 
iter 5661: loss 2.4826, time 5256.35ms 
iter 5662: loss 2.4260, time 5263.90ms 
iter 5663: loss 2.5793, time 5273.89ms 
iter 5664: loss 2.5461, time 5256.03ms 
iter 5665: loss 2.5288, time 5253.93ms 
iter 5666: loss 2.4395, time 5252.44ms 
iter 5667: loss 2.5026, time 5265.89ms 
iter 5668: loss 2.5285, time 5376.08ms 
iter 5669: loss 2.4030, time 5393.96ms 
iter 5670: loss 2.5627, time 5368.88ms 
iter 5671: loss 2.6639, time 5262.25ms 
iter 5672: loss 2.5381, time 5328.14ms 
iter 5673: loss 2.6672, time 5418.52ms 
iter 5674: loss 2.5846, time 5380.44ms 
iter 5675: loss 2.5619, time 5417.81ms 
iter 5676: loss 2.5767, time 5368.36ms 
iter 5677: loss 2.6689, time 5390.64ms 
iter 5678: loss 2.4428, time 5305.86ms 
iter 5679: loss 2.6095, time 5253.74ms 
iter 5680: loss 2.4027, time 5265.57ms 
iter 5681: loss 2.5480, time 5272.63ms 
iter 5682: loss 2.5572, time 5256.53ms 
iter 5683: loss 2.5513, time 5267.13ms 
iter 5684: loss 2.4807, time 5262.35ms 
iter 5685: loss 2.5960, time 5274.09ms 
iter 5686: loss 2.5391, time 5272.37ms 
iter 5687: loss 2.6900, time 5260.90ms 
iter 5688: loss 2.7233, time 5261.26ms 
iter 5689: loss 2.6279, time 5266.25ms 
iter 5690: loss 2.4049, time 5275.72ms 
iter 5691: loss 2.6196, time 5252.48ms 
iter 5692: loss 2.4706, time 5286.11ms 
iter 5693: loss 2.4397, time 5283.13ms 
iter 5694: loss 2.8024, time 5401.13ms 
iter 5695: loss 2.5149, time 5347.08ms 
iter 5696: loss 2.6257, time 5263.37ms 
iter 5697: loss 2.6231, time 5264.70ms 
iter 5698: loss 2.5760, time 5251.02ms 
iter 5699: loss 2.4836, time 5269.51ms 
step 5700: train loss 2.5529, val loss 2.8308
iter 5700: loss 2.5482, time 20252.35ms 
iter 5701: loss 2.4873, time 5270.25ms 
iter 5702: loss 2.4272, time 5236.97ms 
iter 5703: loss 2.6639, time 5254.96ms 
iter 5704: loss 2.6638, time 5253.96ms 
iter 5705: loss 2.6023, time 5262.34ms 
iter 5706: loss 2.4351, time 5261.04ms 
iter 5707: loss 2.5021, time 5220.26ms 
iter 5708: loss 2.5979, time 5253.44ms 
iter 5709: loss 2.6015, time 5253.02ms 
iter 5710: loss 2.4861, time 5268.96ms 
iter 5711: loss 2.4233, time 5263.00ms 
iter 5712: loss 2.4677, time 5223.93ms 
iter 5713: loss 2.6515, time 5255.73ms 
iter 5714: loss 2.6275, time 5252.34ms 
iter 5715: loss 2.6190, time 5263.22ms 
iter 5716: loss 2.5854, time 5270.48ms 
iter 5717: loss 2.4192, time 5252.06ms 
iter 5718: loss 2.7441, time 5252.75ms 
iter 5719: loss 2.4400, time 5252.02ms 
iter 5720: loss 2.6243, time 5270.09ms 
iter 5721: loss 2.5370, time 5262.96ms 
iter 5722: loss 2.7328, time 5254.65ms 
iter 5723: loss 2.4677, time 5251.85ms 
iter 5724: loss 2.6254, time 5258.26ms 
iter 5725: loss 2.3148, time 5268.74ms 
iter 5726: loss 2.4857, time 5272.74ms 
iter 5727: loss 2.5377, time 5264.09ms 
iter 5728: loss 2.6305, time 5264.63ms 
iter 5729: loss 2.3670, time 5271.49ms 
iter 5730: loss 2.4209, time 5259.92ms 
iter 5731: loss 2.9006, time 5219.85ms 
iter 5732: loss 2.4760, time 5221.23ms 
iter 5733: loss 2.4407, time 5350.89ms 
iter 5734: loss 2.5312, time 5318.59ms 
iter 5735: loss 2.5174, time 5275.83ms 
iter 5736: loss 2.7321, time 5271.86ms 
iter 5737: loss 2.5462, time 5261.84ms 
iter 5738: loss 2.5979, time 5255.56ms 
iter 5739: loss 2.5765, time 5284.65ms 
iter 5740: loss 2.4207, time 5271.89ms 
iter 5741: loss 2.5357, time 5252.96ms 
iter 5742: loss 2.5350, time 5262.01ms 
iter 5743: loss 2.5455, time 5254.33ms 
iter 5744: loss 2.2815, time 5264.13ms 
iter 5745: loss 2.5491, time 5259.32ms 
iter 5746: loss 2.5404, time 5261.99ms 
iter 5747: loss 2.4443, time 5252.78ms 
iter 5748: loss 2.7437, time 5258.59ms 
iter 5749: loss 2.6771, time 5278.33ms 
step 5750: train loss 2.5560, val loss 2.8421
iter 5750: loss 2.3692, time 20031.56ms 
iter 5751: loss 2.6453, time 5274.93ms 
iter 5752: loss 2.7051, time 5276.53ms 
iter 5753: loss 2.4968, time 5277.74ms 
iter 5754: loss 2.4097, time 5271.74ms 
iter 5755: loss 2.6187, time 5282.14ms 
iter 5756: loss 2.7905, time 5288.99ms 
iter 5757: loss 2.7842, time 5271.66ms 
iter 5758: loss 2.3163, time 5440.01ms 
iter 5759: loss 2.4055, time 5439.34ms 
iter 5760: loss 2.6126, time 5447.30ms 
iter 5761: loss 2.5928, time 5433.29ms 
iter 5762: loss 2.6225, time 5444.37ms 
iter 5763: loss 2.4867, time 5434.91ms 
iter 5764: loss 2.8180, time 5302.73ms 
iter 5765: loss 2.5115, time 5272.83ms 
iter 5766: loss 2.6114, time 5359.49ms 
iter 5767: loss 2.4788, time 5356.44ms 
iter 5768: loss 2.4953, time 5262.96ms 
iter 5769: loss 2.4315, time 5261.49ms 
iter 5770: loss 2.5468, time 5255.45ms 
iter 5771: loss 2.5315, time 5273.08ms 
iter 5772: loss 2.6351, time 5253.05ms 
iter 5773: loss 2.4756, time 5256.80ms 
iter 5774: loss 2.4722, time 5263.37ms 
iter 5775: loss 2.5191, time 5254.40ms 
iter 5776: loss 2.5797, time 5257.81ms 
iter 5777: loss 2.5044, time 5252.68ms 
iter 5778: loss 2.4855, time 5262.37ms 
iter 5779: loss 2.8309, time 5272.56ms 
iter 5780: loss 2.7675, time 5225.29ms 
iter 5781: loss 2.5620, time 5253.73ms 
iter 5782: loss 2.6273, time 5249.57ms 
iter 5783: loss 2.4797, time 5260.88ms 
iter 5784: loss 2.5928, time 5262.14ms 
iter 5785: loss 2.2686, time 5254.99ms 
iter 5786: loss 2.7525, time 5254.60ms 
iter 5787: loss 2.6288, time 5286.89ms 
iter 5788: loss 2.8042, time 5442.88ms 
iter 5789: loss 2.4405, time 5325.49ms 
iter 5790: loss 2.5959, time 5241.67ms 
iter 5791: loss 2.5706, time 5365.06ms 
iter 5792: loss 2.5084, time 5309.30ms 
iter 5793: loss 2.8379, time 5280.63ms 
iter 5794: loss 2.4698, time 5290.91ms 
iter 5795: loss 2.7507, time 5268.47ms 
iter 5796: loss 2.5544, time 5258.69ms 
iter 5797: loss 2.5882, time 5255.40ms 
iter 5798: loss 2.8269, time 5261.63ms 
iter 5799: loss 2.5061, time 5255.45ms 
step 5800: train loss 2.5491, val loss 2.8194
iter 5800: loss 2.5887, time 20050.13ms 
iter 5801: loss 2.4177, time 5249.50ms 
iter 5802: loss 2.5561, time 5250.43ms 
iter 5803: loss 2.5390, time 5252.31ms 
iter 5804: loss 2.7679, time 5265.27ms 
iter 5805: loss 2.7359, time 5251.39ms 
iter 5806: loss 2.5924, time 5254.75ms 
iter 5807: loss 2.4731, time 5254.83ms 
iter 5808: loss 2.7643, time 5253.40ms 
iter 5809: loss 2.4778, time 5265.18ms 
iter 5810: loss 2.4810, time 5254.67ms 
iter 5811: loss 2.5442, time 5271.01ms 
iter 5812: loss 2.6788, time 5269.04ms 
iter 5813: loss 2.4652, time 5276.36ms 
iter 5814: loss 2.6609, time 5292.97ms 
iter 5815: loss 2.3060, time 5280.66ms 
iter 5816: loss 2.5742, time 5267.83ms 
iter 5817: loss 2.6328, time 5264.44ms 
iter 5818: loss 2.6331, time 5270.48ms 
iter 5819: loss 2.5333, time 5277.07ms 
iter 5820: loss 2.5268, time 5270.01ms 
iter 5821: loss 2.7935, time 5262.92ms 
iter 5822: loss 2.4826, time 5260.51ms 
iter 5823: loss 2.6088, time 5272.72ms 
iter 5824: loss 2.1997, time 5275.37ms 
iter 5825: loss 2.4197, time 5267.64ms 
iter 5826: loss 2.5573, time 5265.61ms 
iter 5827: loss 2.7226, time 5260.89ms 
iter 5828: loss 2.6357, time 5267.00ms 
iter 5829: loss 2.5410, time 5263.36ms 
iter 5830: loss 2.4180, time 5275.26ms 
iter 5831: loss 2.4077, time 5252.57ms 
iter 5832: loss 2.3510, time 5269.04ms 
iter 5833: loss 2.5114, time 5326.31ms 
iter 5834: loss 2.3021, time 5436.47ms 
iter 5835: loss 2.7018, time 5431.95ms 
iter 5836: loss 2.4858, time 5432.13ms 
iter 5837: loss 2.7205, time 5452.17ms 
iter 5838: loss 2.4012, time 5442.93ms 
iter 5839: loss 2.5126, time 5424.14ms 
iter 5840: loss 2.5036, time 5425.78ms 
iter 5841: loss 2.5975, time 5434.69ms 
iter 5842: loss 2.6171, time 5435.85ms 
iter 5843: loss 2.4282, time 5434.14ms 
iter 5844: loss 2.4180, time 5434.95ms 
iter 5845: loss 2.6485, time 5429.30ms 
iter 5846: loss 2.4395, time 5437.81ms 
iter 5847: loss 2.4457, time 5440.75ms 
iter 5848: loss 2.7468, time 5248.05ms 
iter 5849: loss 2.6773, time 5261.82ms 
step 5850: train loss 2.5628, val loss 2.8436
iter 5850: loss 2.5151, time 20192.16ms 
iter 5851: loss 2.6628, time 5267.40ms 
iter 5852: loss 2.5373, time 5262.76ms 
iter 5853: loss 2.4530, time 5266.14ms 
iter 5854: loss 2.5670, time 5260.72ms 
iter 5855: loss 2.6404, time 5254.26ms 
iter 5856: loss 2.6322, time 5250.03ms 
iter 5857: loss 2.6192, time 5256.22ms 
iter 5858: loss 2.5447, time 5262.74ms 
iter 5859: loss 2.7003, time 5231.32ms 
iter 5860: loss 2.4028, time 5249.97ms 
iter 5861: loss 2.6667, time 5242.54ms 
iter 5862: loss 2.5955, time 5252.25ms 
iter 5863: loss 2.8174, time 5276.38ms 
iter 5864: loss 2.3780, time 5277.93ms 
iter 5865: loss 2.7133, time 5268.96ms 
iter 5866: loss 2.5125, time 5266.70ms 
iter 5867: loss 2.4961, time 5265.42ms 
iter 5868: loss 2.3710, time 5263.69ms 
iter 5869: loss 2.5118, time 5253.98ms 
iter 5870: loss 2.4483, time 5255.26ms 
iter 5871: loss 2.5370, time 5258.16ms 
iter 5872: loss 2.6335, time 5262.55ms 
iter 5873: loss 2.5798, time 5265.77ms 
iter 5874: loss 2.5924, time 5256.22ms 
iter 5875: loss 2.7033, time 5252.21ms 
iter 5876: loss 2.4522, time 5253.81ms 
iter 5877: loss 2.5351, time 5259.22ms 
iter 5878: loss 2.6299, time 5262.50ms 
iter 5879: loss 2.4084, time 5253.33ms 
iter 5880: loss 2.6536, time 5273.78ms 
iter 5881: loss 2.5289, time 5269.59ms 
iter 5882: loss 2.4365, time 5276.51ms 
iter 5883: loss 2.3726, time 5263.19ms 
iter 5884: loss 2.5851, time 5263.50ms 
iter 5885: loss 2.5912, time 5260.31ms 
iter 5886: loss 2.5880, time 5265.94ms 
iter 5887: loss 2.6239, time 5266.21ms 
iter 5888: loss 2.6593, time 5269.36ms 
iter 5889: loss 2.6298, time 5265.91ms 
iter 5890: loss 2.6092, time 5260.26ms 
iter 5891: loss 2.5976, time 5268.51ms 
iter 5892: loss 2.5642, time 5268.84ms 
iter 5893: loss 2.6419, time 5263.36ms 
iter 5894: loss 2.7144, time 5264.38ms 
iter 5895: loss 2.4901, time 5258.37ms 
iter 5896: loss 2.6152, time 5271.61ms 
iter 5897: loss 2.7702, time 5251.64ms 
iter 5898: loss 2.6157, time 5256.94ms 
iter 5899: loss 2.4876, time 5258.62ms 
step 5900: train loss 2.5407, val loss 2.8520
iter 5900: loss 2.5173, time 20127.97ms 
iter 5901: loss 2.5394, time 5258.93ms 
iter 5902: loss 2.5816, time 5261.96ms 
iter 5903: loss 2.5033, time 5260.73ms 
iter 5904: loss 2.4423, time 5260.37ms 
iter 5905: loss 2.4919, time 5258.30ms 
iter 5906: loss 2.3896, time 5273.23ms 
iter 5907: loss 2.5499, time 5265.33ms 
iter 5908: loss 2.6872, time 5256.17ms 
iter 5909: loss 2.5999, time 5257.94ms 
iter 5910: loss 2.6748, time 5222.91ms 
iter 5911: loss 2.7388, time 5311.05ms 
iter 5912: loss 2.6350, time 5341.27ms 
iter 5913: loss 2.5553, time 5248.38ms 
iter 5914: loss 2.5506, time 5240.96ms 
iter 5915: loss 2.4561, time 5277.51ms 
iter 5916: loss 2.6544, time 5269.96ms 
iter 5917: loss 2.5127, time 5255.25ms 
iter 5918: loss 2.5020, time 5270.72ms 
iter 5919: loss 2.4162, time 5262.24ms 
iter 5920: loss 2.6597, time 5266.58ms 
iter 5921: loss 2.4564, time 5267.03ms 
iter 5922: loss 2.6348, time 5259.60ms 
iter 5923: loss 2.5771, time 5261.68ms 
iter 5924: loss 2.5908, time 5267.24ms 
iter 5925: loss 2.6330, time 5276.66ms 
iter 5926: loss 2.4521, time 5262.77ms 
iter 5927: loss 2.5795, time 5256.36ms 
iter 5928: loss 2.6233, time 5249.80ms 
iter 5929: loss 2.3836, time 5266.05ms 
iter 5930: loss 2.5243, time 5272.44ms 
iter 5931: loss 2.5863, time 5252.45ms 
iter 5932: loss 2.4909, time 5284.99ms 
iter 5933: loss 2.4280, time 5269.88ms 
iter 5934: loss 2.5728, time 5270.41ms 
iter 5935: loss 2.5301, time 5274.34ms 
iter 5936: loss 2.2375, time 5288.32ms 
iter 5937: loss 2.5919, time 5268.87ms 
iter 5938: loss 2.4256, time 5267.92ms 
iter 5939: loss 2.5116, time 5266.66ms 
iter 5940: loss 2.5799, time 5255.87ms 
iter 5941: loss 2.6639, time 5257.63ms 
iter 5942: loss 2.5188, time 5258.33ms 
iter 5943: loss 2.5428, time 5268.98ms 
iter 5944: loss 2.7742, time 5263.52ms 
iter 5945: loss 2.5962, time 5250.64ms 
iter 5946: loss 2.6251, time 5253.55ms 
iter 5947: loss 2.6212, time 5262.02ms 
iter 5948: loss 2.5366, time 5255.45ms 
iter 5949: loss 2.4325, time 5258.33ms 
step 5950: train loss 2.5579, val loss 2.8316
iter 5950: loss 2.3994, time 20103.41ms 
iter 5951: loss 2.7071, time 5256.30ms 
iter 5952: loss 2.5092, time 5266.93ms 
iter 5953: loss 2.6902, time 5273.59ms 
iter 5954: loss 2.5860, time 5262.41ms 
iter 5955: loss 2.3861, time 5262.28ms 
iter 5956: loss 2.6319, time 5259.35ms 
iter 5957: loss 2.6105, time 5271.50ms 
iter 5958: loss 2.5583, time 5274.82ms 
iter 5959: loss 2.4887, time 5261.26ms 
iter 5960: loss 2.6075, time 5266.99ms 
iter 5961: loss 2.3810, time 5257.99ms 
iter 5962: loss 2.5178, time 5266.42ms 
iter 5963: loss 2.6128, time 5253.97ms 
iter 5964: loss 2.5089, time 5255.28ms 
iter 5965: loss 2.5801, time 5236.28ms 
iter 5966: loss 2.7117, time 5263.54ms 
iter 5967: loss 2.4711, time 5225.72ms 
iter 5968: loss 2.5036, time 5249.75ms 
iter 5969: loss 2.5864, time 5261.57ms 
iter 5970: loss 2.6445, time 5253.64ms 
iter 5971: loss 2.6151, time 5263.54ms 
iter 5972: loss 2.6001, time 5254.52ms 
iter 5973: loss 2.5137, time 5254.36ms 
iter 5974: loss 2.6104, time 5259.09ms 
iter 5975: loss 2.4959, time 5268.86ms 
iter 5976: loss 2.8639, time 5193.96ms 
iter 5977: loss 2.4591, time 5258.09ms 
iter 5978: loss 2.5342, time 5261.51ms 
iter 5979: loss 2.5359, time 5265.27ms 
iter 5980: loss 2.8192, time 5264.90ms 
iter 5981: loss 2.6191, time 5257.00ms 
iter 5982: loss 2.6101, time 5304.03ms 
iter 5983: loss 2.4411, time 5279.64ms 
iter 5984: loss 2.5891, time 5300.10ms 
iter 5985: loss 2.5363, time 5263.28ms 
iter 5986: loss 2.3464, time 5251.89ms 
iter 5987: loss 2.6092, time 5260.87ms 
iter 5988: loss 2.5487, time 5270.64ms 
iter 5989: loss 2.5541, time 5260.92ms 
iter 5990: loss 2.6321, time 5224.21ms 
iter 5991: loss 2.4028, time 5243.99ms 
iter 5992: loss 2.5605, time 5260.11ms 
iter 5993: loss 2.3091, time 5260.58ms 
iter 5994: loss 2.4711, time 5263.27ms 
iter 5995: loss 2.6828, time 5309.16ms 
iter 5996: loss 2.5738, time 5316.80ms 
iter 5997: loss 2.5689, time 5306.05ms 
iter 5998: loss 2.4378, time 5255.76ms 
iter 5999: loss 2.5435, time 5292.81ms 
step 6000: train loss 2.5409, val loss 2.8439
iter 6000: loss 2.4460, time 20150.44ms 
iter 6001: loss 2.4612, time 5391.31ms 
iter 6002: loss 2.5360, time 5406.76ms 
iter 6003: loss 2.4542, time 5323.87ms 
iter 6004: loss 2.6508, time 5282.67ms 
iter 6005: loss 2.6166, time 5252.40ms 
iter 6006: loss 2.5225, time 5261.84ms 
iter 6007: loss 2.6714, time 5255.47ms 
iter 6008: loss 2.5539, time 5258.77ms 
iter 6009: loss 2.4982, time 5259.00ms 
iter 6010: loss 2.4532, time 5258.81ms 
iter 6011: loss 2.6855, time 5256.75ms 
iter 6012: loss 2.6551, time 5272.81ms 
iter 6013: loss 2.3959, time 5259.96ms 
iter 6014: loss 2.2771, time 5259.18ms 
iter 6015: loss 2.3132, time 5256.65ms 
iter 6016: loss 2.4856, time 5253.90ms 
iter 6017: loss 2.5948, time 5256.61ms 
iter 6018: loss 2.5607, time 5260.96ms 
iter 6019: loss 2.5131, time 5252.59ms 
iter 6020: loss 2.6162, time 5253.23ms 
iter 6021: loss 2.5137, time 5250.55ms 
iter 6022: loss 2.4413, time 5263.30ms 
iter 6023: loss 2.5595, time 5261.87ms 
iter 6024: loss 2.5971, time 5356.15ms 
iter 6025: loss 2.5458, time 5395.02ms 
iter 6026: loss 2.4918, time 5409.65ms 
iter 6027: loss 2.6176, time 5402.98ms 
iter 6028: loss 2.5572, time 5401.12ms 
iter 6029: loss 2.5169, time 5253.32ms 
iter 6030: loss 2.2294, time 5257.20ms 
iter 6031: loss 2.5928, time 5262.51ms 
iter 6032: loss 2.5826, time 5263.42ms 
iter 6033: loss 2.6799, time 5261.62ms 
iter 6034: loss 2.5456, time 5258.36ms 
iter 6035: loss 2.4188, time 5268.88ms 
iter 6036: loss 2.6277, time 5257.29ms 
iter 6037: loss 2.4318, time 5259.14ms 
iter 6038: loss 2.4300, time 5267.44ms 
iter 6039: loss 2.7025, time 5265.31ms 
iter 6040: loss 2.4981, time 5264.34ms 
iter 6041: loss 2.6941, time 5272.87ms 
iter 6042: loss 2.7376, time 5266.45ms 
iter 6043: loss 2.5979, time 5261.52ms 
iter 6044: loss 2.7265, time 5269.75ms 
iter 6045: loss 2.6210, time 5280.26ms 
iter 6046: loss 2.6466, time 5267.35ms 
iter 6047: loss 2.6113, time 5308.51ms 
iter 6048: loss 2.4644, time 5307.81ms 
iter 6049: loss 2.5992, time 5310.14ms 
step 6050: train loss 2.5514, val loss 2.8345
iter 6050: loss 2.6640, time 20111.47ms 
iter 6051: loss 2.5840, time 5254.53ms 
iter 6052: loss 2.5637, time 5252.54ms 
iter 6053: loss 2.3070, time 5247.43ms 
iter 6054: loss 2.5917, time 5259.12ms 
iter 6055: loss 2.5448, time 5258.36ms 
iter 6056: loss 2.3593, time 5251.43ms 
iter 6057: loss 2.4351, time 5243.54ms 
iter 6058: loss 2.5264, time 5266.50ms 
iter 6059: loss 2.7967, time 5265.71ms 
iter 6060: loss 2.3729, time 5274.21ms 
iter 6061: loss 2.4929, time 5251.29ms 
iter 6062: loss 2.7125, time 5276.03ms 
iter 6063: loss 2.5150, time 5250.23ms 
iter 6064: loss 2.4502, time 5259.32ms 
iter 6065: loss 2.4063, time 5252.59ms 
iter 6066: loss 2.4101, time 5261.89ms 
iter 6067: loss 2.6145, time 5294.93ms 
iter 6068: loss 2.2319, time 5266.29ms 
iter 6069: loss 2.5721, time 5274.51ms 
iter 6070: loss 2.4854, time 5257.94ms 
iter 6071: loss 2.5083, time 5265.39ms 
iter 6072: loss 2.6544, time 5264.78ms 
iter 6073: loss 2.4919, time 5266.70ms 
iter 6074: loss 2.5939, time 5269.79ms 
iter 6075: loss 2.5739, time 5265.59ms 
iter 6076: loss 2.5090, time 5260.68ms 
iter 6077: loss 2.4000, time 5277.72ms 
iter 6078: loss 2.4148, time 5271.89ms 
iter 6079: loss 2.6396, time 5255.50ms 
iter 6080: loss 2.4397, time 5269.16ms 
iter 6081: loss 2.4955, time 5265.04ms 
iter 6082: loss 2.7302, time 5267.98ms 
iter 6083: loss 2.5432, time 5274.88ms 
iter 6084: loss 2.5267, time 5266.74ms 
iter 6085: loss 2.4245, time 5257.70ms 
iter 6086: loss 2.4778, time 5276.99ms 
iter 6087: loss 2.4929, time 5269.20ms 
iter 6088: loss 2.5318, time 5264.77ms 
iter 6089: loss 2.4613, time 5265.99ms 
iter 6090: loss 2.6692, time 5266.18ms 
iter 6091: loss 2.5853, time 5283.01ms 
iter 6092: loss 2.4009, time 5267.74ms 
iter 6093: loss 2.4012, time 5263.41ms 
iter 6094: loss 2.6520, time 5257.72ms 
iter 6095: loss 2.3615, time 5267.32ms 
iter 6096: loss 2.6812, time 5262.70ms 
iter 6097: loss 2.5616, time 5267.12ms 
iter 6098: loss 2.6469, time 5263.72ms 
iter 6099: loss 2.5212, time 5273.01ms 
step 6100: train loss 2.5459, val loss 2.8368
iter 6100: loss 2.5433, time 20126.98ms 
iter 6101: loss 2.4840, time 5267.36ms 
iter 6102: loss 2.6710, time 5255.10ms 
iter 6103: loss 2.5621, time 5261.63ms 
iter 6104: loss 2.6159, time 5269.02ms 
iter 6105: loss 2.3651, time 5283.35ms 
iter 6106: loss 2.4076, time 5270.63ms 
iter 6107: loss 2.5921, time 5272.75ms 
iter 6108: loss 2.6471, time 5259.98ms 
iter 6109: loss 2.6632, time 5297.12ms 
iter 6110: loss 2.5747, time 5259.96ms 
iter 6111: loss 2.5992, time 5254.45ms 
iter 6112: loss 2.3590, time 5230.46ms 
iter 6113: loss 2.5450, time 5263.84ms 
iter 6114: loss 2.4093, time 5268.13ms 
iter 6115: loss 2.4681, time 5266.18ms 
iter 6116: loss 2.5265, time 5269.16ms 
iter 6117: loss 2.5804, time 5368.40ms 
iter 6118: loss 2.7357, time 5304.46ms 
iter 6119: loss 2.5192, time 5253.74ms 
iter 6120: loss 2.5786, time 5257.29ms 
iter 6121: loss 2.5612, time 5260.53ms 
iter 6122: loss 2.7090, time 5263.45ms 
iter 6123: loss 2.7546, time 5285.92ms 
iter 6124: loss 2.3502, time 5287.44ms 
iter 6125: loss 2.5536, time 5273.33ms 
iter 6126: loss 2.5469, time 5274.80ms 
iter 6127: loss 2.6178, time 5281.23ms 
iter 6128: loss 2.6741, time 5266.53ms 
iter 6129: loss 2.5555, time 5252.51ms 
iter 6130: loss 2.6060, time 5251.40ms 
iter 6131: loss 2.5953, time 5268.39ms 
iter 6132: loss 2.6449, time 5262.17ms 
iter 6133: loss 2.4058, time 5263.77ms 
iter 6134: loss 2.4849, time 5252.43ms 
iter 6135: loss 2.6392, time 5261.80ms 
iter 6136: loss 2.6352, time 5244.49ms 
iter 6137: loss 2.3673, time 5261.97ms 
iter 6138: loss 2.5602, time 5263.53ms 
iter 6139: loss 2.6048, time 5271.79ms 
iter 6140: loss 2.4538, time 5272.35ms 
iter 6141: loss 2.5621, time 5261.35ms 
iter 6142: loss 2.5959, time 5228.37ms 
iter 6143: loss 2.5152, time 5273.06ms 
iter 6144: loss 2.5841, time 5272.49ms 
iter 6145: loss 2.6890, time 5258.65ms 
iter 6146: loss 2.6269, time 5267.73ms 
iter 6147: loss 2.4890, time 5261.75ms 
iter 6148: loss 2.7739, time 5278.51ms 
iter 6149: loss 2.6525, time 5274.58ms 
step 6150: train loss 2.5402, val loss 2.8405
iter 6150: loss 2.3983, time 20115.20ms 
iter 6151: loss 2.6838, time 5257.17ms 
iter 6152: loss 2.5845, time 5261.28ms 
iter 6153: loss 2.5657, time 5272.02ms 
iter 6154: loss 2.6008, time 5268.85ms 
iter 6155: loss 2.5233, time 5262.49ms 
iter 6156: loss 2.5924, time 5261.66ms 
iter 6157: loss 2.5211, time 5264.98ms 
iter 6158: loss 2.2035, time 5273.99ms 
iter 6159: loss 2.5627, time 5268.99ms 
iter 6160: loss 2.6007, time 5270.26ms 
iter 6161: loss 2.5134, time 5274.31ms 
iter 6162: loss 2.5068, time 5277.18ms 
iter 6163: loss 2.6916, time 5276.41ms 
iter 6164: loss 2.7336, time 5280.29ms 
iter 6165: loss 2.5692, time 5269.72ms 
iter 6166: loss 2.3920, time 5318.36ms 
iter 6167: loss 2.6530, time 5402.01ms 
iter 6168: loss 2.4582, time 5412.25ms 
iter 6169: loss 2.6632, time 5427.94ms 
iter 6170: loss 2.7218, time 5429.97ms 
iter 6171: loss 2.5712, time 5405.48ms 
iter 6172: loss 2.5945, time 5435.72ms 
iter 6173: loss 2.6507, time 5425.94ms 
iter 6174: loss 2.4606, time 5437.12ms 
iter 6175: loss 2.4457, time 5424.63ms 
iter 6176: loss 2.4643, time 5425.05ms 
iter 6177: loss 2.5101, time 5429.13ms 
iter 6178: loss 2.3882, time 5443.77ms 
iter 6179: loss 2.6439, time 5420.77ms 
iter 6180: loss 2.6831, time 5253.02ms 
iter 6181: loss 2.3519, time 5271.69ms 
iter 6182: loss 2.6285, time 5271.01ms 
iter 6183: loss 2.6892, time 5262.51ms 
iter 6184: loss 2.6876, time 5253.94ms 
iter 6185: loss 2.5336, time 5273.07ms 
iter 6186: loss 2.6185, time 5283.46ms 
iter 6187: loss 2.4963, time 5279.44ms 
iter 6188: loss 2.2918, time 5266.69ms 
iter 6189: loss 2.6045, time 5268.25ms 
iter 6190: loss 2.5184, time 5271.90ms 
iter 6191: loss 2.5776, time 5321.06ms 
iter 6192: loss 2.4789, time 5277.40ms 
iter 6193: loss 2.4832, time 5430.37ms 
iter 6194: loss 2.5033, time 5276.33ms 
iter 6195: loss 2.5756, time 5277.34ms 
iter 6196: loss 2.6176, time 5263.92ms 
iter 6197: loss 2.5416, time 5268.39ms 
iter 6198: loss 2.3860, time 5252.17ms 
iter 6199: loss 2.5629, time 5258.92ms 
step 6200: train loss 2.5385, val loss 2.8409
iter 6200: loss 2.6202, time 20124.86ms 
iter 6201: loss 2.5159, time 5261.97ms 
iter 6202: loss 2.4851, time 5259.02ms 
iter 6203: loss 2.7800, time 5258.12ms 
iter 6204: loss 2.6199, time 5256.01ms 
iter 6205: loss 2.5634, time 5269.90ms 
iter 6206: loss 2.6053, time 5260.05ms 
iter 6207: loss 2.4915, time 5254.12ms 
iter 6208: loss 2.6849, time 5253.26ms 
iter 6209: loss 2.4750, time 5263.74ms 
iter 6210: loss 2.5967, time 5266.70ms 
iter 6211: loss 2.6623, time 5258.75ms 
iter 6212: loss 2.7384, time 5268.65ms 
iter 6213: loss 2.4583, time 5261.65ms 
iter 6214: loss 2.4946, time 5274.78ms 
iter 6215: loss 2.3044, time 5262.06ms 
iter 6216: loss 2.5090, time 5265.84ms 
iter 6217: loss 2.6095, time 5252.10ms 
iter 6218: loss 2.7136, time 5233.03ms 
iter 6219: loss 2.5222, time 5254.94ms 
iter 6220: loss 2.6058, time 5254.28ms 
iter 6221: loss 2.3942, time 5253.28ms 
iter 6222: loss 2.5807, time 5258.40ms 
iter 6223: loss 2.6528, time 5263.31ms 
iter 6224: loss 2.5203, time 5259.60ms 
iter 6225: loss 2.3324, time 5267.48ms 
iter 6226: loss 2.5600, time 5267.99ms 
iter 6227: loss 2.6020, time 5281.21ms 
iter 6228: loss 2.5582, time 5283.58ms 
iter 6229: loss 2.5962, time 5259.65ms 
iter 6230: loss 2.8183, time 5267.26ms 
iter 6231: loss 2.5607, time 5264.70ms 
iter 6232: loss 2.5937, time 5264.77ms 
iter 6233: loss 2.6721, time 5261.31ms 
iter 6234: loss 2.3491, time 5262.31ms 
iter 6235: loss 2.6884, time 5262.71ms 
iter 6236: loss 2.7445, time 5258.37ms 
iter 6237: loss 2.4380, time 5252.38ms 
iter 6238: loss 2.4044, time 5253.45ms 
iter 6239: loss 2.6851, time 5256.94ms 
iter 6240: loss 2.4684, time 5274.51ms 
iter 6241: loss 2.1979, time 5262.03ms 
iter 6242: loss 2.6239, time 5258.36ms 
iter 6243: loss 2.4202, time 5273.46ms 
iter 6244: loss 2.6179, time 5275.09ms 
iter 6245: loss 2.4727, time 5333.75ms 
iter 6246: loss 2.7221, time 5286.42ms 
iter 6247: loss 2.4226, time 5299.83ms 
iter 6248: loss 2.5179, time 5304.55ms 
iter 6249: loss 2.2394, time 5269.37ms 
step 6250: train loss 2.5386, val loss 2.8460
iter 6250: loss 2.5214, time 20159.38ms 
iter 6251: loss 2.5509, time 5255.35ms 
iter 6252: loss 2.6103, time 5257.66ms 
iter 6253: loss 2.7381, time 5265.93ms 
iter 6254: loss 2.5423, time 5257.26ms 
iter 6255: loss 2.4876, time 5258.54ms 
iter 6256: loss 2.3651, time 5263.98ms 
iter 6257: loss 2.4700, time 5264.23ms 
iter 6258: loss 2.6039, time 5255.96ms 
iter 6259: loss 2.4936, time 5252.34ms 
iter 6260: loss 2.3197, time 5263.40ms 
iter 6261: loss 2.6017, time 5266.89ms 
iter 6262: loss 2.4863, time 5269.61ms 
iter 6263: loss 2.8419, time 5261.32ms 
iter 6264: loss 2.5906, time 5269.02ms 
iter 6265: loss 2.4478, time 5267.06ms 
iter 6266: loss 2.7115, time 5273.03ms 
iter 6267: loss 2.4901, time 5266.08ms 
iter 6268: loss 2.7268, time 5264.61ms 
iter 6269: loss 2.4806, time 5268.14ms 
iter 6270: loss 2.4263, time 5254.46ms 
iter 6271: loss 2.5168, time 5263.99ms 
iter 6272: loss 2.4697, time 5255.92ms 
iter 6273: loss 2.4664, time 5249.95ms 
iter 6274: loss 2.4656, time 5225.78ms 
iter 6275: loss 2.7719, time 5265.07ms 
iter 6276: loss 2.5919, time 5269.50ms 
iter 6277: loss 2.5006, time 5249.57ms 
iter 6278: loss 2.5338, time 5262.48ms 
iter 6279: loss 2.4025, time 5230.34ms 
iter 6280: loss 2.4949, time 5265.16ms 
iter 6281: loss 2.6319, time 5257.94ms 
iter 6282: loss 2.3885, time 5245.92ms 
iter 6283: loss 2.3958, time 5262.88ms 
iter 6284: loss 2.3273, time 5262.41ms 
iter 6285: loss 2.2215, time 5259.34ms 
iter 6286: loss 2.4150, time 5250.28ms 
iter 6287: loss 2.4189, time 5259.81ms 
iter 6288: loss 2.5207, time 5258.10ms 
iter 6289: loss 2.4281, time 5268.74ms 
iter 6290: loss 2.4360, time 5257.19ms 
iter 6291: loss 2.3247, time 5256.78ms 
iter 6292: loss 2.7460, time 5253.26ms 
iter 6293: loss 2.5590, time 5278.21ms 
iter 6294: loss 2.4288, time 5283.95ms 
iter 6295: loss 2.2426, time 5274.04ms 
iter 6296: loss 2.5646, time 5262.32ms 
iter 6297: loss 2.5089, time 5267.19ms 
iter 6298: loss 2.7044, time 5267.50ms 
iter 6299: loss 2.4422, time 5262.58ms 
step 6300: train loss 2.5347, val loss 2.8286
iter 6300: loss 2.6546, time 20089.59ms 
iter 6301: loss 2.5076, time 5213.44ms 
iter 6302: loss 2.6342, time 5215.11ms 
iter 6303: loss 2.5730, time 5232.75ms 
iter 6304: loss 2.5085, time 5276.77ms 
iter 6305: loss 2.6494, time 5237.05ms 
iter 6306: loss 2.6319, time 5249.19ms 
iter 6307: loss 2.6295, time 5237.77ms 
iter 6308: loss 2.5630, time 5253.66ms 
iter 6309: loss 2.6164, time 5248.30ms 
iter 6310: loss 2.5361, time 5239.94ms 
iter 6311: loss 2.6680, time 5249.78ms 
iter 6312: loss 2.4401, time 5271.94ms 
iter 6313: loss 2.5065, time 5276.60ms 
iter 6314: loss 2.4250, time 5274.43ms 
iter 6315: loss 2.4704, time 5289.80ms 
iter 6316: loss 2.6081, time 5290.92ms 
iter 6317: loss 2.4519, time 5290.30ms 
iter 6318: loss 2.5487, time 5265.03ms 
iter 6319: loss 2.6613, time 5271.03ms 
iter 6320: loss 2.3080, time 5257.21ms 
iter 6321: loss 2.6794, time 5268.52ms 
iter 6322: loss 2.5248, time 5278.10ms 
iter 6323: loss 2.4833, time 5263.14ms 
iter 6324: loss 2.4808, time 5274.46ms 
iter 6325: loss 2.4365, time 5285.01ms 
iter 6326: loss 2.5373, time 5254.18ms 
iter 6327: loss 2.6286, time 5263.34ms 
iter 6328: loss 2.6697, time 5253.86ms 
iter 6329: loss 2.6267, time 5254.78ms 
iter 6330: loss 2.3849, time 5257.47ms 
iter 6331: loss 2.5694, time 5248.24ms 
iter 6332: loss 2.6333, time 5252.26ms 
iter 6333: loss 2.5891, time 5248.02ms 
iter 6334: loss 2.3450, time 5266.52ms 
iter 6335: loss 2.4061, time 5251.20ms 
iter 6336: loss 2.5261, time 5253.55ms 
iter 6337: loss 2.3896, time 5257.94ms 
iter 6338: loss 2.5826, time 5266.02ms 
iter 6339: loss 2.5458, time 5264.80ms 
iter 6340: loss 2.7690, time 5258.28ms 
iter 6341: loss 2.4607, time 5353.78ms 
iter 6342: loss 2.4241, time 5398.98ms 
iter 6343: loss 2.5033, time 5361.75ms 
iter 6344: loss 2.4300, time 5262.61ms 
iter 6345: loss 2.4811, time 5268.46ms 
iter 6346: loss 2.6677, time 5358.27ms 
iter 6347: loss 2.4624, time 5424.92ms 
iter 6348: loss 2.6260, time 5367.71ms 
iter 6349: loss 2.5979, time 5255.26ms 
step 6350: train loss 2.5372, val loss 2.8473
iter 6350: loss 2.4989, time 20076.37ms 
iter 6351: loss 2.4962, time 5263.04ms 
iter 6352: loss 2.4377, time 5273.53ms 
iter 6353: loss 2.5592, time 5272.17ms 
iter 6354: loss 2.3664, time 5261.38ms 
iter 6355: loss 2.7975, time 5256.52ms 
iter 6356: loss 2.4646, time 5288.27ms 
iter 6357: loss 2.5378, time 5325.89ms 
iter 6358: loss 2.3721, time 5288.60ms 
iter 6359: loss 2.3425, time 5312.03ms 
iter 6360: loss 2.4051, time 5271.00ms 
iter 6361: loss 2.8253, time 5257.92ms 
iter 6362: loss 2.7153, time 5235.57ms 
iter 6363: loss 2.4236, time 5256.70ms 
iter 6364: loss 2.6363, time 5252.37ms 
iter 6365: loss 2.5305, time 5262.22ms 
iter 6366: loss 2.7444, time 5267.84ms 
iter 6367: loss 2.4559, time 5261.68ms 
iter 6368: loss 2.5074, time 5312.33ms 
iter 6369: loss 2.4624, time 5322.50ms 
iter 6370: loss 2.6357, time 5298.09ms 
iter 6371: loss 2.5797, time 5263.40ms 
iter 6372: loss 2.4786, time 5257.05ms 
iter 6373: loss 2.8177, time 5265.22ms 
iter 6374: loss 2.6046, time 5263.05ms 
iter 6375: loss 2.5142, time 5253.03ms 
iter 6376: loss 2.2879, time 5254.36ms 
iter 6377: loss 2.5651, time 5267.04ms 
iter 6378: loss 2.4009, time 5270.89ms 
iter 6379: loss 2.4204, time 5256.82ms 
iter 6380: loss 2.4585, time 5266.94ms 
iter 6381: loss 2.5560, time 5245.13ms 
iter 6382: loss 2.7506, time 5269.90ms 
iter 6383: loss 2.6643, time 5257.65ms 
iter 6384: loss 2.3690, time 5268.42ms 
iter 6385: loss 2.4858, time 5265.65ms 
iter 6386: loss 2.6724, time 5294.07ms 
iter 6387: loss 2.4142, time 5273.44ms 
iter 6388: loss 2.4872, time 5260.04ms 
iter 6389: loss 2.6779, time 5267.72ms 
iter 6390: loss 2.7590, time 5272.13ms 
iter 6391: loss 2.3154, time 5266.00ms 
iter 6392: loss 2.7151, time 5258.73ms 
iter 6393: loss 2.3006, time 5262.40ms 
iter 6394: loss 2.3969, time 5266.26ms 
iter 6395: loss 2.4564, time 5273.28ms 
iter 6396: loss 2.5010, time 5231.62ms 
iter 6397: loss 2.6095, time 5269.04ms 
iter 6398: loss 2.6059, time 5265.21ms 
iter 6399: loss 2.6376, time 5265.51ms 
step 6400: train loss 2.5510, val loss 2.8349
iter 6400: loss 2.4412, time 20001.37ms 
iter 6401: loss 2.5681, time 5257.87ms 
iter 6402: loss 2.5689, time 5254.33ms 
iter 6403: loss 2.3818, time 5326.04ms 
iter 6404: loss 2.6759, time 5332.36ms 
iter 6405: loss 2.5139, time 5313.14ms 
iter 6406: loss 2.2722, time 5295.88ms 
iter 6407: loss 2.2585, time 5274.84ms 
iter 6408: loss 2.5469, time 5330.70ms 
iter 6409: loss 2.6183, time 5307.22ms 
iter 6410: loss 2.5104, time 5255.41ms 
iter 6411: loss 2.3965, time 5260.51ms 
iter 6412: loss 2.5388, time 5277.20ms 
iter 6413: loss 2.6780, time 5272.62ms 
iter 6414: loss 2.6173, time 5262.51ms 
iter 6415: loss 2.8352, time 5258.93ms 
iter 6416: loss 2.5363, time 5264.48ms 
iter 6417: loss 2.5961, time 5272.82ms 
iter 6418: loss 2.3793, time 5264.62ms 
iter 6419: loss 2.2546, time 5265.29ms 
iter 6420: loss 2.4635, time 5260.19ms 
iter 6421: loss 2.6158, time 5260.96ms 
iter 6422: loss 2.6629, time 5275.15ms 
iter 6423: loss 2.5749, time 5255.71ms 
iter 6424: loss 2.5071, time 5259.23ms 
iter 6425: loss 2.3560, time 5257.82ms 
iter 6426: loss 2.5510, time 5263.33ms 
iter 6427: loss 2.5896, time 5263.58ms 
iter 6428: loss 2.5437, time 5252.60ms 
iter 6429: loss 2.2325, time 5252.13ms 
iter 6430: loss 2.3825, time 5264.48ms 
iter 6431: loss 2.5112, time 5264.58ms 
iter 6432: loss 2.1994, time 5259.94ms 
iter 6433: loss 2.4364, time 5262.50ms 
iter 6434: loss 2.5101, time 5260.66ms 
iter 6435: loss 2.4971, time 5263.95ms 
iter 6436: loss 2.4112, time 5264.11ms 
iter 6437: loss 2.3795, time 5262.94ms 
iter 6438: loss 2.6556, time 5263.86ms 
iter 6439: loss 2.5566, time 5263.10ms 
iter 6440: loss 2.4111, time 5268.04ms 
iter 6441: loss 2.5874, time 5268.67ms 
iter 6442: loss 2.4538, time 5264.20ms 
iter 6443: loss 2.4458, time 5258.34ms 
iter 6444: loss 2.5274, time 5275.74ms 
iter 6445: loss 2.6419, time 5273.08ms 
iter 6446: loss 2.4292, time 5267.00ms 
iter 6447: loss 2.4978, time 5270.78ms 
iter 6448: loss 2.5927, time 5262.00ms 
iter 6449: loss 2.4748, time 5270.08ms 
step 6450: train loss 2.5218, val loss 2.8298
iter 6450: loss 2.5378, time 20086.23ms 
iter 6451: loss 2.5639, time 5257.42ms 
iter 6452: loss 2.5983, time 5266.76ms 
iter 6453: loss 2.6854, time 5252.99ms 
iter 6454: loss 2.6434, time 5262.13ms 
iter 6455: loss 2.5183, time 5259.20ms 
iter 6456: loss 2.5164, time 5268.55ms 
iter 6457: loss 2.5738, time 5266.25ms 
iter 6458: loss 3.0494, time 5276.19ms 
iter 6459: loss 2.4423, time 5265.85ms 
iter 6460: loss 2.6599, time 5259.84ms 
iter 6461: loss 2.4802, time 5268.93ms 
iter 6462: loss 2.4587, time 5276.31ms 
iter 6463: loss 2.5777, time 5260.12ms 
iter 6464: loss 2.3850, time 5259.50ms 
iter 6465: loss 2.6101, time 5255.02ms 
iter 6466: loss 2.5057, time 5267.43ms 
iter 6467: loss 2.4525, time 5270.16ms 
iter 6468: loss 2.2959, time 5253.87ms 
iter 6469: loss 2.4690, time 5253.24ms 
iter 6470: loss 2.4259, time 5263.61ms 
iter 6471: loss 2.3815, time 5259.22ms 
iter 6472: loss 2.6390, time 5250.00ms 
iter 6473: loss 2.6959, time 5252.63ms 
iter 6474: loss 2.7455, time 5260.82ms 
iter 6475: loss 2.3298, time 5261.74ms 
iter 6476: loss 2.5556, time 5254.28ms 
iter 6477: loss 2.5657, time 5259.89ms 
iter 6478: loss 2.3845, time 5258.92ms 
iter 6479: loss 2.4854, time 5268.95ms 
iter 6480: loss 2.5756, time 5268.28ms 
iter 6481: loss 2.6456, time 5236.70ms 
iter 6482: loss 2.5116, time 5258.28ms 
iter 6483: loss 2.5688, time 5225.37ms 
iter 6484: loss 2.2970, time 5266.68ms 
iter 6485: loss 2.3678, time 5259.15ms 
iter 6486: loss 2.5288, time 5260.91ms 
iter 6487: loss 2.6665, time 5258.71ms 
iter 6488: loss 2.5467, time 5271.79ms 
iter 6489: loss 2.4986, time 5264.85ms 
iter 6490: loss 2.4085, time 5254.44ms 
iter 6491: loss 2.3432, time 5259.26ms 
iter 6492: loss 2.5511, time 5275.21ms 
iter 6493: loss 2.4257, time 5254.54ms 
iter 6494: loss 2.5875, time 5267.90ms 
iter 6495: loss 2.4983, time 5272.82ms 
iter 6496: loss 2.4544, time 5273.42ms 
iter 6497: loss 2.6202, time 5262.88ms 
iter 6498: loss 2.5744, time 5265.96ms 
iter 6499: loss 2.5911, time 5271.50ms 
step 6500: train loss 2.5206, val loss 2.8296
iter 6500: loss 2.4159, time 20108.59ms 
iter 6501: loss 2.4286, time 5275.77ms 
iter 6502: loss 2.4437, time 5265.89ms 
iter 6503: loss 2.7169, time 5266.07ms 
iter 6504: loss 2.5913, time 5265.82ms 
iter 6505: loss 2.5138, time 5284.07ms 
iter 6506: loss 2.5338, time 5273.85ms 
iter 6507: loss 2.6841, time 5272.41ms 
iter 6508: loss 2.7025, time 5424.87ms 
iter 6509: loss 2.6449, time 5363.50ms 
iter 6510: loss 2.6447, time 5281.48ms 
iter 6511: loss 2.4601, time 5267.44ms 
iter 6512: loss 2.3728, time 5178.56ms 
iter 6513: loss 2.4312, time 5359.18ms 
iter 6514: loss 2.6633, time 5249.82ms 
iter 6515: loss 2.5181, time 5395.41ms 
iter 6516: loss 2.4511, time 5301.58ms 
iter 6517: loss 2.7525, time 5252.20ms 
iter 6518: loss 2.2842, time 5279.44ms 
iter 6519: loss 2.4067, time 5256.99ms 
iter 6520: loss 2.6167, time 5432.84ms 
iter 6521: loss 2.4777, time 5335.07ms 
iter 6522: loss 2.5209, time 5256.30ms 
iter 6523: loss 2.5221, time 5246.25ms 
iter 6524: loss 2.3922, time 5244.76ms 
iter 6525: loss 2.0278, time 5258.98ms 
iter 6526: loss 2.4383, time 5267.29ms 
iter 6527: loss 2.6475, time 5269.25ms 
iter 6528: loss 2.6230, time 5255.96ms 
iter 6529: loss 2.5974, time 5255.16ms 
iter 6530: loss 2.6373, time 5265.72ms 
iter 6531: loss 2.6091, time 5257.91ms 
iter 6532: loss 2.5561, time 5254.97ms 
iter 6533: loss 2.4059, time 5258.13ms 
iter 6534: loss 2.5689, time 5267.44ms 
iter 6535: loss 2.5611, time 5254.68ms 
iter 6536: loss 2.7298, time 5256.87ms 
iter 6537: loss 2.5468, time 5248.61ms 
iter 6538: loss 2.4256, time 5243.03ms 
iter 6539: loss 2.3740, time 5250.29ms 
iter 6540: loss 2.5514, time 5247.26ms 
iter 6541: loss 2.2553, time 5248.34ms 
iter 6542: loss 2.4839, time 5265.96ms 
iter 6543: loss 2.5235, time 5264.95ms 
iter 6544: loss 2.6739, time 5255.91ms 
iter 6545: loss 2.6279, time 5259.98ms 
iter 6546: loss 2.5261, time 5276.04ms 
iter 6547: loss 2.7157, time 5273.50ms 
iter 6548: loss 2.4602, time 5250.56ms 
iter 6549: loss 2.6403, time 5252.21ms 
step 6550: train loss 2.5377, val loss 2.8442
iter 6550: loss 2.4493, time 20080.53ms 
iter 6551: loss 2.6605, time 5266.67ms 
iter 6552: loss 2.4177, time 5259.24ms 
iter 6553: loss 2.5784, time 5255.14ms 
iter 6554: loss 2.6824, time 5269.17ms 
iter 6555: loss 2.5866, time 5263.47ms 
iter 6556: loss 2.5329, time 5259.72ms 
iter 6557: loss 2.4487, time 5250.69ms 
iter 6558: loss 2.5029, time 5262.15ms 
iter 6559: loss 2.5843, time 5271.34ms 
iter 6560: loss 2.5175, time 5261.58ms 
iter 6561: loss 2.6341, time 5254.07ms 
iter 6562: loss 2.6411, time 5247.85ms 
iter 6563: loss 2.5940, time 5261.10ms 
iter 6564: loss 2.5275, time 5264.06ms 
iter 6565: loss 2.6305, time 5248.34ms 
iter 6566: loss 2.6552, time 5252.54ms 
iter 6567: loss 2.5687, time 5262.67ms 
iter 6568: loss 2.4529, time 5239.55ms 
iter 6569: loss 2.3072, time 5232.72ms 
iter 6570: loss 2.5192, time 5255.62ms 
iter 6571: loss 2.5282, time 5254.99ms 
iter 6572: loss 2.6434, time 5259.15ms 
iter 6573: loss 2.3423, time 5265.74ms 
iter 6574: loss 2.5318, time 5255.63ms 
iter 6575: loss 2.6368, time 5257.02ms 
iter 6576: loss 2.4292, time 5258.64ms 
iter 6577: loss 2.4291, time 5253.88ms 
iter 6578: loss 2.5118, time 5248.99ms 
iter 6579: loss 2.6137, time 5254.19ms 
iter 6580: loss 2.6475, time 5256.40ms 
iter 6581: loss 2.4200, time 5261.26ms 
iter 6582: loss 2.5081, time 5272.51ms 
iter 6583: loss 2.5100, time 5262.45ms 
iter 6584: loss 2.4585, time 5251.67ms 
iter 6585: loss 2.3684, time 5257.79ms 
iter 6586: loss 2.6489, time 5264.21ms 
iter 6587: loss 2.7785, time 5259.19ms 
iter 6588: loss 2.6860, time 5251.16ms 
iter 6589: loss 2.7187, time 5250.35ms 
iter 6590: loss 2.5959, time 5257.38ms 
iter 6591: loss 2.6153, time 5271.11ms 
iter 6592: loss 2.5632, time 5247.79ms 
iter 6593: loss 2.5242, time 5257.68ms 
iter 6594: loss 2.4676, time 5247.94ms 
iter 6595: loss 2.4523, time 5254.71ms 
iter 6596: loss 2.6266, time 5250.40ms 
iter 6597: loss 2.4740, time 5251.84ms 
iter 6598: loss 2.5302, time 5245.57ms 
iter 6599: loss 2.5596, time 5253.23ms 
step 6600: train loss 2.5263, val loss 2.8398
iter 6600: loss 2.7398, time 20066.14ms 
iter 6601: loss 2.5348, time 5255.98ms 
iter 6602: loss 2.5102, time 5247.19ms 
iter 6603: loss 2.5384, time 5252.53ms 
iter 6604: loss 2.6721, time 5251.33ms 
iter 6605: loss 2.7033, time 5273.93ms 
iter 6606: loss 2.5387, time 5262.47ms 
iter 6607: loss 2.4593, time 5261.00ms 
iter 6608: loss 2.5176, time 5267.47ms 
iter 6609: loss 2.4386, time 5323.00ms 
iter 6610: loss 2.4903, time 5342.70ms 
iter 6611: loss 2.4433, time 5248.15ms 
iter 6612: loss 2.7680, time 5251.28ms 
iter 6613: loss 2.3870, time 5256.78ms 
iter 6614: loss 2.3863, time 5263.37ms 
iter 6615: loss 2.5182, time 5297.03ms 
iter 6616: loss 2.6135, time 5333.04ms 
iter 6617: loss 2.4675, time 5324.23ms 
iter 6618: loss 2.5738, time 5314.36ms 
iter 6619: loss 2.3940, time 5317.25ms 
iter 6620: loss 2.2937, time 5252.52ms 
iter 6621: loss 2.7139, time 5248.71ms 
iter 6622: loss 2.4978, time 5249.26ms 
iter 6623: loss 2.6396, time 5265.18ms 
iter 6624: loss 2.5553, time 5251.75ms 
iter 6625: loss 2.7024, time 5254.29ms 
iter 6626: loss 2.6043, time 5260.86ms 
iter 6627: loss 2.5347, time 5264.63ms 
iter 6628: loss 2.5067, time 5266.14ms 
iter 6629: loss 2.5844, time 5249.11ms 
iter 6630: loss 2.6498, time 5246.55ms 
iter 6631: loss 2.7904, time 5228.79ms 
iter 6632: loss 2.5106, time 5228.42ms 
iter 6633: loss 2.6256, time 5244.56ms 
iter 6634: loss 2.4861, time 5250.67ms 
iter 6635: loss 2.4832, time 5238.78ms 
iter 6636: loss 2.5300, time 5242.04ms 
iter 6637: loss 2.6925, time 5254.43ms 
iter 6638: loss 2.6145, time 5249.53ms 
iter 6639: loss 2.6470, time 5252.15ms 
iter 6640: loss 2.4466, time 5261.58ms 
iter 6641: loss 2.6647, time 5269.71ms 
iter 6642: loss 2.6292, time 5254.16ms 
iter 6643: loss 2.4649, time 5259.61ms 
iter 6644: loss 2.4048, time 5259.28ms 
iter 6645: loss 2.5388, time 5263.04ms 
iter 6646: loss 2.8160, time 5263.56ms 
iter 6647: loss 2.2697, time 5255.88ms 
iter 6648: loss 2.5801, time 5249.16ms 
iter 6649: loss 2.6169, time 5259.44ms 
step 6650: train loss 2.5220, val loss 2.8226
iter 6650: loss 2.5046, time 20072.37ms 
iter 6651: loss 2.4875, time 5241.91ms 
iter 6652: loss 2.6524, time 5245.60ms 
iter 6653: loss 2.6445, time 5255.89ms 
iter 6654: loss 2.4330, time 5261.03ms 
iter 6655: loss 2.7103, time 5258.32ms 
iter 6656: loss 2.5128, time 5256.13ms 
iter 6657: loss 2.6665, time 5243.92ms 
iter 6658: loss 2.5462, time 5252.91ms 
iter 6659: loss 2.6239, time 5265.92ms 
iter 6660: loss 2.7257, time 5245.29ms 
iter 6661: loss 2.4282, time 5234.92ms 
iter 6662: loss 2.4829, time 5248.06ms 
iter 6663: loss 2.4644, time 5256.15ms 
iter 6664: loss 2.4353, time 5252.02ms 
iter 6665: loss 2.7315, time 5253.80ms 
iter 6666: loss 2.7205, time 5224.67ms 
iter 6667: loss 2.4195, time 5262.62ms 
iter 6668: loss 2.4673, time 5254.15ms 
iter 6669: loss 2.4654, time 5245.65ms 
iter 6670: loss 2.5573, time 5253.93ms 
iter 6671: loss 2.7400, time 5261.06ms 
iter 6672: loss 2.5810, time 5266.57ms 
iter 6673: loss 2.3904, time 5268.14ms 
iter 6674: loss 2.4194, time 5238.66ms 
iter 6675: loss 2.6786, time 5259.51ms 
iter 6676: loss 2.6078, time 5205.85ms 
iter 6677: loss 2.7674, time 5332.36ms 
iter 6678: loss 2.6572, time 5312.55ms 
iter 6679: loss 2.6331, time 5265.80ms 
iter 6680: loss 2.7119, time 5272.65ms 
iter 6681: loss 2.6343, time 5265.90ms 
iter 6682: loss 2.4573, time 5257.59ms 
iter 6683: loss 2.4390, time 5262.92ms 
iter 6684: loss 2.5355, time 5260.72ms 
iter 6685: loss 2.4238, time 5277.40ms 
iter 6686: loss 2.7807, time 5262.19ms 
iter 6687: loss 2.5213, time 5247.46ms 
iter 6688: loss 2.6418, time 5262.97ms 
iter 6689: loss 2.2865, time 5268.18ms 
iter 6690: loss 2.4077, time 5262.22ms 
iter 6691: loss 2.4613, time 5270.38ms 
iter 6692: loss 2.3246, time 5273.77ms 
iter 6693: loss 2.4840, time 5273.34ms 
iter 6694: loss 2.5300, time 5281.81ms 
iter 6695: loss 2.4962, time 5277.57ms 
iter 6696: loss 2.5058, time 5294.27ms 
iter 6697: loss 2.5602, time 5262.27ms 
iter 6698: loss 2.7832, time 5262.14ms 
iter 6699: loss 2.5370, time 5268.19ms 
step 6700: train loss 2.5243, val loss 2.8236
iter 6700: loss 2.7890, time 20089.24ms 
iter 6701: loss 2.4496, time 5267.81ms 
iter 6702: loss 2.6261, time 5260.44ms 
iter 6703: loss 2.3951, time 5246.31ms 
iter 6704: loss 2.4323, time 5269.92ms 
iter 6705: loss 2.5054, time 5272.97ms 
iter 6706: loss 2.3189, time 5269.50ms 
iter 6707: loss 2.6015, time 5285.59ms 
iter 6708: loss 2.4919, time 5272.57ms 
iter 6709: loss 2.5246, time 5271.88ms 
iter 6710: loss 2.3749, time 5273.45ms 
iter 6711: loss 2.3460, time 5267.52ms 
iter 6712: loss 2.6493, time 5265.09ms 
iter 6713: loss 2.4333, time 5260.23ms 
iter 6714: loss 2.2096, time 5264.02ms 
iter 6715: loss 2.5970, time 5265.35ms 
iter 6716: loss 2.5685, time 5266.18ms 
iter 6717: loss 2.6112, time 5267.94ms 
iter 6718: loss 2.5187, time 5269.21ms 
iter 6719: loss 2.4299, time 5261.58ms 
iter 6720: loss 2.4731, time 5261.50ms 
iter 6721: loss 2.5851, time 5279.24ms 
iter 6722: loss 2.3810, time 5277.82ms 
iter 6723: loss 2.5516, time 5268.90ms 
iter 6724: loss 2.4313, time 5260.50ms 
iter 6725: loss 2.5621, time 5261.31ms 
iter 6726: loss 2.3009, time 5254.82ms 
iter 6727: loss 2.5263, time 5278.50ms 
iter 6728: loss 2.6568, time 5259.34ms 
iter 6729: loss 2.5269, time 5241.42ms 
iter 6730: loss 2.5322, time 5263.24ms 
iter 6731: loss 2.6115, time 5268.54ms 
iter 6732: loss 2.6066, time 5264.86ms 
iter 6733: loss 2.5300, time 5256.08ms 
iter 6734: loss 2.4339, time 5255.32ms 
iter 6735: loss 2.5097, time 5261.87ms 
iter 6736: loss 2.6251, time 5257.26ms 
iter 6737: loss 2.4898, time 5255.90ms 
iter 6738: loss 2.6172, time 5262.04ms 
iter 6739: loss 2.4468, time 5255.91ms 
iter 6740: loss 2.6040, time 5268.21ms 
iter 6741: loss 2.3345, time 5277.38ms 
iter 6742: loss 2.7756, time 5272.52ms 
iter 6743: loss 2.5991, time 5265.89ms 
iter 6744: loss 2.5164, time 5250.61ms 
iter 6745: loss 2.5064, time 5270.63ms 
iter 6746: loss 2.4001, time 5262.74ms 
iter 6747: loss 2.7030, time 5257.73ms 
iter 6748: loss 2.5482, time 5256.49ms 
iter 6749: loss 2.4324, time 5268.50ms 
step 6750: train loss 2.5182, val loss 2.8573
iter 6750: loss 2.3833, time 19993.37ms 
iter 6751: loss 2.5833, time 5257.55ms 
iter 6752: loss 2.4821, time 5260.15ms 
iter 6753: loss 2.3050, time 5259.63ms 
iter 6754: loss 2.5935, time 5266.43ms 
iter 6755: loss 2.7195, time 5272.77ms 
iter 6756: loss 2.7102, time 5266.74ms 
iter 6757: loss 2.6550, time 5261.67ms 
iter 6758: loss 2.8544, time 5264.13ms 
iter 6759: loss 2.5611, time 5273.15ms 
iter 6760: loss 2.3157, time 5265.86ms 
iter 6761: loss 2.4920, time 5253.17ms 
iter 6762: loss 2.6433, time 5256.77ms 
iter 6763: loss 2.2305, time 5266.02ms 
iter 6764: loss 2.4426, time 5272.41ms 
iter 6765: loss 2.8332, time 5261.10ms 
iter 6766: loss 2.4171, time 5260.04ms 
iter 6767: loss 2.5972, time 5254.94ms 
iter 6768: loss 2.3261, time 5265.52ms 
iter 6769: loss 2.5715, time 5264.19ms 
iter 6770: loss 2.4749, time 5260.85ms 
iter 6771: loss 2.6240, time 5268.17ms 
iter 6772: loss 2.4526, time 5264.09ms 
iter 6773: loss 2.2880, time 5267.15ms 
iter 6774: loss 2.4102, time 5259.54ms 
iter 6775: loss 2.6638, time 5269.83ms 
iter 6776: loss 2.5189, time 5266.58ms 
iter 6777: loss 2.6235, time 5273.75ms 
iter 6778: loss 2.4249, time 5259.65ms 
iter 6779: loss 2.7119, time 5254.08ms 
iter 6780: loss 2.5321, time 5256.05ms 
iter 6781: loss 2.4991, time 5268.63ms 
iter 6782: loss 2.6096, time 5250.55ms 
iter 6783: loss 2.5733, time 5255.46ms 
iter 6784: loss 2.4162, time 5257.93ms 
iter 6785: loss 2.3267, time 5264.17ms 
iter 6786: loss 2.7139, time 5266.65ms 
iter 6787: loss 2.5965, time 5261.45ms 
iter 6788: loss 2.7831, time 5251.65ms 
iter 6789: loss 2.7247, time 5235.23ms 
iter 6790: loss 2.6482, time 5252.01ms 
iter 6791: loss 2.6425, time 5262.10ms 
iter 6792: loss 2.4103, time 5249.86ms 
iter 6793: loss 2.6621, time 5263.09ms 
iter 6794: loss 2.5874, time 5264.78ms 
iter 6795: loss 2.4618, time 5250.74ms 
iter 6796: loss 2.6760, time 5248.69ms 
iter 6797: loss 2.4429, time 5265.82ms 
iter 6798: loss 2.6764, time 5273.85ms 
iter 6799: loss 2.6085, time 5279.56ms 
step 6800: train loss 2.5177, val loss 2.8549
iter 6800: loss 2.7277, time 19907.73ms 
iter 6801: loss 2.5615, time 5255.13ms 
iter 6802: loss 2.4860, time 5261.16ms 
iter 6803: loss 2.6330, time 5307.24ms 
iter 6804: loss 2.4272, time 5259.91ms 
iter 6805: loss 2.5003, time 5257.25ms 
iter 6806: loss 2.6350, time 5256.39ms 
iter 6807: loss 2.5746, time 5263.84ms 
iter 6808: loss 2.4505, time 5265.67ms 
iter 6809: loss 2.3083, time 5280.96ms 
iter 6810: loss 2.8062, time 5272.38ms 
iter 6811: loss 2.5343, time 5258.85ms 
iter 6812: loss 2.6712, time 5264.57ms 
iter 6813: loss 2.5404, time 5272.34ms 
iter 6814: loss 2.5004, time 5269.16ms 
iter 6815: loss 2.4140, time 5268.80ms 
iter 6816: loss 2.3597, time 5269.75ms 
iter 6817: loss 2.4054, time 5264.48ms 
iter 6818: loss 2.3818, time 5276.24ms 
iter 6819: loss 2.2727, time 5262.53ms 
iter 6820: loss 2.6214, time 5269.05ms 
iter 6821: loss 2.3369, time 5259.49ms 
iter 6822: loss 2.6402, time 5263.79ms 
iter 6823: loss 2.2878, time 5275.91ms 
iter 6824: loss 2.5711, time 5258.76ms 
iter 6825: loss 2.4670, time 5259.23ms 
iter 6826: loss 2.5532, time 5268.05ms 
iter 6827: loss 2.5908, time 5268.96ms 
iter 6828: loss 2.6480, time 5255.29ms 
iter 6829: loss 2.4558, time 5261.56ms 
iter 6830: loss 2.1607, time 5252.38ms 
iter 6831: loss 2.4705, time 5266.81ms 
iter 6832: loss 2.4600, time 5260.81ms 
iter 6833: loss 2.5299, time 5260.10ms 
iter 6834: loss 2.5982, time 5259.87ms 
iter 6835: loss 2.4809, time 5258.94ms 
iter 6836: loss 2.4713, time 5269.52ms 
iter 6837: loss 2.5434, time 5259.33ms 
iter 6838: loss 2.6059, time 5258.62ms 
iter 6839: loss 2.6333, time 5240.83ms 
iter 6840: loss 2.3964, time 5262.17ms 
iter 6841: loss 2.6395, time 5265.55ms 
iter 6842: loss 2.6030, time 5253.60ms 
iter 6843: loss 2.5121, time 5254.01ms 
iter 6844: loss 2.5584, time 5261.98ms 
iter 6845: loss 2.2418, time 5258.78ms 
iter 6846: loss 2.4615, time 5257.38ms 
iter 6847: loss 2.6214, time 5260.02ms 
iter 6848: loss 2.4242, time 5226.98ms 
iter 6849: loss 2.6144, time 5267.07ms 
step 6850: train loss 2.5050, val loss 2.8496
iter 6850: loss 2.4346, time 20107.49ms 
iter 6851: loss 2.3899, time 5262.39ms 
iter 6852: loss 2.6242, time 5265.79ms 
iter 6853: loss 2.5073, time 5259.21ms 
iter 6854: loss 2.5056, time 5237.37ms 
iter 6855: loss 2.4217, time 5261.65ms 
iter 6856: loss 2.5289, time 5253.89ms 
iter 6857: loss 2.6344, time 5259.10ms 
iter 6858: loss 2.6254, time 5261.36ms 
iter 6859: loss 2.6280, time 5272.67ms 
iter 6860: loss 2.5556, time 5258.93ms 
iter 6861: loss 2.4166, time 5255.74ms 
iter 6862: loss 2.5121, time 5262.90ms 
iter 6863: loss 2.6627, time 5260.60ms 
iter 6864: loss 2.5280, time 5234.04ms 
iter 6865: loss 2.3926, time 5259.15ms 
iter 6866: loss 2.3638, time 5251.95ms 
iter 6867: loss 2.3679, time 5255.72ms 
iter 6868: loss 2.5901, time 5283.14ms 
iter 6869: loss 2.5083, time 5277.49ms 
iter 6870: loss 2.4833, time 5268.51ms 
iter 6871: loss 2.5250, time 5271.21ms 
iter 6872: loss 2.3656, time 5303.76ms 
iter 6873: loss 2.6403, time 5273.86ms 
iter 6874: loss 2.4649, time 5259.34ms 
iter 6875: loss 2.8426, time 5255.77ms 
iter 6876: loss 2.5229, time 5253.67ms 
iter 6877: loss 2.4788, time 5272.44ms 
iter 6878: loss 2.5863, time 5261.87ms 
iter 6879: loss 2.4802, time 5271.83ms 
iter 6880: loss 2.6904, time 5267.35ms 
iter 6881: loss 2.5617, time 5271.36ms 
iter 6882: loss 2.5762, time 5259.06ms 
iter 6883: loss 2.6136, time 5254.87ms 
iter 6884: loss 2.6770, time 5254.04ms 
iter 6885: loss 2.6099, time 5258.56ms 
iter 6886: loss 2.5797, time 5270.69ms 
iter 6887: loss 2.3945, time 5268.07ms 
iter 6888: loss 2.3452, time 5267.13ms 
iter 6889: loss 2.6823, time 5250.64ms 
iter 6890: loss 2.6362, time 5265.80ms 
iter 6891: loss 2.5661, time 5252.27ms 
iter 6892: loss 2.4799, time 5257.01ms 
iter 6893: loss 2.4962, time 5253.43ms 
iter 6894: loss 2.3982, time 5257.18ms 
iter 6895: loss 2.5558, time 5255.15ms 
iter 6896: loss 2.5630, time 5234.47ms 
iter 6897: loss 2.3787, time 5237.04ms 
iter 6898: loss 2.3643, time 5258.96ms 
iter 6899: loss 2.4190, time 5255.23ms 
step 6900: train loss 2.5069, val loss 2.8362
iter 6900: loss 2.5510, time 20075.70ms 
iter 6901: loss 2.4714, time 5262.19ms 
iter 6902: loss 2.1918, time 5258.53ms 
iter 6903: loss 2.8214, time 5254.54ms 
iter 6904: loss 2.4499, time 5273.86ms 
iter 6905: loss 2.7139, time 5251.13ms 
iter 6906: loss 2.4654, time 5253.93ms 
iter 6907: loss 2.3057, time 5251.16ms 
iter 6908: loss 2.4863, time 5257.15ms 
iter 6909: loss 2.7714, time 5252.15ms 
iter 6910: loss 2.4668, time 5245.78ms 
iter 6911: loss 2.5523, time 5250.06ms 
iter 6912: loss 2.5482, time 5257.59ms 
iter 6913: loss 2.4303, time 5265.08ms 
iter 6914: loss 2.7021, time 5249.25ms 
iter 6915: loss 2.5656, time 5254.31ms 
iter 6916: loss 2.3961, time 5256.40ms 
iter 6917: loss 2.5650, time 5273.89ms 
iter 6918: loss 2.5070, time 5251.82ms 
iter 6919: loss 2.4673, time 5250.78ms 
iter 6920: loss 2.5655, time 5259.90ms 
iter 6921: loss 2.5551, time 5266.98ms 
iter 6922: loss 2.4929, time 5261.78ms 
iter 6923: loss 2.6463, time 5252.81ms 
iter 6924: loss 2.5151, time 5253.17ms 
iter 6925: loss 2.5760, time 5269.38ms 
iter 6926: loss 2.2366, time 5262.97ms 
iter 6927: loss 2.5560, time 5263.19ms 
iter 6928: loss 2.5189, time 5263.72ms 
iter 6929: loss 2.6687, time 5259.84ms 
iter 6930: loss 2.5117, time 5267.35ms 
iter 6931: loss 2.5314, time 5260.49ms 
iter 6932: loss 2.7294, time 5257.29ms 
iter 6933: loss 2.4678, time 5261.23ms 
iter 6934: loss 2.4004, time 5271.39ms 
iter 6935: loss 2.3885, time 5263.70ms 
iter 6936: loss 2.5569, time 5260.72ms 
iter 6937: loss 2.4789, time 5261.43ms 
iter 6938: loss 2.5838, time 5277.06ms 
iter 6939: loss 2.6096, time 5279.94ms 
iter 6940: loss 2.6006, time 5271.64ms 
iter 6941: loss 2.6345, time 5280.19ms 
iter 6942: loss 2.5004, time 5285.11ms 
iter 6943: loss 2.4286, time 5276.82ms 
iter 6944: loss 2.3775, time 5272.21ms 
iter 6945: loss 2.5411, time 5265.48ms 
iter 6946: loss 2.4744, time 5282.04ms 
iter 6947: loss 2.3471, time 5277.17ms 
iter 6948: loss 2.4093, time 5260.90ms 
iter 6949: loss 2.5486, time 5262.25ms 
step 6950: train loss 2.5184, val loss 2.8395
iter 6950: loss 2.4486, time 20110.01ms 
iter 6951: loss 2.5294, time 5273.21ms 
iter 6952: loss 2.5264, time 5267.88ms 
iter 6953: loss 2.4451, time 5267.08ms 
iter 6954: loss 2.5463, time 5257.09ms 
iter 6955: loss 2.4213, time 5270.14ms 
iter 6956: loss 2.6909, time 5268.01ms 
iter 6957: loss 2.5223, time 5260.58ms 
iter 6958: loss 2.4026, time 5253.84ms 
iter 6959: loss 2.4010, time 5276.59ms 
iter 6960: loss 2.6771, time 5267.37ms 
iter 6961: loss 2.8159, time 5266.22ms 
iter 6962: loss 2.5289, time 5259.22ms 
iter 6963: loss 2.6580, time 5265.36ms 
iter 6964: loss 2.5568, time 5262.32ms 
iter 6965: loss 2.4929, time 5259.56ms 
iter 6966: loss 2.7964, time 5260.82ms 
iter 6967: loss 2.5759, time 5274.93ms 
iter 6968: loss 2.4964, time 5282.66ms 
iter 6969: loss 2.2757, time 5261.98ms 
iter 6970: loss 2.5725, time 5258.76ms 
iter 6971: loss 2.7435, time 5232.19ms 
iter 6972: loss 2.4767, time 5263.50ms 
iter 6973: loss 2.5306, time 5262.88ms 
iter 6974: loss 2.4212, time 5272.11ms 
iter 6975: loss 2.4731, time 5272.38ms 
iter 6976: loss 2.3858, time 5283.75ms 
iter 6977: loss 2.6359, time 5268.43ms 
iter 6978: loss 2.4182, time 5269.64ms 
iter 6979: loss 2.5146, time 5272.41ms 
iter 6980: loss 2.4524, time 5284.23ms 
iter 6981: loss 2.6269, time 5259.58ms 
iter 6982: loss 2.6588, time 5251.03ms 
iter 6983: loss 2.3975, time 5254.76ms 
iter 6984: loss 2.4657, time 5266.93ms 
iter 6985: loss 2.5121, time 5263.83ms 
iter 6986: loss 2.5029, time 5251.78ms 
iter 6987: loss 2.4540, time 5256.91ms 
iter 6988: loss 2.5924, time 5261.50ms 
iter 6989: loss 2.4985, time 5258.38ms 
iter 6990: loss 2.3572, time 5261.69ms 
iter 6991: loss 2.3584, time 5274.92ms 
iter 6992: loss 2.5547, time 5273.10ms 
iter 6993: loss 2.3419, time 5262.60ms 
iter 6994: loss 2.5340, time 5267.10ms 
iter 6995: loss 2.5494, time 5272.76ms 
iter 6996: loss 2.4615, time 5266.81ms 
iter 6997: loss 2.6933, time 5262.08ms 
iter 6998: loss 2.3192, time 5269.35ms 
iter 6999: loss 2.5509, time 5262.71ms 
step 7000: train loss 2.5181, val loss 2.8373
iter 7000: loss 2.4539, time 19938.60ms 
iter 7001: loss 2.6457, time 5276.68ms 
iter 7002: loss 2.4331, time 5288.55ms 
iter 7003: loss 2.5984, time 5269.19ms 
iter 7004: loss 2.5816, time 5274.76ms 
iter 7005: loss 2.5255, time 5262.58ms 
iter 7006: loss 2.3579, time 5265.61ms 
iter 7007: loss 2.4059, time 5260.12ms 
iter 7008: loss 2.4380, time 5264.82ms 
iter 7009: loss 2.7127, time 5263.70ms 
iter 7010: loss 2.2236, time 5263.20ms 
iter 7011: loss 2.4080, time 5277.19ms 
iter 7012: loss 2.4543, time 5272.69ms 
iter 7013: loss 2.4932, time 5268.61ms 
iter 7014: loss 2.3040, time 5268.36ms 
iter 7015: loss 2.4837, time 5266.90ms 
iter 7016: loss 2.6004, time 5240.04ms 
iter 7017: loss 2.3976, time 5263.00ms 
iter 7018: loss 2.6075, time 5262.99ms 
iter 7019: loss 2.4583, time 5277.78ms 
iter 7020: loss 2.4927, time 5266.21ms 
iter 7021: loss 2.3870, time 5256.88ms 
iter 7022: loss 2.4874, time 5251.74ms 
iter 7023: loss 2.3554, time 5253.14ms 
iter 7024: loss 2.7384, time 5250.94ms 
iter 7025: loss 2.4137, time 5246.70ms 
iter 7026: loss 2.3842, time 5258.45ms 
iter 7027: loss 2.3369, time 5268.59ms 
iter 7028: loss 2.5500, time 5270.60ms 
iter 7029: loss 2.4201, time 5259.66ms 
iter 7030: loss 2.6129, time 5263.83ms 
iter 7031: loss 2.2980, time 5269.33ms 
iter 7032: loss 2.4341, time 5262.65ms 
iter 7033: loss 2.4541, time 5272.65ms 
iter 7034: loss 2.7438, time 5278.09ms 
iter 7035: loss 2.5565, time 5256.53ms 
iter 7036: loss 2.5928, time 5277.67ms 
iter 7037: loss 2.6657, time 5264.63ms 
iter 7038: loss 2.6404, time 5260.56ms 
iter 7039: loss 2.5254, time 5252.04ms 
iter 7040: loss 2.3981, time 5262.03ms 
iter 7041: loss 2.3951, time 5274.70ms 
iter 7042: loss 2.4917, time 5256.28ms 
iter 7043: loss 2.4570, time 5262.58ms 
iter 7044: loss 2.3999, time 5267.38ms 
iter 7045: loss 2.3604, time 5266.81ms 
iter 7046: loss 2.3789, time 5255.31ms 
iter 7047: loss 2.5377, time 5278.63ms 
iter 7048: loss 2.6457, time 5269.43ms 
iter 7049: loss 2.5633, time 5282.25ms 
step 7050: train loss 2.5126, val loss 2.8448
iter 7050: loss 2.5135, time 20057.08ms 
iter 7051: loss 2.6404, time 5296.07ms 
iter 7052: loss 2.5153, time 5289.64ms 
iter 7053: loss 2.5916, time 5281.26ms 
iter 7054: loss 2.4997, time 5299.65ms 
iter 7055: loss 2.5165, time 5305.31ms 
iter 7056: loss 2.4803, time 5294.24ms 
iter 7057: loss 2.5297, time 5259.30ms 
iter 7058: loss 2.5062, time 5272.43ms 
iter 7059: loss 2.5462, time 5265.22ms 
iter 7060: loss 2.5762, time 5247.43ms 
iter 7061: loss 2.5084, time 5257.72ms 
iter 7062: loss 2.7610, time 5264.11ms 
iter 7063: loss 2.6265, time 5259.78ms 
iter 7064: loss 2.5742, time 5261.67ms 
iter 7065: loss 2.4692, time 5260.66ms 
iter 7066: loss 2.4812, time 5254.97ms 
iter 7067: loss 2.5901, time 5268.07ms 
iter 7068: loss 2.3537, time 5259.56ms 
iter 7069: loss 2.5009, time 5259.60ms 
iter 7070: loss 2.5988, time 5262.95ms 
iter 7071: loss 2.4554, time 5274.20ms 
iter 7072: loss 2.5659, time 5255.50ms 
iter 7073: loss 2.6162, time 5257.70ms 
iter 7074: loss 2.4197, time 5259.57ms 
iter 7075: loss 2.4266, time 5267.55ms 
iter 7076: loss 2.3852, time 5265.97ms 
iter 7077: loss 2.4539, time 5260.17ms 
iter 7078: loss 2.4246, time 5269.62ms 
iter 7079: loss 2.2964, time 5278.02ms 
iter 7080: loss 2.4507, time 5264.61ms 
iter 7081: loss 2.5454, time 5227.50ms 
iter 7082: loss 2.5007, time 5309.39ms 
iter 7083: loss 2.3454, time 5283.42ms 
iter 7084: loss 2.5834, time 5234.29ms 
iter 7085: loss 2.5303, time 5241.98ms 
iter 7086: loss 2.7205, time 5291.56ms 
iter 7087: loss 2.5760, time 5280.26ms 
iter 7088: loss 2.4963, time 5272.05ms 
iter 7089: loss 2.3012, time 5315.29ms 
iter 7090: loss 2.3411, time 5319.42ms 
iter 7091: loss 2.6181, time 5243.96ms 
iter 7092: loss 2.5075, time 5259.37ms 
iter 7093: loss 2.2014, time 5255.51ms 
iter 7094: loss 2.2204, time 5238.88ms 
iter 7095: loss 2.6754, time 5282.26ms 
iter 7096: loss 2.5511, time 5282.91ms 
iter 7097: loss 2.5753, time 5349.33ms 
iter 7098: loss 2.4425, time 5210.28ms 
iter 7099: loss 2.4035, time 5264.01ms 
step 7100: train loss 2.5199, val loss 2.8534
iter 7100: loss 2.3495, time 20234.42ms 
iter 7101: loss 2.6160, time 5247.13ms 
iter 7102: loss 2.6431, time 5248.20ms 
iter 7103: loss 2.4475, time 5248.34ms 
iter 7104: loss 2.7063, time 5267.02ms 
iter 7105: loss 2.6949, time 5250.07ms 
iter 7106: loss 2.5675, time 5287.37ms 
iter 7107: loss 2.6742, time 5313.76ms 
iter 7108: loss 2.3987, time 5254.42ms 
iter 7109: loss 2.4846, time 5264.49ms 
iter 7110: loss 2.6386, time 5259.31ms 
iter 7111: loss 2.4052, time 5258.14ms 
iter 7112: loss 2.6163, time 5260.87ms 
iter 7113: loss 2.7223, time 5268.42ms 
iter 7114: loss 2.3665, time 5254.98ms 
iter 7115: loss 2.4741, time 5263.59ms 
iter 7116: loss 2.6398, time 5260.80ms 
iter 7117: loss 2.2565, time 5275.02ms 
iter 7118: loss 2.5085, time 5391.57ms 
iter 7119: loss 2.4135, time 5400.39ms 
iter 7120: loss 2.4138, time 5389.99ms 
iter 7121: loss 2.4250, time 5406.36ms 
iter 7122: loss 2.4943, time 5432.56ms 
iter 7123: loss 2.4156, time 5409.24ms 
iter 7124: loss 2.3943, time 5409.26ms 
iter 7125: loss 2.3786, time 5401.69ms 
iter 7126: loss 2.5830, time 5401.81ms 
iter 7127: loss 2.3918, time 5416.51ms 
iter 7128: loss 2.6489, time 5410.99ms 
iter 7129: loss 2.3515, time 5420.24ms 
iter 7130: loss 2.5885, time 5389.61ms 
iter 7131: loss 2.5715, time 5254.21ms 
iter 7132: loss 2.2708, time 5255.61ms 
iter 7133: loss 2.4231, time 5270.74ms 
iter 7134: loss 2.5588, time 5258.79ms 
iter 7135: loss 2.5254, time 5257.45ms 
iter 7136: loss 2.5312, time 5265.77ms 
iter 7137: loss 2.2238, time 5240.06ms 
iter 7138: loss 2.5738, time 5272.47ms 
iter 7139: loss 2.7068, time 5262.15ms 
iter 7140: loss 2.4925, time 5224.65ms 
iter 7141: loss 2.3912, time 5250.87ms 
iter 7142: loss 2.5343, time 5274.15ms 
iter 7143: loss 2.4512, time 5256.85ms 
iter 7144: loss 2.4998, time 5247.52ms 
iter 7145: loss 2.4904, time 5264.55ms 
iter 7146: loss 2.5661, time 5276.01ms 
iter 7147: loss 2.2871, time 5274.28ms 
iter 7148: loss 2.5195, time 5258.60ms 
iter 7149: loss 2.5607, time 5266.27ms 
step 7150: train loss 2.5095, val loss 2.8228
iter 7150: loss 2.4556, time 20081.17ms 
iter 7151: loss 2.4996, time 5266.12ms 
iter 7152: loss 2.5319, time 5252.82ms 
iter 7153: loss 2.4623, time 5272.72ms 
iter 7154: loss 2.5883, time 5205.57ms 
iter 7155: loss 2.5043, time 5300.14ms 
iter 7156: loss 2.5976, time 5239.48ms 
iter 7157: loss 2.4673, time 5262.48ms 
iter 7158: loss 2.7629, time 5270.56ms 
iter 7159: loss 2.4854, time 5229.16ms 
iter 7160: loss 2.6060, time 5332.39ms 
iter 7161: loss 2.5784, time 5243.00ms 
iter 7162: loss 2.5075, time 5243.64ms 
iter 7163: loss 2.3675, time 5341.37ms 
iter 7164: loss 2.5041, time 5419.00ms 
iter 7165: loss 2.4870, time 5432.82ms 
iter 7166: loss 2.6852, time 5430.38ms 
iter 7167: loss 2.5872, time 5359.04ms 
iter 7168: loss 2.5844, time 5271.76ms 
iter 7169: loss 2.5125, time 5251.44ms 
iter 7170: loss 2.6119, time 5251.70ms 
iter 7171: loss 2.2858, time 5259.75ms 
iter 7172: loss 2.5957, time 5266.71ms 
iter 7173: loss 2.6970, time 5264.90ms 
iter 7174: loss 2.1620, time 5252.70ms 
iter 7175: loss 2.4892, time 5261.09ms 
iter 7176: loss 2.5405, time 5271.09ms 
iter 7177: loss 2.6141, time 5252.32ms 
iter 7178: loss 2.4240, time 5248.75ms 
iter 7179: loss 2.1567, time 5265.82ms 
iter 7180: loss 2.3803, time 5260.91ms 
iter 7181: loss 2.2634, time 5256.69ms 
iter 7182: loss 2.6935, time 5254.08ms 
iter 7183: loss 2.4923, time 5262.79ms 
iter 7184: loss 2.3391, time 5290.53ms 
iter 7185: loss 2.5274, time 5279.25ms 
iter 7186: loss 2.4936, time 5388.25ms 
iter 7187: loss 2.4467, time 5349.24ms 
iter 7188: loss 2.6617, time 5270.26ms 
iter 7189: loss 2.6018, time 5266.71ms 
iter 7190: loss 2.6244, time 5265.33ms 
iter 7191: loss 2.6427, time 5267.41ms 
iter 7192: loss 2.5021, time 5267.84ms 
iter 7193: loss 2.7540, time 5263.48ms 
iter 7194: loss 2.4235, time 5262.69ms 
iter 7195: loss 2.7798, time 5268.50ms 
iter 7196: loss 2.5303, time 5278.23ms 
iter 7197: loss 2.4810, time 5272.07ms 
iter 7198: loss 2.5264, time 5414.62ms 
iter 7199: loss 2.5766, time 5426.00ms 
step 7200: train loss 2.5134, val loss 2.8506
iter 7200: loss 2.3739, time 20114.04ms 
iter 7201: loss 2.4694, time 5272.77ms 
iter 7202: loss 2.5021, time 5263.10ms 
iter 7203: loss 2.3228, time 5398.42ms 
iter 7204: loss 2.4851, time 5438.37ms 
iter 7205: loss 2.4715, time 5415.85ms 
iter 7206: loss 2.6357, time 5396.21ms 
iter 7207: loss 2.5085, time 5400.40ms 
iter 7208: loss 2.6475, time 5405.80ms 
iter 7209: loss 2.5022, time 5416.91ms 
iter 7210: loss 2.6660, time 5281.17ms 
iter 7211: loss 2.4837, time 5262.39ms 
iter 7212: loss 2.2519, time 5253.36ms 
iter 7213: loss 2.6276, time 5252.03ms 
iter 7214: loss 2.2624, time 5281.13ms 
iter 7215: loss 2.6856, time 5253.18ms 
iter 7216: loss 2.4397, time 5252.70ms 
iter 7217: loss 2.4521, time 5260.64ms 
iter 7218: loss 2.5497, time 5323.20ms 
iter 7219: loss 2.5169, time 5421.74ms 
iter 7220: loss 2.3882, time 5414.45ms 
iter 7221: loss 2.4204, time 5420.38ms 
iter 7222: loss 2.4855, time 5432.05ms 
iter 7223: loss 2.4062, time 5446.69ms 
iter 7224: loss 2.5047, time 5429.56ms 
iter 7225: loss 2.6608, time 5422.99ms 
iter 7226: loss 2.6985, time 5254.49ms 
iter 7227: loss 2.3516, time 5262.42ms 
iter 7228: loss 2.5738, time 5257.94ms 
iter 7229: loss 2.7495, time 5236.65ms 
iter 7230: loss 2.6446, time 5233.60ms 
iter 7231: loss 2.7066, time 5256.71ms 
iter 7232: loss 2.4005, time 5253.09ms 
iter 7233: loss 2.6439, time 5252.15ms 
iter 7234: loss 2.4266, time 5258.51ms 
iter 7235: loss 2.5199, time 5259.80ms 
iter 7236: loss 2.4176, time 5256.96ms 
iter 7237: loss 2.4966, time 5255.02ms 
iter 7238: loss 2.8436, time 5268.84ms 
iter 7239: loss 2.6038, time 5260.31ms 
iter 7240: loss 2.5321, time 5253.47ms 
iter 7241: loss 2.4230, time 5252.05ms 
iter 7242: loss 2.4827, time 5267.67ms 
iter 7243: loss 2.5820, time 5249.13ms 
iter 7244: loss 2.5544, time 5256.90ms 
iter 7245: loss 2.3494, time 5237.63ms 
iter 7246: loss 2.3533, time 5259.06ms 
iter 7247: loss 2.3114, time 5328.92ms 
iter 7248: loss 2.4933, time 5245.90ms 
iter 7249: loss 2.7450, time 5261.49ms 
step 7250: train loss 2.5063, val loss 2.8363
iter 7250: loss 2.4489, time 20072.70ms 
iter 7251: loss 2.4354, time 5271.56ms 
iter 7252: loss 2.5072, time 5262.47ms 
iter 7253: loss 2.4765, time 5256.18ms 
iter 7254: loss 2.5227, time 5260.59ms 
iter 7255: loss 2.5820, time 5280.11ms 
iter 7256: loss 2.3211, time 5255.85ms 
iter 7257: loss 2.3684, time 5256.50ms 
iter 7258: loss 2.6347, time 5258.13ms 
iter 7259: loss 2.6658, time 5279.29ms 
iter 7260: loss 2.5398, time 5266.49ms 
iter 7261: loss 2.4252, time 5264.76ms 
iter 7262: loss 2.5088, time 5258.41ms 
iter 7263: loss 2.2529, time 5286.87ms 
iter 7264: loss 2.5192, time 5304.84ms 
iter 7265: loss 2.6732, time 5290.17ms 
iter 7266: loss 2.4224, time 5274.13ms 
iter 7267: loss 2.2537, time 5297.96ms 
iter 7268: loss 2.4910, time 5285.70ms 
iter 7269: loss 2.4634, time 5272.10ms 
iter 7270: loss 2.4134, time 5277.18ms 
iter 7271: loss 2.7033, time 5267.83ms 
iter 7272: loss 2.6939, time 5260.21ms 
iter 7273: loss 2.6468, time 5352.95ms 
iter 7274: loss 2.3660, time 5267.92ms 
iter 7275: loss 2.5351, time 5252.35ms 
iter 7276: loss 2.1645, time 5258.06ms 
iter 7277: loss 2.3090, time 5262.62ms 
iter 7278: loss 2.3963, time 5254.98ms 
iter 7279: loss 2.3716, time 5263.97ms 
iter 7280: loss 2.6119, time 5229.56ms 
iter 7281: loss 2.4139, time 5251.82ms 
iter 7282: loss 2.4658, time 5256.72ms 
iter 7283: loss 2.4312, time 5253.96ms 
iter 7284: loss 2.6732, time 5273.99ms 
iter 7285: loss 2.6584, time 5258.64ms 
iter 7286: loss 2.5899, time 5253.88ms 
iter 7287: loss 2.5556, time 5254.84ms 
iter 7288: loss 2.6491, time 5273.48ms 
iter 7289: loss 2.3964, time 5277.57ms 
iter 7290: loss 2.5321, time 5271.56ms 
iter 7291: loss 2.5219, time 5265.56ms 
iter 7292: loss 2.4853, time 5266.50ms 
iter 7293: loss 2.6272, time 5230.67ms 
iter 7294: loss 2.3660, time 5267.24ms 
iter 7295: loss 2.6329, time 5254.95ms 
iter 7296: loss 2.3110, time 5267.97ms 
iter 7297: loss 2.5267, time 5303.72ms 
iter 7298: loss 2.5465, time 5297.74ms 
iter 7299: loss 2.3749, time 5347.52ms 
step 7300: train loss 2.5000, val loss 2.8365
iter 7300: loss 2.3912, time 19953.24ms 
iter 7301: loss 2.5057, time 5255.81ms 
iter 7302: loss 2.5613, time 5263.58ms 
iter 7303: loss 2.5568, time 5271.76ms 
iter 7304: loss 2.5497, time 5264.75ms 
iter 7305: loss 2.5226, time 5229.08ms 
iter 7306: loss 2.4782, time 5268.65ms 
iter 7307: loss 2.7302, time 5261.69ms 
iter 7308: loss 2.4889, time 5258.13ms 
iter 7309: loss 2.6191, time 5262.45ms 
iter 7310: loss 2.5891, time 5270.32ms 
iter 7311: loss 2.6655, time 5255.98ms 
iter 7312: loss 2.8104, time 5263.06ms 
iter 7313: loss 2.6842, time 5260.07ms 
iter 7314: loss 2.5907, time 5268.60ms 
iter 7315: loss 2.4207, time 5262.22ms 
iter 7316: loss 2.5829, time 5250.27ms 
iter 7317: loss 2.3284, time 5255.62ms 
iter 7318: loss 2.4327, time 5271.40ms 
iter 7319: loss 2.3406, time 5268.13ms 
iter 7320: loss 2.3287, time 5255.31ms 
iter 7321: loss 2.4360, time 5252.92ms 
iter 7322: loss 2.6201, time 5263.32ms 
iter 7323: loss 2.6689, time 5261.52ms 
iter 7324: loss 2.3197, time 5261.03ms 
iter 7325: loss 2.3470, time 5251.66ms 
iter 7326: loss 2.6128, time 5257.80ms 
iter 7327: loss 2.6123, time 5269.06ms 
iter 7328: loss 2.5423, time 5263.57ms 
iter 7329: loss 2.4262, time 5269.52ms 
iter 7330: loss 2.7216, time 5269.42ms 
iter 7331: loss 2.4896, time 5270.12ms 
iter 7332: loss 2.5986, time 5266.99ms 
iter 7333: loss 2.4978, time 5260.70ms 
iter 7334: loss 2.4705, time 5275.48ms 
iter 7335: loss 2.5482, time 5282.17ms 
iter 7336: loss 2.6027, time 5260.84ms 
iter 7337: loss 2.4272, time 5257.33ms 
iter 7338: loss 2.6962, time 5261.53ms 
iter 7339: loss 2.2810, time 5265.59ms 
iter 7340: loss 2.3865, time 5252.81ms 
iter 7341: loss 2.5653, time 5255.23ms 
iter 7342: loss 2.5053, time 5259.60ms 
iter 7343: loss 2.6364, time 5266.30ms 
iter 7344: loss 2.6428, time 5262.03ms 
iter 7345: loss 2.4944, time 5252.22ms 
iter 7346: loss 2.7692, time 5266.62ms 
iter 7347: loss 2.4919, time 5270.84ms 
iter 7348: loss 2.5902, time 5261.21ms 
iter 7349: loss 2.4608, time 5248.12ms 
step 7350: train loss 2.4912, val loss 2.8549
iter 7350: loss 2.4287, time 20118.93ms 
iter 7351: loss 2.5065, time 5266.33ms 
iter 7352: loss 2.5938, time 5250.20ms 
iter 7353: loss 2.6033, time 5273.87ms 
iter 7354: loss 2.6158, time 5261.33ms 
iter 7355: loss 2.4524, time 5273.11ms 
iter 7356: loss 2.2490, time 5260.32ms 
iter 7357: loss 2.4706, time 5258.81ms 
iter 7358: loss 2.6537, time 5255.25ms 
iter 7359: loss 2.3325, time 5276.40ms 
iter 7360: loss 2.2720, time 5237.26ms 
iter 7361: loss 2.4215, time 5248.00ms 
iter 7362: loss 2.2677, time 5264.01ms 
iter 7363: loss 2.6121, time 5276.86ms 
iter 7364: loss 2.4595, time 5244.73ms 
iter 7365: loss 2.4162, time 5265.44ms 
iter 7366: loss 2.5953, time 5273.86ms 
iter 7367: loss 2.3995, time 5277.11ms 
iter 7368: loss 2.6012, time 5274.59ms 
iter 7369: loss 2.6126, time 5259.03ms 
iter 7370: loss 2.5647, time 5264.05ms 
iter 7371: loss 2.6427, time 5280.69ms 
iter 7372: loss 2.4736, time 5266.88ms 
iter 7373: loss 2.5682, time 5267.27ms 
iter 7374: loss 2.3827, time 5263.39ms 
iter 7375: loss 2.4948, time 5273.05ms 
iter 7376: loss 2.4214, time 5263.81ms 
iter 7377: loss 2.2237, time 5259.68ms 
iter 7378: loss 2.6936, time 5266.28ms 
iter 7379: loss 2.6870, time 5298.01ms 
iter 7380: loss 2.5748, time 5277.32ms 
iter 7381: loss 2.3711, time 5263.03ms 
iter 7382: loss 2.6897, time 5236.37ms 
iter 7383: loss 2.5565, time 5258.18ms 
iter 7384: loss 2.7475, time 5254.56ms 
iter 7385: loss 2.5060, time 5425.31ms 
iter 7386: loss 2.3356, time 5417.12ms 
iter 7387: loss 2.3221, time 5234.12ms 
iter 7388: loss 2.3148, time 5364.63ms 
iter 7389: loss 2.4490, time 5423.90ms 
iter 7390: loss 2.5483, time 5412.41ms 
iter 7391: loss 2.4871, time 5286.40ms 
iter 7392: loss 2.5464, time 5273.45ms 
iter 7393: loss 2.4782, time 5407.90ms 
iter 7394: loss 2.4998, time 5409.71ms 
iter 7395: loss 2.6969, time 5412.61ms 
iter 7396: loss 2.4346, time 5416.78ms 
iter 7397: loss 2.4174, time 5418.56ms 
iter 7398: loss 2.2089, time 5416.71ms 
iter 7399: loss 2.3116, time 5340.04ms 
step 7400: train loss 2.5016, val loss 2.8406
iter 7400: loss 2.4753, time 20409.66ms 
iter 7401: loss 2.4439, time 5261.85ms 
iter 7402: loss 2.7201, time 5261.93ms 
iter 7403: loss 2.5966, time 5273.01ms 
iter 7404: loss 2.2903, time 5268.20ms 
iter 7405: loss 2.5832, time 5255.30ms 
iter 7406: loss 2.3240, time 5262.74ms 
iter 7407: loss 2.8040, time 5264.31ms 
iter 7408: loss 2.5193, time 5254.00ms 
iter 7409: loss 2.5945, time 5252.50ms 
iter 7410: loss 2.4486, time 5254.88ms 
iter 7411: loss 2.3013, time 5435.07ms 
iter 7412: loss 2.4835, time 5421.59ms 
iter 7413: loss 2.4634, time 5239.40ms 
iter 7414: loss 2.8025, time 5428.63ms 
iter 7415: loss 2.2653, time 5416.89ms 
iter 7416: loss 2.4999, time 5363.01ms 
iter 7417: loss 2.3186, time 5255.50ms 
iter 7418: loss 2.5098, time 5253.84ms 
iter 7419: loss 2.5255, time 5255.51ms 
iter 7420: loss 2.4975, time 5271.90ms 
iter 7421: loss 2.3956, time 5223.85ms 
iter 7422: loss 2.2162, time 5228.29ms 
iter 7423: loss 2.5546, time 5253.71ms 
iter 7424: loss 2.3644, time 5264.12ms 
iter 7425: loss 2.6128, time 5261.75ms 
iter 7426: loss 2.3784, time 5260.97ms 
iter 7427: loss 2.3682, time 5238.08ms 
iter 7428: loss 2.3384, time 5264.78ms 
iter 7429: loss 2.4862, time 5261.74ms 
iter 7430: loss 2.6276, time 5259.20ms 
iter 7431: loss 2.5883, time 5253.84ms 
iter 7432: loss 2.6722, time 5267.87ms 
iter 7433: loss 2.6857, time 5262.78ms 
iter 7434: loss 2.3568, time 5252.15ms 
iter 7435: loss 2.3975, time 5254.96ms 
iter 7436: loss 2.4092, time 5230.99ms 
iter 7437: loss 2.2233, time 5250.41ms 
iter 7438: loss 2.5797, time 5254.30ms 
iter 7439: loss 2.3012, time 5239.16ms 
iter 7440: loss 2.4877, time 5247.11ms 
iter 7441: loss 2.5650, time 5252.87ms 
iter 7442: loss 2.5531, time 5250.25ms 
iter 7443: loss 2.5240, time 5247.72ms 
iter 7444: loss 2.4903, time 5252.91ms 
iter 7445: loss 2.6732, time 5268.62ms 
iter 7446: loss 2.6580, time 5207.97ms 
iter 7447: loss 2.5013, time 5261.44ms 
iter 7448: loss 2.5115, time 5265.48ms 
iter 7449: loss 2.6705, time 5265.29ms 
step 7450: train loss 2.4989, val loss 2.8467
iter 7450: loss 2.4649, time 20084.34ms 
iter 7451: loss 2.6672, time 5260.86ms 
iter 7452: loss 2.4534, time 5253.79ms 
iter 7453: loss 2.4374, time 5273.41ms 
iter 7454: loss 2.4454, time 5257.44ms 
iter 7455: loss 2.6132, time 5259.73ms 
iter 7456: loss 2.2464, time 5258.13ms 
iter 7457: loss 2.5913, time 5272.48ms 
iter 7458: loss 2.4030, time 5270.95ms 
iter 7459: loss 2.4408, time 5258.60ms 
iter 7460: loss 2.5484, time 5258.30ms 
iter 7461: loss 2.3691, time 5244.45ms 
iter 7462: loss 2.4534, time 5257.79ms 
iter 7463: loss 2.5759, time 5256.23ms 
iter 7464: loss 2.4866, time 5265.57ms 
iter 7465: loss 2.4701, time 5268.56ms 
iter 7466: loss 2.4587, time 5263.78ms 
iter 7467: loss 2.5578, time 5258.39ms 
iter 7468: loss 2.4034, time 5250.07ms 
iter 7469: loss 2.3772, time 5269.44ms 
iter 7470: loss 2.6317, time 5253.29ms 
iter 7471: loss 2.6064, time 5254.40ms 
iter 7472: loss 2.4434, time 5253.33ms 
iter 7473: loss 2.3883, time 5277.90ms 
iter 7474: loss 2.7040, time 5252.91ms 
iter 7475: loss 2.5276, time 5263.68ms 
iter 7476: loss 2.3543, time 5265.15ms 
iter 7477: loss 2.6350, time 5268.88ms 
iter 7478: loss 2.4735, time 5263.37ms 
iter 7479: loss 2.5864, time 5276.78ms 
iter 7480: loss 2.4305, time 5266.81ms 
iter 7481: loss 2.3044, time 5285.00ms 
iter 7482: loss 2.5579, time 5286.73ms 
iter 7483: loss 2.4331, time 5268.56ms 
iter 7484: loss 2.2675, time 5255.46ms 
iter 7485: loss 2.6030, time 5262.14ms 
iter 7486: loss 2.6463, time 5262.13ms 
iter 7487: loss 2.3954, time 5272.14ms 
iter 7488: loss 2.6070, time 5264.44ms 
iter 7489: loss 2.4031, time 5266.00ms 
iter 7490: loss 2.5425, time 5254.15ms 
iter 7491: loss 2.3993, time 5266.38ms 
iter 7492: loss 2.4451, time 5262.05ms 
iter 7493: loss 2.5821, time 5276.43ms 
iter 7494: loss 2.4338, time 5260.41ms 
iter 7495: loss 2.3787, time 5264.80ms 
iter 7496: loss 2.3260, time 5252.65ms 
iter 7497: loss 2.5587, time 5267.42ms 
iter 7498: loss 2.5781, time 5253.40ms 
iter 7499: loss 2.4900, time 5257.69ms 
step 7500: train loss 2.5075, val loss 2.8373
iter 7500: loss 2.6051, time 20057.59ms 
iter 7501: loss 2.3996, time 5270.19ms 
iter 7502: loss 2.6520, time 5265.31ms 
iter 7503: loss 2.4717, time 5259.11ms 
iter 7504: loss 2.5255, time 5270.34ms 
iter 7505: loss 2.3836, time 5261.31ms 
iter 7506: loss 2.6234, time 5259.95ms 
iter 7507: loss 2.6526, time 5252.07ms 
iter 7508: loss 2.4538, time 5234.35ms 
iter 7509: loss 2.3895, time 5265.58ms 
iter 7510: loss 2.5526, time 5246.99ms 
iter 7511: loss 2.3372, time 5260.07ms 
iter 7512: loss 2.4167, time 5267.49ms 
iter 7513: loss 2.5895, time 5265.91ms 
iter 7514: loss 2.2853, time 5254.76ms 
iter 7515: loss 2.4280, time 5257.28ms 
iter 7516: loss 2.4815, time 5266.27ms 
iter 7517: loss 2.4852, time 5250.93ms 
iter 7518: loss 2.4143, time 5253.75ms 
iter 7519: loss 2.4639, time 5258.09ms 
iter 7520: loss 2.6276, time 5263.47ms 
iter 7521: loss 2.2473, time 5253.83ms 
iter 7522: loss 2.5696, time 5251.19ms 
iter 7523: loss 2.6722, time 5260.54ms 
iter 7524: loss 2.6230, time 5263.34ms 
iter 7525: loss 2.4892, time 5256.42ms 
iter 7526: loss 2.3555, time 5250.40ms 
iter 7527: loss 2.4880, time 5262.03ms 
iter 7528: loss 2.2290, time 5261.13ms 
iter 7529: loss 2.5796, time 5253.95ms 
iter 7530: loss 2.6448, time 5252.62ms 
iter 7531: loss 2.6087, time 5261.02ms 
iter 7532: loss 2.5214, time 5262.18ms 
iter 7533: loss 2.4634, time 5251.87ms 
iter 7534: loss 2.5963, time 5251.14ms 
iter 7535: loss 2.4766, time 5263.18ms 
iter 7536: loss 2.2493, time 5256.36ms 
iter 7537: loss 2.5456, time 5275.58ms 
iter 7538: loss 2.3443, time 5280.49ms 
iter 7539: loss 2.4850, time 5286.73ms 
iter 7540: loss 2.4605, time 5270.04ms 
iter 7541: loss 2.4461, time 5269.00ms 
iter 7542: loss 2.5820, time 5280.34ms 
iter 7543: loss 2.4453, time 5285.02ms 
iter 7544: loss 2.6085, time 5283.23ms 
iter 7545: loss 2.5779, time 5259.53ms 
iter 7546: loss 2.4173, time 5266.36ms 
iter 7547: loss 2.5886, time 5274.10ms 
iter 7548: loss 2.4809, time 5266.32ms 
iter 7549: loss 2.5720, time 5261.90ms 
step 7550: train loss 2.4965, val loss 2.8466
iter 7550: loss 2.3569, time 20017.66ms 
iter 7551: loss 2.3563, time 5249.77ms 
iter 7552: loss 2.6209, time 5262.02ms 
iter 7553: loss 2.4901, time 5252.61ms 
iter 7554: loss 2.4279, time 5258.84ms 
iter 7555: loss 2.6589, time 5277.42ms 
iter 7556: loss 2.6298, time 5258.75ms 
iter 7557: loss 2.5722, time 5252.83ms 
iter 7558: loss 2.3829, time 5257.52ms 
iter 7559: loss 2.4652, time 5269.69ms 
iter 7560: loss 2.6036, time 5260.58ms 
iter 7561: loss 2.4978, time 5248.41ms 
iter 7562: loss 2.4876, time 5255.82ms 
iter 7563: loss 2.6089, time 5258.81ms 
iter 7564: loss 2.5727, time 5257.38ms 
iter 7565: loss 2.5525, time 5260.93ms 
iter 7566: loss 2.7595, time 5267.97ms 
iter 7567: loss 2.4289, time 5280.02ms 
iter 7568: loss 2.4932, time 5271.10ms 
iter 7569: loss 2.5441, time 5261.82ms 
iter 7570: loss 2.4530, time 5262.05ms 
iter 7571: loss 2.4491, time 5262.82ms 
iter 7572: loss 2.4409, time 5270.52ms 
iter 7573: loss 2.5818, time 5257.57ms 
iter 7574: loss 2.4260, time 5262.27ms 
iter 7575: loss 2.2479, time 5268.71ms 
iter 7576: loss 2.5282, time 5264.11ms 
iter 7577: loss 2.5480, time 5262.86ms 
iter 7578: loss 2.2260, time 5258.58ms 
iter 7579: loss 2.5204, time 5269.83ms 
iter 7580: loss 2.4903, time 5263.61ms 
iter 7581: loss 2.5598, time 5258.51ms 
iter 7582: loss 2.7576, time 5249.08ms 
iter 7583: loss 2.3785, time 5258.77ms 
iter 7584: loss 2.4107, time 5258.64ms 
iter 7585: loss 2.2059, time 5251.44ms 
iter 7586: loss 2.5735, time 5255.94ms 
iter 7587: loss 2.5067, time 5253.33ms 
iter 7588: loss 2.5132, time 5260.45ms 
iter 7589: loss 2.6278, time 5258.32ms 
iter 7590: loss 2.2436, time 5250.89ms 
iter 7591: loss 2.5341, time 5260.40ms 
iter 7592: loss 2.4579, time 5264.78ms 
iter 7593: loss 2.5437, time 5256.61ms 
iter 7594: loss 2.4392, time 5255.81ms 
iter 7595: loss 2.3575, time 5264.83ms 
iter 7596: loss 2.5919, time 5257.15ms 
iter 7597: loss 2.4776, time 5259.24ms 
iter 7598: loss 2.1601, time 5246.87ms 
iter 7599: loss 2.5028, time 5261.34ms 
step 7600: train loss 2.4952, val loss 2.8580
iter 7600: loss 2.5204, time 20069.82ms 
iter 7601: loss 2.4740, time 5252.23ms 
iter 7602: loss 2.4215, time 5251.78ms 
iter 7603: loss 2.6059, time 5261.51ms 
iter 7604: loss 2.6065, time 5266.31ms 
iter 7605: loss 2.6032, time 5226.73ms 
iter 7606: loss 2.5759, time 5235.86ms 
iter 7607: loss 2.3694, time 5259.49ms 
iter 7608: loss 2.5288, time 5269.00ms 
iter 7609: loss 2.4894, time 5257.46ms 
iter 7610: loss 2.5566, time 5260.20ms 
iter 7611: loss 2.6398, time 5255.63ms 
iter 7612: loss 2.5226, time 5272.87ms 
iter 7613: loss 2.5306, time 5260.14ms 
iter 7614: loss 2.5883, time 5257.40ms 
iter 7615: loss 2.6525, time 5258.80ms 
iter 7616: loss 2.5569, time 5280.63ms 
iter 7617: loss 2.6427, time 5263.49ms 
iter 7618: loss 2.7455, time 5258.29ms 
iter 7619: loss 2.3897, time 5253.09ms 
iter 7620: loss 2.4583, time 5273.53ms 
iter 7621: loss 2.5776, time 5267.54ms 
iter 7622: loss 2.4821, time 5259.31ms 
iter 7623: loss 2.6348, time 5270.57ms 
iter 7624: loss 2.5772, time 5268.40ms 
iter 7625: loss 2.4960, time 5257.88ms 
iter 7626: loss 2.4992, time 5262.55ms 
iter 7627: loss 2.5258, time 5259.00ms 
iter 7628: loss 2.5161, time 5244.74ms 
iter 7629: loss 2.4594, time 5248.99ms 
iter 7630: loss 2.8153, time 5253.92ms 
iter 7631: loss 2.3987, time 5259.81ms 
iter 7632: loss 2.6016, time 5257.49ms 
iter 7633: loss 2.5531, time 5250.15ms 
iter 7634: loss 2.6388, time 5263.29ms 
iter 7635: loss 2.4008, time 5259.99ms 
iter 7636: loss 2.3536, time 5260.19ms 
iter 7637: loss 2.5193, time 5256.36ms 
iter 7638: loss 2.4903, time 5257.33ms 
iter 7639: loss 2.6842, time 5271.73ms 
iter 7640: loss 2.4065, time 5260.83ms 
iter 7641: loss 2.5486, time 5260.62ms 
iter 7642: loss 2.3935, time 5265.76ms 
iter 7643: loss 2.5987, time 5273.45ms 
iter 7644: loss 2.2540, time 5247.80ms 
iter 7645: loss 2.4611, time 5253.25ms 
iter 7646: loss 2.3439, time 5255.63ms 
iter 7647: loss 2.3438, time 5271.37ms 
iter 7648: loss 2.4460, time 5271.79ms 
iter 7649: loss 2.3884, time 5266.70ms 
step 7650: train loss 2.4919, val loss 2.8588
iter 7650: loss 2.4555, time 20092.26ms 
iter 7651: loss 2.6327, time 5274.65ms 
iter 7652: loss 2.3837, time 5259.55ms 
iter 7653: loss 2.4189, time 5290.52ms 
iter 7654: loss 2.2482, time 5241.63ms 
iter 7655: loss 2.5019, time 5397.85ms 
iter 7656: loss 2.7493, time 5404.87ms 
iter 7657: loss 2.4316, time 5313.56ms 
iter 7658: loss 2.4365, time 5273.87ms 
iter 7659: loss 2.5309, time 5266.02ms 
iter 7660: loss 2.3915, time 5267.64ms 
iter 7661: loss 2.3681, time 5271.84ms 
iter 7662: loss 2.6889, time 5276.30ms 
iter 7663: loss 2.6804, time 5251.03ms 
iter 7664: loss 2.4723, time 5262.60ms 
iter 7665: loss 2.6008, time 5263.35ms 
iter 7666: loss 2.6306, time 5279.64ms 
iter 7667: loss 2.5167, time 5261.35ms 
iter 7668: loss 2.6026, time 5252.33ms 
iter 7669: loss 2.2477, time 5253.87ms 
iter 7670: loss 2.3383, time 5274.88ms 
iter 7671: loss 2.6070, time 5254.87ms 
iter 7672: loss 2.4957, time 5266.80ms 
iter 7673: loss 2.5539, time 5330.39ms 
iter 7674: loss 2.4784, time 5412.17ms 
iter 7675: loss 2.4568, time 5342.57ms 
iter 7676: loss 2.6343, time 5262.41ms 
iter 7677: loss 2.4075, time 5250.87ms 
iter 7678: loss 2.5562, time 5271.06ms 
iter 7679: loss 2.5185, time 5257.61ms 
iter 7680: loss 2.4716, time 5256.41ms 
iter 7681: loss 2.4339, time 5257.20ms 
iter 7682: loss 2.6337, time 5264.37ms 
iter 7683: loss 2.4975, time 5253.14ms 
iter 7684: loss 2.4265, time 5256.58ms 
iter 7685: loss 2.5492, time 5248.51ms 
iter 7686: loss 2.3900, time 5289.03ms 
iter 7687: loss 2.5206, time 5282.24ms 
iter 7688: loss 2.3707, time 5286.99ms 
iter 7689: loss 2.4432, time 5302.91ms 
iter 7690: loss 2.2711, time 5309.10ms 
iter 7691: loss 2.5721, time 5276.21ms 
iter 7692: loss 2.6777, time 5272.26ms 
iter 7693: loss 2.4841, time 5270.16ms 
iter 7694: loss 2.4930, time 5324.67ms 
iter 7695: loss 2.4449, time 5345.20ms 
iter 7696: loss 2.3840, time 5284.57ms 
iter 7697: loss 2.5574, time 5269.38ms 
iter 7698: loss 2.3654, time 5274.39ms 
iter 7699: loss 2.5092, time 5275.53ms 
step 7700: train loss 2.5047, val loss 2.8456
iter 7700: loss 2.5726, time 19968.04ms 
iter 7701: loss 2.5236, time 5259.55ms 
iter 7702: loss 2.6165, time 5259.99ms 
iter 7703: loss 2.2417, time 5254.86ms 
iter 7704: loss 2.4779, time 5263.13ms 
iter 7705: loss 2.5767, time 5247.50ms 
iter 7706: loss 2.6314, time 5235.04ms 
iter 7707: loss 2.5062, time 5251.72ms 
iter 7708: loss 2.4799, time 5259.91ms 
iter 7709: loss 2.5065, time 5259.30ms 
iter 7710: loss 2.6782, time 5214.66ms 
iter 7711: loss 2.4665, time 5256.28ms 
iter 7712: loss 2.4627, time 5260.13ms 
iter 7713: loss 2.5320, time 5258.19ms 
iter 7714: loss 2.4556, time 5249.87ms 
iter 7715: loss 2.4119, time 5248.66ms 
iter 7716: loss 2.7022, time 5258.39ms 
iter 7717: loss 2.4735, time 5273.60ms 
iter 7718: loss 2.6084, time 5260.35ms 
iter 7719: loss 2.6512, time 5263.49ms 
iter 7720: loss 2.5660, time 5270.86ms 
iter 7721: loss 2.4757, time 5245.51ms 
iter 7722: loss 2.7120, time 5261.75ms 
iter 7723: loss 2.5123, time 5262.69ms 
iter 7724: loss 2.4201, time 5262.60ms 
iter 7725: loss 2.6705, time 5265.22ms 
iter 7726: loss 2.3952, time 5271.31ms 
iter 7727: loss 2.4674, time 5256.68ms 
iter 7728: loss 2.3521, time 5253.30ms 
iter 7729: loss 2.5116, time 5262.76ms 
iter 7730: loss 2.1159, time 5250.32ms 
iter 7731: loss 2.3275, time 5260.57ms 
iter 7732: loss 2.5757, time 5257.88ms 
iter 7733: loss 2.5272, time 5264.58ms 
iter 7734: loss 2.5327, time 5265.76ms 
iter 7735: loss 2.8549, time 5257.19ms 
iter 7736: loss 2.5026, time 5260.46ms 
iter 7737: loss 2.6427, time 5273.53ms 
iter 7738: loss 2.2061, time 5264.64ms 
iter 7739: loss 2.2137, time 5259.49ms 
iter 7740: loss 2.3822, time 5257.63ms 
iter 7741: loss 2.4549, time 5263.14ms 
iter 7742: loss 2.6195, time 5266.53ms 
iter 7743: loss 2.5759, time 5255.91ms 
iter 7744: loss 2.3603, time 5264.96ms 
iter 7745: loss 2.4722, time 5263.17ms 
iter 7746: loss 2.4020, time 5263.91ms 
iter 7747: loss 2.7433, time 5267.63ms 
iter 7748: loss 2.4727, time 5270.74ms 
iter 7749: loss 2.3433, time 5260.98ms 
step 7750: train loss 2.4898, val loss 2.8560
iter 7750: loss 2.2627, time 20070.05ms 
iter 7751: loss 2.4124, time 5263.40ms 
iter 7752: loss 2.4481, time 5262.19ms 
iter 7753: loss 2.4514, time 5251.96ms 
iter 7754: loss 2.4640, time 5254.27ms 
iter 7755: loss 2.4487, time 5267.92ms 
iter 7756: loss 2.5848, time 5253.14ms 
iter 7757: loss 2.6155, time 5250.77ms 
iter 7758: loss 2.4145, time 5314.08ms 
iter 7759: loss 2.3515, time 5268.55ms 
iter 7760: loss 2.5293, time 5262.89ms 
iter 7761: loss 2.5603, time 5260.65ms 
iter 7762: loss 2.4984, time 5265.43ms 
iter 7763: loss 2.4213, time 5248.69ms 
iter 7764: loss 2.5468, time 5246.14ms 
iter 7765: loss 2.6513, time 5201.56ms 
iter 7766: loss 2.6063, time 5248.24ms 
iter 7767: loss 2.5160, time 5242.74ms 
iter 7768: loss 2.3523, time 5246.42ms 
iter 7769: loss 2.4422, time 5308.87ms 
iter 7770: loss 2.5337, time 5248.42ms 
iter 7771: loss 2.6191, time 5245.98ms 
iter 7772: loss 2.4661, time 5243.27ms 
iter 7773: loss 2.5034, time 5205.66ms 
iter 7774: loss 2.4277, time 5256.44ms 
iter 7775: loss 2.4813, time 5267.55ms 
iter 7776: loss 2.5520, time 5251.73ms 
iter 7777: loss 2.5046, time 5253.08ms 
iter 7778: loss 2.0411, time 5269.60ms 
iter 7779: loss 2.3830, time 5218.00ms 
iter 7780: loss 2.6667, time 5253.56ms 
iter 7781: loss 2.4559, time 5253.98ms 
iter 7782: loss 2.5164, time 5261.01ms 
iter 7783: loss 2.3629, time 5265.74ms 
iter 7784: loss 2.5674, time 5258.19ms 
iter 7785: loss 2.4509, time 5253.74ms 
iter 7786: loss 2.4332, time 5252.09ms 
iter 7787: loss 2.5819, time 5264.88ms 
iter 7788: loss 2.5027, time 5248.37ms 
iter 7789: loss 2.5620, time 5225.26ms 
iter 7790: loss 2.8260, time 5262.52ms 
iter 7791: loss 2.5046, time 5267.55ms 
iter 7792: loss 2.4260, time 5256.28ms 
iter 7793: loss 2.5272, time 5250.38ms 
iter 7794: loss 2.3684, time 5251.01ms 
iter 7795: loss 2.6716, time 5272.09ms 
iter 7796: loss 2.6020, time 5252.42ms 
iter 7797: loss 2.4684, time 5247.28ms 
iter 7798: loss 2.6179, time 5259.94ms 
iter 7799: loss 2.3335, time 5271.98ms 
step 7800: train loss 2.4785, val loss 2.8250
iter 7800: loss 2.6282, time 19853.36ms 
iter 7801: loss 2.2199, time 5141.24ms 
iter 7802: loss 2.2303, time 5092.93ms 
iter 7803: loss 2.7105, time 5140.15ms 
iter 7804: loss 2.3891, time 5256.69ms 
iter 7805: loss 2.4227, time 5248.88ms 
iter 7806: loss 2.5243, time 5246.64ms 
iter 7807: loss 2.3567, time 5256.00ms 
iter 7808: loss 2.6027, time 5244.40ms 
iter 7809: loss 2.5344, time 5252.09ms 
iter 7810: loss 2.7716, time 5250.94ms 
iter 7811: loss 2.4689, time 5254.32ms 
iter 7812: loss 2.3952, time 5265.77ms 
iter 7813: loss 2.3018, time 5249.89ms 
iter 7814: loss 2.5478, time 5250.97ms 
iter 7815: loss 2.7795, time 5253.20ms 
iter 7816: loss 2.3539, time 5266.18ms 
iter 7817: loss 2.7181, time 5248.48ms 
iter 7818: loss 2.2713, time 5155.03ms 
iter 7819: loss 2.3877, time 5087.85ms 
iter 7820: loss 2.6425, time 5100.59ms 
iter 7821: loss 2.4509, time 5234.28ms 
iter 7822: loss 2.5825, time 5256.00ms 
iter 7823: loss 2.8914, time 5261.14ms 
iter 7824: loss 2.4537, time 5266.09ms 
iter 7825: loss 2.4719, time 5267.98ms 
iter 7826: loss 2.4481, time 5255.81ms 
iter 7827: loss 2.6340, time 5255.79ms 
iter 7828: loss 2.4459, time 5255.77ms 
iter 7829: loss 2.6746, time 5265.57ms 
iter 7830: loss 2.4521, time 5250.31ms 
iter 7831: loss 2.3378, time 5249.18ms 
iter 7832: loss 2.3725, time 5249.35ms 
iter 7833: loss 2.4736, time 5245.01ms 
iter 7834: loss 2.5837, time 5277.26ms 
iter 7835: loss 2.5507, time 5301.68ms 
iter 7836: loss 2.4083, time 5296.04ms 
iter 7837: loss 2.4681, time 5258.17ms 
iter 7838: loss 2.4755, time 5254.60ms 
iter 7839: loss 2.7688, time 5254.43ms 
iter 7840: loss 2.5488, time 5247.53ms 
iter 7841: loss 2.4471, time 5257.39ms 
iter 7842: loss 2.5809, time 5262.81ms 
iter 7843: loss 2.5031, time 5272.00ms 
iter 7844: loss 2.3643, time 5268.88ms 
iter 7845: loss 2.6321, time 5265.62ms 
iter 7846: loss 2.6098, time 5269.94ms 
iter 7847: loss 2.5423, time 5261.96ms 
iter 7848: loss 2.4053, time 5268.31ms 
iter 7849: loss 2.5107, time 5261.68ms 
step 7850: train loss 2.4901, val loss 2.8413
iter 7850: loss 2.3823, time 20081.66ms 
iter 7851: loss 2.4325, time 5264.01ms 
iter 7852: loss 2.7041, time 5265.81ms 
iter 7853: loss 2.5777, time 5260.16ms 
iter 7854: loss 2.5094, time 5250.96ms 
iter 7855: loss 2.6547, time 5261.90ms 
iter 7856: loss 2.4002, time 5250.20ms 
iter 7857: loss 2.4982, time 5251.64ms 
iter 7858: loss 2.4805, time 5252.60ms 
iter 7859: loss 2.5955, time 5270.49ms 
iter 7860: loss 2.5039, time 5253.41ms 
iter 7861: loss 2.5654, time 5249.04ms 
iter 7862: loss 2.4427, time 5262.13ms 
iter 7863: loss 2.3759, time 5272.63ms 
iter 7864: loss 2.4996, time 5246.30ms 
iter 7865: loss 2.3681, time 5250.62ms 
iter 7866: loss 2.5805, time 5261.23ms 
iter 7867: loss 2.6660, time 5268.55ms 
iter 7868: loss 2.4114, time 5277.54ms 
iter 7869: loss 2.4569, time 5254.67ms 
iter 7870: loss 2.5670, time 5246.46ms 
iter 7871: loss 2.4281, time 5274.10ms 
iter 7872: loss 2.7358, time 5284.59ms 
iter 7873: loss 2.3098, time 5275.95ms 
iter 7874: loss 2.4866, time 5273.82ms 
iter 7875: loss 2.4903, time 5287.47ms 
iter 7876: loss 2.5480, time 5288.90ms 
iter 7877: loss 2.2504, time 5264.93ms 
iter 7878: loss 2.2725, time 5264.45ms 
iter 7879: loss 2.5038, time 5272.88ms 
iter 7880: loss 2.2983, time 5268.57ms 
iter 7881: loss 2.4627, time 5257.48ms 
iter 7882: loss 2.6123, time 5233.23ms 
iter 7883: loss 2.6470, time 5276.12ms 
iter 7884: loss 2.4658, time 5261.14ms 
iter 7885: loss 2.4376, time 5261.76ms 
iter 7886: loss 2.3843, time 5257.46ms 
iter 7887: loss 2.6237, time 5264.93ms 
iter 7888: loss 2.3852, time 5272.77ms 
iter 7889: loss 2.4758, time 5261.25ms 
iter 7890: loss 2.5693, time 5262.61ms 
iter 7891: loss 2.5214, time 5258.85ms 
iter 7892: loss 2.5641, time 5268.06ms 
iter 7893: loss 2.5887, time 5259.79ms 
iter 7894: loss 2.5276, time 5252.68ms 
iter 7895: loss 2.3957, time 5263.62ms 
iter 7896: loss 2.5387, time 5270.00ms 
iter 7897: loss 2.5082, time 5251.61ms 
iter 7898: loss 2.8233, time 5260.37ms 
iter 7899: loss 2.2555, time 5263.40ms 
step 7900: train loss 2.4811, val loss 2.8496
iter 7900: loss 2.4842, time 20094.96ms 
iter 7901: loss 2.5453, time 5260.57ms 
iter 7902: loss 2.5832, time 5259.73ms 
iter 7903: loss 2.7439, time 5263.54ms 
iter 7904: loss 2.6479, time 5271.11ms 
iter 7905: loss 2.3377, time 5273.78ms 
iter 7906: loss 2.4040, time 5261.77ms 
iter 7907: loss 2.4363, time 5302.67ms 
iter 7908: loss 2.8273, time 5435.67ms 
iter 7909: loss 2.3968, time 5426.96ms 
iter 7910: loss 2.7150, time 5424.21ms 
iter 7911: loss 2.3956, time 5313.85ms 
iter 7912: loss 2.5685, time 5274.80ms 
iter 7913: loss 2.5010, time 5270.94ms 
iter 7914: loss 2.5683, time 5271.48ms 
iter 7915: loss 2.4732, time 5263.52ms 
iter 7916: loss 2.3657, time 5262.86ms 
iter 7917: loss 2.5428, time 5269.95ms 
iter 7918: loss 2.5896, time 5258.05ms 
iter 7919: loss 2.6011, time 5248.46ms 
iter 7920: loss 2.6091, time 5257.04ms 
iter 7921: loss 2.5069, time 5263.82ms 
iter 7922: loss 2.4875, time 5253.86ms 
iter 7923: loss 2.4370, time 5254.14ms 
iter 7924: loss 2.4651, time 5261.97ms 
iter 7925: loss 2.4338, time 5260.60ms 
iter 7926: loss 2.4615, time 5266.27ms 
iter 7927: loss 2.5757, time 5264.17ms 
iter 7928: loss 2.5166, time 5269.40ms 
iter 7929: loss 2.8054, time 5266.60ms 
iter 7930: loss 2.4882, time 5257.40ms 
iter 7931: loss 2.3426, time 5256.70ms 
iter 7932: loss 2.2977, time 5264.74ms 
iter 7933: loss 2.2684, time 5247.57ms 
iter 7934: loss 2.4667, time 5253.66ms 
iter 7935: loss 2.3503, time 5258.55ms 
iter 7936: loss 2.4662, time 5268.11ms 
iter 7937: loss 2.4540, time 5265.59ms 
iter 7938: loss 2.5222, time 5258.30ms 
iter 7939: loss 2.5557, time 5256.88ms 
iter 7940: loss 2.6023, time 5267.70ms 
iter 7941: loss 2.5444, time 5266.13ms 
iter 7942: loss 2.3076, time 5259.58ms 
iter 7943: loss 2.1228, time 5256.69ms 
iter 7944: loss 2.4425, time 5271.27ms 
iter 7945: loss 2.8196, time 5266.54ms 
iter 7946: loss 2.6307, time 5254.02ms 
iter 7947: loss 2.7240, time 5261.36ms 
iter 7948: loss 2.6857, time 5282.57ms 
iter 7949: loss 2.5334, time 5269.74ms 
step 7950: train loss 2.4864, val loss 2.8436
iter 7950: loss 2.5794, time 20066.11ms 
iter 7951: loss 2.3824, time 5255.97ms 
iter 7952: loss 2.6771, time 5274.92ms 
iter 7953: loss 2.5702, time 5277.25ms 
iter 7954: loss 2.6546, time 5262.63ms 
iter 7955: loss 2.3979, time 5264.16ms 
iter 7956: loss 2.3969, time 5270.87ms 
iter 7957: loss 2.4644, time 5274.15ms 
iter 7958: loss 2.3978, time 5274.59ms 
iter 7959: loss 2.2583, time 5267.06ms 
iter 7960: loss 2.4817, time 5277.25ms 
iter 7961: loss 2.5883, time 5280.83ms 
iter 7962: loss 2.6161, time 5266.24ms 
iter 7963: loss 2.5794, time 5271.23ms 
iter 7964: loss 2.5156, time 5260.86ms 
iter 7965: loss 2.3490, time 5259.61ms 
iter 7966: loss 2.4921, time 5261.45ms 
iter 7967: loss 2.5493, time 5267.88ms 
iter 7968: loss 2.9126, time 5263.48ms 
iter 7969: loss 2.5379, time 5266.39ms 
iter 7970: loss 2.4563, time 5260.23ms 
iter 7971: loss 2.2929, time 5259.36ms 
iter 7972: loss 2.4983, time 5262.78ms 
iter 7973: loss 2.7224, time 5257.62ms 
iter 7974: loss 2.5099, time 5254.44ms 
iter 7975: loss 2.3521, time 5255.55ms 
iter 7976: loss 2.4867, time 5257.09ms 
iter 7977: loss 2.4164, time 5252.32ms 
iter 7978: loss 2.5220, time 5259.42ms 
iter 7979: loss 2.6413, time 5225.53ms 
iter 7980: loss 2.5057, time 5216.07ms 
iter 7981: loss 2.7252, time 5209.70ms 
iter 7982: loss 2.4947, time 5211.32ms 
iter 7983: loss 2.4724, time 5210.57ms 
iter 7984: loss 2.4581, time 5221.59ms 
iter 7985: loss 2.3865, time 5212.14ms 
iter 7986: loss 2.7326, time 5212.13ms 
iter 7987: loss 2.8585, time 5214.32ms 
iter 7988: loss 2.4718, time 5282.29ms 
iter 7989: loss 2.4631, time 5267.63ms 
iter 7990: loss 2.4874, time 5262.25ms 
iter 7991: loss 2.5553, time 5278.88ms 
iter 7992: loss 2.5366, time 5258.96ms 
iter 7993: loss 2.6916, time 5257.19ms 
iter 7994: loss 2.3377, time 5262.17ms 
iter 7995: loss 2.5466, time 5264.47ms 
iter 7996: loss 2.5047, time 5254.23ms 
iter 7997: loss 2.6168, time 5251.95ms 
iter 7998: loss 2.5491, time 5259.22ms 
iter 7999: loss 2.5739, time 5262.22ms 
step 8000: train loss 2.4816, val loss 2.8350
iter 8000: loss 2.4987, time 20090.70ms 
iter 8001: loss 2.4554, time 5283.85ms 
iter 8002: loss 2.4873, time 5269.28ms 
iter 8003: loss 2.5847, time 5249.95ms 
iter 8004: loss 2.5721, time 5257.55ms 
iter 8005: loss 2.3111, time 5278.12ms 
iter 8006: loss 2.5306, time 5263.84ms 
iter 8007: loss 2.4103, time 5262.50ms 
iter 8008: loss 2.4305, time 5264.83ms 
iter 8009: loss 2.3783, time 5272.18ms 
iter 8010: loss 2.4429, time 5279.45ms 
iter 8011: loss 2.7735, time 5270.07ms 
iter 8012: loss 2.6703, time 5283.11ms 
iter 8013: loss 2.6003, time 5276.52ms 
iter 8014: loss 2.3862, time 5223.35ms 
iter 8015: loss 2.5362, time 5269.30ms 
iter 8016: loss 2.4345, time 5266.67ms 
iter 8017: loss 2.5378, time 5274.44ms 
iter 8018: loss 2.4222, time 5267.37ms 
iter 8019: loss 2.5511, time 5269.79ms 
iter 8020: loss 2.4302, time 5268.69ms 
iter 8021: loss 2.6582, time 5279.01ms 
iter 8022: loss 2.3822, time 5250.89ms 
iter 8023: loss 2.6529, time 5256.46ms 
iter 8024: loss 2.6395, time 5271.22ms 
iter 8025: loss 2.5248, time 5275.18ms 
iter 8026: loss 2.4236, time 5262.48ms 
iter 8027: loss 2.3282, time 5263.29ms 
iter 8028: loss 2.4605, time 5266.62ms 
iter 8029: loss 2.3246, time 5281.97ms 
iter 8030: loss 2.6536, time 5262.62ms 
iter 8031: loss 2.4808, time 5271.52ms 
iter 8032: loss 2.4778, time 5267.61ms 
iter 8033: loss 2.6401, time 5273.43ms 
iter 8034: loss 2.5683, time 5268.48ms 
iter 8035: loss 2.1195, time 5262.49ms 
iter 8036: loss 2.3965, time 5264.58ms 
iter 8037: loss 2.4760, time 5273.83ms 
iter 8038: loss 2.3808, time 5258.67ms 
iter 8039: loss 2.5563, time 5259.10ms 
iter 8040: loss 2.4190, time 5257.17ms 
iter 8041: loss 2.4857, time 5262.10ms 
iter 8042: loss 2.4479, time 5237.80ms 
iter 8043: loss 2.5464, time 5258.35ms 
iter 8044: loss 2.6248, time 5256.68ms 
iter 8045: loss 2.7242, time 5262.31ms 
iter 8046: loss 2.4204, time 5257.11ms 
iter 8047: loss 2.5214, time 5254.67ms 
iter 8048: loss 2.6028, time 5258.47ms 
iter 8049: loss 2.3783, time 5280.59ms 
step 8050: train loss 2.4961, val loss 2.8310
iter 8050: loss 2.3894, time 20059.78ms 
iter 8051: loss 2.4814, time 5263.19ms 
iter 8052: loss 2.5055, time 5254.59ms 
iter 8053: loss 2.4350, time 5267.25ms 
iter 8054: loss 2.5705, time 5264.21ms 
iter 8055: loss 2.4355, time 5254.02ms 
iter 8056: loss 2.4968, time 5251.93ms 
iter 8057: loss 2.5272, time 5266.97ms 
iter 8058: loss 2.4408, time 5257.28ms 
iter 8059: loss 2.6387, time 5246.68ms 
iter 8060: loss 2.4779, time 5258.41ms 
iter 8061: loss 2.5119, time 5265.47ms 
iter 8062: loss 2.5566, time 5257.22ms 
iter 8063: loss 2.2132, time 5248.91ms 
iter 8064: loss 2.9170, time 5254.31ms 
iter 8065: loss 2.5529, time 5274.60ms 
iter 8066: loss 2.4648, time 5236.42ms 
iter 8067: loss 2.6297, time 5257.87ms 
iter 8068: loss 2.4082, time 5259.99ms 
iter 8069: loss 2.4730, time 5265.89ms 
iter 8070: loss 2.5008, time 5259.71ms 
iter 8071: loss 2.5255, time 5253.56ms 
iter 8072: loss 2.5571, time 5251.90ms 
iter 8073: loss 2.6257, time 5271.39ms 
iter 8074: loss 2.5944, time 5274.26ms 
iter 8075: loss 2.3925, time 5266.62ms 
iter 8076: loss 2.7679, time 5232.82ms 
iter 8077: loss 2.5181, time 5247.38ms 
iter 8078: loss 2.3575, time 5219.34ms 
iter 8079: loss 2.3276, time 5250.62ms 
iter 8080: loss 2.4085, time 5251.97ms 
iter 8081: loss 2.6063, time 5265.26ms 
iter 8082: loss 2.5925, time 5246.81ms 
iter 8083: loss 2.4498, time 5265.33ms 
iter 8084: loss 2.5257, time 5269.65ms 
iter 8085: loss 2.4692, time 5265.34ms 
iter 8086: loss 2.3508, time 5260.45ms 
iter 8087: loss 2.7690, time 5276.28ms 
iter 8088: loss 2.4829, time 5276.51ms 
iter 8089: loss 2.3209, time 5285.17ms 
iter 8090: loss 2.4205, time 5260.15ms 
iter 8091: loss 2.4022, time 5263.22ms 
iter 8092: loss 2.3301, time 5281.11ms 
iter 8093: loss 2.5123, time 5259.31ms 
iter 8094: loss 2.3428, time 5263.62ms 
iter 8095: loss 2.3309, time 5221.30ms 
iter 8096: loss 2.3416, time 5232.18ms 
iter 8097: loss 2.5032, time 5216.23ms 
iter 8098: loss 2.3777, time 5248.46ms 
iter 8099: loss 2.5338, time 5246.91ms 
step 8100: train loss 2.4931, val loss 2.8401
iter 8100: loss 2.5848, time 20041.01ms 
iter 8101: loss 2.6212, time 5267.03ms 
iter 8102: loss 2.3187, time 5256.96ms 
iter 8103: loss 2.5834, time 5251.22ms 
iter 8104: loss 2.5444, time 5253.79ms 
iter 8105: loss 2.4434, time 5257.00ms 
iter 8106: loss 2.5854, time 5254.33ms 
iter 8107: loss 2.4185, time 5250.21ms 
iter 8108: loss 2.2996, time 5259.78ms 
iter 8109: loss 2.6819, time 5268.10ms 
iter 8110: loss 2.7628, time 5260.33ms 
iter 8111: loss 2.5893, time 5259.31ms 
iter 8112: loss 2.3664, time 5260.86ms 
iter 8113: loss 2.4754, time 5261.87ms 
iter 8114: loss 2.4741, time 5249.28ms 
iter 8115: loss 2.5605, time 5254.33ms 
iter 8116: loss 2.5012, time 5246.93ms 
iter 8117: loss 2.4654, time 5259.02ms 
iter 8118: loss 2.7606, time 5257.43ms 
iter 8119: loss 2.5778, time 5265.11ms 
iter 8120: loss 2.5155, time 5256.20ms 
iter 8121: loss 2.3843, time 5267.02ms 
iter 8122: loss 2.5015, time 5276.61ms 
iter 8123: loss 2.3260, time 5258.75ms 
iter 8124: loss 2.1976, time 5245.30ms 
iter 8125: loss 2.4607, time 5268.66ms 
iter 8126: loss 2.5322, time 5268.83ms 
iter 8127: loss 2.5213, time 5265.00ms 
iter 8128: loss 2.6925, time 5256.22ms 
iter 8129: loss 2.5167, time 5275.91ms 
iter 8130: loss 2.5441, time 5256.26ms 
iter 8131: loss 2.5385, time 5261.86ms 
iter 8132: loss 2.4229, time 5266.13ms 
iter 8133: loss 2.5355, time 5277.73ms 
iter 8134: loss 2.3230, time 5265.65ms 
iter 8135: loss 2.5722, time 5262.68ms 
iter 8136: loss 2.3201, time 5250.16ms 
iter 8137: loss 2.3712, time 5263.41ms 
iter 8138: loss 2.5640, time 5262.34ms 
iter 8139: loss 2.7539, time 5251.57ms 
iter 8140: loss 2.4406, time 5251.26ms 
iter 8141: loss 2.5232, time 5272.75ms 
iter 8142: loss 2.4215, time 5256.61ms 
iter 8143: loss 2.4660, time 5262.23ms 
iter 8144: loss 2.2432, time 5258.75ms 
iter 8145: loss 2.4419, time 5270.08ms 
iter 8146: loss 2.2845, time 5259.15ms 
iter 8147: loss 2.4537, time 5270.07ms 
iter 8148: loss 2.3899, time 5261.29ms 
iter 8149: loss 2.5035, time 5269.80ms 
step 8150: train loss 2.4763, val loss 2.8600
iter 8150: loss 2.4965, time 20143.35ms 
iter 8151: loss 2.5998, time 5263.15ms 
iter 8152: loss 2.5697, time 5268.02ms 
iter 8153: loss 2.3648, time 5279.30ms 
iter 8154: loss 2.5233, time 5270.97ms 
iter 8155: loss 2.4844, time 5258.92ms 
iter 8156: loss 2.2379, time 5256.17ms 
iter 8157: loss 2.4819, time 5261.46ms 
iter 8158: loss 2.4087, time 5274.85ms 
iter 8159: loss 2.6093, time 5281.00ms 
iter 8160: loss 2.1172, time 5262.99ms 
iter 8161: loss 2.5372, time 5275.36ms 
iter 8162: loss 2.5638, time 5270.69ms 
iter 8163: loss 2.4260, time 5257.31ms 
iter 8164: loss 2.7511, time 5257.33ms 
iter 8165: loss 2.4118, time 5269.78ms 
iter 8166: loss 2.6379, time 5255.08ms 
iter 8167: loss 2.3371, time 5262.73ms 
iter 8168: loss 2.3887, time 5267.54ms 
iter 8169: loss 2.3741, time 5272.29ms 
iter 8170: loss 2.4292, time 5260.18ms 
iter 8171: loss 2.5153, time 5263.96ms 
iter 8172: loss 2.4416, time 5218.15ms 
iter 8173: loss 2.5013, time 5274.63ms 
iter 8174: loss 2.3149, time 5247.67ms 
iter 8175: loss 2.5615, time 5257.05ms 
iter 8176: loss 2.4397, time 5255.28ms 
iter 8177: loss 2.5295, time 5268.60ms 
iter 8178: loss 2.5653, time 5257.72ms 
iter 8179: loss 2.2924, time 5257.11ms 
iter 8180: loss 2.5448, time 5254.04ms 
iter 8181: loss 2.4366, time 5275.79ms 
iter 8182: loss 2.4258, time 5255.61ms 
iter 8183: loss 2.4805, time 5257.63ms 
iter 8184: loss 2.5750, time 5251.14ms 
iter 8185: loss 2.6805, time 5273.04ms 
iter 8186: loss 2.3680, time 5239.29ms 
iter 8187: loss 2.4678, time 5258.74ms 
iter 8188: loss 2.3324, time 5265.65ms 
iter 8189: loss 2.1935, time 5267.08ms 
iter 8190: loss 2.6863, time 5250.40ms 
iter 8191: loss 2.5379, time 5259.66ms 
iter 8192: loss 2.5302, time 5258.69ms 
iter 8193: loss 2.4341, time 5260.43ms 
iter 8194: loss 2.4198, time 5254.62ms 
iter 8195: loss 2.3819, time 5251.34ms 
iter 8196: loss 2.4982, time 5228.41ms 
iter 8197: loss 2.5077, time 5273.09ms 
iter 8198: loss 2.4608, time 5259.91ms 
iter 8199: loss 2.4085, time 5251.90ms 
step 8200: train loss 2.4837, val loss 2.8332
iter 8200: loss 2.5059, time 20175.74ms 
iter 8201: loss 2.5343, time 5271.60ms 
iter 8202: loss 2.2515, time 5250.95ms 
iter 8203: loss 2.3855, time 5259.32ms 
iter 8204: loss 2.5681, time 5270.82ms 
iter 8205: loss 2.5914, time 5270.79ms 
iter 8206: loss 2.6742, time 5252.73ms 
iter 8207: loss 2.2133, time 5253.44ms 
iter 8208: loss 2.5176, time 5263.13ms 
iter 8209: loss 2.2624, time 5266.83ms 
iter 8210: loss 2.5604, time 5252.03ms 
iter 8211: loss 2.6120, time 5251.62ms 
iter 8212: loss 2.4482, time 5258.58ms 
iter 8213: loss 2.5255, time 5259.98ms 
iter 8214: loss 2.4677, time 5256.17ms 
iter 8215: loss 2.5862, time 5249.77ms 
iter 8216: loss 2.6770, time 5264.33ms 
iter 8217: loss 2.3929, time 5272.18ms 
iter 8218: loss 2.6043, time 5262.59ms 
iter 8219: loss 2.4323, time 5253.46ms 
iter 8220: loss 2.7380, time 5256.10ms 
iter 8221: loss 2.3490, time 5267.52ms 
iter 8222: loss 2.3814, time 5264.09ms 
iter 8223: loss 2.5057, time 5254.19ms 
iter 8224: loss 2.5764, time 5256.72ms 
iter 8225: loss 2.6513, time 5272.60ms 
iter 8226: loss 2.5212, time 5267.17ms 
iter 8227: loss 2.4245, time 5249.55ms 
iter 8228: loss 2.6329, time 5253.07ms 
iter 8229: loss 2.3860, time 5271.47ms 
iter 8230: loss 2.3812, time 5252.36ms 
iter 8231: loss 2.7190, time 5261.68ms 
iter 8232: loss 2.4361, time 5261.41ms 
iter 8233: loss 2.5603, time 5276.82ms 
iter 8234: loss 2.4486, time 5267.80ms 
iter 8235: loss 2.2955, time 5327.01ms 
iter 8236: loss 2.4087, time 5321.69ms 
iter 8237: loss 2.5195, time 5296.21ms 
iter 8238: loss 2.4235, time 5262.71ms 
iter 8239: loss 2.7809, time 5261.78ms 
iter 8240: loss 2.3989, time 5262.70ms 
iter 8241: loss 2.5496, time 5278.94ms 
iter 8242: loss 2.6446, time 5260.18ms 
iter 8243: loss 2.5402, time 5262.95ms 
iter 8244: loss 2.5490, time 5269.70ms 
iter 8245: loss 2.4090, time 5271.57ms 
iter 8246: loss 2.4667, time 5262.21ms 
iter 8247: loss 2.5798, time 5270.89ms 
iter 8248: loss 2.6014, time 5254.30ms 
iter 8249: loss 2.5173, time 5259.41ms 
step 8250: train loss 2.4736, val loss 2.8509
iter 8250: loss 2.5348, time 20224.70ms 
iter 8251: loss 2.3708, time 5249.27ms 
iter 8252: loss 2.4909, time 5259.17ms 
iter 8253: loss 2.3038, time 5261.31ms 
iter 8254: loss 2.4890, time 5259.62ms 
iter 8255: loss 2.3784, time 5252.36ms 
iter 8256: loss 2.4063, time 5251.79ms 
iter 8257: loss 2.5926, time 5258.67ms 
iter 8258: loss 2.3180, time 5260.14ms 
iter 8259: loss 2.5451, time 5246.85ms 
iter 8260: loss 2.4130, time 5250.18ms 
iter 8261: loss 2.5608, time 5275.61ms 
iter 8262: loss 2.4582, time 5276.50ms 
iter 8263: loss 2.4300, time 5262.54ms 
iter 8264: loss 2.2670, time 5262.71ms 
iter 8265: loss 2.4013, time 5255.81ms 
iter 8266: loss 2.3593, time 5269.79ms 
iter 8267: loss 2.3613, time 5256.84ms 
iter 8268: loss 2.3710, time 5255.89ms 
iter 8269: loss 2.5712, time 5260.82ms 
iter 8270: loss 2.3187, time 5272.11ms 
iter 8271: loss 2.6057, time 5282.14ms 
iter 8272: loss 2.4488, time 5256.43ms 
iter 8273: loss 2.4791, time 5256.81ms 
iter 8274: loss 2.4660, time 5262.08ms 
iter 8275: loss 2.1892, time 5250.74ms 
iter 8276: loss 2.3071, time 5259.38ms 
iter 8277: loss 2.5774, time 5258.08ms 
iter 8278: loss 2.4213, time 5256.60ms 
iter 8279: loss 2.2668, time 5249.59ms 
iter 8280: loss 2.5729, time 5259.85ms 
iter 8281: loss 2.4493, time 5260.09ms 
iter 8282: loss 2.1912, time 5248.81ms 
iter 8283: loss 2.6628, time 5236.47ms 
iter 8284: loss 2.1904, time 5245.27ms 
iter 8285: loss 2.3535, time 5379.88ms 
iter 8286: loss 2.5033, time 5414.79ms 
iter 8287: loss 2.4555, time 5393.92ms 
iter 8288: loss 2.3459, time 5388.91ms 
iter 8289: loss 2.4945, time 5402.99ms 
iter 8290: loss 2.5446, time 5398.19ms 
iter 8291: loss 2.5332, time 5378.42ms 
iter 8292: loss 2.5220, time 5379.96ms 
iter 8293: loss 2.6795, time 5381.44ms 
iter 8294: loss 2.4021, time 5419.07ms 
iter 8295: loss 2.4341, time 5384.91ms 
iter 8296: loss 2.5771, time 5253.41ms 
iter 8297: loss 2.5937, time 5271.52ms 
iter 8298: loss 2.5854, time 5264.90ms 
iter 8299: loss 2.4157, time 5320.54ms 
step 8300: train loss 2.4777, val loss 2.8371
iter 8300: loss 2.6705, time 20514.10ms 
iter 8301: loss 2.5076, time 5382.28ms 
iter 8302: loss 2.5006, time 5398.39ms 
iter 8303: loss 2.4297, time 5352.08ms 
iter 8304: loss 2.5595, time 5262.29ms 
iter 8305: loss 2.4757, time 5268.20ms 
iter 8306: loss 2.2260, time 5246.53ms 
iter 8307: loss 2.8892, time 5259.98ms 
iter 8308: loss 2.5700, time 5272.56ms 
iter 8309: loss 2.3851, time 5263.59ms 
iter 8310: loss 2.6406, time 5257.44ms 
iter 8311: loss 2.4428, time 5256.17ms 
iter 8312: loss 2.5042, time 5258.67ms 
iter 8313: loss 2.3596, time 5256.46ms 
iter 8314: loss 2.4855, time 5267.11ms 
iter 8315: loss 2.3048, time 5269.98ms 
iter 8316: loss 2.4236, time 5298.64ms 
iter 8317: loss 2.6048, time 5279.77ms 
iter 8318: loss 2.6705, time 5271.12ms 
iter 8319: loss 2.7121, time 5271.78ms 
iter 8320: loss 2.3099, time 5259.53ms 
iter 8321: loss 2.2818, time 5259.45ms 
iter 8322: loss 2.1764, time 5251.46ms 
iter 8323: loss 2.3496, time 5255.23ms 
iter 8324: loss 2.4322, time 5266.16ms 
iter 8325: loss 2.4701, time 5254.99ms 
iter 8326: loss 2.4591, time 5276.32ms 
iter 8327: loss 2.5042, time 5265.11ms 
iter 8328: loss 2.4563, time 5265.31ms 
iter 8329: loss 2.3330, time 5253.19ms 
iter 8330: loss 2.1733, time 5272.70ms 
iter 8331: loss 2.3259, time 5230.51ms 
iter 8332: loss 2.4884, time 5279.18ms 
iter 8333: loss 2.5505, time 5263.42ms 
iter 8334: loss 2.4177, time 5257.12ms 
iter 8335: loss 2.6199, time 5220.60ms 
iter 8336: loss 2.1027, time 5274.19ms 
iter 8337: loss 2.5216, time 5248.76ms 
iter 8338: loss 2.3603, time 5272.63ms 
iter 8339: loss 2.6587, time 5280.01ms 
iter 8340: loss 2.5010, time 5272.48ms 
iter 8341: loss 2.5657, time 5266.56ms 
iter 8342: loss 2.4379, time 5263.06ms 
iter 8343: loss 2.6291, time 5269.59ms 
iter 8344: loss 2.3148, time 5263.29ms 
iter 8345: loss 2.3537, time 5258.22ms 
iter 8346: loss 2.5425, time 5255.50ms 
iter 8347: loss 2.4384, time 5259.23ms 
iter 8348: loss 2.5555, time 5250.93ms 
iter 8349: loss 2.3527, time 5232.92ms 
step 8350: train loss 2.4849, val loss 2.8395
iter 8350: loss 2.5037, time 20042.54ms 
iter 8351: loss 2.5672, time 5261.11ms 
iter 8352: loss 2.4805, time 5246.15ms 
iter 8353: loss 2.6081, time 5257.35ms 
iter 8354: loss 2.1877, time 5256.25ms 
iter 8355: loss 2.4079, time 5255.56ms 
iter 8356: loss 2.6061, time 5243.66ms 
iter 8357: loss 2.3245, time 5247.57ms 
iter 8358: loss 2.4470, time 5267.76ms 
iter 8359: loss 2.4184, time 5257.86ms 
iter 8360: loss 2.5284, time 5247.38ms 
iter 8361: loss 2.3957, time 5244.61ms 
iter 8362: loss 2.6735, time 5265.66ms 
iter 8363: loss 2.4317, time 5254.36ms 
iter 8364: loss 2.5487, time 5250.22ms 
iter 8365: loss 2.6777, time 5250.70ms 
iter 8366: loss 2.4445, time 5262.74ms 
iter 8367: loss 2.5875, time 5244.90ms 
iter 8368: loss 2.5316, time 5247.71ms 
iter 8369: loss 2.3898, time 5259.06ms 
iter 8370: loss 2.4477, time 5252.12ms 
iter 8371: loss 2.6452, time 5250.09ms 
iter 8372: loss 2.4260, time 5254.72ms 
iter 8373: loss 2.3452, time 5257.82ms 
iter 8374: loss 2.4248, time 5248.02ms 
iter 8375: loss 2.4797, time 5245.48ms 
iter 8376: loss 2.2611, time 5247.44ms 
iter 8377: loss 2.4162, time 5265.60ms 
iter 8378: loss 2.5630, time 5244.82ms 
iter 8379: loss 2.4622, time 5247.62ms 
iter 8380: loss 2.4126, time 5263.63ms 
iter 8381: loss 2.6171, time 5251.33ms 
iter 8382: loss 2.5200, time 5258.65ms 
iter 8383: loss 2.4416, time 5268.82ms 
iter 8384: loss 2.5945, time 5270.15ms 
iter 8385: loss 2.6438, time 5265.57ms 
iter 8386: loss 2.2993, time 5263.99ms 
iter 8387: loss 2.4257, time 5263.55ms 
iter 8388: loss 2.5865, time 5280.92ms 
iter 8389: loss 2.3360, time 5249.75ms 
iter 8390: loss 2.4349, time 5246.36ms 
iter 8391: loss 2.6082, time 5254.23ms 
iter 8392: loss 2.3801, time 5255.99ms 
iter 8393: loss 2.6727, time 5247.72ms 
iter 8394: loss 2.3906, time 5245.89ms 
iter 8395: loss 2.4390, time 5261.06ms 
iter 8396: loss 2.4470, time 5253.82ms 
iter 8397: loss 2.6461, time 5245.75ms 
iter 8398: loss 2.5236, time 5253.18ms 
iter 8399: loss 2.3329, time 5250.74ms 
step 8400: train loss 2.4872, val loss 2.8292
iter 8400: loss 2.6454, time 20067.29ms 
iter 8401: loss 2.3044, time 5246.25ms 
iter 8402: loss 2.3551, time 5244.63ms 
iter 8403: loss 2.5237, time 5242.49ms 
iter 8404: loss 2.6221, time 5273.96ms 
iter 8405: loss 2.4736, time 5258.21ms 
iter 8406: loss 2.4006, time 5257.78ms 
iter 8407: loss 2.6183, time 5264.32ms 
iter 8408: loss 2.4685, time 5277.06ms 
iter 8409: loss 2.5126, time 5253.45ms 
iter 8410: loss 2.6700, time 5248.18ms 
iter 8411: loss 2.6853, time 5254.44ms 
iter 8412: loss 2.6033, time 5262.09ms 
iter 8413: loss 2.5815, time 5252.68ms 
iter 8414: loss 2.4014, time 5252.07ms 
iter 8415: loss 2.6037, time 5255.43ms 
iter 8416: loss 2.5382, time 5259.55ms 
iter 8417: loss 2.5573, time 5264.09ms 
iter 8418: loss 2.6798, time 5257.60ms 
iter 8419: loss 2.6360, time 5269.14ms 
iter 8420: loss 2.4988, time 5279.09ms 
iter 8421: loss 2.2380, time 5263.07ms 
iter 8422: loss 2.5586, time 5265.71ms 
iter 8423: loss 2.4786, time 5271.17ms 
iter 8424: loss 2.3554, time 5281.97ms 
iter 8425: loss 2.2422, time 5260.75ms 
iter 8426: loss 2.4765, time 5257.27ms 
iter 8427: loss 2.3667, time 5257.73ms 
iter 8428: loss 2.3150, time 5268.49ms 
iter 8429: loss 2.5968, time 5263.43ms 
iter 8430: loss 2.5080, time 5246.17ms 
iter 8431: loss 2.2200, time 5260.48ms 
iter 8432: loss 2.4714, time 5268.69ms 
iter 8433: loss 2.2778, time 5254.51ms 
iter 8434: loss 2.3083, time 5252.62ms 
iter 8435: loss 2.3819, time 5269.06ms 
iter 8436: loss 2.7675, time 5260.37ms 
iter 8437: loss 2.4371, time 5265.36ms 
iter 8438: loss 2.4390, time 5261.06ms 
iter 8439: loss 2.4451, time 5355.33ms 
iter 8440: loss 2.2962, time 5399.09ms 
iter 8441: loss 2.5204, time 5383.44ms 
iter 8442: loss 2.4899, time 5264.61ms 
iter 8443: loss 2.4098, time 5263.20ms 
iter 8444: loss 2.5105, time 5270.38ms 
iter 8445: loss 2.5585, time 5271.50ms 
iter 8446: loss 2.5528, time 5279.04ms 
iter 8447: loss 2.4600, time 5409.39ms 
iter 8448: loss 2.3056, time 5383.89ms 
iter 8449: loss 2.3564, time 5390.60ms 
step 8450: train loss 2.4868, val loss 2.8580
iter 8450: loss 2.5759, time 20409.02ms 
iter 8451: loss 2.2416, time 5260.93ms 
iter 8452: loss 2.6866, time 5269.82ms 
iter 8453: loss 2.6912, time 5269.20ms 
iter 8454: loss 2.3616, time 5286.77ms 
iter 8455: loss 2.6358, time 5277.00ms 
iter 8456: loss 2.4127, time 5274.70ms 
iter 8457: loss 2.3989, time 5255.89ms 
iter 8458: loss 2.6731, time 5288.23ms 
iter 8459: loss 2.3725, time 5272.08ms 
iter 8460: loss 2.5220, time 5261.82ms 
iter 8461: loss 2.2750, time 5253.76ms 
iter 8462: loss 2.7338, time 5264.87ms 
iter 8463: loss 2.5316, time 5273.06ms 
iter 8464: loss 2.5526, time 5264.95ms 
iter 8465: loss 2.6579, time 5266.40ms 
iter 8466: loss 2.1002, time 5275.64ms 
iter 8467: loss 2.5988, time 5234.62ms 
iter 8468: loss 2.6258, time 5266.50ms 
iter 8469: loss 2.5979, time 5255.43ms 
iter 8470: loss 2.4619, time 5277.72ms 
iter 8471: loss 2.5266, time 5257.98ms 
iter 8472: loss 2.6783, time 5249.41ms 
iter 8473: loss 2.5872, time 5241.02ms 
iter 8474: loss 2.5105, time 5233.26ms 
iter 8475: loss 2.5862, time 5255.11ms 
iter 8476: loss 2.2315, time 5265.87ms 
iter 8477: loss 2.5817, time 5265.59ms 
iter 8478: loss 2.5800, time 5261.27ms 
iter 8479: loss 2.6239, time 5255.84ms 
iter 8480: loss 2.3752, time 5257.38ms 
iter 8481: loss 2.6750, time 5272.24ms 
iter 8482: loss 2.4566, time 5256.78ms 
iter 8483: loss 2.4624, time 5261.83ms 
iter 8484: loss 2.5395, time 5266.61ms 
iter 8485: loss 2.4435, time 5271.83ms 
iter 8486: loss 2.5122, time 5248.91ms 
iter 8487: loss 2.4003, time 5234.94ms 
iter 8488: loss 2.5715, time 5265.30ms 
iter 8489: loss 2.5165, time 5264.86ms 
iter 8490: loss 2.5167, time 5261.89ms 
iter 8491: loss 2.4111, time 5216.62ms 
iter 8492: loss 2.4228, time 5256.44ms 
iter 8493: loss 2.4605, time 5266.15ms 
iter 8494: loss 2.4326, time 5259.96ms 
iter 8495: loss 2.3496, time 5259.50ms 
iter 8496: loss 2.4648, time 5277.89ms 
iter 8497: loss 2.2710, time 5265.67ms 
iter 8498: loss 2.4611, time 5263.96ms 
iter 8499: loss 2.6306, time 5268.28ms 
step 8500: train loss 2.4768, val loss 2.8366
iter 8500: loss 2.4749, time 20111.42ms 
iter 8501: loss 2.2673, time 5247.37ms 
iter 8502: loss 2.3102, time 5256.26ms 
iter 8503: loss 2.6789, time 5274.64ms 
iter 8504: loss 2.4084, time 5264.52ms 
iter 8505: loss 2.3144, time 5334.96ms 
iter 8506: loss 2.5334, time 5290.55ms 
iter 8507: loss 2.4185, time 5312.90ms 
iter 8508: loss 2.2321, time 5260.33ms 
iter 8509: loss 2.5784, time 5258.60ms 
iter 8510: loss 2.4510, time 5253.10ms 
iter 8511: loss 2.3966, time 5274.78ms 
iter 8512: loss 2.5434, time 5335.80ms 
iter 8513: loss 2.4055, time 5299.45ms 
iter 8514: loss 2.3436, time 5298.46ms 
iter 8515: loss 2.5333, time 5271.57ms 
iter 8516: loss 2.4147, time 5267.10ms 
iter 8517: loss 2.3750, time 5263.21ms 
iter 8518: loss 2.4470, time 5270.28ms 
iter 8519: loss 2.3391, time 5256.91ms 
iter 8520: loss 2.6946, time 5262.02ms 
iter 8521: loss 2.4932, time 5259.20ms 
iter 8522: loss 2.3394, time 5238.24ms 
iter 8523: loss 2.3692, time 5247.15ms 
iter 8524: loss 2.4462, time 5260.97ms 
iter 8525: loss 2.3379, time 5257.31ms 
iter 8526: loss 2.5750, time 5270.29ms 
iter 8527: loss 2.4859, time 5254.48ms 
iter 8528: loss 2.2953, time 5258.96ms 
iter 8529: loss 2.6716, time 5265.41ms 
iter 8530: loss 2.4596, time 5256.11ms 
iter 8531: loss 2.4673, time 5251.88ms 
iter 8532: loss 2.5449, time 5248.15ms 
iter 8533: loss 2.2832, time 5264.23ms 
iter 8534: loss 2.4226, time 5300.18ms 
iter 8535: loss 2.7582, time 5252.00ms 
iter 8536: loss 2.4246, time 5253.19ms 
iter 8537: loss 2.4131, time 5259.86ms 
iter 8538: loss 2.5934, time 5250.50ms 
iter 8539: loss 2.4249, time 5259.24ms 
iter 8540: loss 2.3297, time 5261.85ms 
iter 8541: loss 2.5179, time 5252.03ms 
iter 8542: loss 2.5252, time 5253.70ms 
iter 8543: loss 2.4250, time 5254.62ms 
iter 8544: loss 2.4067, time 5255.57ms 
iter 8545: loss 2.3733, time 5255.54ms 
iter 8546: loss 2.3964, time 5257.30ms 
iter 8547: loss 2.5434, time 5254.63ms 
iter 8548: loss 2.3639, time 5304.68ms 
iter 8549: loss 2.5193, time 5251.44ms 
step 8550: train loss 2.4735, val loss 2.8410
iter 8550: loss 2.5720, time 20053.74ms 
iter 8551: loss 2.4033, time 5253.40ms 
iter 8552: loss 2.2960, time 5267.79ms 
iter 8553: loss 2.6784, time 5226.62ms 
iter 8554: loss 2.3714, time 5250.30ms 
iter 8555: loss 2.5750, time 5260.49ms 
iter 8556: loss 2.3695, time 5269.59ms 
iter 8557: loss 2.6038, time 5263.18ms 
iter 8558: loss 2.3528, time 5256.37ms 
iter 8559: loss 2.4238, time 5262.28ms 
iter 8560: loss 2.5503, time 5273.09ms 
iter 8561: loss 2.4428, time 5273.82ms 
iter 8562: loss 2.3264, time 5270.37ms 
iter 8563: loss 2.2650, time 5264.43ms 
iter 8564: loss 2.6853, time 5269.97ms 
iter 8565: loss 2.3357, time 5288.40ms 
iter 8566: loss 2.5800, time 5266.88ms 
iter 8567: loss 2.6584, time 5261.38ms 
iter 8568: loss 2.4942, time 5258.86ms 
iter 8569: loss 2.5887, time 5258.21ms 
iter 8570: loss 2.4485, time 5258.37ms 
iter 8571: loss 2.4909, time 5255.13ms 
iter 8572: loss 2.5356, time 5274.99ms 
iter 8573: loss 2.4461, time 5268.14ms 
iter 8574: loss 2.5434, time 5303.41ms 
iter 8575: loss 2.4855, time 5298.50ms 
iter 8576: loss 2.4025, time 5290.63ms 
iter 8577: loss 2.5368, time 5295.20ms 
iter 8578: loss 2.3844, time 5277.79ms 
iter 8579: loss 2.6514, time 5263.70ms 
iter 8580: loss 2.2749, time 5296.06ms 
iter 8581: loss 2.4039, time 5280.52ms 
iter 8582: loss 2.6110, time 5271.78ms 
iter 8583: loss 2.5604, time 5286.86ms 
iter 8584: loss 2.4242, time 5258.59ms 
iter 8585: loss 2.2623, time 5255.75ms 
iter 8586: loss 2.3835, time 5257.53ms 
iter 8587: loss 2.4191, time 5259.41ms 
iter 8588: loss 2.2269, time 5260.78ms 
iter 8589: loss 2.4368, time 5313.58ms 
iter 8590: loss 2.6751, time 5327.93ms 
iter 8591: loss 2.4353, time 5268.69ms 
iter 8592: loss 2.2612, time 5233.97ms 
iter 8593: loss 2.5490, time 5259.53ms 
iter 8594: loss 2.3796, time 5270.22ms 
iter 8595: loss 2.5837, time 5251.42ms 
iter 8596: loss 2.2861, time 5257.90ms 
iter 8597: loss 2.4213, time 5286.02ms 
iter 8598: loss 2.5824, time 5303.72ms 
iter 8599: loss 2.5752, time 5269.01ms 
step 8600: train loss 2.4740, val loss 2.8547
iter 8600: loss 2.2382, time 20097.52ms 
iter 8601: loss 2.7578, time 5249.67ms 
iter 8602: loss 2.4456, time 5257.04ms 
iter 8603: loss 2.5775, time 5268.85ms 
iter 8604: loss 2.4527, time 5256.39ms 
iter 8605: loss 2.3521, time 5317.18ms 
iter 8606: loss 2.4632, time 5322.08ms 
iter 8607: loss 2.5890, time 5315.88ms 
iter 8608: loss 2.5780, time 5303.00ms 
iter 8609: loss 2.8187, time 5276.79ms 
iter 8610: loss 2.4212, time 5249.10ms 
iter 8611: loss 2.4657, time 5262.96ms 
iter 8612: loss 2.2533, time 5253.67ms 
iter 8613: loss 2.6484, time 5259.26ms 
iter 8614: loss 2.5908, time 5266.54ms 
iter 8615: loss 2.5419, time 5228.97ms 
iter 8616: loss 2.4516, time 5257.14ms 
iter 8617: loss 2.7139, time 5256.47ms 
iter 8618: loss 2.3968, time 5263.90ms 
iter 8619: loss 2.4653, time 5248.22ms 
iter 8620: loss 2.6068, time 5251.76ms 
iter 8621: loss 2.6470, time 5254.01ms 
iter 8622: loss 2.5632, time 5266.51ms 
iter 8623: loss 2.3727, time 5247.25ms 
iter 8624: loss 2.4060, time 5254.89ms 
iter 8625: loss 2.3324, time 5261.94ms 
iter 8626: loss 2.5669, time 5256.76ms 
iter 8627: loss 2.4404, time 5255.65ms 
iter 8628: loss 2.5491, time 5252.33ms 
iter 8629: loss 2.5286, time 5268.17ms 
iter 8630: loss 2.3561, time 5255.74ms 
iter 8631: loss 2.4266, time 5257.32ms 
iter 8632: loss 2.5741, time 5273.04ms 
iter 8633: loss 2.4636, time 5282.49ms 
iter 8634: loss 2.5654, time 5248.92ms 
iter 8635: loss 2.5105, time 5255.68ms 
iter 8636: loss 2.8216, time 5271.44ms 
iter 8637: loss 2.5185, time 5258.89ms 
iter 8638: loss 2.4531, time 5256.36ms 
iter 8639: loss 2.4068, time 5250.39ms 
iter 8640: loss 2.4184, time 5268.99ms 
iter 8641: loss 2.3570, time 5257.01ms 
iter 8642: loss 2.3641, time 5259.69ms 
iter 8643: loss 2.3198, time 5254.96ms 
iter 8644: loss 2.5311, time 5267.67ms 
iter 8645: loss 2.6081, time 5241.44ms 
iter 8646: loss 2.5521, time 5258.59ms 
iter 8647: loss 2.2987, time 5282.04ms 
iter 8648: loss 2.5915, time 5264.67ms 
iter 8649: loss 2.6654, time 5258.16ms 
step 8650: train loss 2.4786, val loss 2.8425
iter 8650: loss 2.5374, time 20002.62ms 
iter 8651: loss 2.4421, time 5267.99ms 
iter 8652: loss 2.5542, time 5260.48ms 
iter 8653: loss 2.4956, time 5267.86ms 
iter 8654: loss 2.4595, time 5281.58ms 
iter 8655: loss 2.4791, time 5270.22ms 
iter 8656: loss 2.6051, time 5289.78ms 
iter 8657: loss 2.4264, time 5273.00ms 
iter 8658: loss 2.5032, time 5289.44ms 
iter 8659: loss 2.4806, time 5258.51ms 
iter 8660: loss 2.5078, time 5258.66ms 
iter 8661: loss 2.3956, time 5277.84ms 
iter 8662: loss 2.7588, time 5262.11ms 
iter 8663: loss 2.4941, time 5255.60ms 
iter 8664: loss 2.3664, time 5269.34ms 
iter 8665: loss 2.6818, time 5260.39ms 
iter 8666: loss 2.5554, time 5265.39ms 
iter 8667: loss 2.5056, time 5263.26ms 
iter 8668: loss 2.3220, time 5260.33ms 
iter 8669: loss 2.3865, time 5269.32ms 
iter 8670: loss 2.3257, time 5262.76ms 
iter 8671: loss 2.6168, time 5255.92ms 
iter 8672: loss 2.4614, time 5251.92ms 
iter 8673: loss 2.3466, time 5272.52ms 
iter 8674: loss 2.2806, time 5259.79ms 
iter 8675: loss 2.5701, time 5250.91ms 
iter 8676: loss 2.3460, time 5254.62ms 
iter 8677: loss 2.4129, time 5220.08ms 
iter 8678: loss 2.5239, time 5246.02ms 
iter 8679: loss 2.4030, time 5256.79ms 
iter 8680: loss 2.6164, time 5252.89ms 
iter 8681: loss 2.3887, time 5250.65ms 
iter 8682: loss 2.6238, time 5254.01ms 
iter 8683: loss 2.3824, time 5256.80ms 
iter 8684: loss 2.1540, time 5253.67ms 
iter 8685: loss 2.6560, time 5253.84ms 
iter 8686: loss 2.4470, time 5253.40ms 
iter 8687: loss 2.6063, time 5263.34ms 
iter 8688: loss 2.4054, time 5267.95ms 
iter 8689: loss 2.4833, time 5262.20ms 
iter 8690: loss 2.5396, time 5258.77ms 
iter 8691: loss 2.6038, time 5232.09ms 
iter 8692: loss 2.3595, time 5261.36ms 
iter 8693: loss 2.5132, time 5262.71ms 
iter 8694: loss 2.3539, time 5248.16ms 
iter 8695: loss 2.6973, time 5264.38ms 
iter 8696: loss 2.2916, time 5276.96ms 
iter 8697: loss 2.5251, time 5278.45ms 
iter 8698: loss 2.4582, time 5259.21ms 
iter 8699: loss 2.4060, time 5246.45ms 
step 8700: train loss 2.4773, val loss 2.8502
iter 8700: loss 2.4502, time 20057.70ms 
iter 8701: loss 2.6437, time 5250.18ms 
iter 8702: loss 2.6327, time 5195.77ms 
iter 8703: loss 2.4147, time 5076.80ms 
iter 8704: loss 2.5179, time 5167.05ms 
iter 8705: loss 2.3181, time 5269.44ms 
iter 8706: loss 2.4977, time 5266.95ms 
iter 8707: loss 2.4561, time 5261.25ms 
iter 8708: loss 2.6385, time 5269.26ms 
iter 8709: loss 2.6106, time 5257.62ms 
iter 8710: loss 2.3553, time 5256.15ms 
iter 8711: loss 2.3898, time 5264.69ms 
iter 8712: loss 2.3933, time 5266.79ms 
iter 8713: loss 2.2853, time 5205.49ms 
iter 8714: loss 2.5015, time 5070.55ms 
iter 8715: loss 2.3752, time 5181.30ms 
iter 8716: loss 2.6068, time 5271.23ms 
iter 8717: loss 2.3678, time 5255.10ms 
iter 8718: loss 2.3463, time 5221.12ms 
iter 8719: loss 2.4749, time 5255.33ms 
iter 8720: loss 2.6221, time 5283.47ms 
iter 8721: loss 2.2545, time 5279.99ms 
iter 8722: loss 2.6324, time 5292.40ms 
iter 8723: loss 2.6043, time 5287.46ms 
iter 8724: loss 2.5108, time 5290.63ms 
iter 8725: loss 2.1476, time 5251.60ms 
iter 8726: loss 2.5328, time 5282.68ms 
iter 8727: loss 2.4250, time 5259.98ms 
iter 8728: loss 2.4554, time 5266.57ms 
iter 8729: loss 2.5599, time 5261.90ms 
iter 8730: loss 2.4340, time 5257.52ms 
iter 8731: loss 2.5288, time 5262.37ms 
iter 8732: loss 2.3963, time 5256.59ms 
iter 8733: loss 2.3687, time 5261.58ms 
iter 8734: loss 2.6083, time 5243.73ms 
iter 8735: loss 2.3447, time 5275.76ms 
iter 8736: loss 2.7990, time 5120.30ms 
iter 8737: loss 2.2440, time 5117.29ms 
iter 8738: loss 2.2671, time 5085.17ms 
iter 8739: loss 2.3403, time 5087.80ms 
iter 8740: loss 2.5185, time 5070.37ms 
iter 8741: loss 2.5611, time 5110.37ms 
iter 8742: loss 2.2835, time 5085.21ms 
iter 8743: loss 2.4130, time 5176.28ms 
iter 8744: loss 2.4475, time 5275.51ms 
iter 8745: loss 2.4089, time 5265.45ms 
iter 8746: loss 2.4077, time 5249.91ms 
iter 8747: loss 2.5606, time 5257.07ms 
iter 8748: loss 2.5715, time 5270.87ms 
iter 8749: loss 2.2739, time 5244.54ms 
step 8750: train loss 2.4650, val loss 2.8593
iter 8750: loss 2.4243, time 20014.24ms 
iter 8751: loss 2.5573, time 5259.24ms 
iter 8752: loss 2.4743, time 5250.63ms 
iter 8753: loss 2.5067, time 5254.10ms 
iter 8754: loss 2.1172, time 5256.63ms 
iter 8755: loss 2.3981, time 5263.31ms 
iter 8756: loss 2.5694, time 5272.84ms 
iter 8757: loss 2.2858, time 5256.57ms 
iter 8758: loss 2.5915, time 5252.96ms 
iter 8759: loss 2.4432, time 5270.63ms 
iter 8760: loss 2.2350, time 5274.83ms 
iter 8761: loss 2.4546, time 5263.60ms 
iter 8762: loss 2.5293, time 5256.01ms 
iter 8763: loss 2.5957, time 5273.97ms 
iter 8764: loss 2.4163, time 5258.31ms 
iter 8765: loss 2.5474, time 5261.22ms 
iter 8766: loss 2.4012, time 5256.29ms 
iter 8767: loss 2.5512, time 5269.70ms 
iter 8768: loss 2.7562, time 5259.70ms 
iter 8769: loss 2.2724, time 5257.86ms 
iter 8770: loss 2.5065, time 5272.18ms 
iter 8771: loss 2.6221, time 5262.08ms 
iter 8772: loss 2.5333, time 5251.67ms 
iter 8773: loss 2.6423, time 5259.99ms 
iter 8774: loss 2.4021, time 5273.87ms 
iter 8775: loss 2.3606, time 5261.97ms 
iter 8776: loss 2.4081, time 5263.57ms 
iter 8777: loss 2.4509, time 5265.97ms 
iter 8778: loss 2.3854, time 5241.25ms 
iter 8779: loss 2.5200, time 5224.13ms 
iter 8780: loss 2.5525, time 5261.42ms 
iter 8781: loss 2.4255, time 5274.75ms 
iter 8782: loss 2.4312, time 5251.73ms 
iter 8783: loss 2.5827, time 5263.00ms 
iter 8784: loss 2.5202, time 5255.73ms 
iter 8785: loss 2.4811, time 5282.68ms 
iter 8786: loss 2.5347, time 5248.54ms 
iter 8787: loss 2.6449, time 5268.09ms 
iter 8788: loss 2.4384, time 5234.86ms 
iter 8789: loss 2.4691, time 5258.71ms 
iter 8790: loss 2.3428, time 5259.13ms 
iter 8791: loss 2.4456, time 5251.57ms 
iter 8792: loss 2.6799, time 5268.89ms 
iter 8793: loss 2.5043, time 5252.10ms 
iter 8794: loss 2.5077, time 5191.04ms 
iter 8795: loss 2.4179, time 5259.92ms 
iter 8796: loss 2.4522, time 5263.24ms 
iter 8797: loss 2.5371, time 5256.77ms 
iter 8798: loss 2.7900, time 5188.13ms 
iter 8799: loss 2.4749, time 5266.80ms 
step 8800: train loss 2.4622, val loss 2.8379
iter 8800: loss 2.4628, time 20049.22ms 
iter 8801: loss 2.4173, time 5073.68ms 
iter 8802: loss 2.2730, time 5023.49ms 
iter 8803: loss 2.3062, time 5037.30ms 
iter 8804: loss 2.5501, time 5111.14ms 
iter 8805: loss 2.5193, time 5261.28ms 
iter 8806: loss 2.6619, time 5226.07ms 
iter 8807: loss 2.2905, time 5237.37ms 
iter 8808: loss 2.2885, time 5248.24ms 
iter 8809: loss 2.4818, time 5263.12ms 
iter 8810: loss 2.4332, time 5269.22ms 
iter 8811: loss 2.4113, time 5233.04ms 
iter 8812: loss 2.2115, time 5189.74ms 
iter 8813: loss 2.6613, time 5262.55ms 
iter 8814: loss 2.3774, time 5265.91ms 
iter 8815: loss 2.4517, time 5253.28ms 
iter 8816: loss 2.6825, time 5260.60ms 
iter 8817: loss 2.2439, time 5176.32ms 
iter 8818: loss 2.3415, time 5286.02ms 
iter 8819: loss 2.6231, time 5268.43ms 
iter 8820: loss 2.4968, time 5272.03ms 
iter 8821: loss 2.4713, time 5260.36ms 
iter 8822: loss 2.4509, time 5281.24ms 
iter 8823: loss 2.3082, time 5261.13ms 
iter 8824: loss 2.4318, time 5237.57ms 
iter 8825: loss 2.4636, time 5256.57ms 
iter 8826: loss 2.4500, time 5259.62ms 
iter 8827: loss 2.4063, time 5250.37ms 
iter 8828: loss 2.5320, time 5248.03ms 
iter 8829: loss 2.5369, time 5191.52ms 
iter 8830: loss 2.4502, time 5264.11ms 
iter 8831: loss 2.5364, time 5253.37ms 
iter 8832: loss 2.3489, time 5247.97ms 
iter 8833: loss 2.3740, time 5226.12ms 
iter 8834: loss 2.4404, time 5239.26ms 
iter 8835: loss 2.1811, time 5228.12ms 
iter 8836: loss 2.3879, time 5258.46ms 
iter 8837: loss 2.5310, time 5260.49ms 
iter 8838: loss 2.7606, time 5258.62ms 
iter 8839: loss 2.5534, time 5170.56ms 
iter 8840: loss 2.5196, time 5259.68ms 
iter 8841: loss 2.4607, time 5255.04ms 
iter 8842: loss 2.4081, time 5268.78ms 
iter 8843: loss 2.4574, time 5265.10ms 
iter 8844: loss 2.5776, time 5257.78ms 
iter 8845: loss 2.5792, time 5260.38ms 
iter 8846: loss 2.6334, time 5265.48ms 
iter 8847: loss 2.2667, time 5257.36ms 
iter 8848: loss 2.5806, time 5251.22ms 
iter 8849: loss 2.4189, time 5251.05ms 
step 8850: train loss 2.4700, val loss 2.8413
iter 8850: loss 2.5814, time 19984.23ms 
iter 8851: loss 2.5470, time 5238.65ms 
iter 8852: loss 2.4071, time 5261.74ms 
iter 8853: loss 2.4809, time 5269.62ms 
iter 8854: loss 2.6182, time 5274.32ms 
iter 8855: loss 2.6031, time 5222.99ms 
iter 8856: loss 2.3929, time 5255.34ms 
iter 8857: loss 2.5801, time 5250.01ms 
iter 8858: loss 2.4324, time 5262.40ms 
iter 8859: loss 2.5797, time 5262.48ms 
iter 8860: loss 2.2598, time 5248.97ms 
iter 8861: loss 2.5496, time 5254.89ms 
iter 8862: loss 2.5384, time 5266.89ms 
iter 8863: loss 2.4020, time 5260.38ms 
iter 8864: loss 2.3962, time 5255.66ms 
iter 8865: loss 2.7229, time 5255.13ms 
iter 8866: loss 2.3990, time 5253.45ms 
iter 8867: loss 2.1056, time 5262.76ms 
iter 8868: loss 2.4406, time 5262.42ms 
iter 8869: loss 2.5178, time 5262.76ms 
iter 8870: loss 2.4588, time 5260.57ms 
iter 8871: loss 2.3287, time 5263.85ms 
iter 8872: loss 2.4912, time 5255.97ms 
iter 8873: loss 2.6406, time 5289.95ms 
iter 8874: loss 2.3994, time 5397.94ms 
iter 8875: loss 2.3388, time 5380.24ms 
iter 8876: loss 2.2909, time 5362.46ms 
iter 8877: loss 2.5632, time 5374.51ms 
iter 8878: loss 2.7343, time 5393.65ms 
iter 8879: loss 2.4972, time 5310.94ms 
iter 8880: loss 2.5117, time 5367.60ms 
iter 8881: loss 2.3989, time 5383.43ms 
iter 8882: loss 2.3610, time 5386.88ms 
iter 8883: loss 2.4843, time 5308.96ms 
iter 8884: loss 2.4121, time 5241.75ms 
iter 8885: loss 2.4945, time 5303.87ms 
iter 8886: loss 2.5112, time 5316.53ms 
iter 8887: loss 2.3901, time 5384.30ms 
iter 8888: loss 2.3176, time 5334.63ms 
iter 8889: loss 2.4225, time 5261.45ms 
iter 8890: loss 2.4864, time 5261.44ms 
iter 8891: loss 2.2727, time 5217.01ms 
iter 8892: loss 2.5534, time 5237.39ms 
iter 8893: loss 2.5331, time 5259.81ms 
iter 8894: loss 2.5458, time 5256.17ms 
iter 8895: loss 2.1903, time 5256.45ms 
iter 8896: loss 2.6604, time 5255.83ms 
iter 8897: loss 2.5384, time 5313.98ms 
iter 8898: loss 2.3627, time 5360.36ms 
iter 8899: loss 2.5081, time 5395.16ms 
step 8900: train loss 2.4675, val loss 2.8494
iter 8900: loss 2.6133, time 20251.41ms 
iter 8901: loss 2.6060, time 5277.98ms 
iter 8902: loss 2.4638, time 5249.80ms 
iter 8903: loss 2.3691, time 5253.36ms 
iter 8904: loss 2.5095, time 5254.81ms 
iter 8905: loss 2.2990, time 5267.32ms 
iter 8906: loss 2.5041, time 5254.17ms 
iter 8907: loss 2.5591, time 5261.48ms 
iter 8908: loss 2.5874, time 5272.90ms 
iter 8909: loss 2.4688, time 5265.87ms 
iter 8910: loss 2.5674, time 5317.88ms 
iter 8911: loss 2.2577, time 5419.83ms 
iter 8912: loss 2.6592, time 5264.23ms 
iter 8913: loss 2.5262, time 5253.72ms 
iter 8914: loss 2.2976, time 5245.51ms 
iter 8915: loss 2.4943, time 5251.65ms 
iter 8916: loss 2.5256, time 5276.42ms 
iter 8917: loss 2.4886, time 5253.35ms 
iter 8918: loss 2.5321, time 5235.21ms 
iter 8919: loss 2.3113, time 5260.14ms 
iter 8920: loss 2.5343, time 5273.66ms 
iter 8921: loss 2.2727, time 5254.74ms 
iter 8922: loss 2.3268, time 5261.77ms 
iter 8923: loss 2.5164, time 5265.52ms 
iter 8924: loss 2.5909, time 5263.96ms 
iter 8925: loss 2.4377, time 5245.01ms 
iter 8926: loss 2.4568, time 5248.17ms 
iter 8927: loss 2.2899, time 5267.30ms 
iter 8928: loss 2.6538, time 5265.07ms 
iter 8929: loss 2.6528, time 5228.23ms 
iter 8930: loss 2.6154, time 5017.50ms 
iter 8931: loss 2.6038, time 5266.26ms 
iter 8932: loss 2.4733, time 5260.95ms 
iter 8933: loss 2.6042, time 5253.21ms 
iter 8934: loss 2.5022, time 5259.15ms 
iter 8935: loss 2.3412, time 5271.92ms 
iter 8936: loss 2.4472, time 5249.45ms 
iter 8937: loss 2.5935, time 5251.65ms 
iter 8938: loss 2.5097, time 5254.65ms 
iter 8939: loss 2.3429, time 5249.12ms 
iter 8940: loss 2.5155, time 5245.82ms 
iter 8941: loss 2.3274, time 5246.91ms 
iter 8942: loss 2.3759, time 5257.51ms 
iter 8943: loss 2.5914, time 5255.58ms 
iter 8944: loss 2.5165, time 5256.73ms 
iter 8945: loss 2.3166, time 5263.77ms 
iter 8946: loss 2.4536, time 5281.47ms 
iter 8947: loss 2.6590, time 5253.89ms 
iter 8948: loss 2.5072, time 5256.48ms 
iter 8949: loss 2.4664, time 5257.27ms 
step 8950: train loss 2.4464, val loss 2.8442
iter 8950: loss 2.2452, time 20031.72ms 
iter 8951: loss 2.5588, time 5265.74ms 
iter 8952: loss 2.5200, time 5259.72ms 
iter 8953: loss 2.3419, time 5269.08ms 
iter 8954: loss 2.4480, time 5271.69ms 
iter 8955: loss 2.5237, time 5259.51ms 
iter 8956: loss 2.4199, time 5253.52ms 
iter 8957: loss 2.4195, time 5265.66ms 
iter 8958: loss 2.3958, time 5269.28ms 
iter 8959: loss 2.4299, time 5260.17ms 
iter 8960: loss 2.6871, time 5259.12ms 
iter 8961: loss 2.4385, time 5273.72ms 
iter 8962: loss 2.5503, time 5265.00ms 
iter 8963: loss 2.5463, time 5262.58ms 
iter 8964: loss 2.3346, time 5262.36ms 
iter 8965: loss 2.4048, time 5274.91ms 
iter 8966: loss 2.3354, time 5265.77ms 
iter 8967: loss 2.4191, time 5234.35ms 
iter 8968: loss 2.4061, time 5277.33ms 
iter 8969: loss 2.3593, time 5269.55ms 
iter 8970: loss 2.4046, time 5262.00ms 
iter 8971: loss 2.3482, time 5268.84ms 
iter 8972: loss 2.3842, time 5275.74ms 
iter 8973: loss 2.2307, time 5266.22ms 
iter 8974: loss 2.3771, time 5254.01ms 
iter 8975: loss 2.7261, time 5266.81ms 
iter 8976: loss 2.2796, time 5268.11ms 
iter 8977: loss 2.5234, time 5254.62ms 
iter 8978: loss 2.5949, time 5266.71ms 
iter 8979: loss 2.3013, time 5265.96ms 
iter 8980: loss 2.3478, time 5273.65ms 
iter 8981: loss 2.5526, time 5269.93ms 
iter 8982: loss 2.4935, time 5267.46ms 
iter 8983: loss 2.2856, time 5254.79ms 
iter 8984: loss 2.5901, time 5272.07ms 
iter 8985: loss 2.3476, time 5282.18ms 
iter 8986: loss 2.4735, time 5273.73ms 
iter 8987: loss 2.7293, time 5265.37ms 
iter 8988: loss 2.2773, time 5298.80ms 
iter 8989: loss 2.4290, time 5271.77ms 
iter 8990: loss 2.2451, time 5287.81ms 
iter 8991: loss 2.2991, time 5279.44ms 
iter 8992: loss 2.6672, time 5288.12ms 
iter 8993: loss 2.6378, time 5274.70ms 
iter 8994: loss 2.3468, time 5298.63ms 
iter 8995: loss 2.4181, time 5266.81ms 
iter 8996: loss 2.3746, time 5278.35ms 
iter 8997: loss 2.7494, time 5267.48ms 
iter 8998: loss 2.5415, time 5262.69ms 
iter 8999: loss 2.3312, time 5269.79ms 
step 9000: train loss 2.4568, val loss 2.8563
iter 9000: loss 2.6257, time 19999.90ms 
iter 9001: loss 2.5746, time 5256.85ms 
iter 9002: loss 2.5876, time 5247.81ms 
iter 9003: loss 2.4641, time 5240.82ms 
iter 9004: loss 2.5166, time 5267.66ms 
iter 9005: loss 2.5155, time 5263.69ms 
iter 9006: loss 2.4211, time 5254.25ms 
iter 9007: loss 2.4658, time 5255.97ms 
iter 9008: loss 2.5621, time 5256.37ms 
iter 9009: loss 2.4071, time 5250.59ms 
iter 9010: loss 2.2865, time 5251.51ms 
iter 9011: loss 2.5697, time 5271.69ms 
iter 9012: loss 2.4170, time 5267.02ms 
iter 9013: loss 2.3840, time 5259.37ms 
iter 9014: loss 2.5273, time 5254.38ms 
iter 9015: loss 2.1900, time 5271.09ms 
iter 9016: loss 2.5693, time 5253.59ms 
iter 9017: loss 2.3799, time 5259.42ms 
iter 9018: loss 2.5143, time 5260.11ms 
iter 9019: loss 2.8150, time 5266.71ms 
iter 9020: loss 2.3325, time 5280.06ms 
iter 9021: loss 2.3086, time 5266.47ms 
iter 9022: loss 2.4789, time 5264.68ms 
iter 9023: loss 2.3225, time 5276.55ms 
iter 9024: loss 2.5471, time 5268.67ms 
iter 9025: loss 2.4006, time 5293.86ms 
iter 9026: loss 2.1660, time 5277.76ms 
iter 9027: loss 2.4712, time 5275.38ms 
iter 9028: loss 2.5654, time 5259.63ms 
iter 9029: loss 2.4633, time 5254.50ms 
iter 9030: loss 2.5059, time 5264.14ms 
iter 9031: loss 2.4436, time 5278.93ms 
iter 9032: loss 2.3376, time 5260.14ms 
iter 9033: loss 2.3376, time 5276.30ms 
iter 9034: loss 2.3132, time 5263.84ms 
iter 9035: loss 2.4138, time 5273.24ms 
iter 9036: loss 2.4959, time 5265.93ms 
iter 9037: loss 2.5665, time 5258.22ms 
iter 9038: loss 2.3751, time 5275.24ms 
iter 9039: loss 2.5701, time 5265.87ms 
iter 9040: loss 2.3820, time 5260.63ms 
iter 9041: loss 2.5880, time 5251.68ms 
iter 9042: loss 2.3142, time 5281.88ms 
iter 9043: loss 2.4908, time 5258.34ms 
iter 9044: loss 2.4932, time 5266.76ms 
iter 9045: loss 2.3610, time 5255.04ms 
iter 9046: loss 2.5352, time 5263.20ms 
iter 9047: loss 2.5903, time 5252.87ms 
iter 9048: loss 2.4726, time 5252.56ms 
iter 9049: loss 2.4373, time 5259.86ms 
step 9050: train loss 2.4591, val loss 2.8417
iter 9050: loss 2.5664, time 20018.94ms 
iter 9051: loss 2.4581, time 5261.37ms 
iter 9052: loss 2.8245, time 5257.51ms 
iter 9053: loss 2.2635, time 5274.79ms 
iter 9054: loss 2.3077, time 5262.18ms 
iter 9055: loss 2.4768, time 5253.59ms 
iter 9056: loss 2.2490, time 5262.13ms 
iter 9057: loss 2.2369, time 5271.56ms 
iter 9058: loss 2.4962, time 5268.19ms 
iter 9059: loss 2.3646, time 5260.07ms 
iter 9060: loss 2.5526, time 5256.43ms 
iter 9061: loss 2.3508, time 5279.40ms 
iter 9062: loss 2.5704, time 5254.24ms 
iter 9063: loss 2.3745, time 5257.57ms 
iter 9064: loss 2.6249, time 5261.11ms 
iter 9065: loss 2.5694, time 5262.68ms 
iter 9066: loss 2.5625, time 5250.29ms 
iter 9067: loss 2.6360, time 5250.47ms 
iter 9068: loss 2.5349, time 5260.64ms 
iter 9069: loss 2.5611, time 5258.67ms 
iter 9070: loss 2.6675, time 5275.32ms 
iter 9071: loss 2.1262, time 5251.31ms 
iter 9072: loss 2.4291, time 5270.70ms 
iter 9073: loss 2.2893, time 5251.82ms 
iter 9074: loss 2.5611, time 5261.12ms 
iter 9075: loss 2.6638, time 5266.08ms 
iter 9076: loss 2.4169, time 5266.49ms 
iter 9077: loss 2.2403, time 5252.41ms 
iter 9078: loss 2.6799, time 5250.79ms 
iter 9079: loss 2.7599, time 5261.54ms 
iter 9080: loss 2.3720, time 5257.99ms 
iter 9081: loss 2.4676, time 5248.01ms 
iter 9082: loss 2.3287, time 5246.50ms 
iter 9083: loss 2.3196, time 5254.89ms 
iter 9084: loss 2.5485, time 5256.92ms 
iter 9085: loss 2.4300, time 5237.94ms 
iter 9086: loss 2.4345, time 5257.29ms 
iter 9087: loss 2.4073, time 5272.66ms 
iter 9088: loss 2.4493, time 5256.05ms 
iter 9089: loss 2.5194, time 5264.75ms 
iter 9090: loss 2.4301, time 5256.58ms 
iter 9091: loss 2.5039, time 5279.27ms 
iter 9092: loss 2.4694, time 5255.33ms 
iter 9093: loss 2.3481, time 5266.02ms 
iter 9094: loss 2.5002, time 5260.06ms 
iter 9095: loss 2.6046, time 5270.29ms 
iter 9096: loss 2.5805, time 5265.44ms 
iter 9097: loss 2.6378, time 5308.79ms 
iter 9098: loss 2.3941, time 5311.16ms 
iter 9099: loss 2.5955, time 5246.42ms 
step 9100: train loss 2.4635, val loss 2.8339
iter 9100: loss 2.3431, time 19923.96ms 
iter 9101: loss 2.3985, time 5256.75ms 
iter 9102: loss 2.5629, time 5259.36ms 
iter 9103: loss 2.4121, time 5238.03ms 
iter 9104: loss 2.3951, time 5269.97ms 
iter 9105: loss 2.4894, time 5276.39ms 
iter 9106: loss 2.4151, time 5273.28ms 
iter 9107: loss 2.2339, time 5271.75ms 
iter 9108: loss 2.3094, time 5272.92ms 
iter 9109: loss 2.6236, time 5262.30ms 
iter 9110: loss 2.3611, time 5253.85ms 
iter 9111: loss 2.5890, time 5262.06ms 
iter 9112: loss 2.6049, time 5256.99ms 
iter 9113: loss 2.5987, time 5259.16ms 
iter 9114: loss 2.4973, time 5262.81ms 
iter 9115: loss 2.3546, time 5260.80ms 
iter 9116: loss 2.4718, time 5277.41ms 
iter 9117: loss 2.3557, time 5259.36ms 
iter 9118: loss 2.3763, time 5257.17ms 
iter 9119: loss 2.2772, time 5268.70ms 
iter 9120: loss 2.3054, time 5263.71ms 
iter 9121: loss 2.4112, time 5262.50ms 
iter 9122: loss 2.7256, time 5256.23ms 
iter 9123: loss 2.5776, time 5256.55ms 
iter 9124: loss 2.4743, time 5259.64ms 
iter 9125: loss 2.3106, time 5251.21ms 
iter 9126: loss 2.3694, time 5250.96ms 
iter 9127: loss 2.4334, time 5273.43ms 
iter 9128: loss 2.5386, time 5251.22ms 
iter 9129: loss 2.5771, time 5251.54ms 
iter 9130: loss 2.3529, time 5253.23ms 
iter 9131: loss 2.2643, time 5260.43ms 
iter 9132: loss 2.3799, time 5253.13ms 
iter 9133: loss 2.4791, time 5253.45ms 
iter 9134: loss 2.3110, time 5247.61ms 
iter 9135: loss 2.4431, time 5263.05ms 
iter 9136: loss 2.3820, time 5262.89ms 
iter 9137: loss 2.0880, time 5248.61ms 
iter 9138: loss 2.3496, time 5248.14ms 
iter 9139: loss 2.4154, time 5235.61ms 
iter 9140: loss 2.3753, time 5260.98ms 
iter 9141: loss 2.4003, time 5260.95ms 
iter 9142: loss 2.4232, time 5254.69ms 
iter 9143: loss 2.5096, time 5259.59ms 
iter 9144: loss 2.4378, time 5256.18ms 
iter 9145: loss 2.1552, time 5253.58ms 
iter 9146: loss 2.5553, time 5261.20ms 
iter 9147: loss 2.4213, time 5264.39ms 
iter 9148: loss 2.5018, time 5261.76ms 
iter 9149: loss 2.4981, time 5257.72ms 
step 9150: train loss 2.4405, val loss 2.8409
iter 9150: loss 2.4659, time 20010.42ms 
iter 9151: loss 2.4707, time 5248.35ms 
iter 9152: loss 2.3586, time 5248.90ms 
iter 9153: loss 2.3314, time 5257.91ms 
iter 9154: loss 2.4658, time 5258.96ms 
iter 9155: loss 2.5442, time 5254.35ms 
iter 9156: loss 2.4001, time 5251.27ms 
iter 9157: loss 2.3770, time 5267.56ms 
iter 9158: loss 2.3339, time 5268.51ms 
iter 9159: loss 2.5352, time 5261.87ms 
iter 9160: loss 2.4888, time 5262.37ms 
iter 9161: loss 2.4195, time 5265.50ms 
iter 9162: loss 2.3695, time 5269.35ms 
iter 9163: loss 2.6163, time 5242.36ms 
iter 9164: loss 2.4058, time 5242.06ms 
iter 9165: loss 2.4671, time 5266.18ms 
iter 9166: loss 2.4671, time 5260.58ms 
iter 9167: loss 2.6428, time 5252.72ms 
iter 9168: loss 2.3115, time 5247.15ms 
iter 9169: loss 2.4957, time 5261.80ms 
iter 9170: loss 2.5249, time 5249.32ms 
iter 9171: loss 2.4410, time 5254.07ms 
iter 9172: loss 2.4677, time 5254.92ms 
iter 9173: loss 2.4105, time 5265.23ms 
iter 9174: loss 2.5757, time 5247.42ms 
iter 9175: loss 2.4487, time 5253.45ms 
iter 9176: loss 2.5106, time 5253.41ms 
iter 9177: loss 2.4986, time 5261.25ms 
iter 9178: loss 2.3873, time 5260.93ms 
iter 9179: loss 2.5190, time 5256.78ms 
iter 9180: loss 2.3699, time 5268.47ms 
iter 9181: loss 2.3850, time 5262.42ms 
iter 9182: loss 2.2553, time 5226.86ms 
iter 9183: loss 2.7077, time 5259.17ms 
iter 9184: loss 2.6193, time 5271.67ms 
iter 9185: loss 2.3872, time 5263.21ms 
iter 9186: loss 2.3733, time 5266.05ms 
iter 9187: loss 2.5239, time 5263.30ms 
iter 9188: loss 2.4781, time 5273.43ms 
iter 9189: loss 2.5669, time 5257.06ms 
iter 9190: loss 2.4732, time 5266.44ms 
iter 9191: loss 2.5523, time 5272.40ms 
iter 9192: loss 2.3375, time 5262.71ms 
iter 9193: loss 2.4127, time 5265.78ms 
iter 9194: loss 2.6091, time 5257.64ms 
iter 9195: loss 2.4609, time 5270.65ms 
iter 9196: loss 2.5684, time 5263.68ms 
iter 9197: loss 2.5789, time 5258.05ms 
iter 9198: loss 2.2582, time 5263.19ms 
iter 9199: loss 2.5295, time 5277.68ms 
step 9200: train loss 2.4658, val loss 2.8565
iter 9200: loss 2.0704, time 20002.35ms 
iter 9201: loss 2.4610, time 5253.98ms 
iter 9202: loss 2.5921, time 5258.37ms 
iter 9203: loss 2.3150, time 5269.01ms 
iter 9204: loss 2.4592, time 5263.01ms 
iter 9205: loss 2.4851, time 5258.60ms 
iter 9206: loss 2.2260, time 5259.82ms 
iter 9207: loss 2.5818, time 5276.40ms 
iter 9208: loss 2.5048, time 5255.21ms 
iter 9209: loss 2.5338, time 5256.30ms 
iter 9210: loss 2.6214, time 5264.04ms 
iter 9211: loss 2.2883, time 5277.31ms 
iter 9212: loss 2.0292, time 5257.79ms 
iter 9213: loss 2.5313, time 5260.55ms 
iter 9214: loss 2.2197, time 5259.28ms 
iter 9215: loss 2.4262, time 5274.53ms 
iter 9216: loss 2.4271, time 5270.63ms 
iter 9217: loss 2.5776, time 5270.07ms 
iter 9218: loss 2.3633, time 5253.04ms 
iter 9219: loss 2.6750, time 5270.86ms 
iter 9220: loss 2.3334, time 5263.43ms 
iter 9221: loss 2.3058, time 5260.28ms 
iter 9222: loss 2.6287, time 5253.58ms 
iter 9223: loss 2.5744, time 5279.48ms 
iter 9224: loss 2.4475, time 5278.78ms 
iter 9225: loss 2.4578, time 5256.32ms 
iter 9226: loss 2.4329, time 5350.57ms 
iter 9227: loss 2.4350, time 5339.15ms 
iter 9228: loss 2.4502, time 5259.93ms 
iter 9229: loss 2.3625, time 5255.64ms 
iter 9230: loss 2.5550, time 5264.51ms 
iter 9231: loss 2.3027, time 5268.63ms 
iter 9232: loss 2.3575, time 5251.31ms 
iter 9233: loss 2.4159, time 5261.27ms 
iter 9234: loss 2.4916, time 5264.42ms 
iter 9235: loss 2.2861, time 5253.98ms 
iter 9236: loss 2.3559, time 5258.74ms 
iter 9237: loss 2.5679, time 5253.72ms 
iter 9238: loss 2.2154, time 5269.44ms 
iter 9239: loss 2.3586, time 5254.83ms 
iter 9240: loss 2.6890, time 5254.41ms 
iter 9241: loss 2.3221, time 5252.86ms 
iter 9242: loss 2.2630, time 5241.56ms 
iter 9243: loss 2.4800, time 5251.35ms 
iter 9244: loss 2.2729, time 5271.08ms 
iter 9245: loss 2.7438, time 5265.20ms 
iter 9246: loss 2.4174, time 5280.02ms 
iter 9247: loss 2.5052, time 5259.79ms 
iter 9248: loss 2.4301, time 5243.88ms 
iter 9249: loss 2.3823, time 5258.20ms 
step 9250: train loss 2.4500, val loss 2.8597
iter 9250: loss 2.7000, time 19986.23ms 
iter 9251: loss 2.5398, time 5254.24ms 
iter 9252: loss 2.3878, time 5236.57ms 
iter 9253: loss 2.5116, time 5250.65ms 
iter 9254: loss 2.4302, time 5263.14ms 
iter 9255: loss 2.4630, time 5250.76ms 
iter 9256: loss 2.0926, time 5250.97ms 
iter 9257: loss 2.5904, time 5267.51ms 
iter 9258: loss 2.4534, time 5267.22ms 
iter 9259: loss 2.3985, time 5248.62ms 
iter 9260: loss 2.4171, time 5259.91ms 
iter 9261: loss 2.6493, time 5266.52ms 
iter 9262: loss 2.6164, time 5258.23ms 
iter 9263: loss 2.3999, time 5253.88ms 
iter 9264: loss 2.4944, time 5239.61ms 
iter 9265: loss 2.4575, time 5216.21ms 
iter 9266: loss 2.5173, time 5261.71ms 
iter 9267: loss 2.5450, time 5250.53ms 
iter 9268: loss 2.5111, time 5248.12ms 
iter 9269: loss 2.5359, time 5263.54ms 
iter 9270: loss 2.6242, time 5259.38ms 
iter 9271: loss 2.6615, time 5254.47ms 
iter 9272: loss 2.4150, time 5257.11ms 
iter 9273: loss 2.4013, time 5279.79ms 
iter 9274: loss 2.5487, time 5275.56ms 
iter 9275: loss 2.5905, time 5253.06ms 
iter 9276: loss 2.6007, time 5250.29ms 
iter 9277: loss 2.4535, time 5245.25ms 
iter 9278: loss 2.6284, time 5245.40ms 
iter 9279: loss 2.4403, time 5250.93ms 
iter 9280: loss 2.2671, time 5254.05ms 
iter 9281: loss 2.6030, time 5257.15ms 
iter 9282: loss 2.5066, time 5223.60ms 
iter 9283: loss 2.4995, time 5219.99ms 
iter 9284: loss 2.6334, time 5255.84ms 
iter 9285: loss 2.5691, time 5249.30ms 
iter 9286: loss 2.3221, time 5234.97ms 
iter 9287: loss 2.2833, time 5236.71ms 
iter 9288: loss 2.4190, time 5247.25ms 
iter 9289: loss 2.5097, time 5227.95ms 
iter 9290: loss 2.3440, time 5228.34ms 
iter 9291: loss 2.5123, time 5245.48ms 
iter 9292: loss 2.6205, time 5241.66ms 
iter 9293: loss 2.5443, time 5227.17ms 
iter 9294: loss 2.3518, time 5223.36ms 
iter 9295: loss 2.5097, time 5229.64ms 
iter 9296: loss 2.4991, time 5226.18ms 
iter 9297: loss 2.3846, time 5223.22ms 
iter 9298: loss 2.2825, time 5226.60ms 
iter 9299: loss 2.3660, time 5247.66ms 
step 9300: train loss 2.4497, val loss 2.8467
iter 9300: loss 2.4157, time 20032.22ms 
iter 9301: loss 2.5414, time 5209.22ms 
iter 9302: loss 2.3209, time 5205.28ms 
iter 9303: loss 2.4669, time 5250.25ms 
iter 9304: loss 2.1727, time 5239.42ms 
iter 9305: loss 2.4786, time 5248.08ms 
iter 9306: loss 2.5829, time 5241.15ms 
iter 9307: loss 2.5690, time 5258.80ms 
iter 9308: loss 2.7536, time 5206.97ms 
iter 9309: loss 2.5274, time 5206.42ms 
iter 9310: loss 2.3908, time 5253.89ms 
iter 9311: loss 2.4938, time 5258.14ms 
iter 9312: loss 2.5700, time 5211.04ms 
iter 9313: loss 2.3731, time 5249.81ms 
iter 9314: loss 2.3029, time 5201.82ms 
iter 9315: loss 2.4006, time 5252.59ms 
iter 9316: loss 2.6868, time 5245.87ms 
iter 9317: loss 2.4433, time 5240.19ms 
iter 9318: loss 2.4324, time 5250.90ms 
iter 9319: loss 2.6731, time 5229.84ms 
iter 9320: loss 2.4708, time 5251.13ms 
iter 9321: loss 2.5660, time 5249.60ms 
iter 9322: loss 2.4783, time 5250.59ms 
iter 9323: loss 2.6412, time 5214.75ms 
iter 9324: loss 2.6117, time 5244.76ms 
iter 9325: loss 2.1682, time 5229.40ms 
iter 9326: loss 2.5930, time 5245.22ms 
iter 9327: loss 2.5537, time 5232.53ms 
iter 9328: loss 2.4557, time 5238.37ms 
iter 9329: loss 2.5233, time 5255.33ms 
iter 9330: loss 2.4649, time 5250.54ms 
iter 9331: loss 2.5241, time 5249.30ms 
iter 9332: loss 2.2952, time 5227.72ms 
iter 9333: loss 2.2153, time 5237.09ms 
iter 9334: loss 2.2433, time 5234.87ms 
iter 9335: loss 2.4881, time 5252.96ms 
iter 9336: loss 2.3170, time 5245.19ms 
iter 9337: loss 2.2927, time 5240.31ms 
iter 9338: loss 2.4828, time 5236.95ms 
iter 9339: loss 2.4448, time 5232.44ms 
iter 9340: loss 2.4254, time 5251.96ms 
iter 9341: loss 2.4126, time 5240.29ms 
iter 9342: loss 2.4976, time 5232.03ms 
iter 9343: loss 2.3800, time 5181.77ms 
iter 9344: loss 2.4158, time 5137.22ms 
iter 9345: loss 2.3252, time 5223.91ms 
iter 9346: loss 2.6242, time 5184.93ms 
iter 9347: loss 2.2401, time 5228.40ms 
iter 9348: loss 2.6448, time 5211.63ms 
iter 9349: loss 2.2205, time 5229.56ms 
step 9350: train loss 2.4649, val loss 2.8461
iter 9350: loss 2.3504, time 19986.55ms 
iter 9351: loss 2.2289, time 5188.70ms 
iter 9352: loss 2.2642, time 5256.92ms 
iter 9353: loss 2.2736, time 5254.50ms 
iter 9354: loss 2.3829, time 5270.37ms 
iter 9355: loss 2.4893, time 5235.82ms 
iter 9356: loss 2.5774, time 5222.88ms 
iter 9357: loss 2.4678, time 5241.18ms 
iter 9358: loss 2.4731, time 5270.10ms 
iter 9359: loss 2.3323, time 5265.10ms 
iter 9360: loss 2.6432, time 5259.20ms 
iter 9361: loss 2.5371, time 5264.36ms 
iter 9362: loss 2.5748, time 5256.55ms 
iter 9363: loss 2.4317, time 5266.47ms 
iter 9364: loss 2.4454, time 5254.56ms 
iter 9365: loss 2.4069, time 5276.76ms 
iter 9366: loss 2.6920, time 5260.42ms 
iter 9367: loss 2.6892, time 5249.13ms 
iter 9368: loss 2.5471, time 5261.85ms 
iter 9369: loss 2.3010, time 5257.52ms 
iter 9370: loss 2.3627, time 5266.30ms 
iter 9371: loss 2.5511, time 5249.93ms 
iter 9372: loss 2.5561, time 5268.39ms 
iter 9373: loss 2.2366, time 5260.05ms 
iter 9374: loss 2.6592, time 5261.44ms 
iter 9375: loss 2.6630, time 5264.34ms 
iter 9376: loss 2.3878, time 5261.07ms 
iter 9377: loss 2.3815, time 5250.64ms 
iter 9378: loss 2.4763, time 5258.04ms 
iter 9379: loss 2.2608, time 5263.37ms 
iter 9380: loss 2.2641, time 5257.99ms 
iter 9381: loss 2.2977, time 5255.37ms 
iter 9382: loss 2.4794, time 5266.52ms 
iter 9383: loss 2.4234, time 5260.52ms 
iter 9384: loss 2.5786, time 5256.85ms 
iter 9385: loss 2.4794, time 5146.71ms 
iter 9386: loss 2.3671, time 5084.33ms 
iter 9387: loss 2.2812, time 5115.56ms 
iter 9388: loss 2.4686, time 5109.14ms 
iter 9389: loss 2.4149, time 5103.30ms 
iter 9390: loss 2.3475, time 5104.27ms 
iter 9391: loss 2.4543, time 5103.41ms 
iter 9392: loss 2.5526, time 5078.62ms 
iter 9393: loss 2.4105, time 5073.47ms 
iter 9394: loss 2.3734, time 5091.48ms 
iter 9395: loss 2.3742, time 5094.74ms 
iter 9396: loss 2.5932, time 5090.73ms 
iter 9397: loss 2.4407, time 5082.43ms 
iter 9398: loss 2.5360, time 5070.30ms 
iter 9399: loss 2.3325, time 5167.85ms 
step 9400: train loss 2.4474, val loss 2.8483
iter 9400: loss 2.3735, time 20022.34ms 
iter 9401: loss 2.6186, time 5248.85ms 
iter 9402: loss 2.2838, time 5252.73ms 
iter 9403: loss 2.4245, time 5248.42ms 
iter 9404: loss 2.3821, time 5259.36ms 
iter 9405: loss 2.4849, time 5233.60ms 
iter 9406: loss 2.1261, time 5236.02ms 
iter 9407: loss 2.6423, time 5249.89ms 
iter 9408: loss 2.5866, time 5245.58ms 
iter 9409: loss 2.5331, time 5263.43ms 
iter 9410: loss 2.4853, time 5257.27ms 
iter 9411: loss 2.7277, time 5249.88ms 
iter 9412: loss 2.6713, time 5251.17ms 
iter 9413: loss 2.5505, time 5254.14ms 
iter 9414: loss 2.3394, time 5251.95ms 
iter 9415: loss 2.4971, time 5255.89ms 
iter 9416: loss 2.5012, time 5254.50ms 
iter 9417: loss 2.0390, time 5253.74ms 
iter 9418: loss 2.5749, time 5246.95ms 
iter 9419: loss 2.4335, time 5263.90ms 
iter 9420: loss 2.5105, time 5259.78ms 
iter 9421: loss 2.6389, time 5245.78ms 
iter 9422: loss 2.3518, time 5241.00ms 
iter 9423: loss 2.5276, time 5276.74ms 
iter 9424: loss 2.4156, time 5274.34ms 
iter 9425: loss 2.5780, time 5295.16ms 
iter 9426: loss 2.1870, time 5295.00ms 
iter 9427: loss 2.4359, time 5302.73ms 
iter 9428: loss 2.5964, time 5217.74ms 
iter 9429: loss 2.5663, time 5244.20ms 
iter 9430: loss 2.4542, time 5266.78ms 
iter 9431: loss 2.5276, time 5267.25ms 
iter 9432: loss 2.6767, time 5252.35ms 
iter 9433: loss 2.4880, time 5256.83ms 
iter 9434: loss 2.4775, time 5273.18ms 
iter 9435: loss 2.1854, time 5212.88ms 
iter 9436: loss 2.4445, time 5142.29ms 
iter 9437: loss 2.5214, time 5214.75ms 
iter 9438: loss 2.2905, time 5226.21ms 
iter 9439: loss 2.1312, time 5163.27ms 
iter 9440: loss 2.2632, time 5243.80ms 
iter 9441: loss 2.3426, time 5259.28ms 
iter 9442: loss 2.2815, time 5264.44ms 
iter 9443: loss 2.4967, time 5209.85ms 
iter 9444: loss 2.2150, time 5245.83ms 
iter 9445: loss 2.5520, time 5216.70ms 
iter 9446: loss 2.5040, time 5181.33ms 
iter 9447: loss 2.5107, time 5207.95ms 
iter 9448: loss 2.4808, time 5237.29ms 
iter 9449: loss 2.3571, time 5233.72ms 
step 9450: train loss 2.4583, val loss 2.8315
iter 9450: loss 2.4889, time 19962.00ms 
iter 9451: loss 2.3768, time 5244.05ms 
iter 9452: loss 2.4983, time 5246.32ms 
iter 9453: loss 2.5315, time 5269.64ms 
iter 9454: loss 2.3052, time 5196.64ms 
iter 9455: loss 2.4927, time 5247.04ms 
iter 9456: loss 2.5085, time 5355.62ms 
iter 9457: loss 2.4146, time 5454.75ms 
iter 9458: loss 2.5687, time 5439.61ms 
iter 9459: loss 2.6663, time 5433.14ms 
iter 9460: loss 2.2225, time 5330.66ms 
iter 9461: loss 2.7556, time 5252.66ms 
iter 9462: loss 2.2568, time 5254.14ms 
iter 9463: loss 2.7039, time 5257.86ms 
iter 9464: loss 2.6191, time 5267.74ms 
iter 9465: loss 2.5162, time 5265.56ms 
iter 9466: loss 2.3030, time 5259.47ms 
iter 9467: loss 2.4238, time 5272.17ms 
iter 9468: loss 2.6355, time 5235.99ms 
iter 9469: loss 2.2392, time 5268.58ms 
iter 9470: loss 2.5371, time 5252.86ms 
iter 9471: loss 2.4692, time 5267.18ms 
iter 9472: loss 2.4606, time 5247.42ms 
iter 9473: loss 2.5481, time 5242.11ms 
iter 9474: loss 2.4386, time 5261.24ms 
iter 9475: loss 2.6523, time 5248.62ms 
iter 9476: loss 2.5527, time 5250.38ms 
iter 9477: loss 2.5145, time 5250.02ms 
iter 9478: loss 2.1649, time 5279.20ms 
iter 9479: loss 2.3182, time 5184.97ms 
iter 9480: loss 2.5005, time 5242.03ms 
iter 9481: loss 2.3699, time 5267.18ms 
iter 9482: loss 2.3481, time 5249.70ms 
iter 9483: loss 2.2561, time 5250.32ms 
iter 9484: loss 2.6706, time 5259.03ms 
iter 9485: loss 2.3113, time 5288.29ms 
iter 9486: loss 2.2764, time 5252.26ms 
iter 9487: loss 2.6160, time 5257.03ms 
iter 9488: loss 2.3082, time 5253.93ms 
iter 9489: loss 2.5547, time 5260.77ms 
iter 9490: loss 2.4562, time 5252.83ms 
iter 9491: loss 2.1519, time 5248.94ms 
iter 9492: loss 2.3451, time 5263.27ms 
iter 9493: loss 2.3973, time 5251.97ms 
iter 9494: loss 2.2902, time 5253.06ms 
iter 9495: loss 2.3468, time 5254.15ms 
iter 9496: loss 2.4434, time 5276.97ms 
iter 9497: loss 2.5815, time 5250.15ms 
iter 9498: loss 2.5114, time 5246.09ms 
iter 9499: loss 2.1560, time 5266.36ms 
step 9500: train loss 2.4574, val loss 2.8722
iter 9500: loss 2.5111, time 20008.92ms 
iter 9501: loss 2.2974, time 5261.81ms 
iter 9502: loss 2.5584, time 5264.24ms 
iter 9503: loss 2.4275, time 5268.70ms 
iter 9504: loss 2.5565, time 5232.36ms 
iter 9505: loss 2.5057, time 5243.93ms 
iter 9506: loss 2.3589, time 5257.40ms 
iter 9507: loss 2.5257, time 5257.66ms 
iter 9508: loss 2.1415, time 5256.79ms 
iter 9509: loss 2.6862, time 5250.50ms 
iter 9510: loss 2.6041, time 5258.51ms 
iter 9511: loss 2.5814, time 5259.94ms 
iter 9512: loss 2.4166, time 5248.43ms 
iter 9513: loss 2.3897, time 5259.04ms 
iter 9514: loss 2.2396, time 5273.92ms 
iter 9515: loss 2.6204, time 5252.61ms 
iter 9516: loss 2.5742, time 5259.08ms 
iter 9517: loss 2.4689, time 5258.74ms 
iter 9518: loss 2.4726, time 5270.63ms 
iter 9519: loss 2.5824, time 5249.18ms 
iter 9520: loss 2.4755, time 5250.86ms 
iter 9521: loss 2.4376, time 5266.17ms 
iter 9522: loss 2.5323, time 5265.98ms 
iter 9523: loss 2.3409, time 5271.35ms 
iter 9524: loss 2.5547, time 5252.14ms 
iter 9525: loss 2.5144, time 5267.01ms 
iter 9526: loss 2.3078, time 5244.48ms 
iter 9527: loss 2.2167, time 5255.02ms 
iter 9528: loss 2.6119, time 5257.74ms 
iter 9529: loss 2.5133, time 5268.62ms 
iter 9530: loss 2.1408, time 5261.21ms 
iter 9531: loss 2.6483, time 5256.03ms 
iter 9532: loss 2.3611, time 5266.02ms 
iter 9533: loss 2.3230, time 5255.61ms 
iter 9534: loss 2.7334, time 5253.21ms 
iter 9535: loss 2.4980, time 5265.85ms 
iter 9536: loss 2.4149, time 5265.14ms 
iter 9537: loss 2.5390, time 5253.59ms 
iter 9538: loss 2.4648, time 5256.10ms 
iter 9539: loss 2.4111, time 5262.82ms 
iter 9540: loss 2.5662, time 5257.04ms 
iter 9541: loss 2.4061, time 5261.43ms 
iter 9542: loss 2.6312, time 5257.52ms 
iter 9543: loss 2.5010, time 5267.68ms 
iter 9544: loss 2.4211, time 5255.99ms 
iter 9545: loss 2.4849, time 5251.81ms 
iter 9546: loss 2.3307, time 5262.37ms 
iter 9547: loss 2.2695, time 5263.04ms 
iter 9548: loss 2.3619, time 5254.37ms 
iter 9549: loss 2.4817, time 5257.48ms 
step 9550: train loss 2.4700, val loss 2.8500
iter 9550: loss 2.2382, time 19867.45ms 
iter 9551: loss 2.4450, time 5265.08ms 
iter 9552: loss 2.3908, time 5254.05ms 
iter 9553: loss 2.6035, time 5256.24ms 
iter 9554: loss 2.4529, time 5268.30ms 
iter 9555: loss 2.5789, time 5265.09ms 
iter 9556: loss 2.4293, time 5242.90ms 
iter 9557: loss 2.3537, time 5266.55ms 
iter 9558: loss 2.4877, time 5276.47ms 
iter 9559: loss 2.6543, time 5265.73ms 
iter 9560: loss 2.3115, time 5255.58ms 
iter 9561: loss 2.4339, time 5269.52ms 
iter 9562: loss 2.4026, time 5272.29ms 
iter 9563: loss 2.3626, time 5262.43ms 
iter 9564: loss 2.2101, time 5264.13ms 
iter 9565: loss 2.4819, time 5259.08ms 
iter 9566: loss 2.4754, time 5264.69ms 
iter 9567: loss 2.5418, time 5257.99ms 
iter 9568: loss 2.3865, time 5265.70ms 
iter 9569: loss 2.3051, time 5260.44ms 
iter 9570: loss 2.3695, time 5254.66ms 
iter 9571: loss 2.3674, time 5260.10ms 
iter 9572: loss 2.4434, time 5267.33ms 
iter 9573: loss 2.3160, time 5261.79ms 
iter 9574: loss 2.6116, time 5263.91ms 
iter 9575: loss 2.3751, time 5260.85ms 
iter 9576: loss 2.5649, time 5269.63ms 
iter 9577: loss 2.6686, time 5267.74ms 
iter 9578: loss 2.4649, time 5260.28ms 
iter 9579: loss 2.4258, time 5276.56ms 
iter 9580: loss 2.5974, time 5263.24ms 
iter 9581: loss 2.6328, time 5277.81ms 
iter 9582: loss 2.5811, time 5266.66ms 
iter 9583: loss 2.7205, time 5265.51ms 
iter 9584: loss 2.5465, time 5257.01ms 
iter 9585: loss 2.4260, time 5180.39ms 
iter 9586: loss 2.5811, time 5161.45ms 
iter 9587: loss 2.4112, time 5153.35ms 
iter 9588: loss 2.3286, time 5173.43ms 
iter 9589: loss 2.6328, time 5149.46ms 
iter 9590: loss 2.4085, time 5171.90ms 
iter 9591: loss 2.5617, time 5186.53ms 
iter 9592: loss 2.5800, time 5160.28ms 
iter 9593: loss 2.2541, time 5156.12ms 
iter 9594: loss 2.4516, time 5174.29ms 
iter 9595: loss 2.1260, time 5157.47ms 
iter 9596: loss 2.5123, time 5243.31ms 
iter 9597: loss 2.6376, time 5252.27ms 
iter 9598: loss 2.8238, time 5266.99ms 
iter 9599: loss 2.3612, time 5252.48ms 
step 9600: train loss 2.4746, val loss 2.8624
iter 9600: loss 2.5271, time 19969.82ms 
iter 9601: loss 2.5231, time 5270.90ms 
iter 9602: loss 2.5524, time 5237.47ms 
iter 9603: loss 2.2804, time 5181.13ms 
iter 9604: loss 2.6602, time 5153.74ms 
iter 9605: loss 2.4401, time 5180.47ms 
iter 9606: loss 2.4857, time 5182.18ms 
iter 9607: loss 2.3870, time 5273.37ms 
iter 9608: loss 2.5732, time 5260.79ms 
iter 9609: loss 2.6402, time 5267.98ms 
iter 9610: loss 2.0179, time 5256.58ms 
iter 9611: loss 2.2892, time 5194.43ms 
iter 9612: loss 2.5012, time 5255.52ms 
iter 9613: loss 2.3676, time 5259.45ms 
iter 9614: loss 2.4665, time 5249.40ms 
iter 9615: loss 2.6042, time 5244.57ms 
iter 9616: loss 2.6483, time 5265.90ms 
iter 9617: loss 2.4449, time 5234.16ms 
iter 9618: loss 2.5947, time 5246.27ms 
iter 9619: loss 2.3619, time 5265.81ms 
iter 9620: loss 2.6181, time 5281.16ms 
iter 9621: loss 2.4306, time 5248.92ms 
iter 9622: loss 2.1257, time 5272.70ms 
iter 9623: loss 2.3414, time 5282.80ms 
iter 9624: loss 2.4271, time 5273.82ms 
iter 9625: loss 2.4967, time 5270.31ms 
iter 9626: loss 2.4040, time 5265.87ms 
iter 9627: loss 2.5772, time 5267.97ms 
iter 9628: loss 2.4092, time 5194.25ms 
iter 9629: loss 2.4456, time 5259.99ms 
iter 9630: loss 2.4679, time 5287.02ms 
iter 9631: loss 2.4978, time 5265.84ms 
iter 9632: loss 2.4061, time 5273.08ms 
iter 9633: loss 2.3752, time 5254.21ms 
iter 9634: loss 2.5773, time 5257.66ms 
iter 9635: loss 2.5470, time 5262.61ms 
iter 9636: loss 2.5574, time 5281.87ms 
iter 9637: loss 2.5870, time 5265.81ms 
iter 9638: loss 2.4935, time 5261.23ms 
iter 9639: loss 2.3877, time 5262.48ms 
iter 9640: loss 2.5017, time 5264.03ms 
iter 9641: loss 2.3596, time 5257.05ms 
iter 9642: loss 2.4742, time 5299.01ms 
iter 9643: loss 2.5383, time 5258.72ms 
iter 9644: loss 2.3671, time 5265.33ms 
iter 9645: loss 2.5925, time 5260.32ms 
iter 9646: loss 2.3170, time 5261.63ms 
iter 9647: loss 2.5796, time 5257.90ms 
iter 9648: loss 2.5074, time 5273.77ms 
iter 9649: loss 2.5734, time 5259.81ms 
step 9650: train loss 2.4633, val loss 2.8453
iter 9650: loss 2.3842, time 19893.12ms 
iter 9651: loss 2.2097, time 5258.43ms 
iter 9652: loss 2.4664, time 5250.81ms 
iter 9653: loss 2.4475, time 5249.37ms 
iter 9654: loss 2.3677, time 5247.70ms 
iter 9655: loss 2.4329, time 5259.94ms 
iter 9656: loss 2.4796, time 5246.22ms 
iter 9657: loss 2.6618, time 5248.00ms 
iter 9658: loss 2.5068, time 5256.18ms 
iter 9659: loss 2.7771, time 5253.26ms 
iter 9660: loss 2.2917, time 5247.83ms 
iter 9661: loss 2.5030, time 5249.96ms 
iter 9662: loss 2.3685, time 5258.81ms 
iter 9663: loss 2.4884, time 5255.73ms 
iter 9664: loss 2.4438, time 5249.17ms 
iter 9665: loss 2.6535, time 5255.57ms 
iter 9666: loss 2.2118, time 5284.93ms 
iter 9667: loss 2.3298, time 5249.76ms 
iter 9668: loss 2.4367, time 5250.50ms 
iter 9669: loss 2.6060, time 5256.23ms 
iter 9670: loss 2.5117, time 5258.81ms 
iter 9671: loss 2.4553, time 5250.11ms 
iter 9672: loss 2.4563, time 5247.11ms 
iter 9673: loss 2.3032, time 5257.56ms 
iter 9674: loss 2.4802, time 5255.91ms 
iter 9675: loss 2.3245, time 5248.64ms 
iter 9676: loss 2.4701, time 5247.18ms 
iter 9677: loss 2.6019, time 5263.36ms 
iter 9678: loss 2.4369, time 5248.49ms 
iter 9679: loss 2.4260, time 5252.07ms 
iter 9680: loss 2.2534, time 5249.46ms 
iter 9681: loss 2.5533, time 5255.08ms 
iter 9682: loss 2.6018, time 5251.15ms 
iter 9683: loss 2.4921, time 5246.06ms 
iter 9684: loss 2.0533, time 5263.24ms 
iter 9685: loss 2.4528, time 5261.09ms 
iter 9686: loss 2.4458, time 5266.50ms 
iter 9687: loss 2.3755, time 5261.84ms 
iter 9688: loss 2.3919, time 5199.19ms 
iter 9689: loss 2.6313, time 5285.70ms 
iter 9690: loss 2.2761, time 5211.82ms 
iter 9691: loss 2.5146, time 5257.40ms 
iter 9692: loss 2.4400, time 5270.78ms 
iter 9693: loss 2.4418, time 5262.86ms 
iter 9694: loss 2.3009, time 5246.99ms 
iter 9695: loss 2.6130, time 5255.74ms 
iter 9696: loss 2.4059, time 5255.54ms 
iter 9697: loss 2.5192, time 5253.76ms 
iter 9698: loss 2.4183, time 5225.42ms 
iter 9699: loss 2.5893, time 5277.62ms 
step 9700: train loss 2.4433, val loss 2.8731
iter 9700: loss 2.4405, time 19972.14ms 
iter 9701: loss 2.5728, time 5244.34ms 
iter 9702: loss 2.4670, time 5248.85ms 
iter 9703: loss 2.5406, time 5254.54ms 
iter 9704: loss 2.3616, time 5245.13ms 
iter 9705: loss 2.5078, time 5247.22ms 
iter 9706: loss 2.4717, time 5255.46ms 
iter 9707: loss 2.3603, time 5260.49ms 
iter 9708: loss 2.4482, time 5258.54ms 
iter 9709: loss 2.6125, time 5262.66ms 
iter 9710: loss 2.4853, time 5260.84ms 
iter 9711: loss 2.4888, time 5260.43ms 
iter 9712: loss 2.5088, time 5254.83ms 
iter 9713: loss 2.3746, time 5265.38ms 
iter 9714: loss 2.5754, time 5263.16ms 
iter 9715: loss 2.5792, time 5253.80ms 
iter 9716: loss 2.4273, time 5285.83ms 
iter 9717: loss 2.4211, time 5296.77ms 
iter 9718: loss 2.6245, time 5291.55ms 
iter 9719: loss 2.4836, time 5277.04ms 
iter 9720: loss 2.4838, time 5242.75ms 
iter 9721: loss 2.2876, time 5240.96ms 
iter 9722: loss 2.3734, time 5250.80ms 
iter 9723: loss 2.2487, time 5247.77ms 
iter 9724: loss 2.5645, time 5271.95ms 
iter 9725: loss 2.1776, time 5250.81ms 
iter 9726: loss 2.3768, time 5248.77ms 
iter 9727: loss 2.4596, time 5251.71ms 
iter 9728: loss 2.6313, time 5252.21ms 
iter 9729: loss 2.5051, time 5245.02ms 
iter 9730: loss 2.4203, time 5229.83ms 
iter 9731: loss 2.3871, time 5235.94ms 
iter 9732: loss 2.6431, time 5276.47ms 
iter 9733: loss 2.4391, time 5263.70ms 
iter 9734: loss 2.6653, time 5259.04ms 
iter 9735: loss 2.4626, time 5267.15ms 
iter 9736: loss 2.2795, time 5245.24ms 
iter 9737: loss 2.3086, time 5248.54ms 
iter 9738: loss 2.3389, time 5245.66ms 
iter 9739: loss 2.4600, time 5237.93ms 
iter 9740: loss 2.6951, time 5244.47ms 
iter 9741: loss 2.4746, time 5248.05ms 
iter 9742: loss 2.4497, time 5270.56ms 
iter 9743: loss 2.2842, time 5231.57ms 
iter 9744: loss 2.3627, time 5267.59ms 
iter 9745: loss 2.6899, time 5275.18ms 
iter 9746: loss 2.5699, time 5277.00ms 
iter 9747: loss 2.3901, time 5269.54ms 
iter 9748: loss 2.4849, time 5253.77ms 
iter 9749: loss 2.5480, time 5278.96ms 
step 9750: train loss 2.4700, val loss 2.8347
iter 9750: loss 2.5990, time 19968.49ms 
iter 9751: loss 2.6614, time 5260.52ms 
iter 9752: loss 2.6943, time 5267.52ms 
iter 9753: loss 2.5429, time 5253.52ms 
iter 9754: loss 2.2426, time 5232.29ms 
iter 9755: loss 2.5400, time 5261.16ms 
iter 9756: loss 2.3823, time 5234.67ms 
iter 9757: loss 2.2932, time 5255.06ms 
iter 9758: loss 2.4947, time 5264.81ms 
iter 9759: loss 2.4100, time 5276.25ms 
iter 9760: loss 2.4903, time 5277.72ms 
iter 9761: loss 2.4637, time 5264.28ms 
iter 9762: loss 2.5353, time 5271.28ms 
iter 9763: loss 2.5076, time 5275.25ms 
iter 9764: loss 2.4003, time 5263.55ms 
iter 9765: loss 2.4095, time 5259.09ms 
iter 9766: loss 2.5715, time 5276.04ms 
iter 9767: loss 2.5159, time 5258.28ms 
iter 9768: loss 2.4427, time 5259.71ms 
iter 9769: loss 2.7040, time 5266.31ms 
iter 9770: loss 2.5235, time 5280.04ms 
iter 9771: loss 2.5286, time 5219.32ms 
iter 9772: loss 2.5972, time 5260.35ms 
iter 9773: loss 2.5298, time 5191.52ms 
iter 9774: loss 2.4856, time 5298.81ms 
iter 9775: loss 2.5127, time 5281.79ms 
iter 9776: loss 2.4412, time 5309.03ms 
iter 9777: loss 2.4258, time 5300.56ms 
iter 9778: loss 2.5965, time 5274.67ms 
iter 9779: loss 2.5752, time 5296.42ms 
iter 9780: loss 2.4698, time 5272.46ms 
iter 9781: loss 2.2340, time 5317.51ms 
iter 9782: loss 2.3478, time 5301.02ms 
iter 9783: loss 2.5116, time 5317.50ms 
iter 9784: loss 2.5993, time 5308.19ms 
iter 9785: loss 2.3853, time 5305.05ms 
iter 9786: loss 2.4249, time 5295.80ms 
iter 9787: loss 2.2602, time 5314.09ms 
iter 9788: loss 2.4207, time 5300.98ms 
iter 9789: loss 2.3489, time 5303.47ms 
iter 9790: loss 2.3510, time 5311.02ms 
iter 9791: loss 2.5854, time 5279.47ms 
iter 9792: loss 2.5143, time 5282.79ms 
iter 9793: loss 2.7892, time 5297.14ms 
iter 9794: loss 2.4270, time 5316.93ms 
iter 9795: loss 2.4169, time 5300.77ms 
iter 9796: loss 2.5941, time 5296.04ms 
iter 9797: loss 2.4305, time 5289.75ms 
iter 9798: loss 2.8724, time 5303.37ms 
iter 9799: loss 2.5942, time 5282.29ms 
step 9800: train loss 2.4412, val loss 2.8496
iter 9800: loss 2.3832, time 20143.72ms 
iter 9801: loss 2.5534, time 5288.10ms 
iter 9802: loss 2.4788, time 5291.62ms 
iter 9803: loss 2.3794, time 5299.51ms 
iter 9804: loss 2.4393, time 5252.90ms 
iter 9805: loss 2.5436, time 5315.72ms 
iter 9806: loss 2.4546, time 5293.03ms 
iter 9807: loss 2.4168, time 5251.61ms 
iter 9808: loss 2.5747, time 5303.30ms 
iter 9809: loss 2.3651, time 5272.17ms 
iter 9810: loss 2.5517, time 5267.99ms 
iter 9811: loss 2.3531, time 5269.47ms 
iter 9812: loss 2.6349, time 5275.24ms 
iter 9813: loss 2.3283, time 5261.35ms 
iter 9814: loss 2.3648, time 5263.55ms 
iter 9815: loss 2.4042, time 5208.82ms 
iter 9816: loss 2.6083, time 5264.73ms 
iter 9817: loss 2.4488, time 5266.26ms 
iter 9818: loss 2.1319, time 5292.02ms 
iter 9819: loss 2.4015, time 5295.45ms 
iter 9820: loss 2.5975, time 5339.25ms 
iter 9821: loss 2.5586, time 5299.65ms 
iter 9822: loss 2.5198, time 5296.10ms 
iter 9823: loss 2.6843, time 5309.77ms 
iter 9824: loss 2.3708, time 5288.52ms 
iter 9825: loss 2.6212, time 5293.57ms 
iter 9826: loss 2.4487, time 5290.88ms 
iter 9827: loss 2.3740, time 5311.60ms 
iter 9828: loss 2.6052, time 5289.70ms 
iter 9829: loss 2.3760, time 5290.63ms 
iter 9830: loss 2.2003, time 5296.83ms 
iter 9831: loss 2.4395, time 5302.91ms 
iter 9832: loss 2.5932, time 5300.31ms 
iter 9833: loss 2.3855, time 5307.25ms 
iter 9834: loss 2.2742, time 5314.46ms 
iter 9835: loss 2.4597, time 5290.71ms 
iter 9836: loss 2.2449, time 5295.61ms 
iter 9837: loss 2.3172, time 5260.23ms 
iter 9838: loss 2.5080, time 5312.85ms 
iter 9839: loss 2.3698, time 5291.49ms 
iter 9840: loss 2.4500, time 5311.30ms 
iter 9841: loss 2.6167, time 5315.87ms 
iter 9842: loss 2.4364, time 5303.16ms 
iter 9843: loss 2.4684, time 5285.45ms 
iter 9844: loss 2.8377, time 5295.89ms 
iter 9845: loss 2.6560, time 5314.09ms 
iter 9846: loss 2.5405, time 5299.82ms 
iter 9847: loss 2.4173, time 5295.13ms 
iter 9848: loss 2.4198, time 5301.77ms 
iter 9849: loss 2.4790, time 5296.01ms 
step 9850: train loss 2.4395, val loss 2.8754
iter 9850: loss 2.5526, time 20139.44ms 
iter 9851: loss 2.3425, time 5309.43ms 
iter 9852: loss 2.2390, time 5314.63ms 
iter 9853: loss 2.5662, time 5319.43ms 
iter 9854: loss 2.5101, time 5312.84ms 
iter 9855: loss 2.4634, time 5254.99ms 
iter 9856: loss 2.6073, time 5288.10ms 
iter 9857: loss 2.4088, time 5291.45ms 
iter 9858: loss 2.4314, time 5294.05ms 
iter 9859: loss 2.4738, time 5301.13ms 
iter 9860: loss 2.3914, time 5288.19ms 
iter 9861: loss 2.6170, time 5286.54ms 
iter 9862: loss 2.3636, time 5286.41ms 
iter 9863: loss 2.5162, time 5297.79ms 
iter 9864: loss 2.3382, time 5288.97ms 
iter 9865: loss 2.3740, time 5283.66ms 
iter 9866: loss 2.3172, time 5304.55ms 
iter 9867: loss 2.1041, time 5296.85ms 
iter 9868: loss 2.5374, time 5286.03ms 
iter 9869: loss 2.6063, time 5289.91ms 
iter 9870: loss 2.3835, time 5295.65ms 
iter 9871: loss 2.5661, time 5292.36ms 
iter 9872: loss 2.1788, time 5286.33ms 
iter 9873: loss 2.6943, time 5296.21ms 
iter 9874: loss 2.4612, time 5296.05ms 
iter 9875: loss 2.5103, time 5287.73ms 
iter 9876: loss 2.5273, time 5289.61ms 
iter 9877: loss 2.5512, time 5287.27ms 
iter 9878: loss 2.3866, time 5289.75ms 
iter 9879: loss 2.0706, time 5249.53ms 
iter 9880: loss 2.4371, time 5307.85ms 
iter 9881: loss 2.6703, time 5297.53ms 
iter 9882: loss 2.2242, time 5292.83ms 
iter 9883: loss 2.4532, time 5292.12ms 
iter 9884: loss 2.4571, time 5306.04ms 
iter 9885: loss 2.6217, time 5291.72ms 
iter 9886: loss 2.4564, time 5299.61ms 
iter 9887: loss 2.2812, time 5306.17ms 
iter 9888: loss 2.4196, time 5294.93ms 
iter 9889: loss 2.4009, time 5291.08ms 
iter 9890: loss 2.4907, time 5283.37ms 
iter 9891: loss 2.5332, time 5297.24ms 
iter 9892: loss 2.5412, time 5290.67ms 
iter 9893: loss 2.4865, time 5286.22ms 
iter 9894: loss 2.5054, time 5310.74ms 
iter 9895: loss 2.2248, time 5294.97ms 
iter 9896: loss 2.4580, time 5235.99ms 
iter 9897: loss 2.4305, time 5290.44ms 
iter 9898: loss 2.4912, time 5303.21ms 
iter 9899: loss 2.6415, time 5297.97ms 
step 9900: train loss 2.4324, val loss 2.8254
iter 9900: loss 2.5313, time 19951.60ms 
iter 9901: loss 2.6992, time 5234.66ms 
iter 9902: loss 2.6412, time 5316.17ms 
iter 9903: loss 2.3956, time 5302.28ms 
iter 9904: loss 2.3398, time 5316.33ms 
iter 9905: loss 2.3506, time 5307.11ms 
iter 9906: loss 2.4250, time 5310.51ms 
iter 9907: loss 2.7087, time 5269.82ms 
iter 9908: loss 2.4713, time 5255.71ms 
iter 9909: loss 2.2560, time 5287.74ms 
iter 9910: loss 2.4773, time 5288.63ms 
iter 9911: loss 2.6383, time 5305.01ms 
iter 9912: loss 2.5264, time 5295.58ms 
iter 9913: loss 2.3922, time 5294.78ms 
iter 9914: loss 2.6233, time 5302.43ms 
iter 9915: loss 2.3262, time 5292.43ms 
iter 9916: loss 2.3476, time 5301.57ms 
iter 9917: loss 2.4243, time 5308.92ms 
iter 9918: loss 2.4705, time 5310.72ms 
iter 9919: loss 2.2249, time 5304.32ms 
iter 9920: loss 2.5988, time 5304.14ms 
iter 9921: loss 2.3696, time 5312.83ms 
iter 9922: loss 2.3529, time 5301.00ms 
iter 9923: loss 2.4400, time 5293.96ms 
iter 9924: loss 2.6332, time 5302.08ms 
iter 9925: loss 2.5258, time 5305.49ms 
iter 9926: loss 2.4412, time 5287.90ms 
iter 9927: loss 2.5379, time 5291.95ms 
iter 9928: loss 2.4355, time 5315.17ms 
iter 9929: loss 2.6193, time 5299.44ms 
iter 9930: loss 2.6065, time 5268.50ms 
iter 9931: loss 2.1307, time 5297.23ms 
iter 9932: loss 2.4643, time 5290.47ms 
iter 9933: loss 2.2267, time 5286.71ms 
iter 9934: loss 2.4536, time 5273.43ms 
iter 9935: loss 2.4877, time 5255.53ms 
iter 9936: loss 2.4067, time 5300.35ms 
iter 9937: loss 2.5278, time 5297.83ms 
iter 9938: loss 2.6293, time 5310.04ms 
iter 9939: loss 2.4373, time 5296.88ms 
iter 9940: loss 2.3609, time 5304.54ms 
iter 9941: loss 2.4089, time 5303.02ms 
iter 9942: loss 2.5082, time 5299.22ms 
iter 9943: loss 2.5069, time 5301.16ms 
iter 9944: loss 2.4238, time 5293.85ms 
iter 9945: loss 2.3671, time 5308.50ms 
iter 9946: loss 2.5516, time 5312.19ms 
iter 9947: loss 2.3625, time 5292.56ms 
iter 9948: loss 2.5409, time 5193.19ms 
iter 9949: loss 2.5905, time 5276.67ms 
step 9950: train loss 2.4395, val loss 2.8551
iter 9950: loss 2.5727, time 20204.28ms 
iter 9951: loss 2.4257, time 5294.29ms 
iter 9952: loss 2.6052, time 5300.81ms 
iter 9953: loss 2.6426, time 5290.98ms 
iter 9954: loss 2.2555, time 5287.93ms 
iter 9955: loss 2.5456, time 5291.06ms 
iter 9956: loss 2.3589, time 5304.26ms 
iter 9957: loss 2.3650, time 5284.87ms 
iter 9958: loss 2.3474, time 5292.38ms 
iter 9959: loss 2.6758, time 5301.06ms 
iter 9960: loss 2.2163, time 5293.59ms 
iter 9961: loss 2.5446, time 5283.47ms 
iter 9962: loss 2.4299, time 5298.55ms 
iter 9963: loss 2.1894, time 5217.03ms 
iter 9964: loss 2.6194, time 5236.76ms 
iter 9965: loss 2.6366, time 5272.81ms 
iter 9966: loss 2.4595, time 5245.71ms 
iter 9967: loss 2.5184, time 5305.86ms 
iter 9968: loss 2.4485, time 5275.17ms 
iter 9969: loss 2.2979, time 5304.89ms 
iter 9970: loss 2.4694, time 5260.86ms 
iter 9971: loss 2.2815, time 5303.00ms 
iter 9972: loss 2.4668, time 5290.66ms 
iter 9973: loss 2.3761, time 5293.65ms 
iter 9974: loss 2.5181, time 5308.79ms 
iter 9975: loss 2.6963, time 5302.94ms 
iter 9976: loss 2.5512, time 5308.06ms 
iter 9977: loss 2.4406, time 5302.84ms 
iter 9978: loss 2.7332, time 5299.71ms 
iter 9979: loss 2.3351, time 5289.85ms 
iter 9980: loss 2.5140, time 5313.45ms 
iter 9981: loss 2.4365, time 5305.04ms 
iter 9982: loss 2.4663, time 5306.34ms 
iter 9983: loss 2.4761, time 5315.80ms 
iter 9984: loss 2.4956, time 5319.02ms 
iter 9985: loss 2.3691, time 5316.93ms 
iter 9986: loss 2.5729, time 5274.82ms 
iter 9987: loss 2.3526, time 5266.47ms 
iter 9988: loss 2.5648, time 5263.72ms 
iter 9989: loss 2.5851, time 5295.26ms 
iter 9990: loss 2.5200, time 5306.10ms 
iter 9991: loss 2.2895, time 5297.74ms 
iter 9992: loss 2.6140, time 5279.60ms 
iter 9993: loss 2.5117, time 5303.87ms 
iter 9994: loss 2.5768, time 5305.15ms 
iter 9995: loss 2.3682, time 5309.69ms 
iter 9996: loss 2.3146, time 5324.06ms 
iter 9997: loss 2.6426, time 5326.79ms 
iter 9998: loss 2.3371, time 5315.98ms 
iter 9999: loss 2.4648, time 5319.35ms 
step 10000: train loss 2.4666, val loss 2.8525
iter 10000: loss 2.5283, time 20135.42ms 
iter 10001: loss 2.6904, time 5310.07ms 
iter 10002: loss 2.3274, time 5277.11ms 
iter 10003: loss 2.4447, time 5296.11ms 
iter 10004: loss 2.5505, time 5320.63ms 
iter 10005: loss 2.3425, time 5278.85ms 
iter 10006: loss 2.4799, time 5298.59ms 
iter 10007: loss 2.4238, time 5305.00ms 
iter 10008: loss 2.3520, time 5305.41ms 
iter 10009: loss 2.1681, time 5325.82ms 
iter 10010: loss 2.4214, time 5275.82ms 
iter 10011: loss 2.4179, time 5304.22ms 
iter 10012: loss 2.6094, time 5307.82ms 
iter 10013: loss 2.4362, time 5303.92ms 
iter 10014: loss 2.1889, time 5324.41ms 
iter 10015: loss 2.6112, time 5308.14ms 
iter 10016: loss 2.2826, time 5314.09ms 
iter 10017: loss 2.4188, time 5287.32ms 
iter 10018: loss 2.3302, time 5285.64ms 
iter 10019: loss 2.3443, time 5306.39ms 
iter 10020: loss 2.3620, time 5268.07ms 
iter 10021: loss 2.3121, time 5277.57ms 
iter 10022: loss 2.3332, time 5294.35ms 
iter 10023: loss 2.2858, time 5287.03ms 
iter 10024: loss 2.4271, time 5300.98ms 
iter 10025: loss 2.4674, time 5245.73ms 
iter 10026: loss 2.4801, time 5293.87ms 
iter 10027: loss 2.4216, time 5291.04ms 
iter 10028: loss 2.5128, time 5304.88ms 
iter 10029: loss 2.4008, time 5311.63ms 
iter 10030: loss 2.5265, time 5301.43ms 
iter 10031: loss 2.3029, time 5301.27ms 
iter 10032: loss 2.4274, time 5316.99ms 
iter 10033: loss 2.4633, time 5137.59ms 
iter 10034: loss 2.5850, time 5300.49ms 
iter 10035: loss 2.6263, time 5310.60ms 
iter 10036: loss 2.5787, time 5289.59ms 
iter 10037: loss 2.5948, time 5302.90ms 
iter 10038: loss 2.5317, time 5309.45ms 
iter 10039: loss 2.4335, time 5297.99ms 
iter 10040: loss 2.4765, time 5262.56ms 
iter 10041: loss 2.5276, time 5310.05ms 
iter 10042: loss 2.4064, time 5298.22ms 
iter 10043: loss 2.3820, time 5283.44ms 
iter 10044: loss 2.7096, time 5299.84ms 
iter 10045: loss 2.5872, time 5297.82ms 
iter 10046: loss 2.5245, time 5284.72ms 
iter 10047: loss 2.4440, time 5293.43ms 
iter 10048: loss 2.3820, time 5307.10ms 
iter 10049: loss 2.6111, time 5295.27ms 
step 10050: train loss 2.4432, val loss 2.8453
iter 10050: loss 2.7299, time 20164.45ms 
iter 10051: loss 2.5455, time 5284.72ms 
iter 10052: loss 2.6071, time 5292.65ms 
iter 10053: loss 2.4509, time 5314.18ms 
iter 10054: loss 2.2874, time 5327.97ms 
iter 10055: loss 2.4607, time 5280.95ms 
iter 10056: loss 2.1296, time 5308.41ms 
iter 10057: loss 2.3915, time 5297.47ms 
iter 10058: loss 2.5851, time 5283.37ms 
iter 10059: loss 2.4413, time 5312.19ms 
iter 10060: loss 2.5118, time 5309.40ms 
iter 10061: loss 2.5944, time 5287.73ms 
iter 10062: loss 2.4862, time 5299.98ms 
iter 10063: loss 2.4197, time 5294.55ms 
iter 10064: loss 2.2014, time 5296.72ms 
iter 10065: loss 2.6003, time 5305.31ms 
iter 10066: loss 2.5319, time 5305.11ms 
iter 10067: loss 2.3712, time 5299.52ms 
iter 10068: loss 2.3508, time 5312.26ms 
iter 10069: loss 2.4243, time 5260.03ms 
iter 10070: loss 2.3080, time 5284.57ms 
iter 10071: loss 2.5961, time 5292.19ms 
iter 10072: loss 2.5826, time 5285.88ms 
iter 10073: loss 2.4588, time 5287.85ms 
iter 10074: loss 2.6781, time 5292.60ms 
iter 10075: loss 2.5733, time 5242.67ms 
iter 10076: loss 2.3805, time 5293.08ms 
iter 10077: loss 2.4841, time 5333.10ms 
iter 10078: loss 2.4950, time 5339.60ms 
iter 10079: loss 2.4396, time 5323.54ms 
iter 10080: loss 2.2117, time 5337.18ms 
iter 10081: loss 2.3710, time 5336.13ms 
iter 10082: loss 2.5438, time 5333.45ms 
iter 10083: loss 2.3829, time 5311.84ms 
iter 10084: loss 2.2856, time 5300.23ms 
iter 10085: loss 2.6109, time 5285.64ms 
iter 10086: loss 2.2171, time 5298.79ms 
iter 10087: loss 2.4156, time 5305.46ms 
iter 10088: loss 2.2259, time 5289.56ms 
iter 10089: loss 2.4430, time 5294.41ms 
iter 10090: loss 2.6020, time 5267.32ms 
iter 10091: loss 2.5295, time 5316.41ms 
iter 10092: loss 2.4450, time 5300.36ms 
iter 10093: loss 2.7103, time 5308.59ms 
iter 10094: loss 2.5426, time 5292.65ms 
iter 10095: loss 2.4350, time 5298.37ms 
iter 10096: loss 2.2718, time 5285.15ms 
iter 10097: loss 2.6037, time 5298.92ms 
iter 10098: loss 2.5376, time 5291.64ms 
iter 10099: loss 2.2854, time 5310.10ms 
step 10100: train loss 2.4427, val loss 2.8478
iter 10100: loss 2.3338, time 20110.06ms 
iter 10101: loss 2.5681, time 5288.28ms 
iter 10102: loss 2.5142, time 5273.14ms 
iter 10103: loss 2.4310, time 5269.74ms 
iter 10104: loss 2.3590, time 5302.87ms 
iter 10105: loss 2.4637, time 5276.71ms 
iter 10106: loss 2.4051, time 5295.70ms 
iter 10107: loss 2.4184, time 5311.76ms 
iter 10108: loss 2.6368, time 5303.60ms 
iter 10109: loss 2.3321, time 5244.43ms 
iter 10110: loss 2.2003, time 5275.13ms 
iter 10111: loss 2.5362, time 5298.63ms 
iter 10112: loss 2.4934, time 5314.15ms 
iter 10113: loss 2.4074, time 5284.81ms 
iter 10114: loss 2.4934, time 5301.41ms 
iter 10115: loss 2.4121, time 5231.87ms 
iter 10116: loss 2.3121, time 5251.63ms 
iter 10117: loss 2.5553, time 5305.14ms 
iter 10118: loss 2.5402, time 5296.85ms 
iter 10119: loss 2.6538, time 5253.52ms 
iter 10120: loss 2.3741, time 5288.79ms 
iter 10121: loss 2.3747, time 5250.44ms 
iter 10122: loss 2.5505, time 5263.71ms 
iter 10123: loss 2.5598, time 5248.16ms 
iter 10124: loss 2.4381, time 5183.51ms 
iter 10125: loss 2.6116, time 5205.65ms 
iter 10126: loss 2.2240, time 5236.34ms 
iter 10127: loss 2.2734, time 5276.28ms 
iter 10128: loss 2.3898, time 5294.58ms 
iter 10129: loss 2.3838, time 5237.98ms 
iter 10130: loss 2.3421, time 5261.61ms 
iter 10131: loss 2.3576, time 5278.84ms 
iter 10132: loss 2.4512, time 5287.85ms 
iter 10133: loss 2.3739, time 5293.03ms 
iter 10134: loss 2.5817, time 5275.79ms 
iter 10135: loss 2.4137, time 5242.25ms 
iter 10136: loss 2.6122, time 5246.86ms 
iter 10137: loss 2.3573, time 5307.67ms 
iter 10138: loss 2.5493, time 5291.65ms 
iter 10139: loss 2.3035, time 5293.43ms 
iter 10140: loss 2.2519, time 5287.57ms 
iter 10141: loss 2.2568, time 5250.45ms 
iter 10142: loss 2.4099, time 5229.39ms 
iter 10143: loss 2.4636, time 5250.60ms 
iter 10144: loss 2.3394, time 5309.50ms 
iter 10145: loss 2.3300, time 5294.68ms 
iter 10146: loss 2.6056, time 5311.21ms 
iter 10147: loss 2.3364, time 5307.83ms 
iter 10148: loss 2.4867, time 5299.81ms 
iter 10149: loss 2.5670, time 5283.21ms 
step 10150: train loss 2.4387, val loss 2.8587
iter 10150: loss 2.3524, time 20108.76ms 
iter 10151: loss 2.6466, time 5249.08ms 
iter 10152: loss 2.4033, time 5251.36ms 
iter 10153: loss 2.4739, time 5296.43ms 
iter 10154: loss 2.4082, time 5308.95ms 
iter 10155: loss 2.5208, time 5294.66ms 
iter 10156: loss 2.2195, time 5290.88ms 
iter 10157: loss 2.3483, time 5261.87ms 
iter 10158: loss 2.3056, time 5280.68ms 
iter 10159: loss 2.4558, time 5253.02ms 
iter 10160: loss 2.3285, time 5225.81ms 
iter 10161: loss 2.2850, time 5298.02ms 
iter 10162: loss 2.2907, time 5318.35ms 
iter 10163: loss 2.3781, time 5278.49ms 
iter 10164: loss 2.4301, time 5279.32ms 
iter 10165: loss 2.2677, time 5281.52ms 
iter 10166: loss 2.4923, time 5265.19ms 
iter 10167: loss 2.6277, time 5257.06ms 
iter 10168: loss 2.4242, time 5225.69ms 
iter 10169: loss 2.3614, time 5313.37ms 
iter 10170: loss 2.4008, time 5297.55ms 
iter 10171: loss 2.4507, time 5296.88ms 
iter 10172: loss 2.1892, time 5272.92ms 
iter 10173: loss 2.4974, time 5292.50ms 
iter 10174: loss 2.3998, time 5246.74ms 
iter 10175: loss 2.5541, time 5291.94ms 
iter 10176: loss 2.4822, time 5309.02ms 
iter 10177: loss 2.6485, time 5296.61ms 
iter 10178: loss 2.4187, time 5286.64ms 
iter 10179: loss 2.5145, time 5296.58ms 
iter 10180: loss 2.4661, time 5311.71ms 
iter 10181: loss 2.3863, time 5274.09ms 
iter 10182: loss 2.4386, time 5291.03ms 
iter 10183: loss 2.2069, time 5303.11ms 
iter 10184: loss 2.3534, time 5298.48ms 
iter 10185: loss 2.4645, time 5285.42ms 
iter 10186: loss 2.5682, time 5153.92ms 
iter 10187: loss 2.4286, time 5156.48ms 
iter 10188: loss 2.4473, time 5318.37ms 
iter 10189: loss 2.0546, time 5305.69ms 
iter 10190: loss 2.5575, time 5310.81ms 
iter 10191: loss 2.5085, time 5297.55ms 
iter 10192: loss 2.2663, time 5249.81ms 
iter 10193: loss 2.3630, time 5300.00ms 
iter 10194: loss 2.5266, time 5282.11ms 
iter 10195: loss 2.3510, time 5300.51ms 
iter 10196: loss 2.4188, time 5298.37ms 
iter 10197: loss 2.3803, time 5310.66ms 
iter 10198: loss 2.4754, time 5302.11ms 
iter 10199: loss 2.3288, time 5306.69ms 
step 10200: train loss 2.4498, val loss 2.8616
iter 10200: loss 2.3874, time 20118.49ms 
iter 10201: loss 2.4219, time 5305.24ms 
iter 10202: loss 2.4519, time 5307.51ms 
iter 10203: loss 2.4404, time 5300.85ms 
iter 10204: loss 2.3450, time 5325.29ms 
iter 10205: loss 2.4995, time 5308.68ms 
iter 10206: loss 2.3148, time 5262.23ms 
iter 10207: loss 2.6302, time 5287.16ms 
iter 10208: loss 2.4963, time 5308.63ms 
iter 10209: loss 2.4607, time 5305.70ms 
iter 10210: loss 2.3504, time 5290.95ms 
iter 10211: loss 2.5343, time 5294.77ms 
iter 10212: loss 2.5069, time 5305.95ms 
iter 10213: loss 2.3358, time 5287.10ms 
iter 10214: loss 2.5536, time 5238.58ms 
iter 10215: loss 2.2421, time 5278.07ms 
iter 10216: loss 2.4205, time 5282.39ms 
iter 10217: loss 2.3587, time 5289.12ms 
iter 10218: loss 2.5006, time 5301.24ms 
iter 10219: loss 2.2239, time 5262.01ms 
iter 10220: loss 2.3161, time 5308.62ms 
iter 10221: loss 2.5694, time 5300.80ms 
iter 10222: loss 2.3305, time 5290.11ms 
iter 10223: loss 2.2911, time 5308.84ms 
iter 10224: loss 2.5488, time 5298.99ms 
iter 10225: loss 2.5172, time 5300.88ms 
iter 10226: loss 2.5167, time 5298.21ms 
iter 10227: loss 2.3167, time 5304.15ms 
iter 10228: loss 2.6030, time 5289.21ms 
iter 10229: loss 2.7036, time 5300.20ms 
iter 10230: loss 2.4474, time 5301.10ms 
iter 10231: loss 2.3299, time 5308.31ms 
iter 10232: loss 2.5131, time 5302.52ms 
iter 10233: loss 2.7136, time 5304.02ms 
iter 10234: loss 2.6234, time 5297.62ms 
iter 10235: loss 2.4817, time 5298.03ms 
iter 10236: loss 2.3539, time 5299.90ms 
iter 10237: loss 2.4687, time 5270.54ms 
iter 10238: loss 2.2625, time 5242.39ms 
iter 10239: loss 2.5509, time 5297.43ms 
iter 10240: loss 2.6160, time 5276.05ms 
iter 10241: loss 2.4312, time 5298.40ms 
iter 10242: loss 2.5348, time 5301.16ms 
iter 10243: loss 2.3368, time 5298.91ms 
iter 10244: loss 2.2829, time 5295.70ms 
iter 10245: loss 2.4344, time 5292.53ms 
iter 10246: loss 2.4641, time 5306.54ms 
iter 10247: loss 2.3618, time 5284.30ms 
iter 10248: loss 2.3768, time 5275.99ms 
iter 10249: loss 2.5461, time 5281.81ms 
step 10250: train loss 2.4266, val loss 2.8476
iter 10250: loss 2.3949, time 20129.69ms 
iter 10251: loss 2.5814, time 5299.49ms 
iter 10252: loss 2.2968, time 5296.22ms 
iter 10253: loss 2.4817, time 5319.67ms 
iter 10254: loss 2.4403, time 5313.56ms 
iter 10255: loss 2.4298, time 5309.84ms 
iter 10256: loss 2.3326, time 5298.67ms 
iter 10257: loss 2.4996, time 5298.58ms 
iter 10258: loss 2.4888, time 5292.47ms 
iter 10259: loss 2.3391, time 5293.55ms 
iter 10260: loss 2.5041, time 5281.76ms 
iter 10261: loss 2.6800, time 5312.78ms 
iter 10262: loss 2.4009, time 5288.75ms 
iter 10263: loss 2.2702, time 5294.27ms 
iter 10264: loss 2.5720, time 5286.43ms 
iter 10265: loss 2.5720, time 5292.59ms 
iter 10266: loss 2.2928, time 5285.78ms 
iter 10267: loss 2.4896, time 5276.69ms 
iter 10268: loss 2.4645, time 5299.20ms 
iter 10269: loss 2.3162, time 5300.46ms 
iter 10270: loss 2.3155, time 5295.16ms 
iter 10271: loss 2.6023, time 5293.67ms 
iter 10272: loss 2.5052, time 5311.14ms 
iter 10273: loss 2.2192, time 5295.02ms 
iter 10274: loss 2.3899, time 5314.86ms 
iter 10275: loss 2.4312, time 5308.85ms 
iter 10276: loss 2.4475, time 5303.33ms 
iter 10277: loss 2.4357, time 5299.93ms 
iter 10278: loss 2.4703, time 5304.11ms 
iter 10279: loss 2.4905, time 5264.21ms 
iter 10280: loss 2.6377, time 5147.99ms 
iter 10281: loss 2.4645, time 5235.82ms 
iter 10282: loss 2.4426, time 5267.76ms 
iter 10283: loss 2.5420, time 5292.41ms 
iter 10284: loss 2.4239, time 5281.53ms 
iter 10285: loss 2.5398, time 5280.67ms 
iter 10286: loss 2.5366, time 5269.08ms 
iter 10287: loss 2.2131, time 5260.14ms 
iter 10288: loss 2.3600, time 5253.41ms 
iter 10289: loss 2.3987, time 5262.04ms 
iter 10290: loss 2.4891, time 5236.09ms 
iter 10291: loss 2.2192, time 5258.44ms 
iter 10292: loss 2.3523, time 5225.05ms 
iter 10293: loss 2.4882, time 5298.43ms 
iter 10294: loss 2.2267, time 5289.59ms 
iter 10295: loss 2.5045, time 5304.92ms 
iter 10296: loss 2.3767, time 5301.38ms 
iter 10297: loss 2.6575, time 5312.05ms 
iter 10298: loss 2.5490, time 5304.78ms 
iter 10299: loss 2.4335, time 5301.20ms 
step 10300: train loss 2.4338, val loss 2.8573
iter 10300: loss 2.4045, time 20147.98ms 
iter 10301: loss 2.6147, time 5299.51ms 
iter 10302: loss 2.5058, time 5294.40ms 
iter 10303: loss 2.6456, time 5283.51ms 
iter 10304: loss 2.4430, time 5264.95ms 
iter 10305: loss 2.3459, time 5274.12ms 
iter 10306: loss 2.3975, time 5257.24ms 
iter 10307: loss 2.0585, time 5240.69ms 
iter 10308: loss 2.4670, time 5294.09ms 
iter 10309: loss 2.6133, time 5301.26ms 
iter 10310: loss 2.5300, time 5299.97ms 
iter 10311: loss 2.4670, time 5299.63ms 
iter 10312: loss 2.3912, time 5277.84ms 
iter 10313: loss 2.3413, time 5181.59ms 
iter 10314: loss 2.5151, time 5264.88ms 
iter 10315: loss 2.2818, time 5314.22ms 
iter 10316: loss 2.3630, time 5301.45ms 
iter 10317: loss 2.0472, time 5311.13ms 
iter 10318: loss 2.5779, time 5306.46ms 
iter 10319: loss 2.5159, time 5288.48ms 
iter 10320: loss 2.2218, time 5303.31ms 
iter 10321: loss 2.4325, time 5295.34ms 
iter 10322: loss 2.0855, time 5306.76ms 
iter 10323: loss 2.4062, time 5294.05ms 
iter 10324: loss 2.4123, time 5287.96ms 
iter 10325: loss 2.3430, time 5306.98ms 
iter 10326: loss 2.4471, time 5297.30ms 
iter 10327: loss 2.3669, time 5307.05ms 
iter 10328: loss 2.3460, time 5302.26ms 
iter 10329: loss 2.4493, time 5311.60ms 
iter 10330: loss 2.5183, time 5299.40ms 
iter 10331: loss 2.3925, time 5293.96ms 
iter 10332: loss 2.4891, time 5308.52ms 
iter 10333: loss 2.2318, time 5293.33ms 
iter 10334: loss 2.5174, time 5290.18ms 
iter 10335: loss 2.3183, time 5300.78ms 
iter 10336: loss 2.4377, time 5237.75ms 
iter 10337: loss 2.4157, time 5297.77ms 
iter 10338: loss 2.5252, time 5287.53ms 
iter 10339: loss 2.3457, time 5301.36ms 
iter 10340: loss 2.3819, time 5300.79ms 
iter 10341: loss 2.6194, time 5290.30ms 
iter 10342: loss 2.4830, time 5284.14ms 
iter 10343: loss 2.4992, time 5305.41ms 
iter 10344: loss 2.7309, time 5292.90ms 
iter 10345: loss 2.5236, time 5291.34ms 
iter 10346: loss 2.5073, time 5308.11ms 
iter 10347: loss 2.2926, time 5291.22ms 
iter 10348: loss 2.3104, time 5296.44ms 
iter 10349: loss 2.3718, time 5263.25ms 
step 10350: train loss 2.4630, val loss 2.8741
iter 10350: loss 2.3041, time 20138.34ms 
iter 10351: loss 2.3192, time 5312.76ms 
iter 10352: loss 2.2456, time 5194.77ms 
iter 10353: loss 2.3980, time 5169.83ms 
iter 10354: loss 2.4459, time 5171.02ms 
iter 10355: loss 2.3062, time 5181.07ms 
iter 10356: loss 2.2214, time 5300.69ms 
iter 10357: loss 2.6107, time 5294.70ms 
iter 10358: loss 2.4504, time 5283.15ms 
iter 10359: loss 2.5126, time 5283.57ms 
iter 10360: loss 2.5260, time 5310.91ms 
iter 10361: loss 2.4928, time 5302.86ms 
iter 10362: loss 2.8929, time 5268.89ms 
iter 10363: loss 2.2393, time 5277.45ms 
iter 10364: loss 2.2814, time 5278.00ms 
iter 10365: loss 2.3635, time 5261.59ms 
iter 10366: loss 2.5096, time 5262.93ms 
iter 10367: loss 2.4608, time 5293.34ms 
iter 10368: loss 2.5198, time 5236.92ms 
iter 10369: loss 2.4845, time 5268.32ms 
iter 10370: loss 2.4771, time 5263.88ms 
iter 10371: loss 2.5405, time 5257.66ms 
iter 10372: loss 2.3324, time 5235.67ms 
iter 10373: loss 2.4003, time 5209.91ms 
iter 10374: loss 2.5048, time 5240.28ms 
iter 10375: loss 2.3116, time 5211.75ms 
iter 10376: loss 2.4522, time 5248.51ms 
iter 10377: loss 2.4659, time 5265.14ms 
iter 10378: loss 2.4008, time 5271.36ms 
iter 10379: loss 2.2676, time 5272.31ms 
iter 10380: loss 2.3334, time 5256.23ms 
iter 10381: loss 2.3738, time 5280.14ms 
iter 10382: loss 2.2693, time 5268.21ms 
iter 10383: loss 2.3482, time 5259.96ms 
iter 10384: loss 2.4911, time 5242.72ms 
iter 10385: loss 2.2701, time 5295.30ms 
iter 10386: loss 2.2269, time 5292.58ms 
iter 10387: loss 2.4871, time 5289.47ms 
iter 10388: loss 2.5127, time 5302.25ms 
iter 10389: loss 2.3717, time 5284.89ms 
iter 10390: loss 2.4053, time 5286.96ms 
iter 10391: loss 2.5284, time 5299.70ms 
iter 10392: loss 2.3142, time 5298.28ms 
iter 10393: loss 2.4065, time 5283.97ms 
iter 10394: loss 2.3625, time 5288.11ms 
iter 10395: loss 2.2915, time 5268.78ms 
iter 10396: loss 2.4345, time 5205.94ms 
iter 10397: loss 2.3414, time 5265.26ms 
iter 10398: loss 2.4382, time 5294.69ms 
iter 10399: loss 2.6967, time 5310.91ms 
step 10400: train loss 2.4388, val loss 2.8665
iter 10400: loss 2.5541, time 20015.23ms 
iter 10401: loss 2.4451, time 5243.81ms 
iter 10402: loss 2.4546, time 5295.06ms 
iter 10403: loss 2.5268, time 5300.91ms 
iter 10404: loss 2.3492, time 5294.72ms 
iter 10405: loss 2.2445, time 5293.01ms 
iter 10406: loss 2.4742, time 5288.56ms 
iter 10407: loss 2.3432, time 5309.51ms 
iter 10408: loss 2.4778, time 5288.38ms 
iter 10409: loss 2.5166, time 5291.56ms 
iter 10410: loss 2.3683, time 5304.08ms 
iter 10411: loss 2.4331, time 5306.45ms 
iter 10412: loss 2.3275, time 5294.90ms 
iter 10413: loss 2.4813, time 5296.65ms 
iter 10414: loss 2.4784, time 5308.25ms 
iter 10415: loss 2.6031, time 5294.31ms 
iter 10416: loss 2.3651, time 5295.81ms 
iter 10417: loss 2.3671, time 5300.88ms 
iter 10418: loss 2.1992, time 5291.00ms 
iter 10419: loss 2.5251, time 5302.40ms 
iter 10420: loss 2.6668, time 5292.09ms 
iter 10421: loss 2.6177, time 5296.82ms 
iter 10422: loss 2.4763, time 5293.82ms 
iter 10423: loss 2.1965, time 5299.41ms 
iter 10424: loss 2.4540, time 5318.95ms 
iter 10425: loss 2.3421, time 5293.89ms 
iter 10426: loss 2.3021, time 5299.29ms 
iter 10427: loss 2.6216, time 5302.46ms 
iter 10428: loss 2.3738, time 5293.90ms 
iter 10429: loss 2.5037, time 5291.13ms 
iter 10430: loss 2.5686, time 5286.93ms 
iter 10431: loss 2.6430, time 5302.69ms 
iter 10432: loss 2.5131, time 5298.51ms 
iter 10433: loss 2.3197, time 5291.72ms 
iter 10434: loss 2.3807, time 5296.88ms 
iter 10435: loss 2.4472, time 5303.84ms 
iter 10436: loss 2.3892, time 5294.98ms 
iter 10437: loss 2.5754, time 5288.66ms 
iter 10438: loss 2.6110, time 5293.39ms 
iter 10439: loss 2.5330, time 5312.27ms 
iter 10440: loss 2.5367, time 5288.07ms 
iter 10441: loss 2.5439, time 5286.87ms 
iter 10442: loss 2.2695, time 5302.76ms 
iter 10443: loss 2.4981, time 5289.01ms 
iter 10444: loss 2.4739, time 5288.86ms 
iter 10445: loss 2.3001, time 5245.59ms 
iter 10446: loss 2.2684, time 5285.13ms 
iter 10447: loss 2.3875, time 5256.08ms 
iter 10448: loss 2.5254, time 5285.47ms 
iter 10449: loss 2.4782, time 5302.58ms 
step 10450: train loss 2.4369, val loss 2.8700
iter 10450: loss 2.4496, time 20094.55ms 
iter 10451: loss 2.5403, time 5292.00ms 
iter 10452: loss 2.2943, time 5298.36ms 
iter 10453: loss 2.5596, time 5290.12ms 
iter 10454: loss 2.4077, time 5286.29ms 
iter 10455: loss 2.4169, time 5297.39ms 
iter 10456: loss 2.6251, time 5297.99ms 
iter 10457: loss 2.0944, time 5285.24ms 
iter 10458: loss 2.5957, time 5283.23ms 
iter 10459: loss 2.4933, time 5296.85ms 
iter 10460: loss 2.7331, time 5290.67ms 
iter 10461: loss 2.3766, time 5263.74ms 
iter 10462: loss 2.5268, time 5228.32ms 
iter 10463: loss 2.4167, time 5267.22ms 
iter 10464: loss 2.7655, time 5282.06ms 
iter 10465: loss 2.5803, time 5289.80ms 
iter 10466: loss 2.5053, time 5321.20ms 
iter 10467: loss 2.4489, time 5295.95ms 
iter 10468: loss 2.7542, time 5142.00ms 
iter 10469: loss 2.4931, time 5228.65ms 
iter 10470: loss 2.5049, time 5311.48ms 
iter 10471: loss 2.0231, time 5302.41ms 
iter 10472: loss 2.3929, time 5287.64ms 
iter 10473: loss 2.6079, time 5295.41ms 
iter 10474: loss 2.3606, time 5292.28ms 
iter 10475: loss 2.5064, time 5292.60ms 
iter 10476: loss 2.5255, time 5282.61ms 
iter 10477: loss 2.3563, time 5297.34ms 
iter 10478: loss 2.2664, time 5286.17ms 
iter 10479: loss 2.3823, time 5295.02ms 
iter 10480: loss 2.1158, time 5294.56ms 
iter 10481: loss 2.5659, time 5283.47ms 
iter 10482: loss 2.4161, time 5283.60ms 
iter 10483: loss 2.4242, time 5302.30ms 
iter 10484: loss 2.6144, time 5291.11ms 
iter 10485: loss 2.4274, time 5248.85ms 
iter 10486: loss 2.3045, time 5285.41ms 
iter 10487: loss 2.3546, time 5293.40ms 
iter 10488: loss 2.6068, time 5290.88ms 
iter 10489: loss 2.3046, time 5295.80ms 
iter 10490: loss 2.3686, time 5290.86ms 
iter 10491: loss 2.4766, time 5302.92ms 
iter 10492: loss 2.6207, time 5277.09ms 
iter 10493: loss 2.2946, time 5286.27ms 
iter 10494: loss 2.7354, time 5298.74ms 
iter 10495: loss 2.1809, time 5284.38ms 
iter 10496: loss 2.4821, time 5277.06ms 
iter 10497: loss 2.3243, time 5291.51ms 
iter 10498: loss 2.1705, time 5309.16ms 
iter 10499: loss 2.1862, time 5278.87ms 
step 10500: train loss 2.4291, val loss 2.8543
iter 10500: loss 2.2665, time 20095.90ms 
iter 10501: loss 2.5021, time 5305.76ms 
iter 10502: loss 2.1785, time 5281.95ms 
iter 10503: loss 2.5018, time 5336.89ms 
iter 10504: loss 2.4589, time 5283.31ms 
iter 10505: loss 2.3412, time 5282.58ms 
iter 10506: loss 2.4116, time 5276.78ms 
iter 10507: loss 2.6259, time 5286.57ms 
iter 10508: loss 2.2476, time 5295.55ms 
iter 10509: loss 2.3537, time 5282.21ms 
iter 10510: loss 2.1388, time 5239.33ms 
iter 10511: loss 2.0936, time 5276.89ms 
iter 10512: loss 2.4829, time 5261.84ms 
iter 10513: loss 2.5232, time 5202.49ms 
iter 10514: loss 2.5626, time 5234.27ms 
iter 10515: loss 2.4274, time 5291.19ms 
iter 10516: loss 2.5836, time 5294.70ms 
iter 10517: loss 2.4811, time 5296.33ms 
iter 10518: loss 2.3337, time 5296.27ms 
iter 10519: loss 2.5206, time 5299.78ms 
iter 10520: loss 2.4445, time 5292.28ms 
iter 10521: loss 2.5169, time 5298.73ms 
iter 10522: loss 2.5477, time 5303.93ms 
iter 10523: loss 2.4288, time 5283.96ms 
iter 10524: loss 2.3063, time 5248.19ms 
iter 10525: loss 2.4612, time 5250.17ms 
iter 10526: loss 2.3525, time 5301.10ms 
iter 10527: loss 2.4049, time 5290.89ms 
iter 10528: loss 2.2975, time 5300.09ms 
iter 10529: loss 2.5212, time 5312.43ms 
iter 10530: loss 2.3537, time 5267.00ms 
iter 10531: loss 2.5137, time 5293.63ms 
iter 10532: loss 2.4854, time 5294.17ms 
iter 10533: loss 2.7599, time 5290.79ms 
iter 10534: loss 2.6012, time 5284.92ms 
iter 10535: loss 2.6616, time 5290.90ms 
iter 10536: loss 2.5026, time 5298.65ms 
iter 10537: loss 2.4755, time 5299.43ms 
iter 10538: loss 2.2903, time 5301.60ms 
iter 10539: loss 2.3657, time 5296.05ms 
iter 10540: loss 2.6865, time 5260.16ms 
iter 10541: loss 2.2944, time 5286.96ms 
iter 10542: loss 2.3270, time 5290.73ms 
iter 10543: loss 2.5066, time 5313.67ms 
iter 10544: loss 2.4915, time 5315.94ms 
iter 10545: loss 2.3748, time 5316.00ms 
iter 10546: loss 2.6531, time 5299.37ms 
iter 10547: loss 2.1844, time 5289.70ms 
iter 10548: loss 2.7563, time 5289.64ms 
iter 10549: loss 2.3700, time 5296.22ms 
step 10550: train loss 2.4223, val loss 2.8522
iter 10550: loss 2.3652, time 20172.14ms 
iter 10551: loss 2.5993, time 5303.55ms 
iter 10552: loss 2.4112, time 5289.29ms 
iter 10553: loss 2.4514, time 5285.87ms 
iter 10554: loss 2.4044, time 5304.25ms 
iter 10555: loss 2.1117, time 5290.12ms 
iter 10556: loss 2.5613, time 5295.53ms 
iter 10557: loss 2.4362, time 5279.89ms 
iter 10558: loss 2.6521, time 5304.70ms 
iter 10559: loss 2.3360, time 5291.00ms 
iter 10560: loss 2.4022, time 5296.21ms 
iter 10561: loss 2.5470, time 5306.39ms 
iter 10562: loss 2.2059, time 5289.87ms 
iter 10563: loss 2.5194, time 5292.88ms 
iter 10564: loss 2.5951, time 5243.64ms 
iter 10565: loss 2.6978, time 5306.17ms 
iter 10566: loss 2.3972, time 5300.22ms 
iter 10567: loss 2.1777, time 5290.03ms 
iter 10568: loss 2.4820, time 5305.82ms 
iter 10569: loss 2.2915, time 5298.81ms 
iter 10570: loss 2.5195, time 5298.62ms 
iter 10571: loss 2.1809, time 5300.41ms 
iter 10572: loss 2.3404, time 5299.67ms 
iter 10573: loss 2.3963, time 5293.00ms 
iter 10574: loss 2.5464, time 5265.72ms 
iter 10575: loss 2.4608, time 5303.75ms 
iter 10576: loss 2.2854, time 5286.61ms 
iter 10577: loss 2.4061, time 5296.75ms 
iter 10578: loss 2.3698, time 5292.50ms 
iter 10579: loss 2.6145, time 5320.15ms 
iter 10580: loss 2.5271, time 5296.98ms 
iter 10581: loss 2.3679, time 5302.17ms 
iter 10582: loss 2.3647, time 5301.07ms 
iter 10583: loss 2.4767, time 5290.81ms 
iter 10584: loss 2.6278, time 5297.96ms 
iter 10585: loss 2.5465, time 5302.32ms 
iter 10586: loss 2.6062, time 5312.67ms 
iter 10587: loss 2.1746, time 5294.15ms 
iter 10588: loss 2.4784, time 5303.20ms 
iter 10589: loss 2.5192, time 5297.27ms 
iter 10590: loss 2.4233, time 5276.44ms 
iter 10591: loss 2.4214, time 5287.78ms 
iter 10592: loss 2.2997, time 5292.91ms 
iter 10593: loss 2.3473, time 5290.97ms 
iter 10594: loss 2.4159, time 5296.59ms 
iter 10595: loss 2.2345, time 5082.74ms 
iter 10596: loss 2.4436, time 5228.73ms 
iter 10597: loss 2.6025, time 5296.56ms 
iter 10598: loss 2.4688, time 5298.19ms 
iter 10599: loss 2.3186, time 5289.57ms 
step 10600: train loss 2.4449, val loss 2.8848
iter 10600: loss 2.3127, time 20116.14ms 
iter 10601: loss 2.5157, time 5271.22ms 
iter 10602: loss 2.4519, time 5304.72ms 
iter 10603: loss 2.1989, time 5272.57ms 
iter 10604: loss 2.4558, time 5274.27ms 
iter 10605: loss 2.4380, time 5302.60ms 
iter 10606: loss 2.4711, time 5294.83ms 
iter 10607: loss 2.3922, time 5289.84ms 
iter 10608: loss 2.3397, time 5296.50ms 
iter 10609: loss 2.5189, time 5311.25ms 
iter 10610: loss 2.2552, time 5317.36ms 
iter 10611: loss 2.0955, time 5303.53ms 
iter 10612: loss 2.5440, time 5305.17ms 
iter 10613: loss 2.5061, time 5292.47ms 
iter 10614: loss 2.5385, time 5312.68ms 
iter 10615: loss 2.5870, time 5314.79ms 
iter 10616: loss 2.3893, time 5250.84ms 
iter 10617: loss 2.1728, time 5304.63ms 
iter 10618: loss 2.4419, time 5301.89ms 
iter 10619: loss 2.2683, time 5293.68ms 
iter 10620: loss 2.2427, time 5302.52ms 
iter 10621: loss 2.5423, time 5296.90ms 
iter 10622: loss 2.3812, time 5297.98ms 
iter 10623: loss 2.3698, time 5299.43ms 
iter 10624: loss 2.4861, time 5310.24ms 
iter 10625: loss 2.4510, time 5293.51ms 
iter 10626: loss 2.4367, time 5292.16ms 
iter 10627: loss 2.3991, time 5315.15ms 
iter 10628: loss 2.3629, time 5300.97ms 
iter 10629: loss 2.1348, time 5302.80ms 
iter 10630: loss 2.5001, time 5291.52ms 
iter 10631: loss 2.5081, time 5303.50ms 
iter 10632: loss 2.5388, time 5291.44ms 
iter 10633: loss 2.3452, time 5253.32ms 
iter 10634: loss 2.4658, time 5257.03ms 
iter 10635: loss 2.6053, time 5249.18ms 
iter 10636: loss 2.5529, time 5314.63ms 
iter 10637: loss 2.4274, time 5283.24ms 
iter 10638: loss 2.2499, time 5315.11ms 
iter 10639: loss 2.4377, time 5311.15ms 
iter 10640: loss 2.5613, time 5277.12ms 
iter 10641: loss 2.1884, time 5308.00ms 
iter 10642: loss 2.5623, time 5298.96ms 
iter 10643: loss 2.4039, time 5310.90ms 
iter 10644: loss 2.2374, time 5312.90ms 
iter 10645: loss 2.4769, time 5313.09ms 
iter 10646: loss 2.3413, time 5304.47ms 
iter 10647: loss 2.2463, time 5309.34ms 
iter 10648: loss 2.5555, time 5310.16ms 
