tokens per iteration will be: 491,520
Initializing a new model from scratch
config:OpenELMConfig {
  "_name_or_path": "./",
  "activation_fn_name": "swish",
  "architectures": [
    "OpenELMForCausalLM"
  ],
  "attn_pdrop": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_openelm.OpenELMConfig",
    "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "ffn_dim_divisor": 256,
  "ffn_multipliers": [
    0.5,
    0.75,
    1.0,
    1.25,
    1.5,
    1.75,
    2.0,
    2.25
  ],
  "ffn_with_glu": true,
  "head_dim": 64,
  "initializer_range": 0.02,
  "max_context_length": 1024,
  "model_dim": 1280,
  "model_type": "openelm",
  "normalization_layer_name": "rms_norm",
  "normalize_qk_projections": true,
  "num_gqa_groups": 2,
  "num_kv_heads": [
    3,
    3,
    4,
    4,
    4,
    4,
    5,
    5
  ],
  "num_query_heads": [
    10,
    12,
    12,
    14,
    16,
    18,
    18,
    20
  ],
  "num_transformer_layers": 8,
  "qkv_multipliers": [
    0.5,
    1.0
  ],
  "rope_freq_constant": 10000,
  "rope_max_length": 1024,
  "share_input_output_layers": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

config:OpenELMConfig {
  "_name_or_path": "./",
  "activation_fn_name": "swish",
  "architectures": [
    "OpenELMForCausalLM"
  ],
  "attn_pdrop": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_openelm.OpenELMConfig",
    "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "ffn_dim_divisor": 256,
  "ffn_multipliers": [
    0.5,
    0.75,
    1.0,
    1.25,
    1.5,
    1.5,
    1.75,
    2.0
  ],
  "ffn_with_glu": true,
  "head_dim": 64,
  "initializer_range": 0.02,
  "max_context_length": 1024,
  "model_dim": 954,
  "model_type": "openelm",
  "normalization_layer_name": "rms_norm",
  "normalize_qk_projections": true,
  "num_gqa_groups": 2,
  "num_kv_heads": [
    3,
    3,
    3,
    3,
    4,
    4,
    4,
    5
  ],
  "num_query_heads": [
    6,
    6,
    6,
    6,
    8,
    8,
    8,
    10
  ],
  "num_transformer_layers": 8,
  "qkv_multipliers": [
    0.5,
    1.0
  ],
  "rope_freq_constant": 10000,
  "rope_max_length": 1024,
  "share_input_output_layers": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

num decayed parameter tensors: 33, with 87,875,802 parameters
num non-decayed parameter tensors: 33, with 17,242 parameters
using fused AdamW: True
compiling the model... (takes a ~minute)
number of parameters: 87.89M
number of transformer parameters: 39.95M
step 0: train loss 11.0772, val loss 11.0651
iter 0: loss 11.1351, time 55135.87ms 
iter 1: loss 11.0440, time 5194.85ms 
iter 2: loss 11.0143, time 5288.51ms 
iter 3: loss 10.8843, time 5121.94ms 
iter 4: loss 10.6914, time 5002.97ms 
iter 5: loss 10.3177, time 5025.70ms 
iter 6: loss 10.1069, time 5017.23ms 
iter 7: loss 9.6684, time 5020.29ms 
iter 8: loss 9.2105, time 5013.40ms 
iter 9: loss 8.8902, time 5025.02ms 
iter 10: loss 8.4163, time 5231.77ms 
iter 11: loss 7.8807, time 5252.33ms 
iter 12: loss 7.5321, time 5248.89ms 
iter 13: loss 7.2515, time 5251.05ms 
iter 14: loss 7.3037, time 5241.91ms 
iter 15: loss 7.2477, time 5252.04ms 
iter 16: loss 7.0541, time 5247.49ms 
iter 17: loss 7.2186, time 5246.67ms 
iter 18: loss 7.3420, time 5251.21ms 
iter 19: loss 6.8376, time 5242.65ms 
iter 20: loss 6.8536, time 5247.59ms 
iter 21: loss 6.9843, time 5258.07ms 
iter 22: loss 6.6266, time 5264.83ms 
iter 23: loss 6.9035, time 5254.48ms 
iter 24: loss 6.6511, time 5259.43ms 
iter 25: loss 6.4304, time 5255.64ms 
iter 26: loss 6.8046, time 5246.29ms 
iter 27: loss 6.4249, time 5247.20ms 
iter 28: loss 6.5224, time 5251.88ms 
iter 29: loss 6.3336, time 5252.25ms 
iter 30: loss 6.0914, time 5247.87ms 
iter 31: loss 6.2045, time 5254.57ms 
iter 32: loss 6.1996, time 5244.00ms 
iter 33: loss 6.3329, time 5247.99ms 
iter 34: loss 6.3206, time 5258.96ms 
iter 35: loss 6.2404, time 5267.51ms 
iter 36: loss 6.2439, time 5347.02ms 
iter 37: loss 6.1771, time 5260.40ms 
iter 38: loss 6.3528, time 5205.12ms 
iter 39: loss 6.0413, time 5047.36ms 
iter 40: loss 6.0637, time 5046.99ms 
iter 41: loss 5.9750, time 5288.39ms 
iter 42: loss 6.1242, time 5289.59ms 
iter 43: loss 5.8182, time 5263.02ms 
iter 44: loss 6.2618, time 5269.73ms 
iter 45: loss 5.9283, time 5290.83ms 
iter 46: loss 5.9543, time 5279.97ms 
iter 47: loss 6.1724, time 5274.90ms 
iter 48: loss 6.1503, time 5268.08ms 
iter 49: loss 6.3722, time 5254.37ms 
step 50: train loss 5.9468, val loss 5.8688
iter 50: loss 6.0228, time 20067.92ms 
iter 51: loss 6.3464, time 5286.53ms 
iter 52: loss 5.7254, time 5285.45ms 
iter 53: loss 5.8651, time 5268.68ms 
iter 54: loss 5.6732, time 5288.54ms 
iter 55: loss 6.1982, time 5351.03ms 
iter 56: loss 5.7838, time 5287.50ms 
iter 57: loss 5.8390, time 5282.38ms 
iter 58: loss 5.7587, time 5276.85ms 
iter 59: loss 5.7423, time 5277.79ms 
iter 60: loss 5.7176, time 5285.14ms 
iter 61: loss 5.4688, time 5297.83ms 
iter 62: loss 5.6902, time 5304.36ms 
iter 63: loss 5.4685, time 5301.60ms 
iter 64: loss 5.7925, time 5294.22ms 
iter 65: loss 5.4052, time 5220.12ms 
iter 66: loss 5.6611, time 5190.49ms 
iter 67: loss 5.4568, time 5273.50ms 
iter 68: loss 5.4349, time 5273.98ms 
iter 69: loss 5.6875, time 5238.08ms 
iter 70: loss 5.5962, time 5278.12ms 
iter 71: loss 5.6709, time 5274.60ms 
iter 72: loss 5.5030, time 5269.27ms 
iter 73: loss 5.6802, time 5275.11ms 
iter 74: loss 5.5547, time 5265.18ms 
iter 75: loss 5.8939, time 5274.89ms 
iter 76: loss 5.3629, time 5272.02ms 
iter 77: loss 5.5499, time 5263.59ms 
iter 78: loss 5.8156, time 5272.47ms 
iter 79: loss 5.7434, time 5261.46ms 
iter 80: loss 5.3438, time 5260.22ms 
iter 81: loss 5.3893, time 5275.59ms 
iter 82: loss 5.6441, time 5239.80ms 
iter 83: loss 5.8631, time 5276.53ms 
iter 84: loss 5.5354, time 5268.94ms 
iter 85: loss 5.2044, time 5271.79ms 
iter 86: loss 5.1844, time 5278.86ms 
iter 87: loss 5.3270, time 5263.20ms 
iter 88: loss 5.1318, time 5259.42ms 
iter 89: loss 5.2289, time 5252.54ms 
iter 90: loss 5.1823, time 5260.69ms 
iter 91: loss 5.4961, time 5258.86ms 
iter 92: loss 5.1424, time 5262.86ms 
iter 93: loss 5.1018, time 5266.19ms 
iter 94: loss 5.0939, time 5246.79ms 
iter 95: loss 5.4256, time 5267.27ms 
iter 96: loss 5.1353, time 5273.29ms 
iter 97: loss 5.0098, time 5260.20ms 
iter 98: loss 5.1661, time 5266.04ms 
iter 99: loss 5.2607, time 5266.05ms 
step 100: train loss 5.0855, val loss 5.0486
iter 100: loss 5.0577, time 20069.79ms 
iter 101: loss 5.0379, time 5254.66ms 
iter 102: loss 4.8885, time 5263.97ms 
iter 103: loss 5.1276, time 5259.85ms 
iter 104: loss 5.2132, time 5266.16ms 
iter 105: loss 5.0205, time 5260.34ms 
iter 106: loss 4.9020, time 5259.32ms 
iter 107: loss 4.9516, time 5258.37ms 
iter 108: loss 5.1266, time 5251.80ms 
iter 109: loss 4.9559, time 5262.86ms 
iter 110: loss 4.6994, time 5251.29ms 
iter 111: loss 4.9832, time 5256.55ms 
iter 112: loss 4.6270, time 5230.37ms 
iter 113: loss 4.9102, time 5259.70ms 
iter 114: loss 4.8844, time 5258.40ms 
iter 115: loss 4.8440, time 5262.59ms 
iter 116: loss 4.6063, time 5256.88ms 
iter 117: loss 4.9097, time 5254.35ms 
iter 118: loss 4.7803, time 5262.92ms 
iter 119: loss 4.9708, time 5257.43ms 
iter 120: loss 4.8614, time 5259.26ms 
iter 121: loss 4.5969, time 5264.79ms 
iter 122: loss 4.6964, time 5264.14ms 
iter 123: loss 4.8830, time 5259.99ms 
iter 124: loss 4.7549, time 5262.04ms 
iter 125: loss 4.8065, time 5259.84ms 
iter 126: loss 4.9925, time 5275.81ms 
iter 127: loss 4.8750, time 5271.23ms 
iter 128: loss 4.7395, time 5265.61ms 
iter 129: loss 4.8049, time 5263.79ms 
iter 130: loss 4.8226, time 5252.41ms 
iter 131: loss 4.8298, time 5255.49ms 
iter 132: loss 4.5778, time 5255.97ms 
iter 133: loss 4.7229, time 5259.35ms 
iter 134: loss 4.7396, time 5255.22ms 
iter 135: loss 4.7295, time 5281.76ms 
iter 136: loss 4.4955, time 5338.46ms 
iter 137: loss 4.6420, time 5309.62ms 
iter 138: loss 4.6639, time 5257.64ms 
iter 139: loss 4.4485, time 5263.36ms 
iter 140: loss 4.6018, time 5266.25ms 
iter 141: loss 4.4045, time 5265.88ms 
iter 142: loss 4.5750, time 5169.22ms 
iter 143: loss 4.4202, time 5154.86ms 
iter 144: loss 4.3958, time 5196.64ms 
iter 145: loss 4.5089, time 5255.71ms 
iter 146: loss 4.4364, time 5252.17ms 
iter 147: loss 4.4274, time 5208.54ms 
iter 148: loss 4.7708, time 5254.39ms 
iter 149: loss 4.2280, time 5263.22ms 
step 150: train loss 4.4314, val loss 4.3661
iter 150: loss 4.4007, time 20010.07ms 
iter 151: loss 4.3728, time 5260.55ms 
iter 152: loss 4.3991, time 5260.06ms 
iter 153: loss 4.1769, time 5246.81ms 
iter 154: loss 4.2251, time 5246.18ms 
iter 155: loss 4.3944, time 5254.46ms 
iter 156: loss 4.2019, time 5096.26ms 
iter 157: loss 4.5435, time 5157.97ms 
iter 158: loss 4.1661, time 5168.12ms 
iter 159: loss 4.2573, time 5163.68ms 
iter 160: loss 4.3554, time 5150.22ms 
iter 161: loss 4.4649, time 5174.58ms 
iter 162: loss 4.1978, time 5259.51ms 
iter 163: loss 4.2312, time 5171.59ms 
iter 164: loss 4.4146, time 5225.14ms 
iter 165: loss 4.3158, time 5248.54ms 
iter 166: loss 4.1717, time 5203.01ms 
iter 167: loss 4.1204, time 5125.83ms 
iter 168: loss 4.3093, time 5145.78ms 
iter 169: loss 4.3807, time 5147.75ms 
iter 170: loss 4.2584, time 5184.09ms 
iter 171: loss 4.1338, time 5192.28ms 
iter 172: loss 4.1376, time 5132.59ms 
iter 173: loss 4.3979, time 5159.43ms 
iter 174: loss 4.1930, time 5114.85ms 
iter 175: loss 4.4768, time 5197.71ms 
iter 176: loss 4.0991, time 5170.80ms 
iter 177: loss 4.2362, time 5199.86ms 
iter 178: loss 4.1591, time 5136.74ms 
iter 179: loss 4.2199, time 5154.90ms 
iter 180: loss 4.3332, time 5220.18ms 
iter 181: loss 4.0750, time 5258.76ms 
iter 182: loss 4.0669, time 5269.65ms 
iter 183: loss 4.2315, time 5267.91ms 
iter 184: loss 4.0629, time 5263.15ms 
iter 185: loss 4.4723, time 5191.71ms 
iter 186: loss 4.1932, time 5179.72ms 
iter 187: loss 4.2370, time 5175.13ms 
iter 188: loss 4.4044, time 5277.64ms 
iter 189: loss 4.3270, time 5188.20ms 
iter 190: loss 4.1616, time 5176.34ms 
iter 191: loss 4.2329, time 5214.64ms 
iter 192: loss 4.0893, time 5203.98ms 
iter 193: loss 3.9502, time 5182.49ms 
iter 194: loss 3.9808, time 5131.71ms 
iter 195: loss 4.1427, time 5155.77ms 
iter 196: loss 4.0265, time 5175.51ms 
iter 197: loss 4.0686, time 5176.31ms 
iter 198: loss 4.0024, time 5261.98ms 
iter 199: loss 4.0466, time 5266.96ms 
step 200: train loss 4.0669, val loss 3.9952
iter 200: loss 3.8832, time 20053.61ms 
iter 201: loss 3.9109, time 5255.70ms 
iter 202: loss 4.1212, time 5257.60ms 
iter 203: loss 4.1097, time 5264.18ms 
iter 204: loss 4.0575, time 5257.78ms 
iter 205: loss 3.8907, time 5241.34ms 
iter 206: loss 3.8615, time 5142.50ms 
iter 207: loss 4.0221, time 5168.64ms 
iter 208: loss 4.1764, time 5149.38ms 
iter 209: loss 3.9345, time 5149.77ms 
iter 210: loss 3.8686, time 5156.55ms 
iter 211: loss 3.9325, time 5247.55ms 
iter 212: loss 3.9498, time 5267.12ms 
iter 213: loss 4.1189, time 5256.62ms 
iter 214: loss 4.2125, time 5258.40ms 
iter 215: loss 3.9767, time 5261.31ms 
iter 216: loss 3.8365, time 5261.66ms 
iter 217: loss 3.8870, time 5212.69ms 
iter 218: loss 4.0831, time 5169.40ms 
iter 219: loss 4.0368, time 5262.90ms 
iter 220: loss 3.7530, time 5268.84ms 
iter 221: loss 3.9466, time 5240.17ms 
iter 222: loss 4.0418, time 5243.87ms 
iter 223: loss 3.8093, time 5249.57ms 
iter 224: loss 3.7727, time 5246.94ms 
iter 225: loss 4.2878, time 5268.83ms 
iter 226: loss 4.0540, time 5258.93ms 
iter 227: loss 3.8596, time 5252.30ms 
iter 228: loss 3.8848, time 5256.08ms 
iter 229: loss 3.9811, time 5247.98ms 
iter 230: loss 3.8129, time 5266.66ms 
iter 231: loss 3.9956, time 5261.69ms 
iter 232: loss 3.8988, time 5257.90ms 
iter 233: loss 4.0726, time 5258.47ms 
iter 234: loss 4.0687, time 5256.54ms 
iter 235: loss 3.7740, time 5261.08ms 
iter 236: loss 3.9823, time 5266.13ms 
iter 237: loss 3.7798, time 5263.96ms 
iter 238: loss 4.0417, time 5266.23ms 
iter 239: loss 3.7776, time 5266.39ms 
iter 240: loss 3.8920, time 5262.88ms 
iter 241: loss 3.9164, time 5284.84ms 
iter 242: loss 4.2043, time 5278.35ms 
iter 243: loss 3.8760, time 5280.67ms 
iter 244: loss 3.8706, time 5271.99ms 
iter 245: loss 4.1761, time 5264.29ms 
iter 246: loss 3.9189, time 5258.87ms 
iter 247: loss 3.9923, time 5259.54ms 
iter 248: loss 3.7980, time 5254.27ms 
iter 249: loss 4.2732, time 5251.43ms 
step 250: train loss 3.9116, val loss 3.8611
iter 250: loss 3.9671, time 20028.91ms 
iter 251: loss 3.8811, time 5257.15ms 
iter 252: loss 4.0266, time 5268.92ms 
iter 253: loss 3.8264, time 5269.87ms 
iter 254: loss 3.8650, time 5266.20ms 
iter 255: loss 3.8148, time 5271.02ms 
iter 256: loss 4.0377, time 5271.85ms 
iter 257: loss 3.9122, time 5262.20ms 
iter 258: loss 3.7916, time 5275.33ms 
iter 259: loss 3.6681, time 5268.14ms 
iter 260: loss 3.9469, time 5273.40ms 
iter 261: loss 3.7521, time 5280.68ms 
iter 262: loss 3.9503, time 5285.88ms 
iter 263: loss 3.9179, time 5263.55ms 
iter 264: loss 3.8290, time 5266.41ms 
iter 265: loss 3.8142, time 5271.33ms 
iter 266: loss 3.8236, time 5258.64ms 
iter 267: loss 3.8540, time 5261.35ms 
iter 268: loss 3.7660, time 5261.10ms 
iter 269: loss 3.7151, time 5255.15ms 
iter 270: loss 4.0376, time 5259.91ms 
iter 271: loss 3.8421, time 5260.36ms 
iter 272: loss 3.8925, time 5260.39ms 
iter 273: loss 3.8841, time 5254.43ms 
iter 274: loss 3.8078, time 5261.48ms 
iter 275: loss 3.7837, time 5261.96ms 
iter 276: loss 3.7639, time 5272.93ms 
iter 277: loss 3.9388, time 5264.32ms 
iter 278: loss 3.8638, time 5267.61ms 
iter 279: loss 3.9723, time 5255.27ms 
iter 280: loss 3.8243, time 5257.04ms 
iter 281: loss 3.8616, time 5254.31ms 
iter 282: loss 4.0004, time 5249.55ms 
iter 283: loss 3.8792, time 5261.30ms 
iter 284: loss 4.0227, time 5265.36ms 
iter 285: loss 3.9389, time 5258.91ms 
iter 286: loss 3.6957, time 5259.54ms 
iter 287: loss 3.7460, time 5256.24ms 
iter 288: loss 3.7493, time 5265.47ms 
iter 289: loss 3.8323, time 5259.33ms 
iter 290: loss 3.7071, time 5256.55ms 
iter 291: loss 3.8460, time 5276.99ms 
iter 292: loss 3.9740, time 5261.34ms 
iter 293: loss 3.6727, time 5275.95ms 
iter 294: loss 3.7985, time 5260.67ms 
iter 295: loss 3.8884, time 5254.94ms 
iter 296: loss 3.6803, time 5256.09ms 
iter 297: loss 3.8853, time 5264.41ms 
iter 298: loss 3.6663, time 5262.85ms 
iter 299: loss 3.8885, time 5273.71ms 
step 300: train loss 3.8190, val loss 3.7531
iter 300: loss 3.9599, time 20069.39ms 
iter 301: loss 3.8586, time 5274.21ms 
iter 302: loss 3.7496, time 5276.88ms 
iter 303: loss 3.6930, time 5271.80ms 
iter 304: loss 3.6918, time 5271.98ms 
iter 305: loss 3.7559, time 5271.12ms 
iter 306: loss 3.7677, time 5261.73ms 
iter 307: loss 3.7854, time 5265.39ms 
iter 308: loss 3.6752, time 5261.18ms 
iter 309: loss 3.7612, time 5263.40ms 
iter 310: loss 3.7227, time 5272.53ms 
iter 311: loss 3.5641, time 5272.59ms 
iter 312: loss 3.7836, time 5267.84ms 
iter 313: loss 3.6176, time 5260.34ms 
iter 314: loss 3.7837, time 5258.66ms 
iter 315: loss 4.0753, time 5267.20ms 
iter 316: loss 3.8999, time 5261.17ms 
iter 317: loss 3.6617, time 5259.83ms 
iter 318: loss 3.6789, time 5261.19ms 
iter 319: loss 3.7447, time 5250.94ms 
iter 320: loss 3.8187, time 5257.64ms 
iter 321: loss 3.8857, time 5263.32ms 
iter 322: loss 3.6914, time 5257.66ms 
iter 323: loss 3.6930, time 5254.33ms 
iter 324: loss 3.8507, time 5239.14ms 
iter 325: loss 3.6736, time 5247.37ms 
iter 326: loss 3.6308, time 5261.22ms 
iter 327: loss 3.8045, time 5264.68ms 
iter 328: loss 3.7748, time 5298.71ms 
iter 329: loss 3.6393, time 5252.71ms 
iter 330: loss 3.6381, time 5255.36ms 
iter 331: loss 3.7693, time 5257.03ms 
iter 332: loss 3.7387, time 5255.83ms 
iter 333: loss 3.6253, time 5256.30ms 
iter 334: loss 3.7791, time 5271.01ms 
iter 335: loss 3.8418, time 5275.51ms 
iter 336: loss 3.8280, time 5271.52ms 
iter 337: loss 3.7272, time 5274.83ms 
iter 338: loss 3.8173, time 5261.06ms 
iter 339: loss 3.6672, time 5259.19ms 
iter 340: loss 3.5980, time 5273.06ms 
iter 341: loss 3.7402, time 5271.42ms 
iter 342: loss 3.8373, time 5273.75ms 
iter 343: loss 3.6914, time 5268.12ms 
iter 344: loss 3.5207, time 5273.49ms 
iter 345: loss 3.7968, time 5268.56ms 
iter 346: loss 4.0642, time 5260.35ms 
iter 347: loss 3.6331, time 5224.77ms 
iter 348: loss 3.9472, time 5261.08ms 
iter 349: loss 3.8688, time 5263.05ms 
step 350: train loss 3.7134, val loss 3.6693
iter 350: loss 3.6773, time 20060.17ms 
iter 351: loss 3.7555, time 5263.76ms 
iter 352: loss 3.5894, time 5258.88ms 
iter 353: loss 3.7679, time 5267.82ms 
iter 354: loss 3.6182, time 5260.59ms 
iter 355: loss 3.6368, time 5259.52ms 
iter 356: loss 3.5628, time 5257.20ms 
iter 357: loss 3.7172, time 5261.07ms 
iter 358: loss 3.6976, time 5258.86ms 
iter 359: loss 3.5692, time 5259.73ms 
iter 360: loss 3.8031, time 5260.32ms 
iter 361: loss 3.6866, time 5255.20ms 
iter 362: loss 3.5466, time 5262.28ms 
iter 363: loss 3.7552, time 5256.09ms 
iter 364: loss 3.7825, time 5256.47ms 
iter 365: loss 3.6339, time 5258.55ms 
iter 366: loss 3.4077, time 5252.83ms 
iter 367: loss 3.7992, time 5263.42ms 
iter 368: loss 4.0125, time 5243.40ms 
iter 369: loss 3.5652, time 5248.89ms 
iter 370: loss 3.8940, time 5251.44ms 
iter 371: loss 3.8622, time 5258.84ms 
iter 372: loss 3.5116, time 5255.84ms 
iter 373: loss 3.8245, time 5254.71ms 
iter 374: loss 3.7653, time 5268.90ms 
iter 375: loss 3.6997, time 5269.99ms 
iter 376: loss 3.8753, time 5262.80ms 
iter 377: loss 3.8721, time 5261.41ms 
iter 378: loss 3.5517, time 5267.52ms 
iter 379: loss 3.7179, time 5223.82ms 
iter 380: loss 3.6157, time 5243.92ms 
iter 381: loss 3.5612, time 5256.60ms 
iter 382: loss 3.6680, time 5263.11ms 
iter 383: loss 3.6522, time 5278.59ms 
iter 384: loss 3.7687, time 5258.82ms 
iter 385: loss 3.6951, time 5266.12ms 
iter 386: loss 3.7154, time 5270.32ms 
iter 387: loss 3.5537, time 5277.96ms 
iter 388: loss 3.8165, time 5271.11ms 
iter 389: loss 3.5854, time 5283.72ms 
iter 390: loss 3.7218, time 5266.65ms 
iter 391: loss 3.6071, time 5241.64ms 
iter 392: loss 3.7078, time 5258.75ms 
iter 393: loss 3.6129, time 5252.16ms 
iter 394: loss 3.8808, time 5250.87ms 
iter 395: loss 3.8203, time 5254.12ms 
iter 396: loss 3.6083, time 5260.70ms 
iter 397: loss 3.4970, time 5258.00ms 
iter 398: loss 3.6594, time 5256.59ms 
iter 399: loss 3.5715, time 5254.49ms 
step 400: train loss 3.6222, val loss 3.5855
iter 400: loss 3.6831, time 20075.76ms 
iter 401: loss 3.5792, time 5265.28ms 
iter 402: loss 3.7572, time 5257.17ms 
iter 403: loss 3.3997, time 5256.54ms 
iter 404: loss 3.5546, time 5264.77ms 
iter 405: loss 3.7111, time 5255.88ms 
iter 406: loss 3.6935, time 5240.12ms 
iter 407: loss 3.5625, time 5258.29ms 
iter 408: loss 3.6421, time 5267.28ms 
iter 409: loss 3.6130, time 5262.20ms 
iter 410: loss 3.9514, time 5277.09ms 
iter 411: loss 3.5348, time 5278.05ms 
iter 412: loss 3.4283, time 5274.59ms 
iter 413: loss 3.5641, time 5273.98ms 
iter 414: loss 3.6455, time 5268.69ms 
iter 415: loss 3.4804, time 5250.72ms 
iter 416: loss 3.5628, time 5256.62ms 
iter 417: loss 3.5451, time 5248.44ms 
iter 418: loss 3.7216, time 5260.77ms 
iter 419: loss 3.6887, time 5261.36ms 
iter 420: loss 3.5318, time 5257.78ms 
iter 421: loss 3.6662, time 5267.89ms 
iter 422: loss 3.7396, time 5262.28ms 
iter 423: loss 3.4850, time 5265.88ms 
iter 424: loss 3.6638, time 5262.10ms 
iter 425: loss 3.4390, time 5258.80ms 
iter 426: loss 3.7256, time 5273.58ms 
iter 427: loss 3.4922, time 5244.61ms 
iter 428: loss 3.4668, time 5264.63ms 
iter 429: loss 3.6207, time 5240.44ms 
iter 430: loss 3.5565, time 5208.08ms 
iter 431: loss 3.4032, time 5227.80ms 
iter 432: loss 3.7743, time 5261.07ms 
iter 433: loss 4.0076, time 5264.90ms 
iter 434: loss 3.6230, time 5256.51ms 
iter 435: loss 3.5068, time 5306.16ms 
iter 436: loss 3.5821, time 5261.67ms 
iter 437: loss 3.5215, time 5271.29ms 
iter 438: loss 3.5580, time 5266.23ms 
iter 439: loss 3.5081, time 5271.47ms 
iter 440: loss 3.6483, time 5266.78ms 
iter 441: loss 3.5812, time 5279.15ms 
iter 442: loss 3.6081, time 5262.93ms 
iter 443: loss 3.6208, time 5230.09ms 
iter 444: loss 3.7128, time 5257.59ms 
iter 445: loss 3.6602, time 5258.30ms 
iter 446: loss 3.5252, time 5238.52ms 
iter 447: loss 3.5001, time 5257.01ms 
iter 448: loss 3.6408, time 5257.77ms 
iter 449: loss 3.6515, time 5256.79ms 
step 450: train loss 3.5917, val loss 3.5564
iter 450: loss 3.6142, time 20069.49ms 
iter 451: loss 3.5333, time 5258.21ms 
iter 452: loss 3.3283, time 5255.09ms 
iter 453: loss 3.6746, time 5273.63ms 
iter 454: loss 3.5499, time 5268.24ms 
iter 455: loss 3.5654, time 5271.83ms 
iter 456: loss 3.4454, time 5270.88ms 
iter 457: loss 3.5594, time 5283.12ms 
iter 458: loss 3.7177, time 5271.42ms 
iter 459: loss 3.5968, time 5274.67ms 
iter 460: loss 3.3881, time 5264.95ms 
iter 461: loss 3.4042, time 5259.18ms 
iter 462: loss 3.5802, time 5263.18ms 
iter 463: loss 3.4306, time 5264.75ms 
iter 464: loss 3.5273, time 5269.34ms 
iter 465: loss 3.6693, time 5264.25ms 
iter 466: loss 3.8133, time 5266.99ms 
iter 467: loss 3.5775, time 5278.99ms 
iter 468: loss 3.7418, time 5271.61ms 
iter 469: loss 3.6846, time 5273.24ms 
iter 470: loss 3.7167, time 5277.20ms 
iter 471: loss 3.5003, time 5267.50ms 
iter 472: loss 3.7723, time 5265.84ms 
iter 473: loss 3.4458, time 5264.83ms 
iter 474: loss 3.4407, time 5255.03ms 
iter 475: loss 3.5839, time 5260.39ms 
iter 476: loss 3.4749, time 5257.77ms 
iter 477: loss 3.5793, time 5258.03ms 
iter 478: loss 3.4815, time 5226.49ms 
iter 479: loss 3.4338, time 5258.97ms 
iter 480: loss 3.5738, time 5254.96ms 
iter 481: loss 3.6703, time 5270.07ms 
iter 482: loss 3.4884, time 5259.35ms 
iter 483: loss 3.6078, time 5258.98ms 
iter 484: loss 3.3362, time 5257.81ms 
iter 485: loss 3.5287, time 5261.08ms 
iter 486: loss 3.6110, time 5227.54ms 
iter 487: loss 3.7083, time 5259.13ms 
iter 488: loss 3.3778, time 5254.91ms 
iter 489: loss 3.3931, time 5272.95ms 
iter 490: loss 3.4013, time 5253.74ms 
iter 491: loss 3.5367, time 5256.34ms 
iter 492: loss 3.5926, time 5261.19ms 
iter 493: loss 3.4323, time 5262.47ms 
iter 494: loss 3.5509, time 5254.75ms 
iter 495: loss 3.6549, time 5268.77ms 
iter 496: loss 3.6121, time 5227.89ms 
iter 497: loss 3.4857, time 5259.77ms 
iter 498: loss 3.3745, time 5252.74ms 
iter 499: loss 3.3709, time 5258.59ms 
step 500: train loss 3.4934, val loss 3.4838
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 500: loss 3.4166, time 21772.25ms 
iter 501: loss 3.6800, time 5257.24ms 
iter 502: loss 3.4992, time 5263.67ms 
iter 503: loss 3.4942, time 5257.09ms 
iter 504: loss 3.4980, time 5256.50ms 
iter 505: loss 3.6179, time 5247.99ms 
iter 506: loss 3.5402, time 5240.68ms 
iter 507: loss 3.3161, time 5243.17ms 
iter 508: loss 3.2687, time 5236.16ms 
iter 509: loss 3.7053, time 5260.14ms 
iter 510: loss 3.4016, time 5240.59ms 
iter 511: loss 3.5594, time 5213.73ms 
iter 512: loss 3.6256, time 5261.07ms 
iter 513: loss 3.4514, time 5254.20ms 
iter 514: loss 3.5093, time 5255.26ms 
iter 515: loss 3.4092, time 5258.77ms 
iter 516: loss 3.4612, time 5257.27ms 
iter 517: loss 3.5444, time 5264.95ms 
iter 518: loss 3.4690, time 5257.12ms 
iter 519: loss 3.6677, time 5237.94ms 
iter 520: loss 3.4169, time 5242.79ms 
iter 521: loss 3.6261, time 5247.68ms 
iter 522: loss 3.4855, time 5238.86ms 
iter 523: loss 3.3888, time 5248.13ms 
iter 524: loss 3.3076, time 5232.29ms 
iter 525: loss 3.3735, time 5237.42ms 
iter 526: loss 3.5703, time 5232.11ms 
iter 527: loss 3.2962, time 5251.35ms 
iter 528: loss 3.5600, time 5263.03ms 
iter 529: loss 3.4641, time 5229.02ms 
iter 530: loss 3.4674, time 5242.10ms 
iter 531: loss 3.5208, time 5259.88ms 
iter 532: loss 3.6209, time 5257.18ms 
iter 533: loss 3.4251, time 5250.13ms 
iter 534: loss 3.5393, time 5255.02ms 
iter 535: loss 3.4658, time 5256.29ms 
iter 536: loss 3.4498, time 5251.07ms 
iter 537: loss 3.2656, time 5252.15ms 
iter 538: loss 3.4247, time 5225.39ms 
iter 539: loss 3.3868, time 5224.75ms 
iter 540: loss 3.3280, time 5246.36ms 
iter 541: loss 3.4738, time 5250.84ms 
iter 542: loss 3.5914, time 5254.78ms 
iter 543: loss 3.3280, time 5257.62ms 
iter 544: loss 3.4535, time 5257.16ms 
iter 545: loss 3.3285, time 5259.01ms 
iter 546: loss 3.2868, time 5259.02ms 
iter 547: loss 3.3692, time 5252.28ms 
iter 548: loss 3.5246, time 5263.39ms 
iter 549: loss 3.4329, time 5256.10ms 
step 550: train loss 3.4624, val loss 3.4435
iter 550: loss 3.3130, time 20026.50ms 
iter 551: loss 3.3235, time 5278.27ms 
iter 552: loss 3.5005, time 5277.48ms 
iter 553: loss 3.3481, time 5256.49ms 
iter 554: loss 3.4130, time 5258.70ms 
iter 555: loss 3.3854, time 5261.12ms 
iter 556: loss 3.2522, time 5266.49ms 
iter 557: loss 3.6046, time 5257.34ms 
iter 558: loss 3.5123, time 5260.53ms 
iter 559: loss 3.3727, time 5277.03ms 
iter 560: loss 3.5315, time 5274.09ms 
iter 561: loss 3.3804, time 5287.62ms 
iter 562: loss 3.5743, time 5281.72ms 
iter 563: loss 3.2687, time 5272.82ms 
iter 564: loss 3.4152, time 5272.29ms 
iter 565: loss 3.4099, time 5259.99ms 
iter 566: loss 3.4172, time 5275.94ms 
iter 567: loss 3.4159, time 5270.18ms 
iter 568: loss 3.4642, time 5278.47ms 
iter 569: loss 3.2357, time 5272.92ms 
iter 570: loss 3.3375, time 5270.46ms 
iter 571: loss 3.4194, time 5270.14ms 
iter 572: loss 3.2454, time 5264.94ms 
iter 573: loss 3.4524, time 5257.07ms 
iter 574: loss 3.6344, time 5254.74ms 
iter 575: loss 3.5991, time 5273.59ms 
iter 576: loss 3.5940, time 5270.41ms 
iter 577: loss 3.4140, time 5255.14ms 
iter 578: loss 3.4595, time 5275.20ms 
iter 579: loss 3.2841, time 5269.48ms 
iter 580: loss 3.4044, time 5269.22ms 
iter 581: loss 3.2736, time 5264.84ms 
iter 582: loss 3.5589, time 5265.31ms 
iter 583: loss 3.4355, time 5270.25ms 
iter 584: loss 3.4487, time 5264.80ms 
iter 585: loss 3.5830, time 5256.52ms 
iter 586: loss 3.3567, time 5258.69ms 
iter 587: loss 3.5147, time 5253.57ms 
iter 588: loss 3.2808, time 5268.09ms 
iter 589: loss 3.4151, time 5268.56ms 
iter 590: loss 3.4980, time 5258.93ms 
iter 591: loss 3.2186, time 5264.98ms 
iter 592: loss 3.6676, time 5277.03ms 
iter 593: loss 3.4064, time 5264.95ms 
iter 594: loss 3.3084, time 5272.46ms 
iter 595: loss 3.3138, time 5272.64ms 
iter 596: loss 3.5349, time 5269.26ms 
iter 597: loss 3.4491, time 5262.92ms 
iter 598: loss 3.4117, time 5279.58ms 
iter 599: loss 3.4860, time 5276.73ms 
step 600: train loss 3.4166, val loss 3.4072
iter 600: loss 3.4340, time 20029.49ms 
iter 601: loss 3.4996, time 5263.64ms 
iter 602: loss 3.3203, time 5264.47ms 
iter 603: loss 3.3749, time 5267.35ms 
iter 604: loss 3.5499, time 5266.68ms 
iter 605: loss 3.3405, time 5263.46ms 
iter 606: loss 3.3307, time 5261.82ms 
iter 607: loss 3.4098, time 5264.38ms 
iter 608: loss 3.3635, time 5272.38ms 
iter 609: loss 3.2732, time 5243.55ms 
iter 610: loss 3.4079, time 5273.25ms 
iter 611: loss 3.4031, time 5273.64ms 
iter 612: loss 3.4350, time 5140.22ms 
iter 613: loss 3.3452, time 5145.07ms 
iter 614: loss 3.3713, time 5180.84ms 
iter 615: loss 3.3433, time 5201.16ms 
iter 616: loss 3.1549, time 5259.88ms 
iter 617: loss 3.2948, time 5259.60ms 
iter 618: loss 3.3051, time 5265.09ms 
iter 619: loss 3.2391, time 5266.15ms 
iter 620: loss 3.3433, time 5263.39ms 
iter 621: loss 3.3769, time 5267.23ms 
iter 622: loss 3.4925, time 5274.73ms 
iter 623: loss 3.3822, time 5272.90ms 
iter 624: loss 3.4994, time 5274.61ms 
iter 625: loss 3.4425, time 5267.50ms 
iter 626: loss 3.3908, time 5272.38ms 
iter 627: loss 3.5757, time 5286.45ms 
iter 628: loss 3.1773, time 5266.73ms 
iter 629: loss 3.4183, time 5272.28ms 
iter 630: loss 3.5780, time 5279.07ms 
iter 631: loss 3.4019, time 5278.03ms 
iter 632: loss 3.2587, time 5287.28ms 
iter 633: loss 3.2470, time 5301.72ms 
iter 634: loss 3.3244, time 5259.07ms 
iter 635: loss 3.3306, time 5176.28ms 
iter 636: loss 3.3512, time 5303.37ms 
iter 637: loss 3.3732, time 5286.47ms 
iter 638: loss 3.2416, time 5276.61ms 
iter 639: loss 3.4396, time 5276.10ms 
iter 640: loss 3.2398, time 5274.10ms 
iter 641: loss 3.5179, time 5240.07ms 
iter 642: loss 3.3135, time 5265.36ms 
iter 643: loss 3.6105, time 5255.19ms 
iter 644: loss 3.3865, time 5264.99ms 
iter 645: loss 3.4694, time 5258.87ms 
iter 646: loss 3.3697, time 5280.90ms 
iter 647: loss 3.3662, time 5273.99ms 
iter 648: loss 3.4026, time 5260.14ms 
iter 649: loss 3.4728, time 5257.46ms 
step 650: train loss 3.3595, val loss 3.3592
iter 650: loss 3.4355, time 20035.68ms 
iter 651: loss 3.2474, time 5267.81ms 
iter 652: loss 3.3123, time 5257.27ms 
iter 653: loss 3.1924, time 5260.26ms 
iter 654: loss 3.2997, time 5260.66ms 
iter 655: loss 3.3529, time 5264.19ms 
iter 656: loss 3.3680, time 5257.68ms 
iter 657: loss 3.4286, time 5261.76ms 
iter 658: loss 3.7759, time 5269.64ms 
iter 659: loss 3.3484, time 5256.07ms 
iter 660: loss 3.4119, time 5260.33ms 
iter 661: loss 3.4997, time 5233.57ms 
iter 662: loss 3.4334, time 5256.46ms 
iter 663: loss 3.3509, time 5267.90ms 
iter 664: loss 3.3358, time 5262.15ms 
iter 665: loss 3.2186, time 5254.63ms 
iter 666: loss 3.5308, time 5255.05ms 
iter 667: loss 3.2175, time 5249.26ms 
iter 668: loss 3.4021, time 5257.95ms 
iter 669: loss 3.3668, time 5269.14ms 
iter 670: loss 3.4074, time 5267.34ms 
iter 671: loss 3.3514, time 5270.72ms 
iter 672: loss 3.3534, time 5266.80ms 
iter 673: loss 3.1599, time 5256.77ms 
iter 674: loss 3.2179, time 5262.79ms 
iter 675: loss 3.4233, time 5268.84ms 
iter 676: loss 3.5153, time 5276.04ms 
iter 677: loss 3.2955, time 5275.14ms 
iter 678: loss 3.3827, time 5276.77ms 
iter 679: loss 3.3710, time 5335.42ms 
iter 680: loss 3.3379, time 5305.74ms 
iter 681: loss 3.3356, time 5271.73ms 
iter 682: loss 3.2504, time 5271.57ms 
iter 683: loss 3.2165, time 5278.30ms 
iter 684: loss 3.1315, time 5276.64ms 
iter 685: loss 3.3824, time 5274.89ms 
iter 686: loss 3.1345, time 5269.80ms 
iter 687: loss 3.3180, time 5269.60ms 
iter 688: loss 3.2815, time 5271.20ms 
iter 689: loss 3.3360, time 5263.95ms 
iter 690: loss 3.4347, time 5271.91ms 
iter 691: loss 3.3018, time 5268.88ms 
iter 692: loss 3.3956, time 5264.25ms 
iter 693: loss 3.4173, time 5262.82ms 
iter 694: loss 3.2792, time 5262.82ms 
iter 695: loss 3.2204, time 5269.83ms 
iter 696: loss 3.3586, time 5264.05ms 
iter 697: loss 3.3611, time 5258.77ms 
iter 698: loss 3.4095, time 5257.65ms 
iter 699: loss 3.2346, time 5256.13ms 
step 700: train loss 3.3284, val loss 3.3264
iter 700: loss 3.2105, time 20041.40ms 
iter 701: loss 3.2502, time 5268.04ms 
iter 702: loss 3.3260, time 5273.34ms 
iter 703: loss 3.2539, time 5278.54ms 
iter 704: loss 3.2644, time 5281.43ms 
iter 705: loss 3.4640, time 5274.83ms 
iter 706: loss 3.3636, time 5280.02ms 
iter 707: loss 3.3147, time 5277.45ms 
iter 708: loss 3.1227, time 5270.69ms 
iter 709: loss 3.4857, time 5267.84ms 
iter 710: loss 3.2350, time 5273.38ms 
iter 711: loss 3.4457, time 5269.99ms 
iter 712: loss 3.0581, time 5263.88ms 
iter 713: loss 3.2957, time 5268.04ms 
iter 714: loss 3.2497, time 5258.29ms 
iter 715: loss 3.4834, time 5271.80ms 
iter 716: loss 3.3611, time 5274.70ms 
iter 717: loss 3.1109, time 5280.87ms 
iter 718: loss 3.3046, time 5271.84ms 
iter 719: loss 3.3914, time 5275.03ms 
iter 720: loss 3.3724, time 5281.52ms 
iter 721: loss 3.2697, time 5271.91ms 
iter 722: loss 3.6006, time 5265.73ms 
iter 723: loss 3.3741, time 5267.14ms 
iter 724: loss 3.2746, time 5262.04ms 
iter 725: loss 3.3755, time 5267.89ms 
iter 726: loss 3.3906, time 5265.77ms 
iter 727: loss 3.3763, time 5266.87ms 
iter 728: loss 3.2076, time 5268.15ms 
iter 729: loss 3.3580, time 5272.40ms 
iter 730: loss 3.3390, time 5276.80ms 
iter 731: loss 3.3355, time 5271.72ms 
iter 732: loss 3.2949, time 5270.01ms 
iter 733: loss 3.3047, time 5265.82ms 
iter 734: loss 3.3496, time 5271.26ms 
iter 735: loss 3.3801, time 5263.61ms 
iter 736: loss 3.1505, time 5267.78ms 
iter 737: loss 3.3752, time 5268.19ms 
iter 738: loss 3.3556, time 5262.77ms 
iter 739: loss 3.3662, time 5260.63ms 
iter 740: loss 3.3896, time 5266.51ms 
iter 741: loss 3.4032, time 5259.77ms 
iter 742: loss 3.3265, time 5264.72ms 
iter 743: loss 3.2118, time 5224.41ms 
iter 744: loss 3.2534, time 5039.87ms 
iter 745: loss 3.3750, time 5014.93ms 
iter 746: loss 3.2854, time 5075.50ms 
iter 747: loss 3.2559, time 5273.97ms 
iter 748: loss 3.3794, time 5276.88ms 
iter 749: loss 3.4027, time 5276.89ms 
step 750: train loss 3.2898, val loss 3.2897
iter 750: loss 3.2845, time 20067.34ms 
iter 751: loss 3.0736, time 5261.96ms 
iter 752: loss 3.4457, time 5264.95ms 
iter 753: loss 3.1329, time 5249.40ms 
iter 754: loss 3.3254, time 5188.25ms 
iter 755: loss 3.5962, time 5151.30ms 
iter 756: loss 3.1205, time 5137.26ms 
iter 757: loss 3.3010, time 5207.21ms 
iter 758: loss 3.3633, time 5268.17ms 
iter 759: loss 3.1463, time 5268.72ms 
iter 760: loss 3.2653, time 5264.58ms 
iter 761: loss 3.4138, time 5259.26ms 
iter 762: loss 3.5016, time 5258.26ms 
iter 763: loss 3.2875, time 5256.77ms 
iter 764: loss 3.1897, time 5256.34ms 
iter 765: loss 3.2176, time 5264.26ms 
iter 766: loss 3.4359, time 5219.44ms 
iter 767: loss 3.1981, time 5215.22ms 
iter 768: loss 3.1954, time 5250.45ms 
iter 769: loss 3.2821, time 5252.90ms 
iter 770: loss 3.4615, time 5253.78ms 
iter 771: loss 3.5157, time 5246.18ms 
iter 772: loss 3.1767, time 5243.94ms 
iter 773: loss 3.2362, time 5253.35ms 
iter 774: loss 3.2700, time 5262.83ms 
iter 775: loss 3.4148, time 5243.13ms 
iter 776: loss 3.1509, time 5240.32ms 
iter 777: loss 3.2513, time 5256.25ms 
iter 778: loss 3.1532, time 5255.18ms 
iter 779: loss 3.1286, time 5264.09ms 
iter 780: loss 3.3600, time 5270.03ms 
iter 781: loss 3.4561, time 5238.42ms 
iter 782: loss 3.2729, time 5272.87ms 
iter 783: loss 3.4570, time 5241.40ms 
iter 784: loss 3.4419, time 5254.33ms 
iter 785: loss 3.1169, time 5249.43ms 
iter 786: loss 3.2504, time 5250.41ms 
iter 787: loss 3.2645, time 5256.94ms 
iter 788: loss 3.1804, time 5247.19ms 
iter 789: loss 3.4219, time 5255.39ms 
iter 790: loss 3.3637, time 5264.94ms 
iter 791: loss 3.2720, time 5249.39ms 
iter 792: loss 3.1780, time 5247.53ms 
iter 793: loss 3.3671, time 5247.60ms 
iter 794: loss 3.2074, time 5254.22ms 
iter 795: loss 3.1887, time 5265.38ms 
iter 796: loss 3.4457, time 5265.30ms 
iter 797: loss 3.2708, time 5253.76ms 
iter 798: loss 3.2416, time 5263.20ms 
iter 799: loss 3.2664, time 5260.42ms 
step 800: train loss 3.2608, val loss 3.2641
iter 800: loss 3.0869, time 20072.59ms 
iter 801: loss 3.5028, time 5260.41ms 
iter 802: loss 3.2085, time 5258.13ms 
iter 803: loss 3.2558, time 5259.68ms 
iter 804: loss 3.1873, time 5222.20ms 
iter 805: loss 3.3529, time 5270.23ms 
iter 806: loss 3.1987, time 5273.67ms 
iter 807: loss 3.1169, time 5234.02ms 
iter 808: loss 3.2441, time 5271.31ms 
iter 809: loss 3.3513, time 5258.23ms 
iter 810: loss 3.0987, time 5277.22ms 
iter 811: loss 3.1634, time 5258.45ms 
iter 812: loss 3.2072, time 5263.41ms 
iter 813: loss 3.0345, time 5234.32ms 
iter 814: loss 3.3491, time 5268.79ms 
iter 815: loss 3.2517, time 5275.31ms 
iter 816: loss 3.2182, time 5276.27ms 
iter 817: loss 3.2367, time 5281.84ms 
iter 818: loss 3.1634, time 5266.69ms 
iter 819: loss 3.2503, time 5272.26ms 
iter 820: loss 3.1158, time 5255.93ms 
iter 821: loss 3.2551, time 5245.61ms 
iter 822: loss 3.3280, time 5251.80ms 
iter 823: loss 3.2160, time 5264.71ms 
iter 824: loss 3.5487, time 5253.45ms 
iter 825: loss 3.2712, time 5258.94ms 
iter 826: loss 3.3991, time 5277.73ms 
iter 827: loss 3.1798, time 5256.77ms 
iter 828: loss 3.3302, time 5263.70ms 
iter 829: loss 3.3090, time 5265.46ms 
iter 830: loss 3.2997, time 5254.20ms 
iter 831: loss 3.0949, time 5263.53ms 
iter 832: loss 3.4234, time 5258.90ms 
iter 833: loss 3.2220, time 5267.97ms 
iter 834: loss 3.2736, time 5260.57ms 
iter 835: loss 3.2323, time 5262.48ms 
iter 836: loss 3.2210, time 5268.34ms 
iter 837: loss 3.0756, time 5264.98ms 
iter 838: loss 3.2141, time 5257.61ms 
iter 839: loss 3.2072, time 5266.79ms 
iter 840: loss 3.0811, time 5264.35ms 
iter 841: loss 3.3203, time 5263.47ms 
iter 842: loss 3.2957, time 5266.56ms 
iter 843: loss 3.2613, time 5261.76ms 
iter 844: loss 3.5412, time 5270.92ms 
iter 845: loss 3.2050, time 5265.35ms 
iter 846: loss 3.3189, time 5278.40ms 
iter 847: loss 3.2882, time 5273.85ms 
iter 848: loss 3.0755, time 5271.36ms 
iter 849: loss 3.0588, time 5277.47ms 
step 850: train loss 3.2411, val loss 3.2359
iter 850: loss 3.3283, time 20074.38ms 
iter 851: loss 3.1218, time 5269.70ms 
iter 852: loss 3.1097, time 5272.71ms 
iter 853: loss 3.2821, time 5231.13ms 
iter 854: loss 3.1864, time 5258.06ms 
iter 855: loss 3.3171, time 5272.34ms 
iter 856: loss 3.2518, time 5265.66ms 
iter 857: loss 3.1776, time 5268.07ms 
iter 858: loss 3.3068, time 5266.89ms 
iter 859: loss 3.1813, time 5264.22ms 
iter 860: loss 3.2526, time 5278.43ms 
iter 861: loss 3.2884, time 5277.42ms 
iter 862: loss 3.1251, time 5274.37ms 
iter 863: loss 3.1020, time 5261.72ms 
iter 864: loss 3.0489, time 5279.18ms 
iter 865: loss 3.0600, time 5268.86ms 
iter 866: loss 3.1480, time 5277.36ms 
iter 867: loss 3.2727, time 5271.92ms 
iter 868: loss 3.1610, time 5267.85ms 
iter 869: loss 3.1236, time 5276.54ms 
iter 870: loss 3.1679, time 5264.87ms 
iter 871: loss 3.2226, time 5276.19ms 
iter 872: loss 3.3445, time 5253.10ms 
iter 873: loss 3.2360, time 5247.25ms 
iter 874: loss 3.4343, time 5264.51ms 
iter 875: loss 3.2957, time 5280.18ms 
iter 876: loss 2.9732, time 5276.36ms 
iter 877: loss 3.1046, time 5248.75ms 
iter 878: loss 3.2394, time 5274.03ms 
iter 879: loss 3.0039, time 5284.15ms 
iter 880: loss 3.1954, time 5288.70ms 
iter 881: loss 3.2508, time 5272.86ms 
iter 882: loss 3.2350, time 5236.59ms 
iter 883: loss 3.2487, time 5260.68ms 
iter 884: loss 3.3148, time 5258.51ms 
iter 885: loss 3.3353, time 5260.75ms 
iter 886: loss 3.3262, time 5267.47ms 
iter 887: loss 3.4424, time 5263.64ms 
iter 888: loss 3.1093, time 5267.56ms 
iter 889: loss 3.4384, time 5270.93ms 
iter 890: loss 3.2298, time 5261.67ms 
iter 891: loss 3.2882, time 5257.20ms 
iter 892: loss 3.2855, time 5263.12ms 
iter 893: loss 3.3461, time 5264.44ms 
iter 894: loss 3.0755, time 5275.23ms 
iter 895: loss 3.2166, time 5266.54ms 
iter 896: loss 3.3263, time 5270.04ms 
iter 897: loss 3.1575, time 5262.96ms 
iter 898: loss 3.1841, time 5264.98ms 
iter 899: loss 3.1445, time 5271.28ms 
step 900: train loss 3.2082, val loss 3.2080
iter 900: loss 3.2858, time 20046.01ms 
iter 901: loss 3.3579, time 5256.55ms 
iter 902: loss 3.2446, time 5240.79ms 
iter 903: loss 3.1039, time 5264.57ms 
iter 904: loss 3.1298, time 5262.46ms 
iter 905: loss 3.1460, time 5273.42ms 
iter 906: loss 3.2283, time 5277.20ms 
iter 907: loss 3.1994, time 5263.91ms 
iter 908: loss 3.0567, time 5258.87ms 
iter 909: loss 3.2953, time 5273.24ms 
iter 910: loss 3.3083, time 5271.40ms 
iter 911: loss 3.0866, time 5267.26ms 
iter 912: loss 3.1657, time 5271.29ms 
iter 913: loss 3.2846, time 5268.26ms 
iter 914: loss 3.2766, time 5256.98ms 
iter 915: loss 3.4036, time 5258.75ms 
iter 916: loss 3.3015, time 5260.56ms 
iter 917: loss 3.0843, time 5259.95ms 
iter 918: loss 3.2357, time 5273.10ms 
iter 919: loss 3.1190, time 5273.95ms 
iter 920: loss 3.2357, time 5263.75ms 
iter 921: loss 3.1875, time 5264.27ms 
iter 922: loss 2.9914, time 5271.56ms 
iter 923: loss 3.1597, time 5263.53ms 
iter 924: loss 3.2285, time 5262.20ms 
iter 925: loss 3.1928, time 5269.30ms 
iter 926: loss 3.3998, time 5270.38ms 
iter 927: loss 3.1746, time 5271.03ms 
iter 928: loss 3.0494, time 5267.22ms 
iter 929: loss 3.1311, time 5271.98ms 
iter 930: loss 3.0515, time 5264.03ms 
iter 931: loss 3.0729, time 5267.58ms 
iter 932: loss 3.4071, time 5268.45ms 
iter 933: loss 3.2887, time 5263.20ms 
iter 934: loss 3.3334, time 5256.35ms 
iter 935: loss 3.2776, time 5264.17ms 
iter 936: loss 3.0876, time 5267.34ms 
iter 937: loss 3.0202, time 5263.49ms 
iter 938: loss 3.0341, time 5281.42ms 
iter 939: loss 3.4246, time 5267.05ms 
iter 940: loss 3.2726, time 5265.26ms 
iter 941: loss 2.9687, time 5256.84ms 
iter 942: loss 3.1827, time 5264.62ms 
iter 943: loss 3.2820, time 5243.55ms 
iter 944: loss 3.0827, time 5243.53ms 
iter 945: loss 3.0818, time 5250.64ms 
iter 946: loss 3.0522, time 5243.41ms 
iter 947: loss 3.1022, time 5258.05ms 
iter 948: loss 3.1623, time 5259.33ms 
iter 949: loss 3.0016, time 5238.26ms 
step 950: train loss 3.1774, val loss 3.1899
iter 950: loss 3.0918, time 20048.27ms 
iter 951: loss 3.0880, time 5258.58ms 
iter 952: loss 3.2703, time 5258.40ms 
iter 953: loss 3.1879, time 5256.08ms 
iter 954: loss 3.0652, time 5259.51ms 
iter 955: loss 3.1630, time 5265.23ms 
iter 956: loss 3.3120, time 5261.55ms 
iter 957: loss 3.1783, time 5269.82ms 
iter 958: loss 3.2811, time 5288.72ms 
iter 959: loss 3.1247, time 5277.73ms 
iter 960: loss 3.2720, time 5283.73ms 
iter 961: loss 3.1217, time 5285.06ms 
iter 962: loss 3.0090, time 5285.98ms 
iter 963: loss 3.1297, time 5272.53ms 
iter 964: loss 3.1452, time 5268.37ms 
iter 965: loss 3.0715, time 5265.52ms 
iter 966: loss 3.2562, time 5259.08ms 
iter 967: loss 3.0580, time 5257.18ms 
iter 968: loss 3.1892, time 5270.78ms 
iter 969: loss 3.1277, time 5260.24ms 
iter 970: loss 3.2034, time 5256.93ms 
iter 971: loss 3.1207, time 5259.55ms 
iter 972: loss 3.1598, time 5258.80ms 
iter 973: loss 3.0354, time 5257.38ms 
iter 974: loss 3.1999, time 5263.88ms 
iter 975: loss 3.1584, time 5271.51ms 
iter 976: loss 3.1064, time 5268.56ms 
iter 977: loss 3.1147, time 5267.62ms 
iter 978: loss 3.1964, time 5264.21ms 
iter 979: loss 3.1489, time 5261.82ms 
iter 980: loss 3.0437, time 5259.63ms 
iter 981: loss 3.1623, time 5260.28ms 
iter 982: loss 3.0551, time 5257.91ms 
iter 983: loss 3.2026, time 5266.21ms 
iter 984: loss 3.4011, time 5268.92ms 
iter 985: loss 3.3914, time 5267.14ms 
iter 986: loss 3.0089, time 5267.50ms 
iter 987: loss 3.1782, time 5265.59ms 
iter 988: loss 3.2318, time 5270.44ms 
iter 989: loss 3.0785, time 5258.91ms 
iter 990: loss 3.1730, time 5257.43ms 
iter 991: loss 3.0372, time 5264.78ms 
iter 992: loss 3.0162, time 5263.02ms 
iter 993: loss 3.2946, time 5264.38ms 
iter 994: loss 3.1418, time 5267.96ms 
iter 995: loss 3.2844, time 5274.92ms 
iter 996: loss 3.3502, time 5266.40ms 
iter 997: loss 3.2083, time 5266.09ms 
iter 998: loss 3.1300, time 5270.95ms 
iter 999: loss 3.0733, time 5262.48ms 
step 1000: train loss 3.1560, val loss 3.1647
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 1000: loss 3.1746, time 21313.32ms 
iter 1001: loss 3.1060, time 5268.96ms 
iter 1002: loss 3.1209, time 5272.15ms 
iter 1003: loss 2.9692, time 5260.90ms 
iter 1004: loss 3.1552, time 5261.06ms 
iter 1005: loss 3.4631, time 5261.78ms 
iter 1006: loss 3.0472, time 5264.49ms 
iter 1007: loss 3.1119, time 5257.21ms 
iter 1008: loss 3.2016, time 5262.48ms 
iter 1009: loss 3.3783, time 5255.38ms 
iter 1010: loss 3.2810, time 5257.16ms 
iter 1011: loss 3.1741, time 5268.43ms 
iter 1012: loss 3.0936, time 5259.61ms 
iter 1013: loss 3.2078, time 5259.55ms 
iter 1014: loss 3.0735, time 5262.72ms 
iter 1015: loss 2.9994, time 5268.51ms 
iter 1016: loss 2.9748, time 5271.62ms 
iter 1017: loss 3.2091, time 5261.04ms 
iter 1018: loss 3.3168, time 5261.84ms 
iter 1019: loss 3.1027, time 5266.79ms 
iter 1020: loss 3.1273, time 5271.53ms 
iter 1021: loss 3.2512, time 5270.41ms 
iter 1022: loss 2.9743, time 5259.16ms 
iter 1023: loss 3.2708, time 5282.98ms 
iter 1024: loss 3.2559, time 5266.47ms 
iter 1025: loss 3.1350, time 5259.47ms 
iter 1026: loss 3.0716, time 5262.00ms 
iter 1027: loss 3.0570, time 5271.09ms 
iter 1028: loss 3.1938, time 5267.15ms 
iter 1029: loss 3.0083, time 5258.42ms 
iter 1030: loss 3.1224, time 5264.85ms 
iter 1031: loss 3.0532, time 5265.88ms 
iter 1032: loss 3.1562, time 5266.21ms 
iter 1033: loss 3.0561, time 5261.23ms 
iter 1034: loss 3.0859, time 5269.41ms 
iter 1035: loss 3.1753, time 5258.88ms 
iter 1036: loss 3.1121, time 5256.83ms 
iter 1037: loss 3.2826, time 5267.40ms 
iter 1038: loss 3.1151, time 5239.25ms 
iter 1039: loss 3.2848, time 5269.89ms 
iter 1040: loss 3.1335, time 5268.39ms 
iter 1041: loss 3.0074, time 5230.93ms 
iter 1042: loss 3.0558, time 5276.68ms 
iter 1043: loss 3.1239, time 5271.54ms 
iter 1044: loss 3.3196, time 5274.97ms 
iter 1045: loss 3.1533, time 5275.91ms 
iter 1046: loss 2.9594, time 5274.42ms 
iter 1047: loss 2.9892, time 5275.43ms 
iter 1048: loss 3.0722, time 5267.20ms 
iter 1049: loss 3.2561, time 5261.75ms 
step 1050: train loss 3.1308, val loss 3.1604
iter 1050: loss 3.1674, time 20027.50ms 
iter 1051: loss 3.2491, time 5258.81ms 
iter 1052: loss 3.0583, time 5255.71ms 
iter 1053: loss 3.1462, time 5258.74ms 
iter 1054: loss 3.0335, time 5263.09ms 
iter 1055: loss 3.0493, time 5255.51ms 
iter 1056: loss 3.2002, time 5264.89ms 
iter 1057: loss 3.2010, time 5258.20ms 
iter 1058: loss 3.1383, time 5256.95ms 
iter 1059: loss 2.9341, time 5261.90ms 
iter 1060: loss 3.0675, time 5253.39ms 
iter 1061: loss 3.2702, time 5264.38ms 
iter 1062: loss 3.1370, time 5259.90ms 
iter 1063: loss 3.0413, time 5266.60ms 
iter 1064: loss 3.1998, time 5262.46ms 
iter 1065: loss 3.2067, time 5268.97ms 
iter 1066: loss 3.0660, time 5256.99ms 
iter 1067: loss 3.1013, time 5266.46ms 
iter 1068: loss 3.0797, time 5266.07ms 
iter 1069: loss 2.9521, time 5259.68ms 
iter 1070: loss 3.0011, time 5262.30ms 
iter 1071: loss 3.1359, time 5259.40ms 
iter 1072: loss 3.2147, time 5257.05ms 
iter 1073: loss 3.0670, time 5272.80ms 
iter 1074: loss 2.9404, time 5274.57ms 
iter 1075: loss 3.0589, time 5261.63ms 
iter 1076: loss 2.9975, time 5260.98ms 
iter 1077: loss 3.1049, time 5263.48ms 
iter 1078: loss 2.9898, time 5260.00ms 
iter 1079: loss 3.0404, time 5256.10ms 
iter 1080: loss 3.0138, time 5268.58ms 
iter 1081: loss 3.2530, time 5267.61ms 
iter 1082: loss 3.0419, time 5261.49ms 
iter 1083: loss 3.0851, time 5259.42ms 
iter 1084: loss 3.2058, time 5270.94ms 
iter 1085: loss 3.1535, time 5269.51ms 
iter 1086: loss 3.2154, time 5268.94ms 
iter 1087: loss 3.2068, time 5270.37ms 
iter 1088: loss 3.1714, time 5271.62ms 
iter 1089: loss 3.2941, time 5264.46ms 
iter 1090: loss 3.1459, time 5268.52ms 
iter 1091: loss 3.1070, time 5265.67ms 
iter 1092: loss 3.0983, time 5257.10ms 
iter 1093: loss 3.0890, time 5260.01ms 
iter 1094: loss 3.1327, time 5267.76ms 
iter 1095: loss 3.0566, time 5257.44ms 
iter 1096: loss 3.2663, time 5254.72ms 
iter 1097: loss 3.1601, time 5262.14ms 
iter 1098: loss 2.9245, time 5264.36ms 
iter 1099: loss 2.9938, time 5254.03ms 
step 1100: train loss 3.1064, val loss 3.1224
iter 1100: loss 3.0493, time 20090.16ms 
iter 1101: loss 2.9584, time 5272.20ms 
iter 1102: loss 2.9974, time 5273.69ms 
iter 1103: loss 3.1501, time 5271.72ms 
iter 1104: loss 3.3036, time 5272.30ms 
iter 1105: loss 3.0553, time 5272.10ms 
iter 1106: loss 3.0601, time 5268.81ms 
iter 1107: loss 2.9818, time 5272.18ms 
iter 1108: loss 3.2167, time 5268.33ms 
iter 1109: loss 3.0627, time 5267.99ms 
iter 1110: loss 3.0525, time 5270.69ms 
iter 1111: loss 3.2424, time 5275.30ms 
iter 1112: loss 3.1069, time 5258.05ms 
iter 1113: loss 2.9176, time 5260.82ms 
iter 1114: loss 3.0388, time 5260.52ms 
iter 1115: loss 3.1277, time 5339.29ms 
iter 1116: loss 3.1340, time 5340.40ms 
iter 1117: loss 3.0748, time 5275.76ms 
iter 1118: loss 3.2515, time 5259.36ms 
iter 1119: loss 3.1545, time 5258.00ms 
iter 1120: loss 3.3976, time 5311.11ms 
iter 1121: loss 3.0565, time 5274.30ms 
iter 1122: loss 2.9931, time 5269.67ms 
iter 1123: loss 3.0265, time 5273.04ms 
iter 1124: loss 3.0037, time 5273.66ms 
iter 1125: loss 3.0522, time 5285.76ms 
iter 1126: loss 3.1363, time 5270.77ms 
iter 1127: loss 2.9675, time 5275.39ms 
iter 1128: loss 3.1314, time 5268.90ms 
iter 1129: loss 3.0609, time 5273.13ms 
iter 1130: loss 2.9348, time 5273.80ms 
iter 1131: loss 2.9713, time 5264.52ms 
iter 1132: loss 3.1100, time 5278.16ms 
iter 1133: loss 3.1453, time 5264.72ms 
iter 1134: loss 3.0882, time 5268.44ms 
iter 1135: loss 3.1229, time 5241.96ms 
iter 1136: loss 3.0355, time 5222.75ms 
iter 1137: loss 3.0375, time 5269.59ms 
iter 1138: loss 3.1492, time 5261.79ms 
iter 1139: loss 3.1461, time 5282.20ms 
iter 1140: loss 3.1586, time 5264.53ms 
iter 1141: loss 2.9554, time 5256.33ms 
iter 1142: loss 2.8297, time 5259.86ms 
iter 1143: loss 3.1767, time 5266.35ms 
iter 1144: loss 3.0589, time 5262.24ms 
iter 1145: loss 3.0296, time 5256.41ms 
iter 1146: loss 2.9912, time 5248.16ms 
iter 1147: loss 3.0228, time 5263.90ms 
iter 1148: loss 3.0665, time 5264.80ms 
iter 1149: loss 3.1555, time 5259.21ms 
step 1150: train loss 3.0914, val loss 3.1030
iter 1150: loss 2.9477, time 20079.10ms 
iter 1151: loss 3.2821, time 5264.69ms 
iter 1152: loss 3.2365, time 5273.64ms 
iter 1153: loss 2.8876, time 5268.49ms 
iter 1154: loss 3.0964, time 5266.02ms 
iter 1155: loss 3.0959, time 5273.35ms 
iter 1156: loss 2.9469, time 5270.17ms 
iter 1157: loss 3.0443, time 5259.60ms 
iter 1158: loss 3.2345, time 5259.69ms 
iter 1159: loss 3.1161, time 5271.55ms 
iter 1160: loss 3.1134, time 5263.38ms 
iter 1161: loss 3.3484, time 5267.78ms 
iter 1162: loss 3.1438, time 5265.49ms 
iter 1163: loss 3.0185, time 5273.33ms 
iter 1164: loss 3.1908, time 5274.00ms 
iter 1165: loss 3.0039, time 5260.63ms 
iter 1166: loss 3.0716, time 5269.49ms 
iter 1167: loss 2.9879, time 5271.72ms 
iter 1168: loss 2.9452, time 5275.31ms 
iter 1169: loss 2.8995, time 5266.70ms 
iter 1170: loss 3.1194, time 5272.02ms 
iter 1171: loss 3.1767, time 5267.42ms 
iter 1172: loss 2.9201, time 5267.95ms 
iter 1173: loss 2.8543, time 5270.73ms 
iter 1174: loss 2.9009, time 5272.68ms 
iter 1175: loss 3.1489, time 5267.97ms 
iter 1176: loss 3.0597, time 5267.19ms 
iter 1177: loss 3.1264, time 5258.89ms 
iter 1178: loss 3.0175, time 5258.31ms 
iter 1179: loss 3.2156, time 5264.64ms 
iter 1180: loss 3.0313, time 5274.27ms 
iter 1181: loss 3.0779, time 5267.76ms 
iter 1182: loss 3.1593, time 5264.34ms 
iter 1183: loss 3.0748, time 5262.22ms 
iter 1184: loss 2.9636, time 5272.39ms 
iter 1185: loss 3.1126, time 5265.11ms 
iter 1186: loss 3.2034, time 5263.42ms 
iter 1187: loss 3.1577, time 5259.35ms 
iter 1188: loss 2.9719, time 5258.49ms 
iter 1189: loss 3.2208, time 5262.13ms 
iter 1190: loss 3.0950, time 5256.21ms 
iter 1191: loss 3.1092, time 5256.30ms 
iter 1192: loss 3.1194, time 5266.55ms 
iter 1193: loss 3.0163, time 5270.34ms 
iter 1194: loss 2.9776, time 5256.56ms 
iter 1195: loss 2.9515, time 5240.23ms 
iter 1196: loss 2.9685, time 5256.17ms 
iter 1197: loss 3.2253, time 5255.40ms 
iter 1198: loss 3.1738, time 5257.21ms 
iter 1199: loss 3.0099, time 5259.56ms 
step 1200: train loss 3.0692, val loss 3.0961
iter 1200: loss 3.0623, time 20065.00ms 
iter 1201: loss 3.1052, time 5261.03ms 
iter 1202: loss 2.9698, time 5269.08ms 
iter 1203: loss 3.0956, time 5261.55ms 
iter 1204: loss 2.9671, time 5257.31ms 
iter 1205: loss 2.9546, time 5253.31ms 
iter 1206: loss 3.0274, time 5262.36ms 
iter 1207: loss 3.0747, time 5262.24ms 
iter 1208: loss 3.1254, time 5263.41ms 
iter 1209: loss 2.9293, time 5263.23ms 
iter 1210: loss 2.9989, time 5265.06ms 
iter 1211: loss 3.0673, time 5257.37ms 
iter 1212: loss 3.2374, time 5260.48ms 
iter 1213: loss 3.0541, time 5260.95ms 
iter 1214: loss 3.0010, time 5260.53ms 
iter 1215: loss 3.0205, time 5250.90ms 
iter 1216: loss 2.9722, time 5260.72ms 
iter 1217: loss 3.0015, time 5269.42ms 
iter 1218: loss 3.1857, time 5260.14ms 
iter 1219: loss 3.2011, time 5262.50ms 
iter 1220: loss 3.0809, time 5268.71ms 
iter 1221: loss 3.0118, time 5257.75ms 
iter 1222: loss 3.3312, time 5272.29ms 
iter 1223: loss 3.2040, time 5271.69ms 
iter 1224: loss 3.1274, time 5268.35ms 
iter 1225: loss 3.0840, time 5267.78ms 
iter 1226: loss 2.9045, time 5262.00ms 
iter 1227: loss 3.0332, time 5261.26ms 
iter 1228: loss 2.9983, time 5264.60ms 
iter 1229: loss 3.2176, time 5268.01ms 
iter 1230: loss 2.9143, time 5261.88ms 
iter 1231: loss 3.0762, time 5259.03ms 
iter 1232: loss 3.1624, time 5275.30ms 
iter 1233: loss 3.0160, time 5267.83ms 
iter 1234: loss 2.9641, time 5271.72ms 
iter 1235: loss 2.9151, time 5268.90ms 
iter 1236: loss 3.0062, time 5261.92ms 
iter 1237: loss 3.1157, time 5258.50ms 
iter 1238: loss 3.0172, time 5260.98ms 
iter 1239: loss 2.9642, time 5298.68ms 
iter 1240: loss 3.0864, time 5267.32ms 
iter 1241: loss 2.9338, time 5290.84ms 
iter 1242: loss 3.0293, time 5243.01ms 
iter 1243: loss 2.9940, time 5265.26ms 
iter 1244: loss 3.0120, time 5256.73ms 
iter 1245: loss 3.1714, time 5238.36ms 
iter 1246: loss 3.1911, time 5253.21ms 
iter 1247: loss 2.9062, time 5265.28ms 
iter 1248: loss 3.0193, time 5264.34ms 
iter 1249: loss 3.3342, time 5271.98ms 
step 1250: train loss 3.0488, val loss 3.0911
iter 1250: loss 2.8871, time 19958.47ms 
iter 1251: loss 3.1649, time 5255.74ms 
iter 1252: loss 2.9927, time 5261.84ms 
iter 1253: loss 3.0322, time 5256.94ms 
iter 1254: loss 3.1573, time 5257.68ms 
iter 1255: loss 3.3342, time 5249.68ms 
iter 1256: loss 3.1834, time 5253.86ms 
iter 1257: loss 2.9730, time 5270.25ms 
iter 1258: loss 3.0802, time 5260.18ms 
iter 1259: loss 3.2537, time 5234.04ms 
iter 1260: loss 3.0824, time 5270.17ms 
iter 1261: loss 2.9946, time 5261.89ms 
iter 1262: loss 3.0333, time 5258.23ms 
iter 1263: loss 3.1341, time 5255.86ms 
iter 1264: loss 3.1267, time 5253.08ms 
iter 1265: loss 2.9708, time 5263.49ms 
iter 1266: loss 3.0010, time 5265.70ms 
iter 1267: loss 2.8941, time 5257.57ms 
iter 1268: loss 3.0971, time 5261.31ms 
iter 1269: loss 3.1087, time 5265.83ms 
iter 1270: loss 3.0500, time 5257.88ms 
iter 1271: loss 3.2203, time 5261.10ms 
iter 1272: loss 3.0697, time 5274.93ms 
iter 1273: loss 3.1859, time 5270.44ms 
iter 1274: loss 3.1954, time 5270.03ms 
iter 1275: loss 2.8842, time 5257.63ms 
iter 1276: loss 2.9310, time 5260.50ms 
iter 1277: loss 3.1621, time 5268.92ms 
iter 1278: loss 2.9998, time 5258.77ms 
iter 1279: loss 3.0342, time 5272.96ms 
iter 1280: loss 2.9483, time 5269.25ms 
iter 1281: loss 3.3735, time 5261.01ms 
iter 1282: loss 2.9205, time 5263.41ms 
iter 1283: loss 3.0550, time 5269.67ms 
iter 1284: loss 3.1934, time 5258.74ms 
iter 1285: loss 3.1329, time 5270.35ms 
iter 1286: loss 3.0581, time 5267.88ms 
iter 1287: loss 2.9919, time 5272.94ms 
iter 1288: loss 2.7212, time 5269.56ms 
iter 1289: loss 3.0286, time 5269.12ms 
iter 1290: loss 3.0530, time 5261.84ms 
iter 1291: loss 3.0749, time 5267.65ms 
iter 1292: loss 2.9914, time 5261.63ms 
iter 1293: loss 3.0204, time 5257.88ms 
iter 1294: loss 2.8986, time 5258.29ms 
iter 1295: loss 3.1625, time 5263.22ms 
iter 1296: loss 3.0935, time 5259.64ms 
iter 1297: loss 2.9465, time 5258.36ms 
iter 1298: loss 2.8952, time 5274.32ms 
iter 1299: loss 2.8302, time 5263.29ms 
step 1300: train loss 3.0359, val loss 3.0713
iter 1300: loss 2.9374, time 20064.81ms 
iter 1301: loss 3.0778, time 5258.88ms 
iter 1302: loss 2.9590, time 5258.34ms 
iter 1303: loss 2.9602, time 5256.85ms 
iter 1304: loss 3.0392, time 5254.88ms 
iter 1305: loss 3.1509, time 5261.35ms 
iter 1306: loss 3.2010, time 5273.67ms 
iter 1307: loss 3.0435, time 5271.33ms 
iter 1308: loss 2.8736, time 5271.56ms 
iter 1309: loss 3.1649, time 5265.18ms 
iter 1310: loss 3.2765, time 5261.07ms 
iter 1311: loss 3.1513, time 5257.09ms 
iter 1312: loss 2.9037, time 5266.39ms 
iter 1313: loss 3.0305, time 5263.49ms 
iter 1314: loss 2.9519, time 5259.21ms 
iter 1315: loss 3.0572, time 5260.58ms 
iter 1316: loss 3.1023, time 5260.50ms 
iter 1317: loss 3.1297, time 5264.32ms 
iter 1318: loss 3.0101, time 5265.45ms 
iter 1319: loss 3.1560, time 5272.39ms 
iter 1320: loss 3.1033, time 5264.34ms 
iter 1321: loss 3.2738, time 5266.90ms 
iter 1322: loss 3.1665, time 5257.99ms 
iter 1323: loss 2.9555, time 5258.51ms 
iter 1324: loss 2.9891, time 5256.26ms 
iter 1325: loss 3.2398, time 5248.14ms 
iter 1326: loss 2.8655, time 5253.46ms 
iter 1327: loss 2.8794, time 5262.37ms 
iter 1328: loss 2.9035, time 5274.89ms 
iter 1329: loss 3.0078, time 5260.36ms 
iter 1330: loss 3.0408, time 5258.48ms 
iter 1331: loss 3.0877, time 5257.93ms 
iter 1332: loss 2.9915, time 5261.24ms 
iter 1333: loss 3.0344, time 5265.84ms 
iter 1334: loss 3.3179, time 5253.85ms 
iter 1335: loss 2.8378, time 5265.58ms 
iter 1336: loss 2.9570, time 5266.57ms 
iter 1337: loss 2.9899, time 5268.85ms 
iter 1338: loss 3.0986, time 5270.00ms 
iter 1339: loss 2.9262, time 5259.11ms 
iter 1340: loss 3.0185, time 5257.57ms 
iter 1341: loss 3.1188, time 5223.55ms 
iter 1342: loss 3.2381, time 5263.64ms 
iter 1343: loss 2.9586, time 5256.51ms 
iter 1344: loss 2.9697, time 5261.22ms 
iter 1345: loss 3.0967, time 5264.01ms 
iter 1346: loss 2.9515, time 5269.22ms 
iter 1347: loss 2.9841, time 5258.91ms 
iter 1348: loss 3.2742, time 5271.93ms 
iter 1349: loss 2.9493, time 5274.10ms 
step 1350: train loss 3.0209, val loss 3.0522
iter 1350: loss 3.0721, time 20061.45ms 
iter 1351: loss 3.0330, time 5269.13ms 
iter 1352: loss 3.3005, time 5274.50ms 
iter 1353: loss 2.9958, time 5272.62ms 
iter 1354: loss 3.0505, time 5267.83ms 
iter 1355: loss 2.9211, time 5272.29ms 
iter 1356: loss 3.0140, time 5256.88ms 
iter 1357: loss 3.3050, time 5253.81ms 
iter 1358: loss 3.1510, time 5263.87ms 
iter 1359: loss 3.0063, time 5258.09ms 
iter 1360: loss 3.1365, time 5254.79ms 
iter 1361: loss 3.0839, time 5259.44ms 
iter 1362: loss 3.1264, time 5256.45ms 
iter 1363: loss 2.8963, time 5263.31ms 
iter 1364: loss 3.0440, time 5265.25ms 
iter 1365: loss 3.0277, time 5268.54ms 
iter 1366: loss 2.8906, time 5260.87ms 
iter 1367: loss 3.2014, time 5256.26ms 
iter 1368: loss 3.2314, time 5257.48ms 
iter 1369: loss 2.9420, time 5266.31ms 
iter 1370: loss 2.9485, time 5261.43ms 
iter 1371: loss 2.9950, time 5270.48ms 
iter 1372: loss 3.1331, time 5273.71ms 
iter 1373: loss 2.9670, time 5264.11ms 
iter 1374: loss 3.0168, time 5274.23ms 
iter 1375: loss 3.0408, time 5272.85ms 
iter 1376: loss 2.8724, time 5267.19ms 
iter 1377: loss 2.9810, time 5264.65ms 
iter 1378: loss 2.8927, time 5257.54ms 
iter 1379: loss 3.0067, time 5254.20ms 
iter 1380: loss 3.0408, time 5255.34ms 
iter 1381: loss 3.0957, time 5273.82ms 
iter 1382: loss 3.0441, time 5256.35ms 
iter 1383: loss 2.9404, time 5268.00ms 
iter 1384: loss 2.9509, time 5271.55ms 
iter 1385: loss 2.8939, time 5270.95ms 
iter 1386: loss 3.0972, time 5264.97ms 
iter 1387: loss 3.2014, time 5264.90ms 
iter 1388: loss 3.0058, time 5270.50ms 
iter 1389: loss 3.0972, time 5265.10ms 
iter 1390: loss 2.8722, time 5266.77ms 
iter 1391: loss 2.8983, time 5258.59ms 
iter 1392: loss 3.1078, time 5273.49ms 
iter 1393: loss 2.9960, time 5264.62ms 
iter 1394: loss 3.1880, time 5272.71ms 
iter 1395: loss 3.1342, time 5267.96ms 
iter 1396: loss 3.0856, time 5264.32ms 
iter 1397: loss 3.2393, time 5270.86ms 
iter 1398: loss 3.0462, time 5259.91ms 
iter 1399: loss 3.0127, time 5275.92ms 
step 1400: train loss 3.0072, val loss 3.0495
iter 1400: loss 3.1233, time 20101.15ms 
iter 1401: loss 3.2392, time 5271.62ms 
iter 1402: loss 2.8583, time 5272.50ms 
iter 1403: loss 2.9599, time 5269.12ms 
iter 1404: loss 3.1720, time 5275.58ms 
iter 1405: loss 2.9478, time 5274.11ms 
iter 1406: loss 2.9158, time 5268.26ms 
iter 1407: loss 2.8877, time 5273.15ms 
iter 1408: loss 2.9690, time 5284.64ms 
iter 1409: loss 3.0750, time 5261.49ms 
iter 1410: loss 3.2273, time 5261.67ms 
iter 1411: loss 3.0255, time 5261.15ms 
iter 1412: loss 2.9994, time 5262.72ms 
iter 1413: loss 3.1741, time 5257.29ms 
iter 1414: loss 2.9299, time 5265.15ms 
iter 1415: loss 3.0351, time 5271.17ms 
iter 1416: loss 2.9324, time 5270.25ms 
iter 1417: loss 3.0094, time 5270.90ms 
iter 1418: loss 3.1986, time 5274.37ms 
iter 1419: loss 3.0767, time 5264.78ms 
iter 1420: loss 3.0025, time 5273.20ms 
iter 1421: loss 2.9046, time 5269.62ms 
iter 1422: loss 3.0364, time 5258.91ms 
iter 1423: loss 2.9924, time 5265.42ms 
iter 1424: loss 2.8195, time 5265.91ms 
iter 1425: loss 2.9767, time 5263.52ms 
iter 1426: loss 2.8860, time 5272.35ms 
iter 1427: loss 2.7916, time 5261.85ms 
iter 1428: loss 3.0198, time 5178.68ms 
iter 1429: loss 3.1238, time 5150.90ms 
iter 1430: loss 3.0930, time 5215.71ms 
iter 1431: loss 2.9731, time 5266.92ms 
iter 1432: loss 2.9923, time 5268.08ms 
iter 1433: loss 3.0038, time 5264.00ms 
iter 1434: loss 3.1727, time 5270.20ms 
iter 1435: loss 2.9486, time 5261.87ms 
iter 1436: loss 2.8238, time 5257.81ms 
iter 1437: loss 2.9501, time 5262.70ms 
iter 1438: loss 3.1840, time 5258.44ms 
iter 1439: loss 3.0971, time 5266.54ms 
iter 1440: loss 2.9561, time 5267.11ms 
iter 1441: loss 2.8461, time 5269.72ms 
iter 1442: loss 2.9877, time 5270.08ms 
iter 1443: loss 3.2672, time 5262.40ms 
iter 1444: loss 2.9132, time 5265.66ms 
iter 1445: loss 2.9161, time 5261.77ms 
iter 1446: loss 2.9684, time 5267.89ms 
iter 1447: loss 3.1154, time 5273.31ms 
iter 1448: loss 2.9552, time 5267.48ms 
iter 1449: loss 3.0326, time 5271.69ms 
step 1450: train loss 2.9730, val loss 3.0131
iter 1450: loss 3.1055, time 20084.83ms 
iter 1451: loss 2.9398, time 5269.54ms 
iter 1452: loss 2.7998, time 5249.87ms 
iter 1453: loss 3.1187, time 5268.05ms 
iter 1454: loss 3.0032, time 5268.31ms 
iter 1455: loss 2.9711, time 5269.52ms 
iter 1456: loss 2.8262, time 5268.88ms 
iter 1457: loss 3.1769, time 5273.09ms 
iter 1458: loss 2.9282, time 5264.86ms 
iter 1459: loss 3.0414, time 5280.71ms 
iter 1460: loss 2.8724, time 5279.66ms 
iter 1461: loss 3.0669, time 5282.47ms 
iter 1462: loss 2.8488, time 5267.20ms 
iter 1463: loss 2.9517, time 5263.85ms 
iter 1464: loss 3.0369, time 5276.09ms 
iter 1465: loss 2.9436, time 5271.09ms 
iter 1466: loss 3.1002, time 5273.97ms 
iter 1467: loss 2.9273, time 5269.71ms 
iter 1468: loss 2.9072, time 5267.45ms 
iter 1469: loss 2.9961, time 5276.10ms 
iter 1470: loss 2.9680, time 5276.89ms 
iter 1471: loss 2.8809, time 5269.26ms 
iter 1472: loss 3.0502, time 5261.57ms 
iter 1473: loss 2.9961, time 5262.68ms 
iter 1474: loss 2.8631, time 5262.22ms 
iter 1475: loss 2.9883, time 5243.54ms 
iter 1476: loss 2.9982, time 5259.25ms 
iter 1477: loss 2.8974, time 5252.32ms 
iter 1478: loss 2.8474, time 5247.67ms 
iter 1479: loss 3.0452, time 5248.20ms 
iter 1480: loss 2.7600, time 5181.17ms 
iter 1481: loss 3.1487, time 5250.23ms 
iter 1482: loss 3.0565, time 5247.23ms 
iter 1483: loss 3.0084, time 5280.04ms 
iter 1484: loss 3.0714, time 5283.84ms 
iter 1485: loss 2.9479, time 5274.46ms 
iter 1486: loss 2.9227, time 5279.38ms 
iter 1487: loss 3.1375, time 5281.83ms 
iter 1488: loss 3.0001, time 5282.52ms 
iter 1489: loss 2.8986, time 5300.28ms 
iter 1490: loss 2.9514, time 5265.04ms 
iter 1491: loss 3.1779, time 5258.29ms 
iter 1492: loss 2.9403, time 5246.34ms 
iter 1493: loss 2.9478, time 5229.50ms 
iter 1494: loss 2.9464, time 5259.89ms 
iter 1495: loss 3.1254, time 5259.71ms 
iter 1496: loss 2.8094, time 5260.40ms 
iter 1497: loss 3.0337, time 5274.87ms 
iter 1498: loss 2.9619, time 5258.59ms 
iter 1499: loss 2.8811, time 5251.25ms 
step 1500: train loss 2.9710, val loss 3.0150
iter 1500: loss 3.0952, time 20063.45ms 
iter 1501: loss 2.9315, time 5263.55ms 
iter 1502: loss 3.1492, time 5262.14ms 
iter 1503: loss 3.0390, time 5270.87ms 
iter 1504: loss 2.8175, time 5246.73ms 
iter 1505: loss 2.9426, time 5256.16ms 
iter 1506: loss 3.0312, time 5249.34ms 
iter 1507: loss 2.9872, time 5219.65ms 
iter 1508: loss 2.7725, time 5251.48ms 
iter 1509: loss 2.9635, time 5257.78ms 
iter 1510: loss 3.0082, time 5258.08ms 
iter 1511: loss 3.0755, time 5309.70ms 
iter 1512: loss 2.8080, time 5278.76ms 
iter 1513: loss 2.8424, time 5262.16ms 
iter 1514: loss 2.9654, time 5263.27ms 
iter 1515: loss 2.9355, time 5263.26ms 
iter 1516: loss 3.0878, time 5269.42ms 
iter 1517: loss 3.1085, time 5259.37ms 
iter 1518: loss 3.0575, time 5272.24ms 
iter 1519: loss 2.9407, time 5270.25ms 
iter 1520: loss 2.9158, time 5264.59ms 
iter 1521: loss 3.0174, time 5270.41ms 
iter 1522: loss 3.2246, time 5267.12ms 
iter 1523: loss 3.1622, time 5264.24ms 
iter 1524: loss 2.9535, time 5268.33ms 
iter 1525: loss 3.1314, time 5264.15ms 
iter 1526: loss 3.0210, time 5266.30ms 
iter 1527: loss 3.0425, time 5266.63ms 
iter 1528: loss 3.0776, time 5268.25ms 
iter 1529: loss 3.0456, time 5275.48ms 
iter 1530: loss 2.9426, time 5254.83ms 
iter 1531: loss 2.9233, time 5252.97ms 
iter 1532: loss 3.2576, time 5252.55ms 
iter 1533: loss 3.0381, time 5253.42ms 
iter 1534: loss 3.1217, time 5249.88ms 
iter 1535: loss 3.1621, time 5261.33ms 
iter 1536: loss 2.8836, time 5263.52ms 
iter 1537: loss 2.8530, time 5265.73ms 
iter 1538: loss 2.9625, time 5265.22ms 
iter 1539: loss 2.8293, time 5260.77ms 
iter 1540: loss 2.9289, time 5261.28ms 
iter 1541: loss 3.0697, time 5262.76ms 
iter 1542: loss 2.8644, time 5259.68ms 
iter 1543: loss 3.2166, time 5265.20ms 
iter 1544: loss 3.0138, time 5256.49ms 
iter 1545: loss 2.8713, time 5261.24ms 
iter 1546: loss 2.9044, time 5265.92ms 
iter 1547: loss 3.0397, time 5270.60ms 
iter 1548: loss 2.8560, time 5267.06ms 
iter 1549: loss 2.8303, time 5259.99ms 
step 1550: train loss 2.9689, val loss 3.0208
iter 1550: loss 2.9311, time 20061.39ms 
iter 1551: loss 2.8627, time 5267.71ms 
iter 1552: loss 2.9812, time 5264.99ms 
iter 1553: loss 3.0798, time 5265.71ms 
iter 1554: loss 2.8681, time 5262.86ms 
iter 1555: loss 2.9823, time 5259.25ms 
iter 1556: loss 2.9194, time 5270.68ms 
iter 1557: loss 2.8530, time 5270.43ms 
iter 1558: loss 2.9702, time 5280.85ms 
iter 1559: loss 3.0924, time 5274.30ms 
iter 1560: loss 3.0822, time 5269.80ms 
iter 1561: loss 2.9091, time 5269.66ms 
iter 1562: loss 3.0486, time 5269.77ms 
iter 1563: loss 2.8683, time 5267.22ms 
iter 1564: loss 2.9161, time 5264.89ms 
iter 1565: loss 2.7716, time 5264.03ms 
iter 1566: loss 3.1325, time 5276.70ms 
iter 1567: loss 2.8524, time 5272.52ms 
iter 1568: loss 2.8663, time 5278.57ms 
iter 1569: loss 2.9910, time 5280.98ms 
iter 1570: loss 2.8142, time 5241.39ms 
iter 1571: loss 2.9119, time 5271.91ms 
iter 1572: loss 2.8439, time 5266.52ms 
iter 1573: loss 2.9179, time 5250.26ms 
iter 1574: loss 2.8839, time 5271.89ms 
iter 1575: loss 3.1327, time 5266.01ms 
iter 1576: loss 2.8898, time 5277.83ms 
iter 1577: loss 3.0157, time 5266.20ms 
iter 1578: loss 2.9303, time 5268.48ms 
iter 1579: loss 2.8889, time 5266.58ms 
iter 1580: loss 3.2352, time 5266.82ms 
iter 1581: loss 3.0212, time 5255.41ms 
iter 1582: loss 3.0424, time 5267.13ms 
iter 1583: loss 3.3150, time 5277.05ms 
iter 1584: loss 3.0875, time 5280.89ms 
iter 1585: loss 3.0613, time 5268.54ms 
iter 1586: loss 2.8120, time 5271.59ms 
iter 1587: loss 2.9918, time 5251.16ms 
iter 1588: loss 2.9502, time 5268.31ms 
iter 1589: loss 2.9171, time 5256.79ms 
iter 1590: loss 3.0777, time 5250.72ms 
iter 1591: loss 2.8380, time 5252.32ms 
iter 1592: loss 3.2404, time 5246.33ms 
iter 1593: loss 2.7443, time 5245.92ms 
iter 1594: loss 3.0440, time 5262.35ms 
iter 1595: loss 2.9359, time 5271.00ms 
iter 1596: loss 3.0699, time 5271.61ms 
iter 1597: loss 2.9985, time 5267.76ms 
iter 1598: loss 3.0064, time 5264.68ms 
iter 1599: loss 2.8027, time 5266.37ms 
step 1600: train loss 2.9619, val loss 3.0144
iter 1600: loss 3.0400, time 20061.67ms 
iter 1601: loss 3.0138, time 5260.09ms 
iter 1602: loss 2.9374, time 5271.48ms 
iter 1603: loss 3.0005, time 5275.72ms 
iter 1604: loss 2.8530, time 5272.82ms 
iter 1605: loss 2.9466, time 5268.51ms 
iter 1606: loss 3.0613, time 5271.67ms 
iter 1607: loss 2.8371, time 5273.47ms 
iter 1608: loss 2.9841, time 5275.60ms 
iter 1609: loss 2.7839, time 5261.26ms 
iter 1610: loss 2.8100, time 5240.43ms 
iter 1611: loss 2.9116, time 5243.87ms 
iter 1612: loss 3.0624, time 5265.55ms 
iter 1613: loss 2.7582, time 5257.91ms 
iter 1614: loss 3.0119, time 5255.74ms 
iter 1615: loss 2.9273, time 5265.60ms 
iter 1616: loss 2.8829, time 5261.02ms 
iter 1617: loss 3.0950, time 5257.89ms 
iter 1618: loss 2.9538, time 5253.77ms 
iter 1619: loss 2.9623, time 5265.61ms 
iter 1620: loss 2.9925, time 5264.72ms 
iter 1621: loss 2.8614, time 5260.48ms 
iter 1622: loss 2.8129, time 5244.31ms 
iter 1623: loss 3.0259, time 5240.63ms 
iter 1624: loss 3.0166, time 5247.04ms 
iter 1625: loss 2.9245, time 5243.97ms 
iter 1626: loss 2.8852, time 5245.85ms 
iter 1627: loss 2.9841, time 5244.40ms 
iter 1628: loss 3.0925, time 5277.25ms 
iter 1629: loss 2.9524, time 5250.62ms 
iter 1630: loss 2.8984, time 5277.90ms 
iter 1631: loss 2.8573, time 5272.72ms 
iter 1632: loss 2.8902, time 5275.99ms 
iter 1633: loss 2.9756, time 5257.53ms 
iter 1634: loss 2.8765, time 5258.85ms 
iter 1635: loss 2.8838, time 5266.45ms 
iter 1636: loss 3.0347, time 5262.33ms 
iter 1637: loss 2.9269, time 5255.49ms 
iter 1638: loss 2.8031, time 5267.88ms 
iter 1639: loss 3.0362, time 5263.66ms 
iter 1640: loss 2.9859, time 5253.89ms 
iter 1641: loss 3.0212, time 5260.25ms 
iter 1642: loss 2.8671, time 5263.57ms 
iter 1643: loss 2.9464, time 5255.94ms 
iter 1644: loss 2.9382, time 5265.30ms 
iter 1645: loss 3.0098, time 5257.72ms 
iter 1646: loss 2.9692, time 5261.53ms 
iter 1647: loss 2.7444, time 5259.02ms 
iter 1648: loss 2.8203, time 5242.52ms 
iter 1649: loss 3.2273, time 5254.73ms 
step 1650: train loss 2.9335, val loss 3.0035
iter 1650: loss 2.9917, time 19964.72ms 
iter 1651: loss 2.8721, time 5259.58ms 
iter 1652: loss 3.0710, time 5239.77ms 
iter 1653: loss 3.1158, time 5254.19ms 
iter 1654: loss 2.9955, time 5254.23ms 
iter 1655: loss 2.9514, time 5255.65ms 
iter 1656: loss 2.9676, time 5233.44ms 
iter 1657: loss 2.7636, time 5253.81ms 
iter 1658: loss 2.9714, time 5263.39ms 
iter 1659: loss 2.8615, time 5262.76ms 
iter 1660: loss 3.0295, time 5276.72ms 
iter 1661: loss 3.1061, time 5268.30ms 
iter 1662: loss 2.7929, time 5274.29ms 
iter 1663: loss 2.9395, time 5262.27ms 
iter 1664: loss 2.9831, time 5264.96ms 
iter 1665: loss 3.0343, time 5230.89ms 
iter 1666: loss 2.8989, time 5261.90ms 
iter 1667: loss 2.9572, time 5255.30ms 
iter 1668: loss 2.7890, time 5258.43ms 
iter 1669: loss 3.2972, time 5257.57ms 
iter 1670: loss 3.1284, time 5256.23ms 
iter 1671: loss 2.8313, time 5261.17ms 
iter 1672: loss 2.9069, time 5260.80ms 
iter 1673: loss 2.8885, time 5251.28ms 
iter 1674: loss 2.8939, time 5262.10ms 
iter 1675: loss 3.0737, time 5254.90ms 
iter 1676: loss 2.7264, time 5260.24ms 
iter 1677: loss 2.7361, time 5257.94ms 
iter 1678: loss 2.8729, time 5256.26ms 
iter 1679: loss 2.8297, time 5257.48ms 
iter 1680: loss 2.9794, time 5260.71ms 
iter 1681: loss 3.0139, time 5258.64ms 
iter 1682: loss 2.9588, time 5260.85ms 
iter 1683: loss 3.0423, time 5254.45ms 
iter 1684: loss 2.9673, time 5257.11ms 
iter 1685: loss 3.0454, time 5274.89ms 
iter 1686: loss 3.1061, time 5232.00ms 
iter 1687: loss 2.8656, time 5229.60ms 
iter 1688: loss 2.8766, time 5258.09ms 
iter 1689: loss 3.1586, time 5239.75ms 
iter 1690: loss 2.8927, time 5240.92ms 
iter 1691: loss 2.9962, time 5207.54ms 
iter 1692: loss 2.8226, time 5214.90ms 
iter 1693: loss 2.9777, time 5229.94ms 
iter 1694: loss 3.1477, time 5218.39ms 
iter 1695: loss 2.9632, time 5202.90ms 
iter 1696: loss 2.9292, time 5218.49ms 
iter 1697: loss 3.1583, time 5231.52ms 
iter 1698: loss 2.9999, time 5191.66ms 
iter 1699: loss 2.8496, time 5229.91ms 
step 1700: train loss 2.9207, val loss 3.0024
iter 1700: loss 2.8130, time 20053.39ms 
iter 1701: loss 2.8047, time 5268.85ms 
iter 1702: loss 2.9082, time 5221.20ms 
iter 1703: loss 3.0117, time 5246.75ms 
iter 1704: loss 2.9609, time 5215.62ms 
iter 1705: loss 2.8911, time 5194.31ms 
iter 1706: loss 3.1913, time 5215.28ms 
iter 1707: loss 2.9651, time 5190.58ms 
iter 1708: loss 2.8520, time 5233.12ms 
iter 1709: loss 3.0300, time 5239.43ms 
iter 1710: loss 2.8788, time 5207.36ms 
iter 1711: loss 2.8133, time 5243.20ms 
iter 1712: loss 3.0213, time 5220.78ms 
iter 1713: loss 2.9167, time 5215.87ms 
iter 1714: loss 2.7415, time 5221.78ms 
iter 1715: loss 2.7455, time 5222.85ms 
iter 1716: loss 3.2168, time 5203.54ms 
iter 1717: loss 2.8402, time 5231.89ms 
iter 1718: loss 2.8777, time 5209.87ms 
iter 1719: loss 2.7703, time 5220.39ms 
iter 1720: loss 2.9703, time 5258.46ms 
iter 1721: loss 3.0382, time 5261.41ms 
iter 1722: loss 2.8619, time 5260.74ms 
iter 1723: loss 2.9029, time 5263.52ms 
iter 1724: loss 2.8811, time 5271.44ms 
iter 1725: loss 2.9264, time 5273.41ms 
iter 1726: loss 2.8118, time 5237.59ms 
iter 1727: loss 2.9450, time 5219.14ms 
iter 1728: loss 2.8708, time 5253.57ms 
iter 1729: loss 3.0181, time 5254.89ms 
iter 1730: loss 2.9104, time 5266.82ms 
iter 1731: loss 2.9056, time 5252.30ms 
iter 1732: loss 2.8889, time 5272.40ms 
iter 1733: loss 2.7628, time 5265.09ms 
iter 1734: loss 2.7983, time 5258.89ms 
iter 1735: loss 2.9520, time 5251.18ms 
iter 1736: loss 2.9327, time 5257.93ms 
iter 1737: loss 2.7238, time 5257.37ms 
iter 1738: loss 2.9382, time 5260.41ms 
iter 1739: loss 2.6925, time 5262.47ms 
iter 1740: loss 3.0662, time 5268.08ms 
iter 1741: loss 2.9817, time 5259.90ms 
iter 1742: loss 2.9901, time 5266.55ms 
iter 1743: loss 2.8574, time 5260.55ms 
iter 1744: loss 2.8929, time 5260.49ms 
iter 1745: loss 2.7621, time 5260.83ms 
iter 1746: loss 2.9430, time 5255.62ms 
iter 1747: loss 2.9819, time 5255.62ms 
iter 1748: loss 2.7375, time 5269.78ms 
iter 1749: loss 2.8751, time 5276.45ms 
step 1750: train loss 2.9266, val loss 2.9929
iter 1750: loss 2.8567, time 20060.09ms 
iter 1751: loss 3.0157, time 5271.56ms 
iter 1752: loss 2.8511, time 5267.86ms 
iter 1753: loss 2.9296, time 5270.94ms 
iter 1754: loss 3.0527, time 5270.15ms 
iter 1755: loss 2.8309, time 5274.56ms 
iter 1756: loss 2.9420, time 5265.87ms 
iter 1757: loss 2.9477, time 5262.09ms 
iter 1758: loss 2.9157, time 5275.95ms 
iter 1759: loss 2.9595, time 5259.79ms 
iter 1760: loss 2.9574, time 5261.41ms 
iter 1761: loss 2.9125, time 5243.33ms 
iter 1762: loss 2.9952, time 5276.36ms 
iter 1763: loss 2.9449, time 5258.51ms 
iter 1764: loss 2.9594, time 5257.80ms 
iter 1765: loss 3.1212, time 5260.45ms 
iter 1766: loss 3.0251, time 5253.39ms 
iter 1767: loss 3.0039, time 5255.23ms 
iter 1768: loss 2.9041, time 5266.70ms 
iter 1769: loss 2.9187, time 5263.47ms 
iter 1770: loss 2.7798, time 5261.18ms 
iter 1771: loss 2.9496, time 5269.93ms 
iter 1772: loss 2.9048, time 5265.78ms 
iter 1773: loss 2.7827, time 5260.29ms 
iter 1774: loss 2.9074, time 5260.54ms 
iter 1775: loss 2.7986, time 5256.55ms 
iter 1776: loss 3.0523, time 5257.56ms 
iter 1777: loss 2.9737, time 5264.37ms 
iter 1778: loss 2.9718, time 5263.09ms 
iter 1779: loss 2.5835, time 5286.06ms 
iter 1780: loss 2.9481, time 5275.47ms 
iter 1781: loss 2.9318, time 5276.93ms 
iter 1782: loss 3.1462, time 5273.54ms 
iter 1783: loss 2.8968, time 5277.88ms 
iter 1784: loss 2.7165, time 5268.12ms 
iter 1785: loss 2.8444, time 5282.59ms 
iter 1786: loss 2.9598, time 5269.95ms 
iter 1787: loss 2.7782, time 5264.17ms 
iter 1788: loss 2.7244, time 5287.01ms 
iter 1789: loss 2.7638, time 5257.86ms 
iter 1790: loss 2.8210, time 5254.42ms 
iter 1791: loss 2.7360, time 5227.52ms 
iter 1792: loss 2.8079, time 5256.18ms 
iter 1793: loss 2.8831, time 5267.23ms 
iter 1794: loss 2.9336, time 5271.31ms 
iter 1795: loss 2.9282, time 5269.60ms 
iter 1796: loss 2.9939, time 5257.83ms 
iter 1797: loss 2.9386, time 5257.93ms 
iter 1798: loss 2.8910, time 5259.55ms 
iter 1799: loss 2.9118, time 5259.02ms 
step 1800: train loss 2.9038, val loss 2.9670
iter 1800: loss 3.0910, time 20065.62ms 
iter 1801: loss 2.9919, time 5261.99ms 
iter 1802: loss 2.8994, time 5260.25ms 
iter 1803: loss 2.9116, time 5254.36ms 
iter 1804: loss 2.9974, time 5266.09ms 
iter 1805: loss 2.9626, time 5269.24ms 
iter 1806: loss 2.9190, time 5253.47ms 
iter 1807: loss 2.8105, time 5266.72ms 
iter 1808: loss 3.0227, time 5274.68ms 
iter 1809: loss 2.8770, time 5265.04ms 
iter 1810: loss 2.9411, time 5255.99ms 
iter 1811: loss 2.8443, time 5280.56ms 
iter 1812: loss 3.0019, time 5244.64ms 
iter 1813: loss 3.0407, time 5257.18ms 
iter 1814: loss 2.9727, time 5225.27ms 
iter 1815: loss 3.0246, time 5218.89ms 
iter 1816: loss 2.8631, time 5258.18ms 
iter 1817: loss 2.7010, time 5186.45ms 
iter 1818: loss 2.9107, time 5215.92ms 
iter 1819: loss 2.6640, time 5197.24ms 
iter 1820: loss 3.0860, time 5227.92ms 
iter 1821: loss 2.8481, time 5219.31ms 
iter 1822: loss 2.7513, time 5221.54ms 
iter 1823: loss 2.7923, time 5215.64ms 
iter 1824: loss 2.9190, time 5218.12ms 
iter 1825: loss 2.6802, time 5210.40ms 
iter 1826: loss 2.8110, time 5231.99ms 
iter 1827: loss 2.8632, time 5213.92ms 
iter 1828: loss 2.8941, time 5216.02ms 
iter 1829: loss 2.6749, time 5214.35ms 
iter 1830: loss 2.8363, time 5234.51ms 
iter 1831: loss 2.9880, time 5256.59ms 
iter 1832: loss 2.8870, time 5224.69ms 
iter 1833: loss 2.9069, time 5200.94ms 
iter 1834: loss 2.8430, time 5272.90ms 
iter 1835: loss 2.9252, time 5202.56ms 
iter 1836: loss 2.8805, time 5180.12ms 
iter 1837: loss 2.8828, time 5196.92ms 
iter 1838: loss 3.0285, time 5233.22ms 
iter 1839: loss 2.8028, time 5201.92ms 
iter 1840: loss 2.9496, time 5244.76ms 
iter 1841: loss 2.8189, time 5213.75ms 
iter 1842: loss 2.8235, time 5220.19ms 
iter 1843: loss 2.8312, time 5224.50ms 
iter 1844: loss 2.9609, time 5271.23ms 
iter 1845: loss 2.5707, time 5208.95ms 
iter 1846: loss 2.7869, time 5256.53ms 
iter 1847: loss 2.8277, time 5213.70ms 
iter 1848: loss 3.0317, time 5211.78ms 
iter 1849: loss 2.7467, time 5230.54ms 
step 1850: train loss 2.9058, val loss 2.9785
iter 1850: loss 2.9463, time 20039.28ms 
iter 1851: loss 2.7067, time 5236.32ms 
iter 1852: loss 2.9889, time 5217.00ms 
iter 1853: loss 2.6880, time 5176.71ms 
iter 1854: loss 2.8439, time 5222.96ms 
iter 1855: loss 2.8002, time 5225.34ms 
iter 1856: loss 2.8742, time 5278.24ms 
iter 1857: loss 2.9099, time 5274.15ms 
iter 1858: loss 3.0215, time 5285.53ms 
iter 1859: loss 2.8602, time 5277.66ms 
iter 1860: loss 3.0356, time 5257.94ms 
iter 1861: loss 2.5563, time 5272.33ms 
iter 1862: loss 2.9263, time 5263.02ms 
iter 1863: loss 2.9468, time 5266.67ms 
iter 1864: loss 2.9683, time 5265.29ms 
iter 1865: loss 2.9924, time 5268.36ms 
iter 1866: loss 2.8448, time 5268.30ms 
iter 1867: loss 3.0156, time 5266.75ms 
iter 1868: loss 2.8271, time 5270.81ms 
iter 1869: loss 2.7752, time 5257.78ms 
iter 1870: loss 2.9193, time 5261.99ms 
iter 1871: loss 2.9408, time 5268.66ms 
iter 1872: loss 2.8464, time 5277.38ms 
iter 1873: loss 2.9822, time 5268.85ms 
iter 1874: loss 2.8706, time 5265.96ms 
iter 1875: loss 2.8033, time 5257.16ms 
iter 1876: loss 2.9011, time 5251.81ms 
iter 1877: loss 3.2067, time 5267.45ms 
iter 1878: loss 3.2085, time 5266.03ms 
iter 1879: loss 2.8119, time 5259.53ms 
iter 1880: loss 2.8327, time 5260.88ms 
iter 1881: loss 2.8801, time 5259.80ms 
iter 1882: loss 3.0866, time 5263.46ms 
iter 1883: loss 3.0104, time 5254.48ms 
iter 1884: loss 2.8874, time 5244.61ms 
iter 1885: loss 2.7584, time 5258.34ms 
iter 1886: loss 2.8450, time 5216.28ms 
iter 1887: loss 2.9345, time 5272.96ms 
iter 1888: loss 2.9304, time 5257.25ms 
iter 1889: loss 2.7908, time 5259.27ms 
iter 1890: loss 3.0530, time 5257.82ms 
iter 1891: loss 2.8563, time 5262.91ms 
iter 1892: loss 2.8781, time 5258.20ms 
iter 1893: loss 3.0221, time 5266.28ms 
iter 1894: loss 2.7622, time 5273.77ms 
iter 1895: loss 2.9340, time 5275.20ms 
iter 1896: loss 3.0239, time 5262.54ms 
iter 1897: loss 2.8933, time 5237.87ms 
iter 1898: loss 2.9315, time 5269.50ms 
iter 1899: loss 2.9148, time 5269.36ms 
step 1900: train loss 2.8974, val loss 2.9773
iter 1900: loss 3.0477, time 20083.98ms 
iter 1901: loss 2.9465, time 5261.95ms 
iter 1902: loss 2.8197, time 5259.16ms 
iter 1903: loss 2.9335, time 5262.23ms 
iter 1904: loss 2.8238, time 5258.54ms 
iter 1905: loss 2.9900, time 5258.59ms 
iter 1906: loss 2.9039, time 5256.93ms 
iter 1907: loss 2.8275, time 5266.05ms 
iter 1908: loss 2.8335, time 5262.56ms 
iter 1909: loss 2.7051, time 5277.71ms 
iter 1910: loss 2.9867, time 5261.74ms 
iter 1911: loss 2.8577, time 5271.25ms 
iter 1912: loss 2.8907, time 5263.92ms 
iter 1913: loss 2.8489, time 5262.88ms 
iter 1914: loss 2.9998, time 5252.93ms 
iter 1915: loss 2.9229, time 5267.33ms 
iter 1916: loss 2.8905, time 5264.11ms 
iter 1917: loss 2.9374, time 5258.02ms 
iter 1918: loss 2.7762, time 5268.86ms 
iter 1919: loss 2.8249, time 5260.81ms 
iter 1920: loss 2.9611, time 5270.62ms 
iter 1921: loss 2.7840, time 5259.41ms 
iter 1922: loss 2.7531, time 5257.61ms 
iter 1923: loss 3.0135, time 5260.80ms 
iter 1924: loss 2.9197, time 5261.34ms 
iter 1925: loss 2.9031, time 5260.23ms 
iter 1926: loss 2.7442, time 5261.66ms 
iter 1927: loss 2.9109, time 5265.34ms 
iter 1928: loss 3.0915, time 5258.82ms 
iter 1929: loss 2.9998, time 5256.91ms 
iter 1930: loss 2.8224, time 5262.90ms 
iter 1931: loss 3.0510, time 5257.31ms 
iter 1932: loss 2.9075, time 5260.79ms 
iter 1933: loss 3.0949, time 5262.62ms 
iter 1934: loss 2.9244, time 5261.20ms 
iter 1935: loss 2.7776, time 5275.42ms 
iter 1936: loss 2.9714, time 5307.71ms 
iter 1937: loss 2.9718, time 5268.24ms 
iter 1938: loss 2.8940, time 5256.08ms 
iter 1939: loss 2.9322, time 5261.95ms 
iter 1940: loss 2.8710, time 5222.43ms 
iter 1941: loss 3.1117, time 5256.53ms 
iter 1942: loss 3.0183, time 5258.21ms 
iter 1943: loss 2.7696, time 5243.55ms 
iter 1944: loss 3.0368, time 5265.64ms 
iter 1945: loss 2.7652, time 5265.03ms 
iter 1946: loss 2.8329, time 5273.13ms 
iter 1947: loss 3.0510, time 5262.79ms 
iter 1948: loss 2.8747, time 5273.09ms 
iter 1949: loss 2.9072, time 5271.04ms 
step 1950: train loss 2.8824, val loss 2.9541
iter 1950: loss 2.7450, time 20062.40ms 
iter 1951: loss 2.8164, time 5260.79ms 
iter 1952: loss 2.7530, time 5259.59ms 
iter 1953: loss 2.9075, time 5256.00ms 
iter 1954: loss 2.8325, time 5268.72ms 
iter 1955: loss 2.9776, time 5290.64ms 
iter 1956: loss 2.9153, time 5274.10ms 
iter 1957: loss 2.8499, time 5241.63ms 
iter 1958: loss 2.8619, time 5262.26ms 
iter 1959: loss 2.7530, time 5260.97ms 
iter 1960: loss 3.0034, time 5274.53ms 
iter 1961: loss 2.9358, time 5262.53ms 
iter 1962: loss 2.7740, time 5263.59ms 
iter 1963: loss 2.8314, time 5262.05ms 
iter 1964: loss 2.8395, time 5260.57ms 
iter 1965: loss 2.7892, time 5261.04ms 
iter 1966: loss 2.8041, time 5276.01ms 
iter 1967: loss 2.9705, time 5257.60ms 
iter 1968: loss 2.7480, time 5254.23ms 
iter 1969: loss 3.1057, time 5259.10ms 
iter 1970: loss 2.7768, time 5272.06ms 
iter 1971: loss 2.9491, time 5261.37ms 
iter 1972: loss 2.7133, time 5265.60ms 
iter 1973: loss 2.9226, time 5268.13ms 
iter 1974: loss 2.8353, time 5267.73ms 
iter 1975: loss 2.9173, time 5254.93ms 
iter 1976: loss 2.5692, time 5260.47ms 
iter 1977: loss 2.9215, time 5262.71ms 
iter 1978: loss 3.0218, time 5258.08ms 
iter 1979: loss 2.8871, time 5261.95ms 
iter 1980: loss 2.7121, time 5267.01ms 
iter 1981: loss 2.9121, time 5262.26ms 
iter 1982: loss 2.8660, time 5259.85ms 
iter 1983: loss 2.8715, time 5262.91ms 
iter 1984: loss 2.8716, time 5268.11ms 
iter 1985: loss 2.7988, time 5280.51ms 
iter 1986: loss 2.7727, time 5278.27ms 
iter 1987: loss 2.6960, time 5276.04ms 
iter 1988: loss 3.1274, time 5284.97ms 
iter 1989: loss 2.8164, time 5261.96ms 
iter 1990: loss 2.8187, time 5262.84ms 
iter 1991: loss 2.8534, time 5272.02ms 
iter 1992: loss 2.9691, time 5258.75ms 
iter 1993: loss 2.7805, time 5262.19ms 
iter 1994: loss 3.0256, time 5265.09ms 
iter 1995: loss 2.6488, time 5259.24ms 
iter 1996: loss 2.8257, time 5263.59ms 
iter 1997: loss 2.9088, time 5255.76ms 
iter 1998: loss 2.7237, time 5250.09ms 
iter 1999: loss 2.9370, time 5260.66ms 
step 2000: train loss 2.8674, val loss 2.9550
iter 2000: loss 2.9749, time 20088.28ms 
iter 2001: loss 2.8324, time 5253.26ms 
iter 2002: loss 2.8120, time 5262.97ms 
iter 2003: loss 2.7114, time 5265.53ms 
iter 2004: loss 2.8816, time 5244.91ms 
iter 2005: loss 2.9815, time 5264.80ms 
iter 2006: loss 2.7592, time 5266.07ms 
iter 2007: loss 2.9528, time 5279.63ms 
iter 2008: loss 2.4575, time 5271.74ms 
iter 2009: loss 2.8517, time 5274.21ms 
iter 2010: loss 2.8257, time 5274.20ms 
iter 2011: loss 2.9853, time 5269.93ms 
iter 2012: loss 2.9536, time 5269.59ms 
iter 2013: loss 2.9209, time 5270.36ms 
iter 2014: loss 2.8369, time 5270.90ms 
iter 2015: loss 2.9484, time 5266.32ms 
iter 2016: loss 2.8911, time 5257.55ms 
iter 2017: loss 2.9898, time 5244.15ms 
iter 2018: loss 3.1808, time 5285.42ms 
iter 2019: loss 2.8496, time 5278.27ms 
iter 2020: loss 2.7782, time 5230.96ms 
iter 2021: loss 2.7917, time 5269.86ms 
iter 2022: loss 2.9109, time 5271.39ms 
iter 2023: loss 3.0013, time 5274.05ms 
iter 2024: loss 2.9051, time 5272.72ms 
iter 2025: loss 2.9122, time 5269.18ms 
iter 2026: loss 2.8223, time 5267.32ms 
iter 2027: loss 2.8397, time 5268.46ms 
iter 2028: loss 2.9179, time 5266.67ms 
iter 2029: loss 2.8917, time 5263.78ms 
iter 2030: loss 2.7048, time 5273.75ms 
iter 2031: loss 2.7209, time 5266.81ms 
iter 2032: loss 2.7592, time 5260.54ms 
iter 2033: loss 2.7342, time 5272.54ms 
iter 2034: loss 2.6068, time 5267.41ms 
iter 2035: loss 3.0180, time 5265.52ms 
iter 2036: loss 2.7950, time 5270.37ms 
iter 2037: loss 2.8506, time 5260.42ms 
iter 2038: loss 2.8746, time 5272.25ms 
iter 2039: loss 2.8862, time 5273.21ms 
iter 2040: loss 2.6528, time 5262.11ms 
iter 2041: loss 2.7487, time 5259.66ms 
iter 2042: loss 2.8098, time 5271.31ms 
iter 2043: loss 2.9293, time 5274.17ms 
iter 2044: loss 2.9742, time 5277.76ms 
iter 2045: loss 2.9616, time 5264.43ms 
iter 2046: loss 2.8443, time 5271.36ms 
iter 2047: loss 2.8829, time 5266.32ms 
iter 2048: loss 2.8671, time 5260.93ms 
iter 2049: loss 2.7137, time 5275.87ms 
step 2050: train loss 2.8646, val loss 2.9495
iter 2050: loss 2.8088, time 20033.44ms 
iter 2051: loss 2.8291, time 5259.78ms 
iter 2052: loss 2.8830, time 5280.42ms 
iter 2053: loss 2.8099, time 5249.76ms 
iter 2054: loss 2.8375, time 5270.71ms 
iter 2055: loss 2.8133, time 5275.60ms 
iter 2056: loss 2.9103, time 5261.81ms 
iter 2057: loss 2.8772, time 5273.42ms 
iter 2058: loss 2.9999, time 5266.96ms 
iter 2059: loss 2.7414, time 5250.05ms 
iter 2060: loss 2.9146, time 5265.21ms 
iter 2061: loss 2.8096, time 5276.72ms 
iter 2062: loss 2.9736, time 5274.50ms 
iter 2063: loss 2.7625, time 5274.20ms 
iter 2064: loss 2.5712, time 5271.64ms 
iter 2065: loss 2.7809, time 5271.73ms 
iter 2066: loss 2.8055, time 5264.16ms 
iter 2067: loss 2.8906, time 5256.32ms 
iter 2068: loss 2.9192, time 5262.56ms 
iter 2069: loss 2.6764, time 5345.46ms 
iter 2070: loss 2.8324, time 5269.35ms 
iter 2071: loss 2.7690, time 5265.26ms 
iter 2072: loss 2.8205, time 5270.67ms 
iter 2073: loss 2.7922, time 5264.68ms 
iter 2074: loss 2.8779, time 5274.18ms 
iter 2075: loss 3.0972, time 5269.28ms 
iter 2076: loss 3.0371, time 5264.69ms 
iter 2077: loss 2.7368, time 5259.79ms 
iter 2078: loss 2.7148, time 5265.35ms 
iter 2079: loss 2.8975, time 5292.06ms 
iter 2080: loss 2.9115, time 5268.25ms 
iter 2081: loss 3.0102, time 5260.07ms 
iter 2082: loss 2.9256, time 5266.45ms 
iter 2083: loss 2.8495, time 5277.03ms 
iter 2084: loss 2.8911, time 5276.41ms 
iter 2085: loss 3.0424, time 5287.08ms 
iter 2086: loss 2.7960, time 5342.60ms 
iter 2087: loss 3.0800, time 5333.85ms 
iter 2088: loss 2.7675, time 5344.58ms 
iter 2089: loss 2.6447, time 5332.41ms 
iter 2090: loss 2.8577, time 5261.80ms 
iter 2091: loss 3.0513, time 5311.97ms 
iter 2092: loss 2.8714, time 5345.77ms 
iter 2093: loss 2.7695, time 5347.48ms 
iter 2094: loss 2.8364, time 5339.68ms 
iter 2095: loss 2.8362, time 5342.34ms 
iter 2096: loss 2.8863, time 5272.17ms 
iter 2097: loss 2.8814, time 5275.39ms 
iter 2098: loss 2.7694, time 5272.07ms 
iter 2099: loss 3.1608, time 5268.32ms 
step 2100: train loss 2.8535, val loss 2.9427
iter 2100: loss 2.8883, time 20080.41ms 
iter 2101: loss 2.8124, time 5271.68ms 
iter 2102: loss 2.8468, time 5263.23ms 
iter 2103: loss 2.9482, time 5257.98ms 
iter 2104: loss 2.9302, time 5269.17ms 
iter 2105: loss 2.9321, time 5269.79ms 
iter 2106: loss 2.7869, time 5263.97ms 
iter 2107: loss 2.8889, time 5270.40ms 
iter 2108: loss 3.0526, time 5260.13ms 
iter 2109: loss 2.8279, time 5328.04ms 
iter 2110: loss 2.9260, time 5255.96ms 
iter 2111: loss 2.8189, time 5264.27ms 
iter 2112: loss 2.8706, time 5276.41ms 
iter 2113: loss 2.9833, time 5259.32ms 
iter 2114: loss 2.9575, time 5262.26ms 
iter 2115: loss 2.9884, time 5278.17ms 
iter 2116: loss 2.7949, time 5262.72ms 
iter 2117: loss 2.8389, time 5274.11ms 
iter 2118: loss 2.6041, time 5275.56ms 
iter 2119: loss 2.8795, time 5275.17ms 
iter 2120: loss 3.0745, time 5254.71ms 
iter 2121: loss 2.9720, time 5238.15ms 
iter 2122: loss 2.7727, time 5261.68ms 
iter 2123: loss 3.0572, time 5265.05ms 
iter 2124: loss 2.8362, time 5268.53ms 
iter 2125: loss 3.0151, time 5282.73ms 
iter 2126: loss 2.8811, time 5265.55ms 
iter 2127: loss 2.8471, time 5260.46ms 
iter 2128: loss 2.7837, time 5266.17ms 
iter 2129: loss 2.9341, time 5291.21ms 
iter 2130: loss 2.8358, time 5292.22ms 
iter 2131: loss 2.7708, time 5262.50ms 
iter 2132: loss 2.9522, time 5208.71ms 
iter 2133: loss 2.9471, time 5261.64ms 
iter 2134: loss 2.8806, time 5258.23ms 
iter 2135: loss 2.7271, time 5342.42ms 
iter 2136: loss 2.9614, time 5289.84ms 
iter 2137: loss 2.8745, time 5272.12ms 
iter 2138: loss 2.8596, time 5336.60ms 
iter 2139: loss 2.8065, time 5339.57ms 
iter 2140: loss 2.9430, time 5340.20ms 
iter 2141: loss 2.8262, time 5269.76ms 
iter 2142: loss 2.9261, time 5259.69ms 
iter 2143: loss 2.7696, time 5260.56ms 
iter 2144: loss 2.7581, time 5231.66ms 
iter 2145: loss 2.9116, time 5273.70ms 
iter 2146: loss 2.8666, time 5261.74ms 
iter 2147: loss 2.8517, time 5269.10ms 
iter 2148: loss 2.8742, time 5270.12ms 
iter 2149: loss 2.7334, time 5265.74ms 
step 2150: train loss 2.8299, val loss 2.9376
iter 2150: loss 2.7042, time 20086.29ms 
iter 2151: loss 2.9702, time 5293.57ms 
iter 2152: loss 2.9981, time 5289.85ms 
iter 2153: loss 2.8429, time 5258.98ms 
iter 2154: loss 2.7164, time 5283.10ms 
iter 2155: loss 2.9122, time 5331.74ms 
iter 2156: loss 2.6021, time 5329.11ms 
iter 2157: loss 2.7994, time 5262.83ms 
iter 2158: loss 2.6074, time 5259.57ms 
iter 2159: loss 2.8641, time 5294.90ms 
iter 2160: loss 3.0895, time 5241.84ms 
iter 2161: loss 2.6671, time 5256.42ms 
iter 2162: loss 2.8639, time 5256.50ms 
iter 2163: loss 2.8376, time 5257.88ms 
iter 2164: loss 2.7674, time 5266.92ms 
iter 2165: loss 2.6091, time 5242.36ms 
iter 2166: loss 2.9481, time 5227.14ms 
iter 2167: loss 2.7510, time 5264.55ms 
iter 2168: loss 2.7261, time 5267.55ms 
iter 2169: loss 2.7751, time 5266.98ms 
iter 2170: loss 2.9271, time 5259.00ms 
iter 2171: loss 2.8796, time 5268.31ms 
iter 2172: loss 2.8823, time 5227.76ms 
iter 2173: loss 2.9429, time 5264.23ms 
iter 2174: loss 2.8600, time 5262.22ms 
iter 2175: loss 2.9655, time 5267.21ms 
iter 2176: loss 2.6296, time 5267.74ms 
iter 2177: loss 2.7354, time 5315.35ms 
iter 2178: loss 2.7066, time 5263.89ms 
iter 2179: loss 2.9493, time 5288.55ms 
iter 2180: loss 2.7495, time 5221.07ms 
iter 2181: loss 2.9032, time 5262.49ms 
iter 2182: loss 2.6887, time 5271.70ms 
iter 2183: loss 2.7703, time 5265.39ms 
iter 2184: loss 2.8669, time 5269.26ms 
iter 2185: loss 2.6611, time 5266.47ms 
iter 2186: loss 2.6809, time 5266.21ms 
iter 2187: loss 2.9119, time 5277.84ms 
iter 2188: loss 2.7111, time 5272.08ms 
iter 2189: loss 2.7841, time 5260.80ms 
iter 2190: loss 2.7294, time 5262.43ms 
iter 2191: loss 2.7075, time 5291.77ms 
iter 2192: loss 2.8194, time 5334.13ms 
iter 2193: loss 2.6457, time 5253.02ms 
iter 2194: loss 2.8035, time 5260.52ms 
iter 2195: loss 2.8866, time 5267.37ms 
iter 2196: loss 2.8057, time 5285.68ms 
iter 2197: loss 2.7734, time 5313.94ms 
iter 2198: loss 2.8481, time 5265.09ms 
iter 2199: loss 3.0785, time 5235.77ms 
step 2200: train loss 2.8433, val loss 2.9311
iter 2200: loss 2.8761, time 20061.69ms 
iter 2201: loss 2.7530, time 5294.10ms 
iter 2202: loss 2.8485, time 5342.32ms 
iter 2203: loss 2.9305, time 5296.70ms 
iter 2204: loss 3.0102, time 5273.67ms 
iter 2205: loss 2.8822, time 5297.90ms 
iter 2206: loss 2.6858, time 5263.69ms 
iter 2207: loss 2.8012, time 5285.17ms 
iter 2208: loss 2.7650, time 5269.40ms 
iter 2209: loss 2.7541, time 5280.46ms 
iter 2210: loss 3.0071, time 5264.62ms 
iter 2211: loss 2.8707, time 5277.09ms 
iter 2212: loss 2.8584, time 5265.63ms 
iter 2213: loss 3.0373, time 5260.64ms 
iter 2214: loss 2.7200, time 5256.78ms 
iter 2215: loss 2.9411, time 5255.82ms 
iter 2216: loss 2.7950, time 5265.20ms 
iter 2217: loss 2.8942, time 5259.56ms 
iter 2218: loss 2.7964, time 5197.46ms 
iter 2219: loss 2.7833, time 5167.38ms 
iter 2220: loss 2.7085, time 5144.10ms 
iter 2221: loss 2.8533, time 5195.87ms 
iter 2222: loss 2.6157, time 5201.25ms 
iter 2223: loss 2.7414, time 5157.59ms 
iter 2224: loss 2.8842, time 5191.07ms 
iter 2225: loss 2.9015, time 5189.97ms 
iter 2226: loss 2.8321, time 5150.62ms 
iter 2227: loss 2.9141, time 5247.10ms 
iter 2228: loss 2.7298, time 5263.71ms 
iter 2229: loss 2.9292, time 5242.17ms 
iter 2230: loss 2.9618, time 5233.64ms 
iter 2231: loss 2.7085, time 5181.56ms 
iter 2232: loss 3.0347, time 5282.11ms 
iter 2233: loss 2.8885, time 5218.67ms 
iter 2234: loss 2.9313, time 5250.01ms 
iter 2235: loss 2.7025, time 5294.87ms 
iter 2236: loss 2.9352, time 5195.29ms 
iter 2237: loss 2.8173, time 5255.18ms 
iter 2238: loss 2.8403, time 5265.47ms 
iter 2239: loss 2.8006, time 5261.79ms 
iter 2240: loss 2.7081, time 5252.77ms 
iter 2241: loss 2.9325, time 5253.76ms 
iter 2242: loss 2.8759, time 5257.39ms 
iter 2243: loss 2.9881, time 5254.54ms 
iter 2244: loss 3.0165, time 5262.68ms 
iter 2245: loss 2.6846, time 5258.02ms 
iter 2246: loss 3.1044, time 5258.40ms 
iter 2247: loss 2.8881, time 5260.82ms 
iter 2248: loss 2.9186, time 5259.81ms 
iter 2249: loss 2.9061, time 5245.68ms 
step 2250: train loss 2.8446, val loss 2.9247
iter 2250: loss 2.7482, time 20031.84ms 
iter 2251: loss 2.8375, time 5260.10ms 
iter 2252: loss 2.9126, time 5262.86ms 
iter 2253: loss 2.7618, time 5295.07ms 
iter 2254: loss 2.6621, time 5314.09ms 
iter 2255: loss 2.8393, time 5323.89ms 
iter 2256: loss 2.7166, time 5256.55ms 
iter 2257: loss 2.6810, time 5266.42ms 
iter 2258: loss 2.9377, time 5320.76ms 
iter 2259: loss 2.9378, time 5332.13ms 
iter 2260: loss 3.0004, time 5258.80ms 
iter 2261: loss 2.8642, time 5268.97ms 
iter 2262: loss 2.7346, time 5280.34ms 
iter 2263: loss 2.7298, time 5267.88ms 
iter 2264: loss 2.8829, time 5277.15ms 
iter 2265: loss 2.9075, time 5272.17ms 
iter 2266: loss 2.7761, time 5275.81ms 
iter 2267: loss 2.8370, time 5265.22ms 
iter 2268: loss 2.8274, time 5279.54ms 
iter 2269: loss 2.8184, time 5270.82ms 
iter 2270: loss 2.8396, time 5298.30ms 
iter 2271: loss 2.8852, time 5289.88ms 
iter 2272: loss 2.8790, time 5261.16ms 
iter 2273: loss 2.9168, time 5266.14ms 
iter 2274: loss 2.7061, time 5270.88ms 
iter 2275: loss 2.7978, time 5266.97ms 
iter 2276: loss 2.8120, time 5263.30ms 
iter 2277: loss 2.9486, time 5260.25ms 
iter 2278: loss 2.6891, time 5275.91ms 
iter 2279: loss 2.7466, time 5268.60ms 
iter 2280: loss 2.8973, time 5272.59ms 
iter 2281: loss 2.6356, time 5273.08ms 
iter 2282: loss 2.8997, time 5262.76ms 
iter 2283: loss 2.7892, time 5267.28ms 
iter 2284: loss 2.6801, time 5260.54ms 
iter 2285: loss 2.8004, time 5272.96ms 
iter 2286: loss 2.9303, time 5271.85ms 
iter 2287: loss 2.8089, time 5273.75ms 
iter 2288: loss 3.0157, time 5269.92ms 
iter 2289: loss 2.8538, time 5259.30ms 
iter 2290: loss 2.8089, time 5278.55ms 
iter 2291: loss 2.8456, time 5264.79ms 
iter 2292: loss 2.9368, time 5254.96ms 
iter 2293: loss 2.9368, time 5267.20ms 
iter 2294: loss 2.9384, time 5291.24ms 
iter 2295: loss 2.8526, time 5273.55ms 
iter 2296: loss 2.6962, time 5261.42ms 
iter 2297: loss 2.7394, time 5272.98ms 
iter 2298: loss 2.7714, time 5260.66ms 
iter 2299: loss 2.9505, time 5264.85ms 
step 2300: train loss 2.8116, val loss 2.9144
iter 2300: loss 2.7493, time 20069.18ms 
iter 2301: loss 2.6864, time 5272.93ms 
iter 2302: loss 2.8894, time 5266.65ms 
iter 2303: loss 2.8137, time 5267.15ms 
iter 2304: loss 2.8159, time 5268.49ms 
iter 2305: loss 2.9468, time 5264.81ms 
iter 2306: loss 2.7950, time 5266.96ms 
iter 2307: loss 2.7414, time 5276.91ms 
iter 2308: loss 2.8363, time 5283.33ms 
iter 2309: loss 2.9486, time 5262.59ms 
iter 2310: loss 2.9524, time 5274.40ms 
iter 2311: loss 2.7394, time 5270.71ms 
iter 2312: loss 3.0944, time 5259.32ms 
iter 2313: loss 2.8459, time 5311.36ms 
iter 2314: loss 2.9497, time 5282.80ms 
iter 2315: loss 2.8781, time 5285.91ms 
iter 2316: loss 2.8046, time 5262.25ms 
iter 2317: loss 2.8170, time 5263.40ms 
iter 2318: loss 2.6468, time 5258.79ms 
iter 2319: loss 2.7020, time 5261.42ms 
iter 2320: loss 2.9248, time 5286.48ms 
iter 2321: loss 2.8292, time 5281.33ms 
iter 2322: loss 2.6929, time 5269.31ms 
iter 2323: loss 2.8714, time 5266.80ms 
iter 2324: loss 2.9188, time 5268.23ms 
iter 2325: loss 2.8480, time 5265.52ms 
iter 2326: loss 2.8523, time 5259.51ms 
iter 2327: loss 2.8850, time 5255.00ms 
iter 2328: loss 2.8774, time 5251.24ms 
iter 2329: loss 2.9422, time 5256.35ms 
iter 2330: loss 2.8172, time 5268.49ms 
iter 2331: loss 2.9698, time 5264.44ms 
iter 2332: loss 2.7673, time 5258.08ms 
iter 2333: loss 2.9480, time 5258.71ms 
iter 2334: loss 2.7964, time 5260.63ms 
iter 2335: loss 2.8449, time 5258.07ms 
iter 2336: loss 2.9245, time 5267.24ms 
iter 2337: loss 2.6801, time 5263.69ms 
iter 2338: loss 2.8460, time 5262.34ms 
iter 2339: loss 2.9564, time 5265.61ms 
iter 2340: loss 3.0224, time 5271.14ms 
iter 2341: loss 2.8097, time 5269.52ms 
iter 2342: loss 3.0063, time 5263.42ms 
iter 2343: loss 2.8041, time 5268.21ms 
iter 2344: loss 2.6284, time 5270.05ms 
iter 2345: loss 2.8077, time 5283.00ms 
iter 2346: loss 2.7675, time 5287.02ms 
iter 2347: loss 2.8753, time 5273.26ms 
iter 2348: loss 2.7793, time 5281.07ms 
iter 2349: loss 2.8000, time 5275.01ms 
step 2350: train loss 2.8031, val loss 2.9080
iter 2350: loss 2.8310, time 20064.55ms 
iter 2351: loss 2.7012, time 5260.60ms 
iter 2352: loss 2.7969, time 5259.19ms 
iter 2353: loss 3.0150, time 5262.54ms 
iter 2354: loss 2.7113, time 5257.74ms 
iter 2355: loss 2.7607, time 5264.19ms 
iter 2356: loss 2.5460, time 5256.81ms 
iter 2357: loss 2.9207, time 5266.75ms 
iter 2358: loss 2.7254, time 5257.67ms 
iter 2359: loss 2.8477, time 5258.62ms 
iter 2360: loss 2.8368, time 5257.69ms 
iter 2361: loss 2.6140, time 5263.88ms 
iter 2362: loss 2.6154, time 5267.47ms 
iter 2363: loss 2.7771, time 5260.82ms 
iter 2364: loss 2.9878, time 5264.55ms 
iter 2365: loss 2.8778, time 5270.60ms 
iter 2366: loss 2.8518, time 5257.80ms 
iter 2367: loss 2.6168, time 5269.76ms 
iter 2368: loss 2.7707, time 5263.73ms 
iter 2369: loss 3.0852, time 5263.72ms 
iter 2370: loss 2.6397, time 5263.17ms 
iter 2371: loss 2.7342, time 5268.07ms 
iter 2372: loss 2.7543, time 5263.71ms 
iter 2373: loss 2.8832, time 5266.53ms 
iter 2374: loss 2.6648, time 5261.93ms 
iter 2375: loss 2.6934, time 5266.44ms 
iter 2376: loss 2.8317, time 5276.35ms 
iter 2377: loss 2.8232, time 5271.00ms 
iter 2378: loss 2.8267, time 5272.32ms 
iter 2379: loss 2.8112, time 5265.50ms 
iter 2380: loss 2.7788, time 5270.41ms 
iter 2381: loss 2.5374, time 5264.89ms 
iter 2382: loss 2.6061, time 5273.86ms 
iter 2383: loss 2.7516, time 5266.87ms 
iter 2384: loss 2.8640, time 5276.12ms 
iter 2385: loss 3.0768, time 5269.55ms 
iter 2386: loss 2.8546, time 5276.51ms 
iter 2387: loss 2.8928, time 5270.54ms 
iter 2388: loss 2.7275, time 5271.11ms 
iter 2389: loss 2.7160, time 5265.32ms 
iter 2390: loss 2.6695, time 5277.58ms 
iter 2391: loss 2.8838, time 5269.40ms 
iter 2392: loss 2.9030, time 5279.76ms 
iter 2393: loss 2.8578, time 5265.41ms 
iter 2394: loss 2.7874, time 5258.16ms 
iter 2395: loss 2.7484, time 5257.54ms 
iter 2396: loss 2.9569, time 5262.08ms 
iter 2397: loss 2.8757, time 5274.62ms 
iter 2398: loss 2.8678, time 5263.45ms 
iter 2399: loss 2.6595, time 5271.28ms 
step 2400: train loss 2.8084, val loss 2.9136
iter 2400: loss 2.7675, time 20055.85ms 
iter 2401: loss 2.7394, time 5253.05ms 
iter 2402: loss 2.7436, time 5269.86ms 
iter 2403: loss 2.9403, time 5261.63ms 
iter 2404: loss 2.6439, time 5268.56ms 
iter 2405: loss 2.9108, time 5260.99ms 
iter 2406: loss 2.7150, time 5260.41ms 
iter 2407: loss 2.9146, time 5263.29ms 
iter 2408: loss 2.8628, time 5262.41ms 
iter 2409: loss 2.7710, time 5266.32ms 
iter 2410: loss 2.5887, time 5260.90ms 
iter 2411: loss 2.9270, time 5258.93ms 
iter 2412: loss 2.9119, time 5255.84ms 
iter 2413: loss 2.7538, time 5264.59ms 
iter 2414: loss 2.6069, time 5268.10ms 
iter 2415: loss 2.8944, time 5264.74ms 
iter 2416: loss 2.8041, time 5270.39ms 
iter 2417: loss 2.6744, time 5242.17ms 
iter 2418: loss 2.7861, time 5258.71ms 
iter 2419: loss 2.6613, time 5268.24ms 
iter 2420: loss 2.7996, time 5261.32ms 
iter 2421: loss 2.7232, time 5268.77ms 
iter 2422: loss 2.7483, time 5262.45ms 
iter 2423: loss 2.8515, time 5267.39ms 
iter 2424: loss 2.7249, time 5256.66ms 
iter 2425: loss 2.8719, time 5259.54ms 
iter 2426: loss 2.7793, time 5259.81ms 
iter 2427: loss 2.8903, time 5256.01ms 
iter 2428: loss 2.8558, time 5263.77ms 
iter 2429: loss 2.8225, time 5261.77ms 
iter 2430: loss 2.6661, time 5265.04ms 
iter 2431: loss 2.7364, time 5266.37ms 
iter 2432: loss 2.6589, time 5261.61ms 
iter 2433: loss 2.7806, time 5264.59ms 
iter 2434: loss 2.7258, time 5283.39ms 
iter 2435: loss 2.7170, time 5281.94ms 
iter 2436: loss 2.7029, time 5266.21ms 
iter 2437: loss 2.6298, time 5258.25ms 
iter 2438: loss 2.8510, time 5256.61ms 
iter 2439: loss 2.7477, time 5260.63ms 
iter 2440: loss 2.7118, time 5233.94ms 
iter 2441: loss 2.7884, time 5263.18ms 
iter 2442: loss 2.7833, time 5270.36ms 
iter 2443: loss 2.8811, time 5276.71ms 
iter 2444: loss 2.6041, time 5251.12ms 
iter 2445: loss 2.6778, time 5267.00ms 
iter 2446: loss 3.0475, time 5236.78ms 
iter 2447: loss 2.8548, time 5243.74ms 
iter 2448: loss 2.7494, time 5280.94ms 
iter 2449: loss 2.8030, time 5287.48ms 
step 2450: train loss 2.8101, val loss 2.9178
iter 2450: loss 2.6790, time 20105.52ms 
iter 2451: loss 2.8553, time 5274.39ms 
iter 2452: loss 3.0022, time 5262.86ms 
iter 2453: loss 2.7134, time 5269.41ms 
iter 2454: loss 2.9830, time 5266.24ms 
iter 2455: loss 2.7274, time 5259.62ms 
iter 2456: loss 2.6979, time 5267.84ms 
iter 2457: loss 2.7679, time 5260.64ms 
iter 2458: loss 2.6560, time 5273.95ms 
iter 2459: loss 2.7960, time 5277.31ms 
iter 2460: loss 2.6234, time 5271.91ms 
iter 2461: loss 2.7442, time 5260.73ms 
iter 2462: loss 2.9618, time 5268.76ms 
iter 2463: loss 2.8844, time 5346.17ms 
iter 2464: loss 2.7327, time 5338.41ms 
iter 2465: loss 2.7731, time 5272.70ms 
iter 2466: loss 2.6736, time 5285.01ms 
iter 2467: loss 2.7245, time 5327.67ms 
iter 2468: loss 2.8217, time 5341.28ms 
iter 2469: loss 2.7750, time 5320.61ms 
iter 2470: loss 2.8224, time 5285.22ms 
iter 2471: loss 2.6982, time 5289.03ms 
iter 2472: loss 2.9203, time 5269.95ms 
iter 2473: loss 2.6722, time 5275.32ms 
iter 2474: loss 2.5213, time 5269.56ms 
iter 2475: loss 2.9519, time 5270.87ms 
iter 2476: loss 2.6537, time 5291.55ms 
iter 2477: loss 2.8283, time 5270.83ms 
iter 2478: loss 2.7892, time 5314.09ms 
iter 2479: loss 2.8522, time 5271.95ms 
iter 2480: loss 2.9921, time 5271.03ms 
iter 2481: loss 2.7114, time 5268.01ms 
iter 2482: loss 3.0439, time 5239.19ms 
iter 2483: loss 2.7323, time 5269.61ms 
iter 2484: loss 2.6988, time 5261.38ms 
iter 2485: loss 2.9738, time 5268.37ms 
iter 2486: loss 2.6914, time 5237.75ms 
iter 2487: loss 2.5391, time 5265.51ms 
iter 2488: loss 2.7911, time 5274.22ms 
iter 2489: loss 2.6822, time 5268.31ms 
iter 2490: loss 2.7316, time 5248.36ms 
iter 2491: loss 2.9613, time 5258.25ms 
iter 2492: loss 2.8017, time 5272.29ms 
iter 2493: loss 2.9521, time 5258.38ms 
iter 2494: loss 2.7048, time 5265.45ms 
iter 2495: loss 2.4861, time 5257.22ms 
iter 2496: loss 2.6893, time 5226.83ms 
iter 2497: loss 2.8315, time 5263.76ms 
iter 2498: loss 2.7430, time 5269.76ms 
iter 2499: loss 2.8302, time 5264.99ms 
step 2500: train loss 2.7857, val loss 2.9063
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 2500: loss 2.8894, time 21321.79ms 
iter 2501: loss 2.8662, time 5258.66ms 
iter 2502: loss 2.8687, time 5314.49ms 
iter 2503: loss 2.9412, time 5291.67ms 
iter 2504: loss 2.6871, time 5316.77ms 
iter 2505: loss 2.7132, time 5343.41ms 
iter 2506: loss 2.7842, time 5343.36ms 
iter 2507: loss 2.8483, time 5344.07ms 
iter 2508: loss 2.8164, time 5281.69ms 
iter 2509: loss 2.7193, time 5305.53ms 
iter 2510: loss 2.8363, time 5345.46ms 
iter 2511: loss 3.0045, time 5223.06ms 
iter 2512: loss 2.8376, time 5292.39ms 
iter 2513: loss 2.9851, time 5265.97ms 
iter 2514: loss 2.8012, time 5266.07ms 
iter 2515: loss 2.6337, time 5274.79ms 
iter 2516: loss 2.5858, time 5261.41ms 
iter 2517: loss 2.5548, time 5259.24ms 
iter 2518: loss 2.6435, time 5279.42ms 
iter 2519: loss 2.7008, time 5266.06ms 
iter 2520: loss 2.7432, time 5327.52ms 
iter 2521: loss 2.7209, time 5297.08ms 
iter 2522: loss 2.8610, time 5262.97ms 
iter 2523: loss 2.6602, time 5273.73ms 
iter 2524: loss 2.6329, time 5299.93ms 
iter 2525: loss 2.8645, time 5253.47ms 
iter 2526: loss 2.9671, time 5262.88ms 
iter 2527: loss 2.7721, time 5312.14ms 
iter 2528: loss 2.8549, time 5269.69ms 
iter 2529: loss 2.7537, time 5271.11ms 
iter 2530: loss 2.5715, time 5277.88ms 
iter 2531: loss 2.7026, time 5262.71ms 
iter 2532: loss 2.7525, time 5285.74ms 
iter 2533: loss 2.8298, time 5266.07ms 
iter 2534: loss 3.0982, time 5257.50ms 
iter 2535: loss 3.0066, time 5283.16ms 
iter 2536: loss 2.7250, time 5257.34ms 
iter 2537: loss 2.6822, time 5263.84ms 
iter 2538: loss 2.8583, time 5262.85ms 
iter 2539: loss 2.7269, time 5320.73ms 
iter 2540: loss 2.7444, time 5328.69ms 
iter 2541: loss 2.8853, time 5262.22ms 
iter 2542: loss 2.7880, time 5280.42ms 
iter 2543: loss 2.7745, time 5289.00ms 
iter 2544: loss 2.6111, time 5260.47ms 
iter 2545: loss 2.7452, time 5276.04ms 
iter 2546: loss 2.9782, time 5275.78ms 
iter 2547: loss 2.9258, time 5276.12ms 
iter 2548: loss 2.8408, time 5273.88ms 
iter 2549: loss 2.8987, time 5255.49ms 
step 2550: train loss 2.7623, val loss 2.8933
iter 2550: loss 2.7875, time 20080.67ms 
iter 2551: loss 2.6989, time 5242.17ms 
iter 2552: loss 2.7263, time 5276.94ms 
iter 2553: loss 2.5797, time 5332.35ms 
iter 2554: loss 2.9716, time 5308.95ms 
iter 2555: loss 2.9305, time 5307.87ms 
iter 2556: loss 2.8209, time 5339.12ms 
iter 2557: loss 2.8854, time 5335.99ms 
iter 2558: loss 2.5704, time 5342.98ms 
iter 2559: loss 2.8261, time 5323.22ms 
iter 2560: loss 2.7831, time 5324.89ms 
iter 2561: loss 2.7756, time 5320.17ms 
iter 2562: loss 2.7898, time 5332.42ms 
iter 2563: loss 2.5944, time 5300.70ms 
iter 2564: loss 2.7938, time 5211.39ms 
iter 2565: loss 2.8462, time 5281.87ms 
iter 2566: loss 2.8660, time 5260.91ms 
iter 2567: loss 2.8192, time 5262.57ms 
iter 2568: loss 2.6636, time 5263.43ms 
iter 2569: loss 2.8923, time 5270.94ms 
iter 2570: loss 2.6992, time 5276.55ms 
iter 2571: loss 2.9167, time 5271.93ms 
iter 2572: loss 2.6124, time 5333.94ms 
iter 2573: loss 2.8194, time 5337.25ms 
iter 2574: loss 2.9194, time 5345.06ms 
iter 2575: loss 2.6999, time 5343.25ms 
iter 2576: loss 2.8367, time 5158.75ms 
iter 2577: loss 2.9425, time 5157.57ms 
iter 2578: loss 2.7282, time 5262.15ms 
iter 2579: loss 2.9119, time 5267.84ms 
iter 2580: loss 2.6948, time 5256.62ms 
iter 2581: loss 2.8058, time 5269.72ms 
iter 2582: loss 2.9424, time 5264.08ms 
iter 2583: loss 2.9536, time 5265.49ms 
iter 2584: loss 2.7142, time 5286.35ms 
iter 2585: loss 2.7975, time 5269.85ms 
iter 2586: loss 2.9226, time 5301.36ms 
iter 2587: loss 2.8273, time 5295.31ms 
iter 2588: loss 2.7527, time 5292.37ms 
iter 2589: loss 2.6802, time 5279.93ms 
iter 2590: loss 2.8307, time 5275.40ms 
iter 2591: loss 2.7068, time 5274.80ms 
iter 2592: loss 2.5409, time 5272.04ms 
iter 2593: loss 2.9400, time 5287.77ms 
iter 2594: loss 2.6498, time 5267.49ms 
iter 2595: loss 2.6047, time 5263.52ms 
iter 2596: loss 2.6951, time 5262.92ms 
iter 2597: loss 2.6884, time 5268.64ms 
iter 2598: loss 2.8433, time 5253.69ms 
iter 2599: loss 2.9405, time 5262.71ms 
step 2600: train loss 2.7788, val loss 2.8967
iter 2600: loss 2.5272, time 20067.13ms 
iter 2601: loss 2.7663, time 5255.29ms 
iter 2602: loss 2.8910, time 5262.97ms 
iter 2603: loss 2.7473, time 5259.42ms 
iter 2604: loss 2.8389, time 5257.95ms 
iter 2605: loss 2.6683, time 5263.37ms 
iter 2606: loss 2.6196, time 5244.24ms 
iter 2607: loss 2.5935, time 5263.56ms 
iter 2608: loss 2.7893, time 5265.24ms 
iter 2609: loss 2.7639, time 5268.26ms 
iter 2610: loss 2.7886, time 5267.57ms 
iter 2611: loss 3.0243, time 5260.94ms 
iter 2612: loss 2.7596, time 5258.81ms 
iter 2613: loss 2.7556, time 5256.13ms 
iter 2614: loss 2.9770, time 5255.05ms 
iter 2615: loss 2.7408, time 5256.52ms 
iter 2616: loss 2.5703, time 5274.44ms 
iter 2617: loss 2.8861, time 5259.50ms 
iter 2618: loss 2.4870, time 5259.96ms 
iter 2619: loss 2.7380, time 5260.13ms 
iter 2620: loss 2.7111, time 5259.49ms 
iter 2621: loss 2.8572, time 5244.15ms 
iter 2622: loss 2.7661, time 5244.22ms 
iter 2623: loss 2.7753, time 5236.30ms 
iter 2624: loss 2.9006, time 5245.99ms 
iter 2625: loss 2.6179, time 5244.32ms 
iter 2626: loss 2.8554, time 5246.25ms 
iter 2627: loss 2.7105, time 5260.51ms 
iter 2628: loss 2.7894, time 5254.95ms 
iter 2629: loss 2.7873, time 5259.58ms 
iter 2630: loss 2.6830, time 5261.37ms 
iter 2631: loss 2.7096, time 5274.44ms 
iter 2632: loss 2.8310, time 5269.68ms 
iter 2633: loss 2.6577, time 5262.52ms 
iter 2634: loss 2.8685, time 5271.44ms 
iter 2635: loss 2.7566, time 5270.99ms 
iter 2636: loss 2.8662, time 5275.17ms 
iter 2637: loss 2.8645, time 5274.91ms 
iter 2638: loss 2.9211, time 5281.22ms 
iter 2639: loss 2.7868, time 5260.94ms 
iter 2640: loss 2.8461, time 5274.17ms 
iter 2641: loss 2.7902, time 5257.51ms 
iter 2642: loss 2.7951, time 5266.30ms 
iter 2643: loss 2.6481, time 5262.48ms 
iter 2644: loss 2.7684, time 5274.14ms 
iter 2645: loss 2.8948, time 5273.41ms 
iter 2646: loss 2.6764, time 5260.30ms 
iter 2647: loss 2.7742, time 5270.37ms 
iter 2648: loss 2.8217, time 5260.48ms 
iter 2649: loss 3.0578, time 5261.29ms 
step 2650: train loss 2.7710, val loss 2.8930
iter 2650: loss 2.7056, time 20087.73ms 
iter 2651: loss 2.8230, time 5260.37ms 
iter 2652: loss 2.7781, time 5259.63ms 
iter 2653: loss 2.8535, time 5261.91ms 
iter 2654: loss 2.6454, time 5261.29ms 
iter 2655: loss 2.7448, time 5263.29ms 
iter 2656: loss 2.5932, time 5260.73ms 
iter 2657: loss 2.9289, time 5268.40ms 
iter 2658: loss 2.6106, time 5260.32ms 
iter 2659: loss 2.9453, time 5255.54ms 
iter 2660: loss 2.6706, time 5264.71ms 
iter 2661: loss 2.8009, time 5267.17ms 
iter 2662: loss 2.6197, time 5259.70ms 
iter 2663: loss 2.6004, time 5260.79ms 
iter 2664: loss 2.6604, time 5264.89ms 
iter 2665: loss 2.7155, time 5256.44ms 
iter 2666: loss 2.7185, time 5256.74ms 
iter 2667: loss 2.9357, time 5269.88ms 
iter 2668: loss 2.6342, time 5264.30ms 
iter 2669: loss 2.7284, time 5264.30ms 
iter 2670: loss 2.9551, time 5264.92ms 
iter 2671: loss 2.7742, time 5266.68ms 
iter 2672: loss 2.7028, time 5269.28ms 
iter 2673: loss 2.8871, time 5265.28ms 
iter 2674: loss 2.4407, time 5280.39ms 
iter 2675: loss 2.6546, time 5271.29ms 
iter 2676: loss 2.8305, time 5275.89ms 
iter 2677: loss 2.7804, time 5273.81ms 
iter 2678: loss 2.8440, time 5269.95ms 
iter 2679: loss 2.7039, time 5269.19ms 
iter 2680: loss 2.7351, time 5275.23ms 
iter 2681: loss 2.8752, time 5280.23ms 
iter 2682: loss 2.6033, time 5273.07ms 
iter 2683: loss 2.6879, time 5272.63ms 
iter 2684: loss 2.7196, time 5266.50ms 
iter 2685: loss 2.8010, time 5264.91ms 
iter 2686: loss 2.9096, time 5264.11ms 
iter 2687: loss 2.6603, time 5272.74ms 
iter 2688: loss 2.6266, time 5273.72ms 
iter 2689: loss 2.6018, time 5269.70ms 
iter 2690: loss 2.8476, time 5263.44ms 
iter 2691: loss 2.6440, time 5268.42ms 
iter 2692: loss 2.7290, time 5260.51ms 
iter 2693: loss 2.9701, time 5278.51ms 
iter 2694: loss 2.9032, time 5251.84ms 
iter 2695: loss 2.8231, time 5271.40ms 
iter 2696: loss 2.6359, time 5266.15ms 
iter 2697: loss 2.7278, time 5269.21ms 
iter 2698: loss 2.6431, time 5277.21ms 
iter 2699: loss 2.6604, time 5240.85ms 
step 2700: train loss 2.7588, val loss 2.8848
iter 2700: loss 2.4484, time 20098.79ms 
iter 2701: loss 2.6519, time 5252.52ms 
iter 2702: loss 2.6646, time 5258.90ms 
iter 2703: loss 2.8951, time 5242.05ms 
iter 2704: loss 2.5702, time 5264.24ms 
iter 2705: loss 2.7209, time 5263.02ms 
iter 2706: loss 2.9262, time 5262.95ms 
iter 2707: loss 2.8119, time 5224.00ms 
iter 2708: loss 2.8102, time 5237.08ms 
iter 2709: loss 2.7399, time 5265.30ms 
iter 2710: loss 2.7682, time 5269.37ms 
iter 2711: loss 2.7986, time 5262.71ms 
iter 2712: loss 2.7920, time 5259.68ms 
iter 2713: loss 2.7169, time 5228.93ms 
iter 2714: loss 3.0628, time 5266.82ms 
iter 2715: loss 2.6294, time 5255.80ms 
iter 2716: loss 2.9227, time 5248.15ms 
iter 2717: loss 2.6819, time 5179.47ms 
iter 2718: loss 2.7356, time 5267.86ms 
iter 2719: loss 2.8322, time 5260.48ms 
iter 2720: loss 2.6147, time 5162.07ms 
iter 2721: loss 2.6171, time 5274.39ms 
iter 2722: loss 2.6031, time 5276.51ms 
iter 2723: loss 2.7002, time 5286.39ms 
iter 2724: loss 2.7346, time 5283.25ms 
iter 2725: loss 2.9978, time 5304.71ms 
iter 2726: loss 2.5047, time 5281.77ms 
iter 2727: loss 2.8472, time 5338.70ms 
iter 2728: loss 2.9267, time 5311.53ms 
iter 2729: loss 2.8296, time 5257.46ms 
iter 2730: loss 2.8527, time 5266.70ms 
iter 2731: loss 2.7305, time 5262.44ms 
iter 2732: loss 2.7926, time 5265.26ms 
iter 2733: loss 2.6661, time 5265.81ms 
iter 2734: loss 2.7917, time 5249.10ms 
iter 2735: loss 2.9730, time 5258.55ms 
iter 2736: loss 2.7569, time 5298.55ms 
iter 2737: loss 2.7603, time 5269.64ms 
iter 2738: loss 2.8170, time 5260.70ms 
iter 2739: loss 2.7333, time 5264.70ms 
iter 2740: loss 2.6498, time 5257.31ms 
iter 2741: loss 2.8154, time 5267.80ms 
iter 2742: loss 2.6758, time 5257.14ms 
iter 2743: loss 2.7272, time 5164.92ms 
iter 2744: loss 2.7616, time 5161.53ms 
iter 2745: loss 2.9836, time 5161.57ms 
iter 2746: loss 2.8349, time 5224.74ms 
iter 2747: loss 2.6054, time 5279.80ms 
iter 2748: loss 2.7012, time 5268.84ms 
iter 2749: loss 2.7612, time 5279.07ms 
step 2750: train loss 2.7552, val loss 2.8960
iter 2750: loss 2.8177, time 20079.23ms 
iter 2751: loss 2.7595, time 5267.72ms 
iter 2752: loss 2.7164, time 5241.87ms 
iter 2753: loss 2.8343, time 5245.66ms 
iter 2754: loss 2.6471, time 5261.81ms 
iter 2755: loss 2.7764, time 5300.95ms 
iter 2756: loss 2.8205, time 5288.48ms 
iter 2757: loss 2.8300, time 5255.88ms 
iter 2758: loss 2.8314, time 5231.25ms 
iter 2759: loss 2.7278, time 5259.99ms 
iter 2760: loss 2.8910, time 5220.53ms 
iter 2761: loss 2.7729, time 5235.99ms 
iter 2762: loss 2.8427, time 5177.94ms 
iter 2763: loss 2.7459, time 5260.56ms 
iter 2764: loss 2.6337, time 5267.63ms 
iter 2765: loss 2.8904, time 5274.87ms 
iter 2766: loss 2.7748, time 5265.61ms 
iter 2767: loss 2.8180, time 5267.01ms 
iter 2768: loss 2.6645, time 5259.05ms 
iter 2769: loss 2.8397, time 5248.91ms 
iter 2770: loss 2.8153, time 5197.43ms 
iter 2771: loss 2.6339, time 5200.09ms 
iter 2772: loss 2.7396, time 5250.66ms 
iter 2773: loss 3.0218, time 5263.88ms 
iter 2774: loss 2.7367, time 5242.97ms 
iter 2775: loss 2.9226, time 5250.89ms 
iter 2776: loss 2.5503, time 5253.08ms 
iter 2777: loss 2.6942, time 5222.68ms 
iter 2778: loss 2.8998, time 5245.24ms 
iter 2779: loss 2.9605, time 5201.45ms 
iter 2780: loss 2.9050, time 5175.28ms 
iter 2781: loss 2.6857, time 5134.97ms 
iter 2782: loss 2.9320, time 5123.98ms 
iter 2783: loss 2.6583, time 5108.28ms 
iter 2784: loss 2.6636, time 5253.36ms 
iter 2785: loss 2.6144, time 5262.62ms 
iter 2786: loss 2.7173, time 5259.87ms 
iter 2787: loss 2.7498, time 5261.56ms 
iter 2788: loss 2.7437, time 5267.10ms 
iter 2789: loss 2.6985, time 5270.49ms 
iter 2790: loss 2.8951, time 5267.52ms 
iter 2791: loss 2.6553, time 5258.35ms 
iter 2792: loss 2.8531, time 5260.66ms 
iter 2793: loss 2.8622, time 5260.89ms 
iter 2794: loss 2.7590, time 5257.07ms 
iter 2795: loss 2.7724, time 5269.37ms 
iter 2796: loss 2.7935, time 5257.37ms 
iter 2797: loss 2.7299, time 5257.20ms 
iter 2798: loss 2.8339, time 5257.75ms 
iter 2799: loss 2.5460, time 5257.23ms 
step 2800: train loss 2.7471, val loss 2.8911
iter 2800: loss 2.8246, time 20063.23ms 
iter 2801: loss 2.7332, time 5255.07ms 
iter 2802: loss 2.7554, time 5254.37ms 
iter 2803: loss 2.6865, time 5254.75ms 
iter 2804: loss 2.4954, time 5261.29ms 
iter 2805: loss 2.8752, time 5267.03ms 
iter 2806: loss 2.7744, time 5274.33ms 
iter 2807: loss 2.6658, time 5276.21ms 
iter 2808: loss 2.5367, time 5259.42ms 
iter 2809: loss 2.9694, time 5269.09ms 
iter 2810: loss 2.6922, time 5265.98ms 
iter 2811: loss 2.6612, time 5264.37ms 
iter 2812: loss 2.9085, time 5265.40ms 
iter 2813: loss 2.8663, time 5278.47ms 
iter 2814: loss 2.7851, time 5280.09ms 
iter 2815: loss 2.5664, time 5272.76ms 
iter 2816: loss 2.7420, time 5274.24ms 
iter 2817: loss 2.8513, time 5260.33ms 
iter 2818: loss 2.8562, time 5261.12ms 
iter 2819: loss 2.8967, time 5273.13ms 
iter 2820: loss 2.7979, time 5279.94ms 
iter 2821: loss 2.6502, time 5282.75ms 
iter 2822: loss 2.7674, time 5276.18ms 
iter 2823: loss 2.7628, time 5282.07ms 
iter 2824: loss 2.9284, time 5272.03ms 
iter 2825: loss 2.9829, time 5270.97ms 
iter 2826: loss 2.7739, time 5278.32ms 
iter 2827: loss 2.6688, time 5281.00ms 
iter 2828: loss 2.6441, time 5280.73ms 
iter 2829: loss 2.6735, time 5280.74ms 
iter 2830: loss 2.9303, time 5273.73ms 
iter 2831: loss 2.7751, time 5279.92ms 
iter 2832: loss 2.8521, time 5272.99ms 
iter 2833: loss 2.5388, time 5270.01ms 
iter 2834: loss 2.8256, time 5271.65ms 
iter 2835: loss 2.7849, time 5275.67ms 
iter 2836: loss 2.7173, time 5265.24ms 
iter 2837: loss 2.7867, time 5265.88ms 
iter 2838: loss 2.8071, time 5275.87ms 
iter 2839: loss 2.6798, time 5280.64ms 
iter 2840: loss 2.8295, time 5267.72ms 
iter 2841: loss 2.7159, time 5259.71ms 
iter 2842: loss 2.7250, time 5267.10ms 
iter 2843: loss 2.8077, time 5271.53ms 
iter 2844: loss 2.5692, time 5269.45ms 
iter 2845: loss 2.8383, time 5275.61ms 
iter 2846: loss 2.7472, time 5275.29ms 
iter 2847: loss 2.6499, time 5262.02ms 
iter 2848: loss 2.7631, time 5261.36ms 
iter 2849: loss 2.6847, time 5260.17ms 
step 2850: train loss 2.7393, val loss 2.8956
iter 2850: loss 2.9446, time 20089.39ms 
iter 2851: loss 2.6734, time 5261.86ms 
iter 2852: loss 2.7950, time 5261.97ms 
iter 2853: loss 2.7778, time 5269.09ms 
iter 2854: loss 2.9073, time 5260.74ms 
iter 2855: loss 2.6959, time 5267.69ms 
iter 2856: loss 2.6240, time 5265.70ms 
iter 2857: loss 2.4957, time 5257.62ms 
iter 2858: loss 2.8064, time 5259.49ms 
iter 2859: loss 2.9420, time 5271.59ms 
iter 2860: loss 2.6761, time 5269.26ms 
iter 2861: loss 2.7604, time 5262.46ms 
iter 2862: loss 2.7084, time 5262.65ms 
iter 2863: loss 2.8741, time 5261.56ms 
iter 2864: loss 2.8845, time 5272.06ms 
iter 2865: loss 2.6262, time 5271.29ms 
iter 2866: loss 2.6883, time 5266.84ms 
iter 2867: loss 2.7341, time 5259.19ms 
iter 2868: loss 2.7522, time 5263.78ms 
iter 2869: loss 2.7053, time 5267.61ms 
iter 2870: loss 2.7124, time 5263.78ms 
iter 2871: loss 2.6675, time 5267.76ms 
iter 2872: loss 2.8455, time 5265.83ms 
iter 2873: loss 2.8532, time 5258.84ms 
iter 2874: loss 2.7719, time 5263.97ms 
iter 2875: loss 2.7080, time 5264.95ms 
iter 2876: loss 2.7322, time 5273.57ms 
iter 2877: loss 2.8303, time 5264.73ms 
iter 2878: loss 3.1113, time 5271.29ms 
iter 2879: loss 2.5745, time 5266.62ms 
iter 2880: loss 3.0022, time 5300.85ms 
iter 2881: loss 2.5969, time 5289.99ms 
iter 2882: loss 2.7587, time 5266.46ms 
iter 2883: loss 2.8781, time 5279.76ms 
iter 2884: loss 2.7034, time 5277.12ms 
iter 2885: loss 2.9149, time 5264.21ms 
iter 2886: loss 2.6389, time 5269.67ms 
iter 2887: loss 2.9439, time 5260.19ms 
iter 2888: loss 2.6460, time 5276.40ms 
iter 2889: loss 2.5093, time 5264.01ms 
iter 2890: loss 2.6941, time 5264.30ms 
iter 2891: loss 2.7523, time 5287.80ms 
iter 2892: loss 2.6258, time 5291.48ms 
iter 2893: loss 2.7968, time 5264.44ms 
iter 2894: loss 2.7533, time 5261.80ms 
iter 2895: loss 2.7075, time 5278.42ms 
iter 2896: loss 2.8734, time 5266.97ms 
iter 2897: loss 2.7612, time 5274.82ms 
iter 2898: loss 2.7588, time 5279.23ms 
iter 2899: loss 2.7475, time 5265.82ms 
step 2900: train loss 2.7448, val loss 2.8862
iter 2900: loss 2.8435, time 20121.91ms 
iter 2901: loss 2.6722, time 5267.33ms 
iter 2902: loss 2.5429, time 5319.95ms 
iter 2903: loss 2.7700, time 5352.03ms 
iter 2904: loss 2.6559, time 5315.31ms 
iter 2905: loss 2.7262, time 5275.62ms 
iter 2906: loss 2.8908, time 5253.11ms 
iter 2907: loss 2.7850, time 5278.20ms 
iter 2908: loss 2.6401, time 5269.61ms 
iter 2909: loss 2.6221, time 5269.23ms 
iter 2910: loss 2.7170, time 5270.65ms 
iter 2911: loss 2.8830, time 5272.25ms 
iter 2912: loss 2.6924, time 5280.11ms 
iter 2913: loss 2.7074, time 5282.13ms 
iter 2914: loss 2.6597, time 5280.26ms 
iter 2915: loss 2.6570, time 5271.43ms 
iter 2916: loss 2.6278, time 5258.47ms 
iter 2917: loss 2.8670, time 5248.93ms 
iter 2918: loss 2.8016, time 5263.86ms 
iter 2919: loss 2.8099, time 5264.81ms 
iter 2920: loss 3.0185, time 5260.30ms 
iter 2921: loss 2.7580, time 5260.35ms 
iter 2922: loss 2.7227, time 5261.42ms 
iter 2923: loss 2.6736, time 5265.22ms 
iter 2924: loss 2.7492, time 5261.80ms 
iter 2925: loss 2.7168, time 5269.01ms 
iter 2926: loss 3.0656, time 5259.79ms 
iter 2927: loss 2.7003, time 5261.05ms 
iter 2928: loss 2.7837, time 5258.68ms 
iter 2929: loss 2.7249, time 5269.61ms 
iter 2930: loss 2.7964, time 5274.42ms 
iter 2931: loss 2.7464, time 5265.46ms 
iter 2932: loss 2.6786, time 5267.13ms 
iter 2933: loss 2.7275, time 5260.57ms 
iter 2934: loss 2.9878, time 5265.92ms 
iter 2935: loss 2.7614, time 5275.01ms 
iter 2936: loss 2.8861, time 5274.26ms 
iter 2937: loss 2.8304, time 5264.10ms 
iter 2938: loss 2.6281, time 5237.48ms 
iter 2939: loss 2.6026, time 5267.25ms 
iter 2940: loss 2.5620, time 5257.25ms 
iter 2941: loss 2.7284, time 5230.28ms 
iter 2942: loss 2.7161, time 5265.65ms 
iter 2943: loss 2.8899, time 5266.53ms 
iter 2944: loss 2.8265, time 5270.77ms 
iter 2945: loss 2.6903, time 5259.13ms 
iter 2946: loss 2.8398, time 5263.10ms 
iter 2947: loss 2.7787, time 5260.74ms 
iter 2948: loss 2.7851, time 5260.21ms 
iter 2949: loss 2.6035, time 5260.51ms 
step 2950: train loss 2.7178, val loss 2.8877
iter 2950: loss 2.9270, time 20068.64ms 
iter 2951: loss 2.8788, time 5274.85ms 
iter 2952: loss 3.0335, time 5268.72ms 
iter 2953: loss 2.6644, time 5268.21ms 
iter 2954: loss 2.8734, time 5272.35ms 
iter 2955: loss 2.6383, time 5268.11ms 
iter 2956: loss 2.7427, time 5263.45ms 
iter 2957: loss 2.7790, time 5260.27ms 
iter 2958: loss 2.6745, time 5265.58ms 
iter 2959: loss 2.7845, time 5267.57ms 
iter 2960: loss 2.6941, time 5269.57ms 
iter 2961: loss 2.8172, time 5271.88ms 
iter 2962: loss 2.6811, time 5271.89ms 
iter 2963: loss 2.8042, time 5263.31ms 
iter 2964: loss 2.4511, time 5279.95ms 
iter 2965: loss 2.6364, time 5277.69ms 
iter 2966: loss 2.6690, time 5269.63ms 
iter 2967: loss 2.6326, time 5255.12ms 
iter 2968: loss 2.8315, time 5266.35ms 
iter 2969: loss 2.8718, time 5267.28ms 
iter 2970: loss 2.7032, time 5264.36ms 
iter 2971: loss 2.6825, time 5263.85ms 
iter 2972: loss 2.8639, time 5259.55ms 
iter 2973: loss 2.8078, time 5264.12ms 
iter 2974: loss 2.5729, time 5242.25ms 
iter 2975: loss 2.9186, time 5263.05ms 
iter 2976: loss 2.8184, time 5261.79ms 
iter 2977: loss 2.8497, time 5257.87ms 
iter 2978: loss 2.7749, time 5269.58ms 
iter 2979: loss 2.8403, time 5265.93ms 
iter 2980: loss 2.7836, time 5269.11ms 
iter 2981: loss 2.8216, time 5269.67ms 
iter 2982: loss 2.7984, time 5237.96ms 
iter 2983: loss 2.7300, time 5258.45ms 
iter 2984: loss 2.9033, time 5263.88ms 
iter 2985: loss 2.6680, time 5265.49ms 
iter 2986: loss 2.7941, time 5315.28ms 
iter 2987: loss 2.6123, time 5320.38ms 
iter 2988: loss 2.8016, time 5283.30ms 
iter 2989: loss 2.7156, time 5257.82ms 
iter 2990: loss 2.5922, time 5297.39ms 
iter 2991: loss 2.7107, time 5274.06ms 
iter 2992: loss 2.6278, time 5264.79ms 
iter 2993: loss 2.8564, time 5261.64ms 
iter 2994: loss 2.8019, time 5262.12ms 
iter 2995: loss 2.8110, time 5193.70ms 
iter 2996: loss 2.6218, time 5257.11ms 
iter 2997: loss 2.5725, time 5302.32ms 
iter 2998: loss 2.6539, time 5330.10ms 
iter 2999: loss 2.6360, time 5305.80ms 
step 3000: train loss 2.7293, val loss 2.8618
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 3000: loss 2.7328, time 21374.88ms 
iter 3001: loss 2.6483, time 5309.84ms 
iter 3002: loss 2.8631, time 5295.73ms 
iter 3003: loss 2.8438, time 5285.27ms 
iter 3004: loss 2.6913, time 5309.15ms 
iter 3005: loss 2.7817, time 5277.87ms 
iter 3006: loss 2.8477, time 5274.27ms 
iter 3007: loss 2.5408, time 5284.36ms 
iter 3008: loss 2.8223, time 5277.59ms 
iter 3009: loss 2.7671, time 5284.13ms 
iter 3010: loss 2.7357, time 5275.78ms 
iter 3011: loss 2.7778, time 5270.86ms 
iter 3012: loss 2.9236, time 5277.47ms 
iter 3013: loss 2.8939, time 5335.10ms 
iter 3014: loss 2.8108, time 5262.25ms 
iter 3015: loss 2.6957, time 5268.36ms 
iter 3016: loss 2.5990, time 5347.46ms 
iter 3017: loss 2.7852, time 5327.72ms 
iter 3018: loss 2.6862, time 5282.75ms 
iter 3019: loss 2.9945, time 5289.04ms 
iter 3020: loss 2.4738, time 5321.09ms 
iter 3021: loss 2.6490, time 5349.53ms 
iter 3022: loss 2.5504, time 5032.91ms 
iter 3023: loss 2.7833, time 5310.97ms 
iter 3024: loss 2.8100, time 5338.99ms 
iter 3025: loss 2.5640, time 5340.24ms 
iter 3026: loss 2.6457, time 5266.46ms 
iter 3027: loss 2.9576, time 5286.85ms 
iter 3028: loss 2.9186, time 5276.60ms 
iter 3029: loss 2.5057, time 5258.00ms 
iter 3030: loss 2.7841, time 5263.08ms 
iter 3031: loss 2.4862, time 5278.43ms 
iter 3032: loss 2.7927, time 5267.14ms 
iter 3033: loss 2.6299, time 5280.47ms 
iter 3034: loss 2.7566, time 5271.33ms 
iter 3035: loss 2.5961, time 5277.74ms 
iter 3036: loss 2.8323, time 5271.76ms 
iter 3037: loss 2.7902, time 5272.82ms 
iter 3038: loss 2.6870, time 5283.75ms 
iter 3039: loss 2.9215, time 5295.46ms 
iter 3040: loss 2.7263, time 5279.92ms 
iter 3041: loss 2.7952, time 5268.34ms 
iter 3042: loss 2.8721, time 5272.62ms 
iter 3043: loss 2.6165, time 5277.53ms 
iter 3044: loss 2.7069, time 5289.67ms 
iter 3045: loss 2.5508, time 5292.14ms 
iter 3046: loss 2.8516, time 5292.08ms 
iter 3047: loss 2.6713, time 5263.32ms 
iter 3048: loss 2.6496, time 5261.89ms 
iter 3049: loss 2.6056, time 5322.94ms 
step 3050: train loss 2.7138, val loss 2.8653
iter 3050: loss 2.9411, time 20060.43ms 
iter 3051: loss 2.5882, time 5253.33ms 
iter 3052: loss 2.6262, time 5264.06ms 
iter 3053: loss 2.6519, time 5340.03ms 
iter 3054: loss 2.8575, time 5345.54ms 
iter 3055: loss 2.9367, time 5272.08ms 
iter 3056: loss 2.5641, time 5271.15ms 
iter 3057: loss 2.6156, time 5261.99ms 
iter 3058: loss 2.6979, time 5276.02ms 
iter 3059: loss 2.6043, time 5266.45ms 
iter 3060: loss 2.7125, time 5264.33ms 
iter 3061: loss 2.5936, time 5270.99ms 
iter 3062: loss 2.7035, time 5267.97ms 
iter 3063: loss 2.8349, time 5300.24ms 
iter 3064: loss 2.7979, time 5268.07ms 
iter 3065: loss 2.6285, time 5274.01ms 
iter 3066: loss 2.7229, time 5322.86ms 
iter 3067: loss 2.7457, time 5294.73ms 
iter 3068: loss 2.8186, time 5272.68ms 
iter 3069: loss 2.9234, time 5273.67ms 
iter 3070: loss 2.6798, time 5347.67ms 
iter 3071: loss 2.4242, time 5312.06ms 
iter 3072: loss 2.7953, time 5343.38ms 
iter 3073: loss 2.7453, time 5299.63ms 
iter 3074: loss 2.8191, time 5272.44ms 
iter 3075: loss 2.8002, time 5261.84ms 
iter 3076: loss 2.8488, time 5256.43ms 
iter 3077: loss 2.6113, time 5277.03ms 
iter 3078: loss 2.6278, time 5270.58ms 
iter 3079: loss 2.6512, time 5264.40ms 
iter 3080: loss 2.6965, time 5263.95ms 
iter 3081: loss 2.7518, time 5259.45ms 
iter 3082: loss 2.6222, time 5269.00ms 
iter 3083: loss 2.9059, time 5268.68ms 
iter 3084: loss 2.7635, time 5264.09ms 
iter 3085: loss 2.6795, time 5257.91ms 
iter 3086: loss 2.9346, time 5257.08ms 
iter 3087: loss 2.5633, time 5258.58ms 
iter 3088: loss 2.6643, time 5282.15ms 
iter 3089: loss 2.7529, time 5279.37ms 
iter 3090: loss 2.6877, time 5282.17ms 
iter 3091: loss 2.6323, time 5274.13ms 
iter 3092: loss 2.5810, time 5272.97ms 
iter 3093: loss 2.7375, time 5260.08ms 
iter 3094: loss 2.7139, time 5258.42ms 
iter 3095: loss 2.6132, time 5263.82ms 
iter 3096: loss 2.6712, time 5265.35ms 
iter 3097: loss 2.9553, time 5265.22ms 
iter 3098: loss 2.7977, time 5262.38ms 
iter 3099: loss 2.7167, time 5289.82ms 
step 3100: train loss 2.7139, val loss 2.8761
iter 3100: loss 2.8612, time 20127.04ms 
iter 3101: loss 2.6877, time 5277.00ms 
iter 3102: loss 2.8175, time 5272.96ms 
iter 3103: loss 2.7215, time 5282.80ms 
iter 3104: loss 2.9666, time 5298.27ms 
iter 3105: loss 2.6288, time 5278.04ms 
iter 3106: loss 2.4802, time 5272.03ms 
iter 3107: loss 2.7198, time 5261.39ms 
iter 3108: loss 2.6433, time 5242.34ms 
iter 3109: loss 3.0006, time 5282.02ms 
iter 3110: loss 2.5033, time 5341.34ms 
iter 3111: loss 2.7184, time 5349.58ms 
iter 3112: loss 2.6771, time 5349.40ms 
iter 3113: loss 2.7102, time 5343.85ms 
iter 3114: loss 2.6966, time 5313.11ms 
iter 3115: loss 2.8845, time 5271.99ms 
iter 3116: loss 2.6760, time 5288.15ms 
iter 3117: loss 2.5321, time 5296.83ms 
iter 3118: loss 2.8222, time 5333.45ms 
iter 3119: loss 2.7969, time 5264.71ms 
iter 3120: loss 2.6714, time 5256.87ms 
iter 3121: loss 2.6173, time 5258.08ms 
iter 3122: loss 2.8616, time 5269.20ms 
iter 3123: loss 2.6852, time 5263.32ms 
iter 3124: loss 2.7790, time 5262.93ms 
iter 3125: loss 2.6932, time 5260.16ms 
iter 3126: loss 2.7341, time 5296.44ms 
iter 3127: loss 2.5490, time 5279.50ms 
iter 3128: loss 2.7210, time 5265.07ms 
iter 3129: loss 2.6726, time 5321.21ms 
iter 3130: loss 2.6925, time 5313.97ms 
iter 3131: loss 2.7735, time 5328.63ms 
iter 3132: loss 2.5564, time 5289.23ms 
iter 3133: loss 2.6649, time 5297.61ms 
iter 3134: loss 2.6467, time 5286.16ms 
iter 3135: loss 2.5735, time 5278.08ms 
iter 3136: loss 2.5026, time 5262.55ms 
iter 3137: loss 2.5468, time 5259.46ms 
iter 3138: loss 2.6406, time 5262.51ms 
iter 3139: loss 2.8126, time 5257.65ms 
iter 3140: loss 2.7483, time 5259.23ms 
iter 3141: loss 2.7605, time 5264.65ms 
iter 3142: loss 2.7917, time 5274.97ms 
iter 3143: loss 2.6169, time 5260.35ms 
iter 3144: loss 2.7578, time 5258.71ms 
iter 3145: loss 2.6757, time 5259.97ms 
iter 3146: loss 2.6609, time 5280.55ms 
iter 3147: loss 2.6463, time 5266.83ms 
iter 3148: loss 2.7700, time 5301.84ms 
iter 3149: loss 2.6486, time 5304.55ms 
step 3150: train loss 2.7129, val loss 2.8583
iter 3150: loss 2.8458, time 20181.60ms 
iter 3151: loss 2.6489, time 5282.94ms 
iter 3152: loss 2.7200, time 5281.95ms 
iter 3153: loss 2.8959, time 5269.25ms 
iter 3154: loss 2.4302, time 5275.48ms 
iter 3155: loss 2.7793, time 5276.91ms 
iter 3156: loss 2.7477, time 5257.97ms 
iter 3157: loss 2.5809, time 5265.43ms 
iter 3158: loss 2.8764, time 5248.45ms 
iter 3159: loss 2.7078, time 5277.88ms 
iter 3160: loss 2.8194, time 5274.74ms 
iter 3161: loss 2.7999, time 5273.83ms 
iter 3162: loss 2.5327, time 5281.89ms 
iter 3163: loss 2.6678, time 5262.80ms 
iter 3164: loss 2.7113, time 5263.23ms 
iter 3165: loss 2.7153, time 5258.06ms 
iter 3166: loss 2.7388, time 5257.34ms 
iter 3167: loss 2.7436, time 5260.70ms 
iter 3168: loss 2.7714, time 5257.57ms 
iter 3169: loss 2.4863, time 5254.58ms 
iter 3170: loss 2.5563, time 5259.69ms 
iter 3171: loss 2.9707, time 5271.08ms 
iter 3172: loss 2.7103, time 5267.03ms 
iter 3173: loss 2.6922, time 5259.77ms 
iter 3174: loss 2.6805, time 5260.39ms 
iter 3175: loss 2.6484, time 5274.41ms 
iter 3176: loss 2.6995, time 5309.73ms 
iter 3177: loss 2.6164, time 5277.01ms 
iter 3178: loss 2.5731, time 5277.73ms 
iter 3179: loss 2.4516, time 5338.05ms 
iter 3180: loss 2.8424, time 5306.55ms 
iter 3181: loss 2.8613, time 5324.44ms 
iter 3182: loss 2.6969, time 5327.34ms 
iter 3183: loss 2.8842, time 5276.83ms 
iter 3184: loss 2.7428, time 5288.77ms 
iter 3185: loss 2.7870, time 5245.59ms 
iter 3186: loss 2.5868, time 5298.47ms 
iter 3187: loss 2.7147, time 5270.95ms 
iter 3188: loss 2.8347, time 5275.13ms 
iter 3189: loss 2.6931, time 5265.02ms 
iter 3190: loss 2.6490, time 5261.61ms 
iter 3191: loss 2.8503, time 5263.52ms 
iter 3192: loss 2.6477, time 5260.96ms 
iter 3193: loss 2.5709, time 5259.68ms 
iter 3194: loss 2.8173, time 5258.70ms 
iter 3195: loss 2.6106, time 5267.83ms 
iter 3196: loss 2.8633, time 5260.58ms 
iter 3197: loss 2.8402, time 5262.18ms 
iter 3198: loss 2.6110, time 5257.32ms 
iter 3199: loss 2.8909, time 5258.54ms 
step 3200: train loss 2.7196, val loss 2.8689
iter 3200: loss 2.8467, time 20100.48ms 
iter 3201: loss 2.5037, time 5320.42ms 
iter 3202: loss 2.7645, time 5265.06ms 
iter 3203: loss 2.7993, time 5266.32ms 
iter 3204: loss 2.7560, time 5278.69ms 
iter 3205: loss 2.6096, time 5264.80ms 
iter 3206: loss 2.8972, time 5269.05ms 
iter 3207: loss 2.6619, time 5265.19ms 
iter 3208: loss 2.5633, time 5271.10ms 
iter 3209: loss 2.5965, time 5263.82ms 
iter 3210: loss 2.8479, time 5273.51ms 
iter 3211: loss 2.7189, time 5299.31ms 
iter 3212: loss 2.9320, time 5333.84ms 
iter 3213: loss 2.8177, time 5343.47ms 
iter 3214: loss 2.8955, time 5298.28ms 
iter 3215: loss 2.6938, time 5254.09ms 
iter 3216: loss 2.6724, time 5275.89ms 
iter 3217: loss 2.7163, time 5269.86ms 
iter 3218: loss 2.6185, time 5268.53ms 
iter 3219: loss 2.7781, time 5264.07ms 
iter 3220: loss 2.7680, time 5261.07ms 
iter 3221: loss 2.6137, time 5260.52ms 
iter 3222: loss 2.7391, time 5275.64ms 
iter 3223: loss 2.6416, time 5262.79ms 
iter 3224: loss 2.7204, time 5274.84ms 
iter 3225: loss 2.6249, time 5275.61ms 
iter 3226: loss 2.6219, time 5271.64ms 
iter 3227: loss 2.8883, time 5265.42ms 
iter 3228: loss 2.7343, time 5281.23ms 
iter 3229: loss 2.6724, time 5331.43ms 
iter 3230: loss 2.7188, time 5264.28ms 
iter 3231: loss 2.8413, time 5289.71ms 
iter 3232: loss 2.7594, time 5270.55ms 
iter 3233: loss 2.9534, time 5267.91ms 
iter 3234: loss 2.8166, time 5238.67ms 
iter 3235: loss 2.7949, time 5272.69ms 
iter 3236: loss 2.7948, time 5281.12ms 
iter 3237: loss 2.7462, time 5273.49ms 
iter 3238: loss 2.3353, time 5281.59ms 
iter 3239: loss 2.7942, time 5265.70ms 
iter 3240: loss 2.7642, time 5263.78ms 
iter 3241: loss 2.5984, time 5269.29ms 
iter 3242: loss 2.6979, time 5271.20ms 
iter 3243: loss 2.7279, time 5270.47ms 
iter 3244: loss 2.8219, time 5261.02ms 
iter 3245: loss 2.8002, time 5260.32ms 
iter 3246: loss 2.5690, time 5263.86ms 
iter 3247: loss 2.7206, time 5270.23ms 
iter 3248: loss 2.4664, time 5256.43ms 
iter 3249: loss 2.7357, time 5258.59ms 
step 3250: train loss 2.7073, val loss 2.8648
iter 3250: loss 2.6768, time 20087.20ms 
iter 3251: loss 2.6332, time 5262.79ms 
iter 3252: loss 2.6762, time 5235.91ms 
iter 3253: loss 2.5159, time 5258.31ms 
iter 3254: loss 2.6352, time 5276.92ms 
iter 3255: loss 2.7141, time 5273.98ms 
iter 3256: loss 2.9298, time 5280.68ms 
iter 3257: loss 2.7947, time 5272.30ms 
iter 3258: loss 2.7733, time 5264.56ms 
iter 3259: loss 2.6818, time 5281.36ms 
iter 3260: loss 2.6260, time 5278.55ms 
iter 3261: loss 2.6588, time 5283.63ms 
iter 3262: loss 2.6313, time 5282.44ms 
iter 3263: loss 2.6990, time 5267.83ms 
iter 3264: loss 2.6728, time 5304.59ms 
iter 3265: loss 2.7626, time 5296.41ms 
iter 3266: loss 2.8308, time 5293.14ms 
iter 3267: loss 2.5473, time 5313.07ms 
iter 3268: loss 2.7507, time 5314.84ms 
iter 3269: loss 2.6163, time 5311.88ms 
iter 3270: loss 2.7715, time 5260.45ms 
iter 3271: loss 2.7295, time 5267.72ms 
iter 3272: loss 2.7838, time 5263.95ms 
iter 3273: loss 2.7629, time 5259.20ms 
iter 3274: loss 2.5507, time 5260.10ms 
iter 3275: loss 2.5059, time 5320.11ms 
iter 3276: loss 2.7885, time 5290.68ms 
iter 3277: loss 2.6849, time 5264.38ms 
iter 3278: loss 2.4302, time 5270.97ms 
iter 3279: loss 2.6791, time 5274.33ms 
iter 3280: loss 2.7252, time 5266.53ms 
iter 3281: loss 2.4256, time 5268.43ms 
iter 3282: loss 2.7401, time 5225.96ms 
iter 3283: loss 2.7225, time 5262.12ms 
iter 3284: loss 2.8596, time 5267.14ms 
iter 3285: loss 2.7310, time 5266.26ms 
iter 3286: loss 2.6898, time 5266.25ms 
iter 3287: loss 2.6224, time 5272.92ms 
iter 3288: loss 2.8085, time 5255.60ms 
iter 3289: loss 2.9690, time 5268.42ms 
iter 3290: loss 2.7299, time 5240.96ms 
iter 3291: loss 2.8896, time 5252.61ms 
iter 3292: loss 2.5284, time 5256.38ms 
iter 3293: loss 2.5818, time 5255.06ms 
iter 3294: loss 2.6564, time 5256.26ms 
iter 3295: loss 2.6618, time 5263.12ms 
iter 3296: loss 2.6004, time 5265.38ms 
iter 3297: loss 2.6204, time 5260.93ms 
iter 3298: loss 2.5405, time 5264.65ms 
iter 3299: loss 2.7611, time 5254.68ms 
step 3300: train loss 2.7050, val loss 2.8782
iter 3300: loss 2.9182, time 20064.50ms 
iter 3301: loss 2.7238, time 5256.20ms 
iter 3302: loss 2.7800, time 5256.29ms 
iter 3303: loss 2.6741, time 5254.88ms 
iter 3304: loss 2.7110, time 5268.51ms 
iter 3305: loss 2.4738, time 5261.70ms 
iter 3306: loss 2.7620, time 5233.28ms 
iter 3307: loss 2.5838, time 5287.29ms 
iter 3308: loss 2.5641, time 5258.91ms 
iter 3309: loss 2.7007, time 5281.96ms 
iter 3310: loss 2.6784, time 5305.38ms 
iter 3311: loss 2.7552, time 5280.21ms 
iter 3312: loss 2.6795, time 5273.08ms 
iter 3313: loss 2.6939, time 5268.93ms 
iter 3314: loss 2.6677, time 5266.76ms 
iter 3315: loss 2.7949, time 5320.44ms 
iter 3316: loss 2.7766, time 5259.07ms 
iter 3317: loss 2.7483, time 5263.50ms 
iter 3318: loss 2.6616, time 5254.71ms 
iter 3319: loss 2.7726, time 5280.80ms 
iter 3320: loss 2.8150, time 5297.69ms 
iter 3321: loss 2.7865, time 5299.73ms 
iter 3322: loss 2.8232, time 5319.93ms 
iter 3323: loss 2.6940, time 5302.45ms 
iter 3324: loss 2.8235, time 5319.48ms 
iter 3325: loss 2.6074, time 5259.77ms 
iter 3326: loss 2.4262, time 5273.18ms 
iter 3327: loss 2.5982, time 5267.65ms 
iter 3328: loss 2.6279, time 5262.29ms 
iter 3329: loss 2.5527, time 5269.87ms 
iter 3330: loss 2.5840, time 5262.75ms 
iter 3331: loss 2.5120, time 5263.49ms 
iter 3332: loss 2.5709, time 5254.49ms 
iter 3333: loss 2.6334, time 5252.77ms 
iter 3334: loss 2.6149, time 5260.95ms 
iter 3335: loss 2.7117, time 5255.34ms 
iter 3336: loss 2.7701, time 5263.64ms 
iter 3337: loss 2.7956, time 5255.67ms 
iter 3338: loss 2.7674, time 5258.58ms 
iter 3339: loss 2.9162, time 5257.53ms 
iter 3340: loss 2.4703, time 5264.72ms 
iter 3341: loss 2.7467, time 5272.50ms 
iter 3342: loss 2.7464, time 5269.97ms 
iter 3343: loss 2.6850, time 5258.16ms 
iter 3344: loss 2.6499, time 5258.76ms 
iter 3345: loss 2.7730, time 5260.15ms 
iter 3346: loss 2.8851, time 5275.97ms 
iter 3347: loss 2.8757, time 5261.69ms 
iter 3348: loss 2.6621, time 5255.80ms 
iter 3349: loss 2.8607, time 5248.03ms 
step 3350: train loss 2.6936, val loss 2.8709
iter 3350: loss 2.6162, time 20044.01ms 
iter 3351: loss 2.6998, time 5265.58ms 
iter 3352: loss 2.9413, time 5259.71ms 
iter 3353: loss 2.6171, time 5268.49ms 
iter 3354: loss 2.6982, time 5259.01ms 
iter 3355: loss 2.6869, time 5263.71ms 
iter 3356: loss 2.9192, time 5269.45ms 
iter 3357: loss 2.6997, time 5257.05ms 
iter 3358: loss 2.6623, time 5254.30ms 
iter 3359: loss 2.6376, time 5232.95ms 
iter 3360: loss 2.7272, time 5263.00ms 
iter 3361: loss 2.7224, time 5258.93ms 
iter 3362: loss 2.7244, time 5267.21ms 
iter 3363: loss 2.8349, time 5261.29ms 
iter 3364: loss 2.4646, time 5258.58ms 
iter 3365: loss 2.7654, time 5255.38ms 
iter 3366: loss 2.6827, time 5268.22ms 
iter 3367: loss 2.7651, time 5267.02ms 
iter 3368: loss 2.6635, time 5257.63ms 
iter 3369: loss 2.6099, time 5266.65ms 
iter 3370: loss 2.6919, time 5257.70ms 
iter 3371: loss 2.7437, time 5261.57ms 
iter 3372: loss 2.8293, time 5286.99ms 
iter 3373: loss 2.8016, time 5311.67ms 
iter 3374: loss 2.6798, time 5266.94ms 
iter 3375: loss 2.5848, time 5264.46ms 
iter 3376: loss 2.6351, time 5281.29ms 
iter 3377: loss 2.5009, time 5264.66ms 
iter 3378: loss 2.5456, time 5275.97ms 
iter 3379: loss 2.8132, time 5297.48ms 
iter 3380: loss 2.6538, time 5259.05ms 
iter 3381: loss 2.8877, time 5255.86ms 
iter 3382: loss 2.4648, time 5259.90ms 
iter 3383: loss 2.7502, time 5304.93ms 
iter 3384: loss 2.7555, time 5330.41ms 
iter 3385: loss 2.6049, time 5317.44ms 
iter 3386: loss 2.9838, time 5337.57ms 
iter 3387: loss 2.8025, time 5332.12ms 
iter 3388: loss 2.7310, time 5304.16ms 
iter 3389: loss 2.6978, time 5342.72ms 
iter 3390: loss 2.7134, time 5276.86ms 
iter 3391: loss 2.6677, time 5266.97ms 
iter 3392: loss 2.4535, time 5272.45ms 
iter 3393: loss 2.5939, time 5336.75ms 
iter 3394: loss 2.7899, time 5294.95ms 
iter 3395: loss 2.8004, time 5275.98ms 
iter 3396: loss 2.8636, time 5272.97ms 
iter 3397: loss 2.7233, time 5274.25ms 
iter 3398: loss 2.8307, time 5268.80ms 
iter 3399: loss 2.7825, time 5263.31ms 
step 3400: train loss 2.6935, val loss 2.8821
iter 3400: loss 2.7397, time 20053.41ms 
iter 3401: loss 2.5110, time 5247.48ms 
iter 3402: loss 2.6073, time 5265.92ms 
iter 3403: loss 2.4928, time 5311.62ms 
iter 3404: loss 2.6525, time 5267.51ms 
iter 3405: loss 2.6907, time 5266.96ms 
iter 3406: loss 2.8077, time 5279.91ms 
iter 3407: loss 2.6283, time 5273.99ms 
iter 3408: loss 2.8411, time 5273.57ms 
iter 3409: loss 2.5973, time 5275.20ms 
iter 3410: loss 2.6254, time 5261.68ms 
iter 3411: loss 2.8210, time 5268.40ms 
iter 3412: loss 2.7951, time 5268.22ms 
iter 3413: loss 2.6551, time 5264.75ms 
iter 3414: loss 2.7816, time 5264.34ms 
iter 3415: loss 2.5674, time 5285.98ms 
iter 3416: loss 2.7362, time 5338.82ms 
iter 3417: loss 2.6617, time 5280.30ms 
iter 3418: loss 2.5456, time 5259.97ms 
iter 3419: loss 2.5747, time 5249.54ms 
iter 3420: loss 2.6812, time 5269.59ms 
iter 3421: loss 2.7298, time 5264.25ms 
iter 3422: loss 2.8429, time 5274.35ms 
iter 3423: loss 2.5776, time 5345.53ms 
iter 3424: loss 2.5968, time 5349.54ms 
iter 3425: loss 2.5578, time 5289.28ms 
iter 3426: loss 2.6431, time 5261.03ms 
iter 3427: loss 2.7952, time 5329.47ms 
iter 3428: loss 3.0135, time 5276.59ms 
iter 3429: loss 2.6263, time 5280.34ms 
iter 3430: loss 2.6282, time 5279.40ms 
iter 3431: loss 2.8621, time 5266.68ms 
iter 3432: loss 2.6683, time 5268.36ms 
iter 3433: loss 2.8091, time 5314.83ms 
iter 3434: loss 2.5378, time 5272.28ms 
iter 3435: loss 2.7617, time 5341.37ms 
iter 3436: loss 2.5226, time 5349.85ms 
iter 3437: loss 2.6741, time 5358.00ms 
iter 3438: loss 2.7622, time 5279.69ms 
iter 3439: loss 3.0074, time 5277.01ms 
iter 3440: loss 2.7451, time 5265.56ms 
iter 3441: loss 2.4894, time 5275.29ms 
iter 3442: loss 2.7276, time 5270.20ms 
iter 3443: loss 2.6805, time 5271.79ms 
iter 3444: loss 2.7168, time 5264.37ms 
iter 3445: loss 2.4600, time 5276.68ms 
iter 3446: loss 2.7027, time 5266.89ms 
iter 3447: loss 2.7000, time 5265.38ms 
iter 3448: loss 2.6789, time 5270.78ms 
iter 3449: loss 2.6889, time 5267.81ms 
step 3450: train loss 2.6732, val loss 2.8659
iter 3450: loss 2.8488, time 20068.00ms 
iter 3451: loss 2.6353, time 5269.18ms 
iter 3452: loss 2.6913, time 5275.32ms 
iter 3453: loss 2.7049, time 5267.46ms 
iter 3454: loss 2.5015, time 5261.98ms 
iter 3455: loss 2.8021, time 5260.67ms 
iter 3456: loss 2.6956, time 5264.67ms 
iter 3457: loss 2.6626, time 5261.27ms 
iter 3458: loss 2.6920, time 5267.53ms 
iter 3459: loss 2.6390, time 5265.23ms 
iter 3460: loss 2.7112, time 5266.07ms 
iter 3461: loss 2.7739, time 5275.48ms 
iter 3462: loss 2.5652, time 5260.82ms 
iter 3463: loss 2.5865, time 5262.34ms 
iter 3464: loss 2.4129, time 5261.28ms 
iter 3465: loss 2.6636, time 5270.32ms 
iter 3466: loss 2.7665, time 5268.15ms 
iter 3467: loss 2.7563, time 5236.08ms 
iter 3468: loss 2.5679, time 5270.27ms 
iter 3469: loss 2.6870, time 5257.46ms 
iter 3470: loss 2.6215, time 5316.07ms 
iter 3471: loss 2.7236, time 5269.59ms 
iter 3472: loss 2.7096, time 5293.77ms 
iter 3473: loss 2.7410, time 5285.06ms 
iter 3474: loss 2.8008, time 5284.92ms 
iter 3475: loss 2.6716, time 5274.52ms 
iter 3476: loss 2.7264, time 5280.85ms 
iter 3477: loss 2.6776, time 5272.14ms 
iter 3478: loss 2.6138, time 5281.62ms 
iter 3479: loss 2.5459, time 5277.39ms 
iter 3480: loss 2.6404, time 5281.79ms 
iter 3481: loss 2.5587, time 5297.61ms 
iter 3482: loss 2.4680, time 5335.28ms 
iter 3483: loss 2.6411, time 5336.77ms 
iter 3484: loss 2.6691, time 5271.58ms 
iter 3485: loss 2.7450, time 5275.67ms 
iter 3486: loss 2.7101, time 5260.89ms 
iter 3487: loss 2.6847, time 5270.22ms 
iter 3488: loss 2.6221, time 5255.27ms 
iter 3489: loss 2.6987, time 5275.44ms 
iter 3490: loss 2.6746, time 5271.84ms 
iter 3491: loss 2.7964, time 5272.11ms 
iter 3492: loss 2.6386, time 5352.78ms 
iter 3493: loss 2.4273, time 5314.72ms 
iter 3494: loss 2.6383, time 5273.86ms 
iter 3495: loss 2.6310, time 5262.52ms 
iter 3496: loss 2.7073, time 5258.41ms 
iter 3497: loss 2.4762, time 5268.37ms 
iter 3498: loss 2.6793, time 5289.86ms 
iter 3499: loss 2.7475, time 5274.15ms 
step 3500: train loss 2.6784, val loss 2.8592
iter 3500: loss 2.6428, time 20100.41ms 
iter 3501: loss 2.7903, time 5262.61ms 
iter 3502: loss 2.5706, time 5277.48ms 
iter 3503: loss 2.5567, time 5260.09ms 
iter 3504: loss 2.7824, time 5259.25ms 
iter 3505: loss 2.5784, time 5257.56ms 
iter 3506: loss 2.5697, time 5263.13ms 
iter 3507: loss 2.6521, time 5277.10ms 
iter 3508: loss 2.9414, time 5309.22ms 
iter 3509: loss 2.6078, time 5311.27ms 
iter 3510: loss 2.7047, time 5341.79ms 
iter 3511: loss 2.5257, time 5315.41ms 
iter 3512: loss 2.7170, time 5271.51ms 
iter 3513: loss 2.6655, time 5263.18ms 
iter 3514: loss 2.6633, time 5257.52ms 
iter 3515: loss 2.6646, time 5259.21ms 
iter 3516: loss 2.6988, time 5267.25ms 
iter 3517: loss 2.7308, time 5271.66ms 
iter 3518: loss 2.6755, time 5267.94ms 
iter 3519: loss 2.8544, time 5274.13ms 
iter 3520: loss 2.8660, time 5265.14ms 
iter 3521: loss 2.6301, time 5267.25ms 
iter 3522: loss 2.7090, time 5269.51ms 
iter 3523: loss 2.8035, time 5265.37ms 
iter 3524: loss 2.4954, time 5264.16ms 
iter 3525: loss 2.7515, time 5284.07ms 
iter 3526: loss 2.7847, time 5265.20ms 
iter 3527: loss 2.9465, time 5257.52ms 
iter 3528: loss 2.8062, time 5268.44ms 
iter 3529: loss 2.7776, time 5266.40ms 
iter 3530: loss 2.7298, time 5265.74ms 
iter 3531: loss 2.6533, time 5322.13ms 
iter 3532: loss 2.6511, time 5302.87ms 
iter 3533: loss 2.5956, time 5257.32ms 
iter 3534: loss 2.7741, time 5269.35ms 
iter 3535: loss 2.7988, time 5260.74ms 
iter 3536: loss 2.7034, time 5260.49ms 
iter 3537: loss 2.6590, time 5220.93ms 
iter 3538: loss 2.6504, time 5285.00ms 
iter 3539: loss 2.6406, time 5260.93ms 
iter 3540: loss 2.6866, time 5264.56ms 
iter 3541: loss 2.6472, time 5261.46ms 
iter 3542: loss 2.4934, time 5255.16ms 
iter 3543: loss 2.6632, time 5253.61ms 
iter 3544: loss 2.5927, time 5216.49ms 
iter 3545: loss 2.6900, time 5260.57ms 
iter 3546: loss 2.6484, time 5266.71ms 
iter 3547: loss 2.7997, time 5270.58ms 
iter 3548: loss 2.5566, time 5255.94ms 
iter 3549: loss 2.6143, time 5256.85ms 
step 3550: train loss 2.6752, val loss 2.8451
iter 3550: loss 2.6605, time 20091.32ms 
iter 3551: loss 2.6060, time 5266.71ms 
iter 3552: loss 2.8272, time 5277.36ms 
iter 3553: loss 2.5803, time 5277.60ms 
iter 3554: loss 2.6217, time 5276.50ms 
iter 3555: loss 2.8461, time 5294.27ms 
iter 3556: loss 2.6533, time 5259.43ms 
iter 3557: loss 2.5646, time 5275.83ms 
iter 3558: loss 2.7068, time 5267.41ms 
iter 3559: loss 2.7334, time 5267.52ms 
iter 3560: loss 2.6893, time 5264.03ms 
iter 3561: loss 2.6388, time 5253.04ms 
iter 3562: loss 2.7945, time 5254.05ms 
iter 3563: loss 2.4881, time 5247.58ms 
iter 3564: loss 2.6171, time 5241.82ms 
iter 3565: loss 2.8069, time 5257.99ms 
iter 3566: loss 2.7557, time 5255.14ms 
iter 3567: loss 2.5885, time 5262.39ms 
iter 3568: loss 2.5518, time 5255.91ms 
iter 3569: loss 2.6280, time 5233.65ms 
iter 3570: loss 2.5528, time 5268.54ms 
iter 3571: loss 2.8130, time 5268.15ms 
iter 3572: loss 2.8079, time 5257.51ms 
iter 3573: loss 2.5987, time 5259.80ms 
iter 3574: loss 2.6124, time 5260.49ms 
iter 3575: loss 2.8347, time 5266.96ms 
iter 3576: loss 2.7399, time 5251.14ms 
iter 3577: loss 2.7860, time 5262.70ms 
iter 3578: loss 2.7553, time 5258.14ms 
iter 3579: loss 2.6832, time 5258.72ms 
iter 3580: loss 2.5797, time 5265.69ms 
iter 3581: loss 2.7049, time 5275.02ms 
iter 3582: loss 2.7829, time 5262.62ms 
iter 3583: loss 2.5594, time 5258.72ms 
iter 3584: loss 2.5916, time 5266.03ms 
iter 3585: loss 2.4391, time 5259.89ms 
iter 3586: loss 2.7302, time 5267.94ms 
iter 3587: loss 2.7840, time 5269.63ms 
iter 3588: loss 2.6660, time 5343.02ms 
iter 3589: loss 2.6976, time 5289.10ms 
iter 3590: loss 2.6040, time 5280.36ms 
iter 3591: loss 2.7499, time 5265.37ms 
iter 3592: loss 2.5802, time 5269.18ms 
iter 3593: loss 2.8199, time 5262.48ms 
iter 3594: loss 2.7230, time 5277.33ms 
iter 3595: loss 2.7642, time 5272.90ms 
iter 3596: loss 2.7481, time 5268.04ms 
iter 3597: loss 2.6321, time 5263.74ms 
iter 3598: loss 2.5082, time 5262.60ms 
iter 3599: loss 2.7218, time 5260.10ms 
step 3600: train loss 2.6740, val loss 2.8738
iter 3600: loss 2.4997, time 20040.97ms 
iter 3601: loss 2.5564, time 5264.55ms 
iter 3602: loss 2.7233, time 5262.16ms 
iter 3603: loss 2.6644, time 5266.20ms 
iter 3604: loss 2.7033, time 5263.35ms 
iter 3605: loss 2.9161, time 5267.59ms 
iter 3606: loss 2.4968, time 5272.84ms 
iter 3607: loss 2.6656, time 5265.10ms 
iter 3608: loss 2.4938, time 5258.13ms 
iter 3609: loss 2.6609, time 5255.85ms 
iter 3610: loss 2.5114, time 5257.66ms 
iter 3611: loss 2.6587, time 5265.09ms 
iter 3612: loss 2.5861, time 5294.09ms 
iter 3613: loss 2.6235, time 5271.71ms 
iter 3614: loss 2.7278, time 5269.34ms 
iter 3615: loss 2.6608, time 5262.31ms 
iter 3616: loss 2.7615, time 5266.16ms 
iter 3617: loss 2.6491, time 5272.54ms 
iter 3618: loss 2.6836, time 5273.23ms 
iter 3619: loss 2.9250, time 5259.54ms 
iter 3620: loss 2.6786, time 5257.61ms 
iter 3621: loss 2.5227, time 5269.00ms 
iter 3622: loss 2.5807, time 5266.41ms 
iter 3623: loss 2.5971, time 5259.40ms 
iter 3624: loss 2.5757, time 5260.96ms 
iter 3625: loss 2.7807, time 5261.90ms 
iter 3626: loss 2.6400, time 5258.17ms 
iter 3627: loss 2.7072, time 5257.08ms 
iter 3628: loss 2.6135, time 5267.11ms 
iter 3629: loss 2.8330, time 5268.94ms 
iter 3630: loss 2.5489, time 5255.57ms 
iter 3631: loss 2.7648, time 5260.02ms 
iter 3632: loss 2.5779, time 5261.07ms 
iter 3633: loss 2.7170, time 5264.77ms 
iter 3634: loss 2.7234, time 5262.78ms 
iter 3635: loss 2.5806, time 5256.26ms 
iter 3636: loss 2.5883, time 5254.82ms 
iter 3637: loss 2.6255, time 5266.15ms 
iter 3638: loss 2.5363, time 5275.45ms 
iter 3639: loss 2.6856, time 5270.95ms 
iter 3640: loss 2.6183, time 5264.54ms 
iter 3641: loss 2.5563, time 5259.44ms 
iter 3642: loss 2.6668, time 5264.73ms 
iter 3643: loss 2.6126, time 5274.13ms 
iter 3644: loss 2.6022, time 5269.39ms 
iter 3645: loss 2.6340, time 5265.41ms 
iter 3646: loss 2.6709, time 5269.08ms 
iter 3647: loss 2.6307, time 5263.74ms 
iter 3648: loss 2.6062, time 5264.15ms 
iter 3649: loss 2.6388, time 5264.41ms 
step 3650: train loss 2.6649, val loss 2.8587
iter 3650: loss 2.5844, time 20067.78ms 
iter 3651: loss 2.7104, time 5259.19ms 
iter 3652: loss 2.6965, time 5263.19ms 
iter 3653: loss 2.9423, time 5255.12ms 
iter 3654: loss 2.5693, time 5254.03ms 
iter 3655: loss 2.7200, time 5260.36ms 
iter 3656: loss 2.7899, time 5266.13ms 
iter 3657: loss 2.9036, time 5269.01ms 
iter 3658: loss 2.6622, time 5272.02ms 
iter 3659: loss 2.7028, time 5237.08ms 
iter 3660: loss 2.8220, time 5265.97ms 
iter 3661: loss 2.7350, time 5255.60ms 
iter 3662: loss 2.7312, time 5267.29ms 
iter 3663: loss 2.6473, time 5242.63ms 
iter 3664: loss 2.6327, time 5273.90ms 
iter 3665: loss 2.8204, time 5268.19ms 
iter 3666: loss 2.7150, time 5263.54ms 
iter 3667: loss 2.5096, time 5273.68ms 
iter 3668: loss 2.6588, time 5258.52ms 
iter 3669: loss 2.6564, time 5259.76ms 
iter 3670: loss 2.5998, time 5262.95ms 
iter 3671: loss 2.5604, time 5255.97ms 
iter 3672: loss 2.6972, time 5264.43ms 
iter 3673: loss 2.9057, time 5253.74ms 
iter 3674: loss 2.6521, time 5263.59ms 
iter 3675: loss 2.6421, time 5253.55ms 
iter 3676: loss 2.7439, time 5259.84ms 
iter 3677: loss 2.7612, time 5261.84ms 
iter 3678: loss 2.7601, time 5263.72ms 
iter 3679: loss 2.4502, time 5266.89ms 
iter 3680: loss 2.7408, time 5257.56ms 
iter 3681: loss 2.6630, time 5251.59ms 
iter 3682: loss 2.7723, time 5242.60ms 
iter 3683: loss 2.5919, time 5234.50ms 
iter 3684: loss 2.6874, time 5248.23ms 
iter 3685: loss 2.5015, time 5262.59ms 
iter 3686: loss 2.5448, time 5257.05ms 
iter 3687: loss 2.5159, time 5264.49ms 
iter 3688: loss 2.7008, time 5265.84ms 
iter 3689: loss 2.8899, time 5267.54ms 
iter 3690: loss 2.6890, time 5256.81ms 
iter 3691: loss 2.7153, time 5257.42ms 
iter 3692: loss 2.6515, time 5314.46ms 
iter 3693: loss 2.6120, time 5345.96ms 
iter 3694: loss 2.6517, time 5343.44ms 
iter 3695: loss 2.6785, time 5335.89ms 
iter 3696: loss 2.7155, time 5306.07ms 
iter 3697: loss 2.7243, time 5257.82ms 
iter 3698: loss 2.4954, time 5256.85ms 
iter 3699: loss 2.6880, time 5264.95ms 
step 3700: train loss 2.6742, val loss 2.8499
iter 3700: loss 2.8004, time 20055.44ms 
iter 3701: loss 2.6030, time 5268.23ms 
iter 3702: loss 2.5697, time 5283.66ms 
iter 3703: loss 2.8497, time 5302.75ms 
iter 3704: loss 2.6320, time 5256.24ms 
iter 3705: loss 2.7071, time 5254.87ms 
iter 3706: loss 2.4705, time 5264.56ms 
iter 3707: loss 2.6581, time 5313.13ms 
iter 3708: loss 2.6609, time 5266.71ms 
iter 3709: loss 2.6097, time 5259.13ms 
iter 3710: loss 2.8467, time 5254.36ms 
iter 3711: loss 2.4404, time 5259.57ms 
iter 3712: loss 2.6568, time 5317.94ms 
iter 3713: loss 2.5100, time 5266.94ms 
iter 3714: loss 2.5239, time 5331.02ms 
iter 3715: loss 2.4608, time 5318.82ms 
iter 3716: loss 2.9952, time 5264.95ms 
iter 3717: loss 2.6292, time 5270.00ms 
iter 3718: loss 2.6836, time 5261.79ms 
iter 3719: loss 2.6270, time 5261.45ms 
iter 3720: loss 2.8251, time 5262.48ms 
iter 3721: loss 2.4320, time 5259.34ms 
iter 3722: loss 2.7471, time 5243.98ms 
iter 3723: loss 2.6912, time 5268.18ms 
iter 3724: loss 2.4534, time 5259.74ms 
iter 3725: loss 2.6327, time 5270.19ms 
iter 3726: loss 2.8064, time 5226.17ms 
iter 3727: loss 2.6575, time 5273.83ms 
iter 3728: loss 2.8024, time 5256.69ms 
iter 3729: loss 2.8260, time 5259.31ms 
iter 3730: loss 2.7099, time 5258.43ms 
iter 3731: loss 2.6110, time 5263.68ms 
iter 3732: loss 2.4958, time 5270.54ms 
iter 3733: loss 2.7097, time 5265.67ms 
iter 3734: loss 2.8187, time 5256.11ms 
iter 3735: loss 2.6033, time 5264.14ms 
iter 3736: loss 2.5825, time 5264.44ms 
iter 3737: loss 2.7444, time 5263.46ms 
iter 3738: loss 2.6413, time 5269.77ms 
iter 3739: loss 2.6567, time 5269.89ms 
iter 3740: loss 2.7198, time 5261.61ms 
iter 3741: loss 2.8710, time 5271.02ms 
iter 3742: loss 2.6447, time 5323.31ms 
iter 3743: loss 2.6229, time 5321.26ms 
iter 3744: loss 2.7134, time 5270.89ms 
iter 3745: loss 2.7837, time 5263.13ms 
iter 3746: loss 2.6908, time 5305.64ms 
iter 3747: loss 2.6445, time 5258.90ms 
iter 3748: loss 2.6552, time 5273.18ms 
iter 3749: loss 2.7768, time 5271.04ms 
step 3750: train loss 2.6482, val loss 2.8411
iter 3750: loss 2.6544, time 20064.92ms 
iter 3751: loss 2.8305, time 5260.01ms 
iter 3752: loss 2.6108, time 5300.32ms 
iter 3753: loss 2.6825, time 5265.34ms 
iter 3754: loss 2.8232, time 5345.81ms 
iter 3755: loss 2.6014, time 5292.19ms 
iter 3756: loss 2.6485, time 5280.16ms 
iter 3757: loss 2.6412, time 5323.47ms 
iter 3758: loss 2.6458, time 5275.72ms 
iter 3759: loss 2.5626, time 5261.37ms 
iter 3760: loss 2.7001, time 5263.20ms 
iter 3761: loss 2.7409, time 5264.09ms 
iter 3762: loss 2.7612, time 5284.50ms 
iter 3763: loss 2.6222, time 5269.80ms 
iter 3764: loss 2.7234, time 5263.36ms 
iter 3765: loss 2.6624, time 5262.83ms 
iter 3766: loss 2.7200, time 5265.01ms 
iter 3767: loss 2.7540, time 5265.85ms 
iter 3768: loss 2.6966, time 5274.92ms 
iter 3769: loss 2.5979, time 5266.86ms 
iter 3770: loss 2.5889, time 5265.44ms 
iter 3771: loss 2.5633, time 5276.18ms 
iter 3772: loss 2.9976, time 5301.07ms 
iter 3773: loss 2.3560, time 5303.93ms 
iter 3774: loss 2.6339, time 5300.55ms 
iter 3775: loss 2.8649, time 5264.21ms 
iter 3776: loss 2.7080, time 5265.43ms 
iter 3777: loss 2.5035, time 5268.95ms 
iter 3778: loss 2.5990, time 5270.08ms 
iter 3779: loss 2.6364, time 5270.58ms 
iter 3780: loss 2.5020, time 5268.69ms 
iter 3781: loss 2.7230, time 5260.33ms 
iter 3782: loss 2.9176, time 5324.53ms 
iter 3783: loss 2.5361, time 5281.17ms 
iter 3784: loss 2.5537, time 5279.27ms 
iter 3785: loss 2.7793, time 5267.30ms 
iter 3786: loss 2.8608, time 5267.99ms 
iter 3787: loss 2.6626, time 5265.93ms 
iter 3788: loss 2.6836, time 5276.20ms 
iter 3789: loss 2.6784, time 5264.69ms 
iter 3790: loss 2.5644, time 5257.16ms 
iter 3791: loss 2.7248, time 5269.35ms 
iter 3792: loss 2.6436, time 5268.46ms 
iter 3793: loss 2.6698, time 5264.25ms 
iter 3794: loss 2.8261, time 5265.51ms 
iter 3795: loss 2.7598, time 5259.69ms 
iter 3796: loss 2.7064, time 5255.73ms 
iter 3797: loss 2.6250, time 5261.91ms 
iter 3798: loss 2.6426, time 5265.98ms 
iter 3799: loss 2.5944, time 5280.47ms 
step 3800: train loss 2.6630, val loss 2.8615
iter 3800: loss 2.6355, time 20091.07ms 
iter 3801: loss 2.6636, time 5262.73ms 
iter 3802: loss 2.5665, time 5235.65ms 
iter 3803: loss 2.6032, time 5260.03ms 
iter 3804: loss 2.5753, time 5262.16ms 
iter 3805: loss 2.5972, time 5267.63ms 
iter 3806: loss 2.7672, time 5282.52ms 
iter 3807: loss 2.5828, time 5278.15ms 
iter 3808: loss 2.6603, time 5274.11ms 
iter 3809: loss 2.6951, time 5265.96ms 
iter 3810: loss 2.6345, time 5263.66ms 
iter 3811: loss 2.8085, time 5282.88ms 
iter 3812: loss 2.5088, time 5276.68ms 
iter 3813: loss 2.8949, time 5264.72ms 
iter 3814: loss 2.7326, time 5272.18ms 
iter 3815: loss 2.5124, time 5272.23ms 
iter 3816: loss 2.6812, time 5278.82ms 
iter 3817: loss 2.6989, time 5285.14ms 
iter 3818: loss 2.5461, time 5271.61ms 
iter 3819: loss 2.6522, time 5263.76ms 
iter 3820: loss 2.6394, time 5268.14ms 
iter 3821: loss 2.6717, time 5284.01ms 
iter 3822: loss 2.6223, time 5288.98ms 
iter 3823: loss 2.8406, time 5262.26ms 
iter 3824: loss 2.4292, time 5276.73ms 
iter 3825: loss 2.6023, time 5277.59ms 
iter 3826: loss 2.5582, time 5288.55ms 
iter 3827: loss 2.9629, time 5265.51ms 
iter 3828: loss 2.6939, time 5260.74ms 
iter 3829: loss 2.6612, time 5340.20ms 
iter 3830: loss 2.6835, time 5339.69ms 
iter 3831: loss 2.7180, time 5304.83ms 
iter 3832: loss 2.7104, time 5310.56ms 
iter 3833: loss 2.7170, time 5341.00ms 
iter 3834: loss 2.6335, time 5291.18ms 
iter 3835: loss 2.5305, time 5333.03ms 
iter 3836: loss 2.6104, time 5291.14ms 
iter 3837: loss 2.6352, time 5223.80ms 
iter 3838: loss 2.6001, time 5238.38ms 
iter 3839: loss 2.7913, time 5252.68ms 
iter 3840: loss 2.5004, time 5255.10ms 
iter 3841: loss 2.7068, time 5273.79ms 
iter 3842: loss 2.6276, time 5259.78ms 
iter 3843: loss 2.5857, time 5264.38ms 
iter 3844: loss 2.6302, time 5271.18ms 
iter 3845: loss 2.6022, time 5256.69ms 
iter 3846: loss 2.5843, time 5255.82ms 
iter 3847: loss 2.7038, time 5265.98ms 
iter 3848: loss 2.6841, time 5263.63ms 
iter 3849: loss 2.6158, time 5259.72ms 
step 3850: train loss 2.6605, val loss 2.8423
iter 3850: loss 2.4832, time 20022.73ms 
iter 3851: loss 2.9726, time 5250.66ms 
iter 3852: loss 2.6576, time 5272.50ms 
iter 3853: loss 2.6134, time 5270.18ms 
iter 3854: loss 2.7092, time 5319.37ms 
iter 3855: loss 2.4584, time 5281.59ms 
iter 3856: loss 2.6616, time 5262.94ms 
iter 3857: loss 2.6368, time 5267.09ms 
iter 3858: loss 2.7846, time 5266.50ms 
iter 3859: loss 2.8497, time 5267.22ms 
iter 3860: loss 2.8076, time 5269.47ms 
iter 3861: loss 2.5138, time 5265.20ms 
iter 3862: loss 2.5982, time 5269.25ms 
iter 3863: loss 2.6851, time 5273.69ms 
iter 3864: loss 2.7845, time 5280.65ms 
iter 3865: loss 2.5568, time 5272.41ms 
iter 3866: loss 2.8302, time 5279.30ms 
iter 3867: loss 2.7638, time 5278.10ms 
iter 3868: loss 2.5887, time 5262.53ms 
iter 3869: loss 2.3800, time 5267.25ms 
iter 3870: loss 2.7328, time 5256.94ms 
iter 3871: loss 2.5848, time 5266.33ms 
iter 3872: loss 2.6271, time 5264.54ms 
iter 3873: loss 2.4943, time 5267.85ms 
iter 3874: loss 2.8238, time 5284.57ms 
iter 3875: loss 2.6804, time 5270.58ms 
iter 3876: loss 2.8131, time 5265.96ms 
iter 3877: loss 2.6976, time 5279.08ms 
iter 3878: loss 2.6935, time 5273.30ms 
iter 3879: loss 2.7666, time 5272.48ms 
iter 3880: loss 2.5344, time 5283.75ms 
iter 3881: loss 2.5272, time 5266.61ms 
iter 3882: loss 2.3785, time 5264.24ms 
iter 3883: loss 2.8877, time 5256.66ms 
iter 3884: loss 2.7265, time 5258.10ms 
iter 3885: loss 2.5575, time 5272.96ms 
iter 3886: loss 2.7074, time 5269.35ms 
iter 3887: loss 2.6056, time 5264.23ms 
iter 3888: loss 2.7183, time 5250.63ms 
iter 3889: loss 2.7842, time 5274.55ms 
iter 3890: loss 2.9292, time 5283.24ms 
iter 3891: loss 2.5860, time 5272.39ms 
iter 3892: loss 2.8259, time 5256.87ms 
iter 3893: loss 2.5353, time 5259.84ms 
iter 3894: loss 2.3368, time 5285.88ms 
iter 3895: loss 2.7331, time 5274.67ms 
iter 3896: loss 2.4852, time 5277.17ms 
iter 3897: loss 2.6853, time 5280.02ms 
iter 3898: loss 2.6094, time 5263.08ms 
iter 3899: loss 2.6245, time 5252.22ms 
step 3900: train loss 2.6526, val loss 2.8513
iter 3900: loss 2.6962, time 20122.14ms 
iter 3901: loss 2.6739, time 5292.39ms 
iter 3902: loss 2.6533, time 5269.22ms 
iter 3903: loss 2.5269, time 5264.79ms 
iter 3904: loss 2.6442, time 5277.25ms 
iter 3905: loss 2.6067, time 5270.63ms 
iter 3906: loss 2.6373, time 5275.04ms 
iter 3907: loss 2.6082, time 5310.61ms 
iter 3908: loss 2.7507, time 5272.99ms 
iter 3909: loss 2.4533, time 5312.30ms 
iter 3910: loss 2.6320, time 5260.68ms 
iter 3911: loss 2.7594, time 5277.57ms 
iter 3912: loss 2.4444, time 5273.54ms 
iter 3913: loss 2.8391, time 5279.38ms 
iter 3914: loss 2.4687, time 5228.84ms 
iter 3915: loss 2.7584, time 5263.39ms 
iter 3916: loss 2.5767, time 5256.10ms 
iter 3917: loss 2.5818, time 5259.53ms 
iter 3918: loss 2.6047, time 5261.56ms 
iter 3919: loss 2.9075, time 5281.39ms 
iter 3920: loss 2.5336, time 5326.46ms 
iter 3921: loss 2.7249, time 5337.45ms 
iter 3922: loss 2.7270, time 5328.39ms 
iter 3923: loss 2.8633, time 5328.12ms 
iter 3924: loss 2.5055, time 5346.25ms 
iter 3925: loss 2.4249, time 5338.64ms 
iter 3926: loss 2.7982, time 5307.11ms 
iter 3927: loss 2.7925, time 5261.82ms 
iter 3928: loss 2.5771, time 5261.71ms 
iter 3929: loss 2.4875, time 5261.44ms 
iter 3930: loss 2.5846, time 5301.06ms 
iter 3931: loss 2.6143, time 5341.84ms 
iter 3932: loss 2.5106, time 5339.55ms 
iter 3933: loss 2.5048, time 5268.45ms 
iter 3934: loss 2.5390, time 5274.36ms 
iter 3935: loss 2.6065, time 5273.11ms 
iter 3936: loss 2.6841, time 5305.55ms 
iter 3937: loss 2.9076, time 5296.51ms 
iter 3938: loss 2.7274, time 5283.14ms 
iter 3939: loss 2.6986, time 5286.01ms 
iter 3940: loss 2.5865, time 5333.68ms 
iter 3941: loss 2.5156, time 5294.48ms 
iter 3942: loss 2.6191, time 5261.53ms 
iter 3943: loss 2.5416, time 5291.73ms 
iter 3944: loss 2.7617, time 5275.47ms 
iter 3945: loss 2.6830, time 5269.76ms 
iter 3946: loss 2.6185, time 5264.58ms 
iter 3947: loss 2.5562, time 5254.93ms 
iter 3948: loss 2.4616, time 5262.00ms 
iter 3949: loss 2.6342, time 5264.69ms 
step 3950: train loss 2.6450, val loss 2.8555
iter 3950: loss 2.6971, time 20056.43ms 
iter 3951: loss 2.4984, time 5263.52ms 
iter 3952: loss 2.8081, time 5257.68ms 
iter 3953: loss 2.6233, time 5267.19ms 
iter 3954: loss 2.5927, time 5234.78ms 
iter 3955: loss 2.6851, time 5254.57ms 
iter 3956: loss 2.5172, time 5260.91ms 
iter 3957: loss 2.5189, time 5262.48ms 
iter 3958: loss 2.6407, time 5259.56ms 
iter 3959: loss 2.6391, time 5256.65ms 
iter 3960: loss 2.8550, time 5262.92ms 
iter 3961: loss 2.6914, time 5259.59ms 
iter 3962: loss 2.6601, time 5281.15ms 
iter 3963: loss 2.5526, time 5265.25ms 
iter 3964: loss 2.6140, time 5263.99ms 
iter 3965: loss 2.5532, time 5270.78ms 
iter 3966: loss 2.8257, time 5272.68ms 
iter 3967: loss 2.5333, time 5259.31ms 
iter 3968: loss 2.7192, time 5266.36ms 
iter 3969: loss 2.5120, time 5247.49ms 
iter 3970: loss 2.7637, time 5252.92ms 
iter 3971: loss 2.6399, time 5264.45ms 
iter 3972: loss 2.6187, time 5273.80ms 
iter 3973: loss 2.5998, time 5270.53ms 
iter 3974: loss 2.5658, time 5267.48ms 
iter 3975: loss 2.5400, time 5265.27ms 
iter 3976: loss 2.6478, time 5271.80ms 
iter 3977: loss 2.6052, time 5228.92ms 
iter 3978: loss 2.8759, time 5256.71ms 
iter 3979: loss 2.6892, time 5255.61ms 
iter 3980: loss 2.8920, time 5254.72ms 
iter 3981: loss 2.7251, time 5268.66ms 
iter 3982: loss 2.7220, time 5263.89ms 
iter 3983: loss 2.8098, time 5262.83ms 
iter 3984: loss 2.8204, time 5262.69ms 
iter 3985: loss 2.7678, time 5276.03ms 
iter 3986: loss 2.8031, time 5269.50ms 
iter 3987: loss 2.6024, time 5266.01ms 
iter 3988: loss 2.4227, time 5267.01ms 
iter 3989: loss 2.7486, time 5273.66ms 
iter 3990: loss 2.7704, time 5248.37ms 
iter 3991: loss 2.6662, time 5253.23ms 
iter 3992: loss 2.6724, time 5261.04ms 
iter 3993: loss 2.5976, time 5262.20ms 
iter 3994: loss 2.4615, time 5252.32ms 
iter 3995: loss 2.5159, time 5254.75ms 
iter 3996: loss 2.6098, time 5257.83ms 
iter 3997: loss 2.7151, time 5261.04ms 
iter 3998: loss 2.5534, time 5257.93ms 
iter 3999: loss 2.6243, time 5247.48ms 
step 4000: train loss 2.6480, val loss 2.8419
iter 4000: loss 2.5744, time 19919.96ms 
iter 4001: loss 2.7911, time 5258.85ms 
iter 4002: loss 2.7011, time 5273.91ms 
iter 4003: loss 2.7937, time 5256.11ms 
iter 4004: loss 2.9075, time 5253.30ms 
iter 4005: loss 2.6615, time 5271.94ms 
iter 4006: loss 2.6595, time 5226.78ms 
iter 4007: loss 2.8786, time 5268.55ms 
iter 4008: loss 2.6190, time 5267.72ms 
iter 4009: loss 2.6162, time 5264.62ms 
iter 4010: loss 2.5789, time 5266.56ms 
iter 4011: loss 2.7561, time 5277.63ms 
iter 4012: loss 2.5519, time 5274.30ms 
iter 4013: loss 2.5388, time 5332.45ms 
iter 4014: loss 2.7072, time 5345.53ms 
iter 4015: loss 2.6524, time 5321.43ms 
iter 4016: loss 2.6250, time 5226.56ms 
iter 4017: loss 2.6685, time 5217.07ms 
iter 4018: loss 2.6305, time 5253.90ms 
iter 4019: loss 2.5360, time 5264.53ms 
iter 4020: loss 2.6661, time 5247.00ms 
iter 4021: loss 2.6575, time 5248.09ms 
iter 4022: loss 2.7704, time 5250.57ms 
iter 4023: loss 2.5756, time 5249.69ms 
iter 4024: loss 2.7663, time 5248.27ms 
iter 4025: loss 2.7266, time 5241.60ms 
iter 4026: loss 2.6298, time 5250.02ms 
iter 4027: loss 2.7579, time 5244.55ms 
iter 4028: loss 2.7972, time 5256.47ms 
iter 4029: loss 2.4899, time 5236.09ms 
iter 4030: loss 2.6205, time 5246.61ms 
iter 4031: loss 2.5504, time 5244.71ms 
iter 4032: loss 2.4290, time 5245.66ms 
iter 4033: loss 2.6192, time 5245.22ms 
iter 4034: loss 2.8436, time 5251.30ms 
iter 4035: loss 2.3952, time 5261.34ms 
iter 4036: loss 2.8934, time 5258.15ms 
iter 4037: loss 2.4487, time 5254.20ms 
iter 4038: loss 2.7396, time 5255.32ms 
iter 4039: loss 2.8407, time 5256.21ms 
iter 4040: loss 2.5575, time 5262.19ms 
iter 4041: loss 2.6009, time 5161.18ms 
iter 4042: loss 2.7640, time 5254.65ms 
iter 4043: loss 2.7480, time 5258.59ms 
iter 4044: loss 2.7243, time 5234.42ms 
iter 4045: loss 2.9032, time 5222.70ms 
iter 4046: loss 2.7243, time 5219.46ms 
iter 4047: loss 2.6112, time 5240.99ms 
iter 4048: loss 2.7153, time 5248.89ms 
iter 4049: loss 2.5522, time 5246.83ms 
step 4050: train loss 2.6353, val loss 2.8399
iter 4050: loss 2.5920, time 20030.13ms 
iter 4051: loss 2.5477, time 5241.07ms 
iter 4052: loss 2.4404, time 5255.06ms 
iter 4053: loss 2.5966, time 5261.88ms 
iter 4054: loss 2.7072, time 5260.11ms 
iter 4055: loss 2.6306, time 5254.85ms 
iter 4056: loss 2.4870, time 5263.01ms 
iter 4057: loss 2.7690, time 5250.61ms 
iter 4058: loss 2.8203, time 5238.56ms 
iter 4059: loss 2.4792, time 5247.67ms 
iter 4060: loss 2.7930, time 5248.94ms 
iter 4061: loss 2.4604, time 5254.59ms 
iter 4062: loss 2.5828, time 5210.69ms 
iter 4063: loss 2.7270, time 5158.93ms 
iter 4064: loss 2.6576, time 5127.08ms 
iter 4065: loss 2.4832, time 5162.68ms 
iter 4066: loss 2.6418, time 5157.75ms 
iter 4067: loss 2.7819, time 5176.17ms 
iter 4068: loss 2.7094, time 5166.63ms 
iter 4069: loss 2.5774, time 5173.82ms 
iter 4070: loss 2.5823, time 5247.48ms 
iter 4071: loss 2.6419, time 5251.06ms 
iter 4072: loss 2.7020, time 5277.35ms 
iter 4073: loss 2.3250, time 5260.37ms 
iter 4074: loss 2.5837, time 5259.87ms 
iter 4075: loss 2.7137, time 5273.14ms 
iter 4076: loss 2.5772, time 5270.76ms 
iter 4077: loss 2.6153, time 5260.55ms 
iter 4078: loss 2.6556, time 5291.29ms 
iter 4079: loss 2.6156, time 5288.43ms 
iter 4080: loss 2.4992, time 5286.52ms 
iter 4081: loss 2.9178, time 5306.07ms 
iter 4082: loss 2.6412, time 5314.81ms 
iter 4083: loss 2.6318, time 5295.68ms 
iter 4084: loss 2.6659, time 5270.43ms 
iter 4085: loss 2.7268, time 5283.08ms 
iter 4086: loss 2.6323, time 5275.58ms 
iter 4087: loss 2.7135, time 5260.37ms 
iter 4088: loss 2.7490, time 5262.70ms 
iter 4089: loss 2.5782, time 5268.37ms 
iter 4090: loss 2.5227, time 5271.28ms 
iter 4091: loss 2.5933, time 5273.72ms 
iter 4092: loss 2.7124, time 5282.90ms 
iter 4093: loss 2.8844, time 5261.66ms 
iter 4094: loss 2.5064, time 5260.17ms 
iter 4095: loss 2.5563, time 5276.29ms 
iter 4096: loss 2.5707, time 5265.29ms 
iter 4097: loss 2.5990, time 5268.73ms 
iter 4098: loss 2.6255, time 5269.64ms 
iter 4099: loss 2.6054, time 5307.15ms 
step 4100: train loss 2.6359, val loss 2.8528
iter 4100: loss 2.5544, time 20097.32ms 
iter 4101: loss 2.7068, time 5279.43ms 
iter 4102: loss 2.5995, time 5280.48ms 
iter 4103: loss 2.6418, time 5281.32ms 
iter 4104: loss 2.7818, time 5272.56ms 
iter 4105: loss 2.6129, time 5270.21ms 
iter 4106: loss 2.7245, time 5273.11ms 
iter 4107: loss 2.7264, time 5281.71ms 
iter 4108: loss 2.6557, time 5278.44ms 
iter 4109: loss 2.7687, time 5271.56ms 
iter 4110: loss 2.6436, time 5264.94ms 
iter 4111: loss 2.6467, time 5268.06ms 
iter 4112: loss 2.5262, time 5270.98ms 
iter 4113: loss 2.7015, time 5259.42ms 
iter 4114: loss 2.7904, time 5258.81ms 
iter 4115: loss 2.7987, time 5265.74ms 
iter 4116: loss 2.5693, time 5281.70ms 
iter 4117: loss 2.4893, time 5280.23ms 
iter 4118: loss 2.5915, time 5264.74ms 
iter 4119: loss 2.9785, time 5259.50ms 
iter 4120: loss 2.6806, time 5262.46ms 
iter 4121: loss 2.7305, time 5315.09ms 
iter 4122: loss 2.6436, time 5252.53ms 
iter 4123: loss 2.4989, time 5271.20ms 
iter 4124: loss 2.6650, time 5258.46ms 
iter 4125: loss 2.5350, time 5265.67ms 
iter 4126: loss 2.4991, time 5300.40ms 
iter 4127: loss 2.8360, time 5320.81ms 
iter 4128: loss 2.4530, time 5276.24ms 
iter 4129: loss 2.6196, time 5279.36ms 
iter 4130: loss 2.5249, time 5285.66ms 
iter 4131: loss 2.6368, time 5269.44ms 
iter 4132: loss 2.7130, time 5262.85ms 
iter 4133: loss 2.5119, time 5266.70ms 
iter 4134: loss 2.6673, time 5264.38ms 
iter 4135: loss 2.5719, time 5278.64ms 
iter 4136: loss 2.5371, time 5281.22ms 
iter 4137: loss 2.6753, time 5275.93ms 
iter 4138: loss 2.6810, time 5274.74ms 
iter 4139: loss 2.6590, time 5274.58ms 
iter 4140: loss 2.6335, time 5298.89ms 
iter 4141: loss 2.5420, time 5272.36ms 
iter 4142: loss 2.5704, time 5309.89ms 
iter 4143: loss 2.6609, time 5339.74ms 
iter 4144: loss 2.7190, time 5311.87ms 
iter 4145: loss 2.6454, time 5302.42ms 
iter 4146: loss 2.5468, time 5277.41ms 
iter 4147: loss 2.5421, time 5290.34ms 
iter 4148: loss 2.6237, time 5304.50ms 
iter 4149: loss 2.8618, time 5339.67ms 
step 4150: train loss 2.6330, val loss 2.8228
iter 4150: loss 2.7507, time 20173.87ms 
iter 4151: loss 2.6490, time 5294.27ms 
iter 4152: loss 2.4113, time 5284.94ms 
iter 4153: loss 2.7295, time 5290.23ms 
iter 4154: loss 2.6565, time 5286.56ms 
iter 4155: loss 2.6975, time 5285.36ms 
iter 4156: loss 2.7623, time 5285.62ms 
iter 4157: loss 2.6167, time 5332.07ms 
iter 4158: loss 2.6658, time 5265.52ms 
iter 4159: loss 2.5554, time 5269.79ms 
iter 4160: loss 2.3806, time 5267.03ms 
iter 4161: loss 2.5353, time 5280.24ms 
iter 4162: loss 2.6050, time 5275.29ms 
iter 4163: loss 2.7093, time 5250.75ms 
iter 4164: loss 2.7220, time 5269.22ms 
iter 4165: loss 2.8134, time 5261.90ms 
iter 4166: loss 2.3566, time 5259.54ms 
iter 4167: loss 2.6883, time 5266.74ms 
iter 4168: loss 2.6260, time 5269.03ms 
iter 4169: loss 2.7079, time 5264.36ms 
iter 4170: loss 2.6264, time 5264.62ms 
iter 4171: loss 2.4156, time 5276.49ms 
iter 4172: loss 2.5623, time 5268.38ms 
iter 4173: loss 2.5007, time 5270.91ms 
iter 4174: loss 2.5886, time 5275.35ms 
iter 4175: loss 2.8201, time 5273.22ms 
iter 4176: loss 2.5821, time 5271.54ms 
iter 4177: loss 2.8734, time 5291.00ms 
iter 4178: loss 2.5452, time 5312.35ms 
iter 4179: loss 2.6829, time 5271.94ms 
iter 4180: loss 2.7935, time 5284.01ms 
iter 4181: loss 2.4542, time 5299.01ms 
iter 4182: loss 2.6741, time 5277.36ms 
iter 4183: loss 2.5757, time 5277.81ms 
iter 4184: loss 2.6233, time 5274.21ms 
iter 4185: loss 2.6569, time 5258.38ms 
iter 4186: loss 2.5614, time 5321.64ms 
iter 4187: loss 2.5862, time 5277.33ms 
iter 4188: loss 2.6803, time 5276.67ms 
iter 4189: loss 2.6049, time 5273.05ms 
iter 4190: loss 2.4491, time 5279.03ms 
iter 4191: loss 2.4792, time 5285.71ms 
iter 4192: loss 2.6434, time 5287.26ms 
iter 4193: loss 2.6001, time 5271.45ms 
iter 4194: loss 2.7891, time 5288.99ms 
iter 4195: loss 2.4814, time 5276.90ms 
iter 4196: loss 2.7291, time 5275.37ms 
iter 4197: loss 2.6741, time 5286.03ms 
iter 4198: loss 2.7152, time 5270.36ms 
iter 4199: loss 2.7086, time 5308.99ms 
step 4200: train loss 2.6205, val loss 2.8440
iter 4200: loss 2.4442, time 20156.81ms 
iter 4201: loss 2.5680, time 5309.80ms 
iter 4202: loss 2.7825, time 5282.51ms 
iter 4203: loss 2.6565, time 5347.17ms 
iter 4204: loss 2.6446, time 5338.43ms 
iter 4205: loss 2.6198, time 5338.55ms 
iter 4206: loss 2.6709, time 5332.07ms 
iter 4207: loss 2.5436, time 5327.32ms 
iter 4208: loss 2.8896, time 5346.48ms 
iter 4209: loss 2.6606, time 5342.25ms 
iter 4210: loss 2.8014, time 5322.37ms 
iter 4211: loss 2.6567, time 5264.73ms 
iter 4212: loss 2.5867, time 5267.21ms 
iter 4213: loss 2.3991, time 5266.23ms 
iter 4214: loss 2.3291, time 5255.35ms 
iter 4215: loss 2.4771, time 5259.12ms 
iter 4216: loss 2.7399, time 5260.06ms 
iter 4217: loss 2.6770, time 5264.19ms 
iter 4218: loss 2.6784, time 5263.40ms 
iter 4219: loss 2.7209, time 5259.14ms 
iter 4220: loss 2.5897, time 5256.87ms 
iter 4221: loss 2.6590, time 5259.35ms 
iter 4222: loss 2.4852, time 5263.91ms 
iter 4223: loss 2.7341, time 5258.44ms 
iter 4224: loss 2.6500, time 5258.07ms 
iter 4225: loss 2.5604, time 5256.69ms 
iter 4226: loss 2.5382, time 5255.79ms 
iter 4227: loss 2.7138, time 5272.96ms 
iter 4228: loss 2.7087, time 5288.81ms 
iter 4229: loss 2.4830, time 5281.60ms 
iter 4230: loss 2.7170, time 5285.87ms 
iter 4231: loss 2.6165, time 5254.59ms 
iter 4232: loss 2.6778, time 5265.84ms 
iter 4233: loss 2.7517, time 5265.42ms 
iter 4234: loss 2.6742, time 5255.41ms 
iter 4235: loss 2.4346, time 5258.44ms 
iter 4236: loss 2.6376, time 5267.27ms 
iter 4237: loss 2.9487, time 5283.41ms 
iter 4238: loss 2.5654, time 5262.31ms 
iter 4239: loss 2.5234, time 5263.63ms 
iter 4240: loss 2.6014, time 5265.45ms 
iter 4241: loss 2.4830, time 5271.32ms 
iter 4242: loss 2.5614, time 5284.49ms 
iter 4243: loss 2.4767, time 5277.38ms 
iter 4244: loss 2.6834, time 5269.92ms 
iter 4245: loss 2.5923, time 5258.53ms 
iter 4246: loss 2.6273, time 5241.61ms 
iter 4247: loss 2.5080, time 5259.82ms 
iter 4248: loss 2.7075, time 5263.36ms 
iter 4249: loss 2.7058, time 5259.90ms 
step 4250: train loss 2.6356, val loss 2.8405
iter 4250: loss 2.6318, time 20066.74ms 
iter 4251: loss 2.4462, time 5262.01ms 
iter 4252: loss 2.4200, time 5309.28ms 
iter 4253: loss 2.7414, time 5280.50ms 
iter 4254: loss 2.5674, time 5337.81ms 
iter 4255: loss 2.6208, time 5291.26ms 
iter 4256: loss 2.7114, time 5307.12ms 
iter 4257: loss 2.5249, time 5329.59ms 
iter 4258: loss 2.6013, time 5328.89ms 
iter 4259: loss 2.5403, time 5280.00ms 
iter 4260: loss 2.8258, time 5314.78ms 
iter 4261: loss 2.5937, time 5308.86ms 
iter 4262: loss 2.6394, time 5345.99ms 
iter 4263: loss 2.8706, time 5293.55ms 
iter 4264: loss 2.6312, time 5259.04ms 
iter 4265: loss 2.6716, time 5264.86ms 
iter 4266: loss 2.7324, time 5258.12ms 
iter 4267: loss 2.7191, time 5248.45ms 
iter 4268: loss 2.5075, time 5278.62ms 
iter 4269: loss 2.5567, time 5271.05ms 
iter 4270: loss 2.5701, time 5262.85ms 
iter 4271: loss 2.8143, time 5279.39ms 
iter 4272: loss 2.6251, time 5266.45ms 
iter 4273: loss 2.5787, time 5264.07ms 
iter 4274: loss 2.7407, time 5264.40ms 
iter 4275: loss 2.6027, time 5264.46ms 
iter 4276: loss 2.6825, time 5274.38ms 
iter 4277: loss 2.5808, time 5262.00ms 
iter 4278: loss 2.4738, time 5256.33ms 
iter 4279: loss 2.6963, time 5268.05ms 
iter 4280: loss 2.7030, time 5267.19ms 
iter 4281: loss 2.7390, time 5269.04ms 
iter 4282: loss 2.5642, time 5272.70ms 
iter 4283: loss 2.6201, time 5272.71ms 
iter 4284: loss 2.7851, time 5259.88ms 
iter 4285: loss 2.6543, time 5261.26ms 
iter 4286: loss 2.6018, time 5266.45ms 
iter 4287: loss 2.6194, time 5254.33ms 
iter 4288: loss 2.6065, time 5267.00ms 
iter 4289: loss 2.6570, time 5275.83ms 
iter 4290: loss 2.6050, time 5268.28ms 
iter 4291: loss 2.7195, time 5272.25ms 
iter 4292: loss 2.5246, time 5274.05ms 
iter 4293: loss 2.4922, time 5256.93ms 
iter 4294: loss 2.6209, time 5263.25ms 
iter 4295: loss 2.5413, time 5268.87ms 
iter 4296: loss 2.5712, time 5276.82ms 
iter 4297: loss 2.4010, time 5269.98ms 
iter 4298: loss 2.5501, time 5268.04ms 
iter 4299: loss 2.7324, time 5257.25ms 
step 4300: train loss 2.6189, val loss 2.8524
iter 4300: loss 2.6424, time 20102.26ms 
iter 4301: loss 2.3995, time 5279.96ms 
iter 4302: loss 2.5903, time 5271.69ms 
iter 4303: loss 2.4143, time 5270.17ms 
iter 4304: loss 2.6347, time 5269.34ms 
iter 4305: loss 2.4783, time 5269.48ms 
iter 4306: loss 2.7027, time 5268.11ms 
iter 4307: loss 2.6376, time 5268.49ms 
iter 4308: loss 2.6224, time 5271.35ms 
iter 4309: loss 2.6087, time 5284.00ms 
iter 4310: loss 2.6511, time 5268.92ms 
iter 4311: loss 2.5776, time 5265.64ms 
iter 4312: loss 2.6502, time 5275.61ms 
iter 4313: loss 2.6279, time 5279.41ms 
iter 4314: loss 2.6910, time 5314.94ms 
iter 4315: loss 2.6349, time 5337.70ms 
iter 4316: loss 2.6838, time 5249.48ms 
iter 4317: loss 2.4926, time 5262.07ms 
iter 4318: loss 2.6070, time 5269.79ms 
iter 4319: loss 2.5998, time 5275.71ms 
iter 4320: loss 2.6358, time 5277.24ms 
iter 4321: loss 2.8694, time 5267.29ms 
iter 4322: loss 2.7944, time 5270.88ms 
iter 4323: loss 2.7942, time 5274.25ms 
iter 4324: loss 2.7247, time 5266.93ms 
iter 4325: loss 2.5382, time 5268.43ms 
iter 4326: loss 2.6926, time 5244.96ms 
iter 4327: loss 2.6925, time 5225.32ms 
iter 4328: loss 2.6662, time 5254.92ms 
iter 4329: loss 2.5317, time 5268.57ms 
iter 4330: loss 2.5645, time 5260.24ms 
iter 4331: loss 2.6816, time 5264.00ms 
iter 4332: loss 2.8858, time 5258.03ms 
iter 4333: loss 2.7761, time 5258.63ms 
iter 4334: loss 2.5108, time 5267.43ms 
iter 4335: loss 2.4573, time 5258.04ms 
iter 4336: loss 2.5284, time 5260.11ms 
iter 4337: loss 2.6026, time 5261.32ms 
iter 4338: loss 2.6003, time 5269.32ms 
iter 4339: loss 2.5315, time 5275.33ms 
iter 4340: loss 2.3244, time 5261.38ms 
iter 4341: loss 2.6668, time 5261.95ms 
iter 4342: loss 2.6245, time 5250.61ms 
iter 4343: loss 2.5436, time 5272.42ms 
iter 4344: loss 2.5559, time 5272.03ms 
iter 4345: loss 2.6336, time 5271.10ms 
iter 4346: loss 2.4386, time 5279.22ms 
iter 4347: loss 2.6348, time 5270.23ms 
iter 4348: loss 2.5881, time 5262.91ms 
iter 4349: loss 2.5999, time 5269.58ms 
step 4350: train loss 2.6352, val loss 2.8564
iter 4350: loss 2.7370, time 20104.97ms 
iter 4351: loss 2.4727, time 5269.45ms 
iter 4352: loss 2.6446, time 5266.99ms 
iter 4353: loss 2.6502, time 5234.48ms 
iter 4354: loss 2.7413, time 5267.50ms 
iter 4355: loss 2.7459, time 5268.06ms 
iter 4356: loss 2.5920, time 5253.86ms 
iter 4357: loss 2.6484, time 5266.78ms 
iter 4358: loss 2.7728, time 5260.29ms 
iter 4359: loss 2.6929, time 5276.12ms 
iter 4360: loss 2.5879, time 5270.47ms 
iter 4361: loss 2.5492, time 5273.13ms 
iter 4362: loss 2.6975, time 5275.15ms 
iter 4363: loss 2.7235, time 5269.63ms 
iter 4364: loss 2.7199, time 5265.46ms 
iter 4365: loss 2.6211, time 5267.41ms 
iter 4366: loss 2.5667, time 5266.13ms 
iter 4367: loss 2.4341, time 5278.29ms 
iter 4368: loss 2.5283, time 5272.50ms 
iter 4369: loss 2.4888, time 5265.92ms 
iter 4370: loss 2.5055, time 5268.71ms 
iter 4371: loss 2.6262, time 5261.13ms 
iter 4372: loss 2.7072, time 5266.83ms 
iter 4373: loss 2.5953, time 5275.82ms 
iter 4374: loss 2.5569, time 5268.66ms 
iter 4375: loss 2.4961, time 5261.01ms 
iter 4376: loss 2.7487, time 5259.07ms 
iter 4377: loss 2.4023, time 5273.04ms 
iter 4378: loss 2.4735, time 5265.02ms 
iter 4379: loss 2.6768, time 5262.72ms 
iter 4380: loss 2.6149, time 5270.54ms 
iter 4381: loss 2.7195, time 5266.29ms 
iter 4382: loss 2.6818, time 5267.86ms 
iter 4383: loss 2.5704, time 5257.30ms 
iter 4384: loss 2.5744, time 5265.99ms 
iter 4385: loss 2.7251, time 5273.74ms 
iter 4386: loss 2.5406, time 5274.66ms 
iter 4387: loss 2.4856, time 5284.53ms 
iter 4388: loss 2.4893, time 5286.45ms 
iter 4389: loss 2.6188, time 5346.99ms 
iter 4390: loss 2.5906, time 5322.70ms 
iter 4391: loss 2.7806, time 5325.43ms 
iter 4392: loss 2.5289, time 5342.21ms 
iter 4393: loss 2.5741, time 5266.25ms 
iter 4394: loss 2.6540, time 5277.16ms 
iter 4395: loss 2.7053, time 5308.57ms 
iter 4396: loss 2.5421, time 5343.18ms 
iter 4397: loss 2.6160, time 5312.23ms 
iter 4398: loss 2.7367, time 5292.91ms 
iter 4399: loss 2.5347, time 5295.33ms 
step 4400: train loss 2.6217, val loss 2.8569
iter 4400: loss 2.7379, time 20115.76ms 
iter 4401: loss 2.6258, time 5274.25ms 
iter 4402: loss 2.4801, time 5273.39ms 
iter 4403: loss 2.5066, time 5272.25ms 
iter 4404: loss 2.6532, time 5251.80ms 
iter 4405: loss 2.5741, time 5264.30ms 
iter 4406: loss 2.7489, time 5260.72ms 
iter 4407: loss 2.5321, time 5268.62ms 
iter 4408: loss 2.7264, time 5283.19ms 
iter 4409: loss 2.7103, time 5288.40ms 
iter 4410: loss 2.5586, time 5266.59ms 
iter 4411: loss 2.6194, time 5277.34ms 
iter 4412: loss 2.5238, time 5270.60ms 
iter 4413: loss 2.7417, time 5280.78ms 
iter 4414: loss 2.7016, time 5280.84ms 
iter 4415: loss 2.5400, time 5256.42ms 
iter 4416: loss 2.5941, time 5271.78ms 
iter 4417: loss 2.5114, time 5260.37ms 
iter 4418: loss 2.7169, time 5270.81ms 
iter 4419: loss 2.6427, time 5269.55ms 
iter 4420: loss 2.5082, time 5260.71ms 
iter 4421: loss 2.6849, time 5297.10ms 
iter 4422: loss 2.4852, time 5312.37ms 
iter 4423: loss 2.7231, time 5283.78ms 
iter 4424: loss 2.6830, time 5301.03ms 
iter 4425: loss 2.4920, time 5337.50ms 
iter 4426: loss 2.4615, time 5334.83ms 
iter 4427: loss 2.6839, time 5336.05ms 
iter 4428: loss 2.5710, time 5313.16ms 
iter 4429: loss 2.5142, time 5271.46ms 
iter 4430: loss 2.5306, time 5275.85ms 
iter 4431: loss 2.6894, time 5265.22ms 
iter 4432: loss 2.6528, time 5256.65ms 
iter 4433: loss 2.5755, time 5261.64ms 
iter 4434: loss 2.3257, time 5281.72ms 
iter 4435: loss 2.6148, time 5266.15ms 
iter 4436: loss 2.6980, time 5261.88ms 
iter 4437: loss 2.6350, time 5262.09ms 
iter 4438: loss 2.5922, time 5273.49ms 
iter 4439: loss 2.5121, time 5267.91ms 
iter 4440: loss 2.5589, time 5266.62ms 
iter 4441: loss 2.4988, time 5316.23ms 
iter 4442: loss 2.6655, time 5280.18ms 
iter 4443: loss 2.7589, time 5270.89ms 
iter 4444: loss 2.5768, time 5332.29ms 
iter 4445: loss 2.5253, time 5265.95ms 
iter 4446: loss 2.7493, time 5266.11ms 
iter 4447: loss 2.5787, time 5271.28ms 
iter 4448: loss 2.8297, time 5259.46ms 
iter 4449: loss 2.6545, time 5293.41ms 
step 4450: train loss 2.6104, val loss 2.8408
iter 4450: loss 2.6700, time 20087.62ms 
iter 4451: loss 2.8540, time 5242.38ms 
iter 4452: loss 2.5822, time 5344.42ms 
iter 4453: loss 2.6275, time 5275.72ms 
iter 4454: loss 2.7004, time 5264.30ms 
iter 4455: loss 2.6510, time 5267.66ms 
iter 4456: loss 2.5098, time 5259.68ms 
iter 4457: loss 2.5695, time 5271.69ms 
iter 4458: loss 2.5657, time 5282.74ms 
iter 4459: loss 2.6822, time 5261.16ms 
iter 4460: loss 2.6878, time 5263.60ms 
iter 4461: loss 2.5891, time 5246.57ms 
iter 4462: loss 2.4931, time 5258.61ms 
iter 4463: loss 2.6258, time 5269.13ms 
iter 4464: loss 2.5404, time 5263.77ms 
iter 4465: loss 2.6029, time 5264.39ms 
iter 4466: loss 2.6722, time 5265.64ms 
iter 4467: loss 2.6994, time 5264.04ms 
iter 4468: loss 2.5372, time 5265.03ms 
iter 4469: loss 2.7211, time 5264.44ms 
iter 4470: loss 2.5088, time 5261.37ms 
iter 4471: loss 2.6507, time 5277.46ms 
iter 4472: loss 2.6580, time 5266.09ms 
iter 4473: loss 2.6912, time 5254.78ms 
iter 4474: loss 2.5081, time 5262.92ms 
iter 4475: loss 2.5415, time 5260.94ms 
iter 4476: loss 2.6084, time 5265.39ms 
iter 4477: loss 2.7433, time 5263.36ms 
iter 4478: loss 2.6732, time 5256.22ms 
iter 4479: loss 2.7032, time 5258.37ms 
iter 4480: loss 2.7427, time 5267.10ms 
iter 4481: loss 2.4866, time 5263.86ms 
iter 4482: loss 2.6603, time 5251.49ms 
iter 4483: loss 2.6613, time 5266.39ms 
iter 4484: loss 2.2479, time 5259.85ms 
iter 4485: loss 2.6067, time 5257.54ms 
iter 4486: loss 2.7123, time 5270.65ms 
iter 4487: loss 2.6534, time 5262.75ms 
iter 4488: loss 2.7340, time 5261.38ms 
iter 4489: loss 2.4320, time 5258.16ms 
iter 4490: loss 2.5924, time 5267.82ms 
iter 4491: loss 2.5932, time 5276.87ms 
iter 4492: loss 2.5021, time 5268.56ms 
iter 4493: loss 2.5491, time 5255.92ms 
iter 4494: loss 2.5592, time 5269.94ms 
iter 4495: loss 2.6335, time 5271.83ms 
iter 4496: loss 2.3973, time 5280.02ms 
iter 4497: loss 2.6713, time 5285.05ms 
iter 4498: loss 2.6528, time 5269.45ms 
iter 4499: loss 2.6331, time 5264.82ms 
step 4500: train loss 2.6313, val loss 2.8507
iter 4500: loss 2.4043, time 20082.96ms 
iter 4501: loss 2.6980, time 5256.70ms 
iter 4502: loss 2.6770, time 5269.52ms 
iter 4503: loss 2.6840, time 5265.32ms 
iter 4504: loss 2.5507, time 5262.56ms 
iter 4505: loss 2.6206, time 5254.35ms 
iter 4506: loss 2.5914, time 5257.17ms 
iter 4507: loss 2.5733, time 5262.49ms 
iter 4508: loss 2.6650, time 5279.70ms 
iter 4509: loss 2.6240, time 5262.12ms 
iter 4510: loss 2.5847, time 5274.71ms 
iter 4511: loss 2.4791, time 5269.43ms 
iter 4512: loss 2.6701, time 5256.04ms 
iter 4513: loss 2.5202, time 5272.36ms 
iter 4514: loss 2.6580, time 5262.84ms 
iter 4515: loss 2.3939, time 5256.80ms 
iter 4516: loss 2.4600, time 5261.40ms 
iter 4517: loss 2.4290, time 5259.82ms 
iter 4518: loss 2.5352, time 5241.34ms 
iter 4519: loss 2.6712, time 5256.45ms 
iter 4520: loss 2.3348, time 5205.76ms 
iter 4521: loss 2.7604, time 5231.97ms 
iter 4522: loss 2.4863, time 5158.51ms 
iter 4523: loss 2.5859, time 5136.55ms 
iter 4524: loss 2.7880, time 5136.14ms 
iter 4525: loss 2.7302, time 5137.10ms 
iter 4526: loss 2.6328, time 5110.49ms 
iter 4527: loss 2.5989, time 5092.61ms 
iter 4528: loss 2.4975, time 5101.32ms 
iter 4529: loss 2.7097, time 5087.54ms 
iter 4530: loss 2.4743, time 5130.34ms 
iter 4531: loss 2.7167, time 5075.95ms 
iter 4532: loss 2.5810, time 5104.03ms 
iter 4533: loss 2.5937, time 5166.26ms 
iter 4534: loss 2.4693, time 5125.37ms 
iter 4535: loss 2.7845, time 5141.64ms 
iter 4536: loss 2.5259, time 5111.90ms 
iter 4537: loss 2.7326, time 5133.88ms 
iter 4538: loss 2.5189, time 5269.98ms 
iter 4539: loss 2.6849, time 5276.76ms 
iter 4540: loss 2.6453, time 5269.87ms 
iter 4541: loss 2.5476, time 5266.29ms 
iter 4542: loss 2.3081, time 5274.38ms 
iter 4543: loss 2.7804, time 5279.36ms 
iter 4544: loss 2.7126, time 5273.94ms 
iter 4545: loss 2.6977, time 5257.27ms 
iter 4546: loss 2.5047, time 5270.71ms 
iter 4547: loss 2.5668, time 5278.24ms 
iter 4548: loss 2.6472, time 5267.39ms 
iter 4549: loss 2.7085, time 5264.24ms 
step 4550: train loss 2.6161, val loss 2.8551
iter 4550: loss 2.4725, time 20093.28ms 
iter 4551: loss 2.6310, time 5292.30ms 
iter 4552: loss 2.5467, time 5282.34ms 
iter 4553: loss 2.7141, time 5280.07ms 
iter 4554: loss 2.7161, time 5272.28ms 
iter 4555: loss 2.7551, time 5278.16ms 
iter 4556: loss 2.5865, time 5310.91ms 
iter 4557: loss 2.6094, time 5302.73ms 
iter 4558: loss 2.9426, time 5270.92ms 
iter 4559: loss 2.5287, time 5259.91ms 
iter 4560: loss 2.7042, time 5272.73ms 
iter 4561: loss 2.7384, time 5224.64ms 
iter 4562: loss 2.6370, time 5281.96ms 
iter 4563: loss 2.7558, time 5276.51ms 
iter 4564: loss 2.5436, time 5335.54ms 
iter 4565: loss 2.4678, time 5338.60ms 
iter 4566: loss 2.4769, time 5340.26ms 
iter 4567: loss 2.7078, time 5346.76ms 
iter 4568: loss 2.7511, time 5332.97ms 
iter 4569: loss 2.5361, time 5317.86ms 
iter 4570: loss 2.5918, time 5337.18ms 
iter 4571: loss 2.7719, time 5268.51ms 
iter 4572: loss 2.6588, time 5272.17ms 
iter 4573: loss 2.5088, time 5271.22ms 
iter 4574: loss 2.7436, time 5271.85ms 
iter 4575: loss 2.7673, time 5268.78ms 
iter 4576: loss 2.7597, time 5269.21ms 
iter 4577: loss 2.6704, time 5290.47ms 
iter 4578: loss 2.5325, time 5300.39ms 
iter 4579: loss 2.6436, time 5337.83ms 
iter 4580: loss 2.6320, time 5324.68ms 
iter 4581: loss 2.6176, time 5275.78ms 
iter 4582: loss 2.5006, time 5286.49ms 
iter 4583: loss 2.5898, time 5279.50ms 
iter 4584: loss 2.6861, time 5278.65ms 
iter 4585: loss 2.6572, time 5276.03ms 
iter 4586: loss 2.2050, time 5288.55ms 
iter 4587: loss 2.6731, time 5285.12ms 
iter 4588: loss 2.4612, time 5270.38ms 
iter 4589: loss 2.5359, time 5279.10ms 
iter 4590: loss 2.6681, time 5287.30ms 
iter 4591: loss 2.5231, time 5266.78ms 
iter 4592: loss 2.5116, time 5263.92ms 
iter 4593: loss 2.5920, time 5264.57ms 
iter 4594: loss 2.5600, time 5255.92ms 
iter 4595: loss 2.5556, time 5260.94ms 
iter 4596: loss 2.8618, time 5260.16ms 
iter 4597: loss 2.4397, time 5269.16ms 
iter 4598: loss 2.6885, time 5262.60ms 
iter 4599: loss 2.6914, time 5258.34ms 
step 4600: train loss 2.6217, val loss 2.8559
iter 4600: loss 2.7529, time 20070.89ms 
iter 4601: loss 2.5223, time 5256.31ms 
iter 4602: loss 2.6578, time 5257.82ms 
iter 4603: loss 2.6586, time 5262.07ms 
iter 4604: loss 2.8403, time 5265.03ms 
iter 4605: loss 2.7375, time 5286.78ms 
iter 4606: loss 2.5209, time 5278.50ms 
iter 4607: loss 2.8308, time 5275.91ms 
iter 4608: loss 2.6486, time 5270.54ms 
iter 4609: loss 2.6418, time 5272.54ms 
iter 4610: loss 2.5319, time 5207.04ms 
iter 4611: loss 2.6363, time 5277.78ms 
iter 4612: loss 2.5363, time 5297.11ms 
iter 4613: loss 2.5548, time 5263.76ms 
iter 4614: loss 2.7166, time 5263.37ms 
iter 4615: loss 2.5957, time 5342.97ms 
iter 4616: loss 2.6898, time 5349.96ms 
iter 4617: loss 2.5027, time 5326.33ms 
iter 4618: loss 2.6182, time 5313.43ms 
iter 4619: loss 2.6861, time 5281.60ms 
iter 4620: loss 2.6371, time 5274.87ms 
iter 4621: loss 2.6990, time 5267.14ms 
iter 4622: loss 2.6238, time 5268.58ms 
iter 4623: loss 2.5197, time 5275.71ms 
iter 4624: loss 2.6719, time 5263.92ms 
iter 4625: loss 2.4706, time 5267.11ms 
iter 4626: loss 2.6175, time 5264.84ms 
iter 4627: loss 2.5175, time 5273.73ms 
iter 4628: loss 2.6001, time 5308.55ms 
iter 4629: loss 2.5837, time 5283.58ms 
iter 4630: loss 2.4275, time 5281.50ms 
iter 4631: loss 2.6840, time 5273.13ms 
iter 4632: loss 2.6458, time 5272.08ms 
iter 4633: loss 2.6476, time 5277.12ms 
iter 4634: loss 2.4947, time 5277.44ms 
iter 4635: loss 2.8395, time 5264.80ms 
iter 4636: loss 2.6825, time 5276.20ms 
iter 4637: loss 2.4821, time 5308.47ms 
iter 4638: loss 2.4630, time 5297.02ms 
iter 4639: loss 2.4865, time 5279.67ms 
iter 4640: loss 2.5112, time 5331.43ms 
iter 4641: loss 2.8016, time 5297.95ms 
iter 4642: loss 2.6664, time 5280.07ms 
iter 4643: loss 2.5015, time 5321.33ms 
iter 4644: loss 2.5568, time 5267.62ms 
iter 4645: loss 2.6554, time 5272.27ms 
iter 4646: loss 2.6683, time 5262.81ms 
iter 4647: loss 2.6504, time 5272.78ms 
iter 4648: loss 2.4889, time 5261.52ms 
iter 4649: loss 2.6412, time 5272.99ms 
step 4650: train loss 2.6002, val loss 2.8379
iter 4650: loss 2.5104, time 20082.56ms 
iter 4651: loss 2.4361, time 5268.24ms 
iter 4652: loss 2.7245, time 5266.78ms 
iter 4653: loss 2.7165, time 5277.22ms 
iter 4654: loss 2.5955, time 5276.69ms 
iter 4655: loss 2.5474, time 5275.98ms 
iter 4656: loss 2.5557, time 5270.70ms 
iter 4657: loss 2.5928, time 5268.59ms 
iter 4658: loss 2.6655, time 5268.54ms 
iter 4659: loss 2.5684, time 5286.47ms 
iter 4660: loss 2.5985, time 5279.12ms 
iter 4661: loss 2.4857, time 5259.75ms 
iter 4662: loss 2.6578, time 5269.68ms 
iter 4663: loss 2.5398, time 5270.30ms 
iter 4664: loss 2.6089, time 5264.11ms 
iter 4665: loss 2.8616, time 5322.91ms 
iter 4666: loss 2.6903, time 5280.53ms 
iter 4667: loss 2.7158, time 5328.93ms 
iter 4668: loss 2.3205, time 5285.04ms 
iter 4669: loss 2.6773, time 5277.99ms 
iter 4670: loss 2.7386, time 5272.06ms 
iter 4671: loss 2.4947, time 5267.08ms 
iter 4672: loss 2.5362, time 5266.02ms 
iter 4673: loss 2.5853, time 5260.46ms 
iter 4674: loss 2.5111, time 5268.85ms 
iter 4675: loss 2.6030, time 5267.05ms 
iter 4676: loss 2.6777, time 5278.26ms 
iter 4677: loss 2.6179, time 5259.20ms 
iter 4678: loss 2.5450, time 5278.05ms 
iter 4679: loss 2.5161, time 5281.93ms 
iter 4680: loss 2.5437, time 5257.96ms 
iter 4681: loss 2.7606, time 5258.39ms 
iter 4682: loss 2.2616, time 5256.13ms 
iter 4683: loss 2.5085, time 5265.26ms 
iter 4684: loss 2.3794, time 5265.21ms 
iter 4685: loss 2.7900, time 5260.25ms 
iter 4686: loss 2.6731, time 5269.83ms 
iter 4687: loss 2.5663, time 5261.50ms 
iter 4688: loss 2.6457, time 5277.53ms 
iter 4689: loss 2.6435, time 5276.84ms 
iter 4690: loss 2.7565, time 5280.19ms 
iter 4691: loss 2.5589, time 5273.41ms 
iter 4692: loss 2.5743, time 5284.47ms 
iter 4693: loss 2.7779, time 5281.89ms 
iter 4694: loss 2.7191, time 5275.47ms 
iter 4695: loss 2.5548, time 5280.62ms 
iter 4696: loss 2.8490, time 5268.98ms 
iter 4697: loss 2.6872, time 5276.28ms 
iter 4698: loss 2.5550, time 5274.70ms 
iter 4699: loss 2.6283, time 5271.41ms 
step 4700: train loss 2.6069, val loss 2.8620
iter 4700: loss 2.6770, time 20103.01ms 
iter 4701: loss 2.7066, time 5272.33ms 
iter 4702: loss 2.7120, time 5272.49ms 
iter 4703: loss 2.5065, time 5283.31ms 
iter 4704: loss 2.4461, time 5278.39ms 
iter 4705: loss 2.5547, time 5271.32ms 
iter 4706: loss 2.3376, time 5278.07ms 
iter 4707: loss 2.5039, time 5282.52ms 
iter 4708: loss 2.7400, time 5289.63ms 
iter 4709: loss 2.5948, time 5246.27ms 
iter 4710: loss 2.3781, time 5258.04ms 
iter 4711: loss 2.8756, time 5267.87ms 
iter 4712: loss 2.5765, time 5299.92ms 
iter 4713: loss 2.4328, time 5268.40ms 
iter 4714: loss 2.4636, time 5259.84ms 
iter 4715: loss 2.6174, time 5270.68ms 
iter 4716: loss 2.5954, time 5263.83ms 
iter 4717: loss 2.5468, time 5269.96ms 
iter 4718: loss 2.5423, time 5263.33ms 
iter 4719: loss 2.6322, time 5269.35ms 
iter 4720: loss 2.8048, time 5265.12ms 
iter 4721: loss 2.6025, time 5265.54ms 
iter 4722: loss 2.7035, time 5281.16ms 
iter 4723: loss 2.5539, time 5266.12ms 
iter 4724: loss 2.2979, time 5265.27ms 
iter 4725: loss 2.5594, time 5287.51ms 
iter 4726: loss 2.6811, time 5282.40ms 
iter 4727: loss 2.8301, time 5283.64ms 
iter 4728: loss 2.8089, time 5274.21ms 
iter 4729: loss 2.6671, time 5296.30ms 
iter 4730: loss 2.6172, time 5269.63ms 
iter 4731: loss 2.6625, time 5270.85ms 
iter 4732: loss 2.6952, time 5281.19ms 
iter 4733: loss 2.7328, time 5281.99ms 
iter 4734: loss 2.6496, time 5276.49ms 
iter 4735: loss 2.7255, time 5270.27ms 
iter 4736: loss 2.6381, time 5277.49ms 
iter 4737: loss 2.5756, time 5273.59ms 
iter 4738: loss 2.4340, time 5341.62ms 
iter 4739: loss 2.9783, time 5266.18ms 
iter 4740: loss 2.6260, time 5274.64ms 
iter 4741: loss 2.5056, time 5305.11ms 
iter 4742: loss 2.4949, time 5341.71ms 
iter 4743: loss 2.4807, time 5270.04ms 
iter 4744: loss 2.3580, time 5275.06ms 
iter 4745: loss 2.4551, time 5271.22ms 
iter 4746: loss 2.6142, time 5267.99ms 
iter 4747: loss 2.5610, time 5283.01ms 
iter 4748: loss 2.5579, time 5277.85ms 
iter 4749: loss 2.4298, time 5272.63ms 
step 4750: train loss 2.5946, val loss 2.8603
iter 4750: loss 2.4852, time 20105.95ms 
iter 4751: loss 2.2920, time 5261.57ms 
iter 4752: loss 2.4123, time 5251.06ms 
iter 4753: loss 2.5317, time 5267.65ms 
iter 4754: loss 2.5910, time 5271.79ms 
iter 4755: loss 2.4982, time 5256.72ms 
iter 4756: loss 2.4604, time 5260.96ms 
iter 4757: loss 2.6120, time 5264.90ms 
iter 4758: loss 2.7543, time 5275.91ms 
iter 4759: loss 2.6272, time 5280.27ms 
iter 4760: loss 2.6426, time 5263.80ms 
iter 4761: loss 2.8630, time 5266.33ms 
iter 4762: loss 2.6866, time 5279.09ms 
iter 4763: loss 2.6793, time 5271.38ms 
iter 4764: loss 2.5979, time 5274.72ms 
iter 4765: loss 2.5795, time 5270.05ms 
iter 4766: loss 2.6031, time 5271.36ms 
iter 4767: loss 2.6591, time 5286.64ms 
iter 4768: loss 2.4514, time 5281.52ms 
iter 4769: loss 2.7012, time 5272.48ms 
iter 4770: loss 2.7051, time 5277.35ms 
iter 4771: loss 2.5095, time 5279.91ms 
iter 4772: loss 2.6664, time 5282.27ms 
iter 4773: loss 2.7636, time 5276.63ms 
iter 4774: loss 2.5637, time 5270.48ms 
iter 4775: loss 2.6596, time 5268.96ms 
iter 4776: loss 2.6733, time 5266.42ms 
iter 4777: loss 2.6515, time 5282.36ms 
iter 4778: loss 2.8048, time 5258.65ms 
iter 4779: loss 2.5348, time 5258.40ms 
iter 4780: loss 2.3956, time 5259.21ms 
iter 4781: loss 2.6606, time 5267.89ms 
iter 4782: loss 2.4988, time 5267.14ms 
iter 4783: loss 2.7688, time 5265.96ms 
iter 4784: loss 2.7644, time 5278.61ms 
iter 4785: loss 2.7909, time 5277.30ms 
iter 4786: loss 2.6394, time 5272.58ms 
iter 4787: loss 2.6410, time 5262.70ms 
iter 4788: loss 2.5863, time 5270.71ms 
iter 4789: loss 2.7733, time 5262.87ms 
iter 4790: loss 2.6199, time 5271.57ms 
iter 4791: loss 2.5957, time 5264.94ms 
iter 4792: loss 2.7202, time 5263.26ms 
iter 4793: loss 2.4915, time 5258.12ms 
iter 4794: loss 2.6587, time 5264.50ms 
iter 4795: loss 2.6209, time 5275.58ms 
iter 4796: loss 2.4388, time 5268.21ms 
iter 4797: loss 2.4087, time 5265.34ms 
iter 4798: loss 2.7093, time 5263.27ms 
iter 4799: loss 2.6713, time 5258.50ms 
step 4800: train loss 2.5974, val loss 2.8320
iter 4800: loss 2.6532, time 20105.10ms 
iter 4801: loss 2.6237, time 5266.38ms 
iter 4802: loss 2.5995, time 5269.22ms 
iter 4803: loss 2.5397, time 5258.21ms 
iter 4804: loss 2.5571, time 5261.84ms 
iter 4805: loss 2.6679, time 5263.92ms 
iter 4806: loss 2.4025, time 5262.50ms 
iter 4807: loss 2.6199, time 5273.63ms 
iter 4808: loss 2.5910, time 5260.76ms 
iter 4809: loss 2.5703, time 5261.50ms 
iter 4810: loss 2.5169, time 5261.01ms 
iter 4811: loss 2.5551, time 5278.12ms 
iter 4812: loss 2.5186, time 5268.97ms 
iter 4813: loss 2.6126, time 5274.26ms 
iter 4814: loss 2.4568, time 5275.42ms 
iter 4815: loss 2.7422, time 5275.87ms 
iter 4816: loss 2.7714, time 5277.67ms 
iter 4817: loss 2.5500, time 5278.31ms 
iter 4818: loss 2.6316, time 5270.62ms 
iter 4819: loss 2.7585, time 5272.58ms 
iter 4820: loss 2.4713, time 5291.81ms 
iter 4821: loss 2.5072, time 5291.70ms 
iter 4822: loss 2.7234, time 5278.62ms 
iter 4823: loss 2.4568, time 5307.52ms 
iter 4824: loss 2.6345, time 5299.48ms 
iter 4825: loss 2.5997, time 5296.43ms 
iter 4826: loss 2.4586, time 5257.60ms 
iter 4827: loss 2.5840, time 5265.71ms 
iter 4828: loss 2.5425, time 5257.92ms 
iter 4829: loss 2.6963, time 5267.29ms 
iter 4830: loss 2.5110, time 5270.68ms 
iter 4831: loss 2.4016, time 5223.58ms 
iter 4832: loss 2.3694, time 5266.17ms 
iter 4833: loss 2.6107, time 5265.99ms 
iter 4834: loss 2.5784, time 5271.67ms 
iter 4835: loss 2.7107, time 5269.03ms 
iter 4836: loss 2.6296, time 5263.75ms 
iter 4837: loss 2.4548, time 5265.58ms 
iter 4838: loss 2.5861, time 5266.95ms 
iter 4839: loss 2.6943, time 5273.31ms 
iter 4840: loss 2.7271, time 5265.51ms 
iter 4841: loss 2.5769, time 5258.13ms 
iter 4842: loss 2.5875, time 5255.60ms 
iter 4843: loss 2.7315, time 5269.37ms 
iter 4844: loss 2.6885, time 5235.14ms 
iter 4845: loss 2.6676, time 5261.16ms 
iter 4846: loss 2.4866, time 5269.59ms 
iter 4847: loss 2.4810, time 5266.46ms 
iter 4848: loss 2.4762, time 5277.14ms 
iter 4849: loss 2.5921, time 5242.31ms 
step 4850: train loss 2.5854, val loss 2.8587
iter 4850: loss 2.6459, time 20097.14ms 
iter 4851: loss 2.5020, time 5272.39ms 
iter 4852: loss 2.6226, time 5260.85ms 
iter 4853: loss 2.6320, time 5265.98ms 
iter 4854: loss 2.5667, time 5274.38ms 
iter 4855: loss 2.6687, time 5253.10ms 
iter 4856: loss 2.6989, time 5260.32ms 
iter 4857: loss 2.3578, time 5265.69ms 
iter 4858: loss 2.6017, time 5279.34ms 
iter 4859: loss 2.3388, time 5268.95ms 
iter 4860: loss 2.5488, time 5268.49ms 
iter 4861: loss 2.5245, time 5255.54ms 
iter 4862: loss 2.6525, time 5258.56ms 
iter 4863: loss 2.8390, time 5274.06ms 
iter 4864: loss 2.7951, time 5276.67ms 
iter 4865: loss 2.4247, time 5273.75ms 
iter 4866: loss 2.6827, time 5282.31ms 
iter 4867: loss 2.5769, time 5295.75ms 
iter 4868: loss 2.4631, time 5269.39ms 
iter 4869: loss 2.8186, time 5275.02ms 
iter 4870: loss 2.5878, time 5286.64ms 
iter 4871: loss 2.5596, time 5259.92ms 
iter 4872: loss 2.5713, time 5260.05ms 
iter 4873: loss 2.5555, time 5265.67ms 
iter 4874: loss 2.6198, time 5279.49ms 
iter 4875: loss 2.6325, time 5280.40ms 
iter 4876: loss 2.6043, time 5273.00ms 
iter 4877: loss 2.5696, time 5265.72ms 
iter 4878: loss 2.6893, time 5273.20ms 
iter 4879: loss 2.6476, time 5268.31ms 
iter 4880: loss 2.6298, time 5265.23ms 
iter 4881: loss 2.6276, time 5259.57ms 
iter 4882: loss 2.6517, time 5278.14ms 
iter 4883: loss 2.5691, time 5278.05ms 
iter 4884: loss 2.8827, time 5277.19ms 
iter 4885: loss 2.6001, time 5266.85ms 
iter 4886: loss 2.5729, time 5269.23ms 
iter 4887: loss 2.6837, time 5271.05ms 
iter 4888: loss 2.8246, time 5263.83ms 
iter 4889: loss 2.6948, time 5268.54ms 
iter 4890: loss 2.5865, time 5263.54ms 
iter 4891: loss 2.5150, time 5273.75ms 
iter 4892: loss 2.7917, time 5266.78ms 
iter 4893: loss 2.8106, time 5279.74ms 
iter 4894: loss 2.5949, time 5259.11ms 
iter 4895: loss 2.6054, time 5261.14ms 
iter 4896: loss 2.5239, time 5267.37ms 
iter 4897: loss 2.4634, time 5279.47ms 
iter 4898: loss 2.7548, time 5254.53ms 
iter 4899: loss 2.5969, time 5268.47ms 
step 4900: train loss 2.6003, val loss 2.8515
iter 4900: loss 2.6677, time 20056.04ms 
iter 4901: loss 2.6117, time 5258.07ms 
iter 4902: loss 2.4528, time 5266.45ms 
iter 4903: loss 2.4314, time 5252.73ms 
iter 4904: loss 2.6267, time 5264.78ms 
iter 4905: loss 2.5366, time 5258.73ms 
iter 4906: loss 2.6368, time 5263.13ms 
iter 4907: loss 2.5271, time 5264.48ms 
iter 4908: loss 2.6271, time 5269.94ms 
iter 4909: loss 2.6822, time 5256.83ms 
iter 4910: loss 2.5390, time 5262.93ms 
iter 4911: loss 2.5313, time 5269.32ms 
iter 4912: loss 2.7349, time 5261.48ms 
iter 4913: loss 2.7131, time 5266.84ms 
iter 4914: loss 2.5127, time 5258.64ms 
iter 4915: loss 2.5501, time 5262.28ms 
iter 4916: loss 2.6916, time 5271.35ms 
iter 4917: loss 2.6030, time 5274.53ms 
iter 4918: loss 2.5549, time 5256.86ms 
iter 4919: loss 2.6675, time 5241.94ms 
iter 4920: loss 2.5080, time 5269.05ms 
iter 4921: loss 2.4970, time 5266.10ms 
iter 4922: loss 2.7124, time 5257.69ms 
iter 4923: loss 2.6127, time 5229.05ms 
iter 4924: loss 2.7008, time 5259.09ms 
iter 4925: loss 2.6993, time 5238.55ms 
iter 4926: loss 2.4013, time 5270.84ms 
iter 4927: loss 2.5286, time 5257.03ms 
iter 4928: loss 2.7931, time 5270.36ms 
iter 4929: loss 2.6958, time 5275.56ms 
iter 4930: loss 2.6871, time 5286.92ms 
iter 4931: loss 2.5997, time 5274.40ms 
iter 4932: loss 2.4290, time 5269.21ms 
iter 4933: loss 2.6395, time 5273.23ms 
iter 4934: loss 2.5484, time 5265.83ms 
iter 4935: loss 2.8189, time 5285.74ms 
iter 4936: loss 2.6354, time 5278.21ms 
iter 4937: loss 2.7510, time 5273.39ms 
iter 4938: loss 2.5816, time 5285.63ms 
iter 4939: loss 2.6773, time 5275.77ms 
iter 4940: loss 2.6437, time 5277.27ms 
iter 4941: loss 2.4943, time 5257.37ms 
iter 4942: loss 2.2638, time 5272.32ms 
iter 4943: loss 2.5034, time 5265.47ms 
iter 4944: loss 2.6219, time 5271.35ms 
iter 4945: loss 2.4418, time 5264.44ms 
iter 4946: loss 2.5846, time 5266.78ms 
iter 4947: loss 2.5537, time 5258.25ms 
iter 4948: loss 2.7202, time 5272.83ms 
iter 4949: loss 2.6725, time 5268.36ms 
step 4950: train loss 2.5840, val loss 2.8492
iter 4950: loss 2.5279, time 20124.42ms 
iter 4951: loss 2.7879, time 5259.39ms 
iter 4952: loss 2.6580, time 5260.07ms 
iter 4953: loss 2.5902, time 5262.79ms 
iter 4954: loss 2.4013, time 5264.55ms 
iter 4955: loss 2.6043, time 5264.56ms 
iter 4956: loss 2.4764, time 5267.83ms 
iter 4957: loss 2.5954, time 5267.12ms 
iter 4958: loss 2.5904, time 5263.04ms 
iter 4959: loss 2.2868, time 5272.93ms 
iter 4960: loss 2.6907, time 5271.87ms 
iter 4961: loss 2.5448, time 5262.70ms 
iter 4962: loss 2.5810, time 5235.89ms 
iter 4963: loss 2.6415, time 5262.15ms 
iter 4964: loss 2.7646, time 5260.23ms 
iter 4965: loss 2.5842, time 5261.45ms 
iter 4966: loss 2.7426, time 5255.77ms 
iter 4967: loss 2.5261, time 5261.46ms 
iter 4968: loss 2.7368, time 5274.94ms 
iter 4969: loss 2.5298, time 5257.11ms 
iter 4970: loss 2.5783, time 5263.35ms 
iter 4971: loss 2.6931, time 5260.12ms 
iter 4972: loss 2.5827, time 5266.68ms 
iter 4973: loss 2.6812, time 5269.50ms 
iter 4974: loss 2.8190, time 5259.10ms 
iter 4975: loss 2.4596, time 5263.69ms 
iter 4976: loss 2.5524, time 5260.78ms 
iter 4977: loss 2.3148, time 5267.40ms 
iter 4978: loss 2.5656, time 5258.68ms 
iter 4979: loss 2.6225, time 5262.48ms 
iter 4980: loss 2.5781, time 5259.53ms 
iter 4981: loss 2.4630, time 5235.78ms 
iter 4982: loss 2.5479, time 5262.47ms 
iter 4983: loss 2.6102, time 5267.95ms 
iter 4984: loss 2.6812, time 5258.98ms 
iter 4985: loss 2.4173, time 5237.68ms 
iter 4986: loss 2.7327, time 5263.45ms 
iter 4987: loss 2.4870, time 5264.54ms 
iter 4988: loss 2.4938, time 5259.61ms 
iter 4989: loss 2.6439, time 5207.90ms 
iter 4990: loss 2.3935, time 5250.08ms 
iter 4991: loss 2.7913, time 5170.71ms 
iter 4992: loss 2.5959, time 5112.89ms 
iter 4993: loss 2.3845, time 5184.26ms 
iter 4994: loss 2.5262, time 5182.78ms 
iter 4995: loss 2.5657, time 5239.84ms 
iter 4996: loss 2.7357, time 5259.78ms 
iter 4997: loss 2.6656, time 5261.66ms 
iter 4998: loss 2.7920, time 5267.06ms 
iter 4999: loss 2.5346, time 5261.47ms 
step 5000: train loss 2.6109, val loss 2.8576
iter 5000: loss 2.5541, time 20064.22ms 
iter 5001: loss 2.5419, time 5259.54ms 
iter 5002: loss 2.5364, time 5258.92ms 
iter 5003: loss 2.6246, time 5259.32ms 
iter 5004: loss 2.5316, time 5263.93ms 
iter 5005: loss 2.5363, time 5226.80ms 
iter 5006: loss 2.6759, time 5233.84ms 
iter 5007: loss 2.4494, time 5256.44ms 
iter 5008: loss 2.4888, time 5256.71ms 
iter 5009: loss 2.6969, time 5267.35ms 
iter 5010: loss 2.5857, time 5259.89ms 
iter 5011: loss 2.5932, time 5254.74ms 
iter 5012: loss 2.6832, time 5263.59ms 
iter 5013: loss 2.6187, time 5258.97ms 
iter 5014: loss 2.5761, time 5262.10ms 
iter 5015: loss 2.5318, time 5256.66ms 
iter 5016: loss 2.4463, time 5227.30ms 
iter 5017: loss 2.5673, time 5256.34ms 
iter 5018: loss 2.3660, time 5265.03ms 
iter 5019: loss 2.6569, time 5244.27ms 
iter 5020: loss 2.6640, time 5215.44ms 
iter 5021: loss 2.7354, time 5118.75ms 
iter 5022: loss 2.4454, time 5258.48ms 
iter 5023: loss 2.5174, time 5268.19ms 
iter 5024: loss 2.4303, time 5266.28ms 
iter 5025: loss 2.5221, time 5221.96ms 
iter 5026: loss 2.2728, time 5157.27ms 
iter 5027: loss 2.5881, time 5190.35ms 
iter 5028: loss 2.4840, time 5241.68ms 
iter 5029: loss 2.6623, time 5255.63ms 
iter 5030: loss 2.5865, time 5128.97ms 
iter 5031: loss 2.4786, time 5203.90ms 
iter 5032: loss 2.3874, time 5157.95ms 
iter 5033: loss 2.7115, time 5213.87ms 
iter 5034: loss 2.3013, time 5260.79ms 
iter 5035: loss 2.5481, time 5194.93ms 
iter 5036: loss 2.5884, time 5242.23ms 
iter 5037: loss 2.6621, time 5152.64ms 
iter 5038: loss 2.6819, time 5188.64ms 
iter 5039: loss 2.5906, time 5244.15ms 
iter 5040: loss 2.6743, time 5269.71ms 
iter 5041: loss 2.6345, time 5274.88ms 
iter 5042: loss 2.7126, time 5267.52ms 
iter 5043: loss 2.6843, time 5235.96ms 
iter 5044: loss 2.5084, time 5193.44ms 
iter 5045: loss 2.6542, time 5181.97ms 
iter 5046: loss 2.8341, time 5201.54ms 
iter 5047: loss 2.5092, time 5201.57ms 
iter 5048: loss 2.3804, time 5171.78ms 
iter 5049: loss 2.6161, time 5272.16ms 
step 5050: train loss 2.5891, val loss 2.8584
iter 5050: loss 2.6723, time 20088.32ms 
iter 5051: loss 2.6077, time 5261.19ms 
iter 5052: loss 2.3871, time 5261.31ms 
iter 5053: loss 2.5262, time 5253.84ms 
iter 5054: loss 2.5759, time 5265.18ms 
iter 5055: loss 2.4925, time 5268.09ms 
iter 5056: loss 2.6785, time 5258.24ms 
iter 5057: loss 2.4850, time 5265.56ms 
iter 5058: loss 2.6390, time 5268.80ms 
iter 5059: loss 2.6813, time 5261.35ms 
iter 5060: loss 2.6782, time 5265.69ms 
iter 5061: loss 2.3917, time 5257.15ms 
iter 5062: loss 2.4580, time 5257.66ms 
iter 5063: loss 2.7422, time 5265.75ms 
iter 5064: loss 2.6385, time 5265.58ms 
iter 5065: loss 2.5414, time 5229.12ms 
iter 5066: loss 2.5964, time 5262.22ms 
iter 5067: loss 2.4562, time 5256.15ms 
iter 5068: loss 2.6903, time 5274.92ms 
iter 5069: loss 2.7730, time 5272.41ms 
iter 5070: loss 2.6968, time 5277.42ms 
iter 5071: loss 2.5409, time 5270.58ms 
iter 5072: loss 2.4573, time 5274.20ms 
iter 5073: loss 2.6143, time 5251.25ms 
iter 5074: loss 2.5415, time 5267.54ms 
iter 5075: loss 2.4043, time 5267.50ms 
iter 5076: loss 2.5121, time 5227.65ms 
iter 5077: loss 2.5818, time 5276.25ms 
iter 5078: loss 2.4956, time 5239.74ms 
iter 5079: loss 2.5796, time 5262.19ms 
iter 5080: loss 2.7261, time 5259.63ms 
iter 5081: loss 2.5388, time 5259.60ms 
iter 5082: loss 2.3352, time 5283.64ms 
iter 5083: loss 2.4499, time 5273.68ms 
iter 5084: loss 2.4996, time 5267.40ms 
iter 5085: loss 2.5392, time 5265.52ms 
iter 5086: loss 2.6632, time 5271.36ms 
iter 5087: loss 2.5324, time 5271.23ms 
iter 5088: loss 2.7072, time 5267.54ms 
iter 5089: loss 2.5353, time 5266.85ms 
iter 5090: loss 2.5100, time 5271.27ms 
iter 5091: loss 2.6679, time 5279.59ms 
iter 5092: loss 2.5994, time 5263.46ms 
iter 5093: loss 2.6223, time 5261.34ms 
iter 5094: loss 2.8524, time 5260.15ms 
iter 5095: loss 2.5134, time 5260.69ms 
iter 5096: loss 2.4733, time 5251.48ms 
iter 5097: loss 2.5869, time 5260.35ms 
iter 5098: loss 2.6531, time 5282.46ms 
iter 5099: loss 2.6688, time 5269.19ms 
step 5100: train loss 2.5905, val loss 2.8243
iter 5100: loss 2.5383, time 20081.81ms 
iter 5101: loss 2.5954, time 5278.07ms 
iter 5102: loss 2.5607, time 5261.93ms 
iter 5103: loss 2.5382, time 5260.63ms 
iter 5104: loss 2.6392, time 5260.43ms 
iter 5105: loss 2.7896, time 5277.02ms 
iter 5106: loss 2.4142, time 5263.26ms 
iter 5107: loss 2.4487, time 5245.75ms 
iter 5108: loss 2.7259, time 5262.99ms 
iter 5109: loss 2.6285, time 5267.82ms 
iter 5110: loss 2.6859, time 5268.63ms 
iter 5111: loss 2.4352, time 5260.73ms 
iter 5112: loss 2.6200, time 5257.60ms 
iter 5113: loss 2.6016, time 5259.96ms 
iter 5114: loss 2.6435, time 5269.27ms 
iter 5115: loss 2.6737, time 5255.70ms 
iter 5116: loss 2.4508, time 5258.31ms 
iter 5117: loss 2.5846, time 5264.83ms 
iter 5118: loss 2.6774, time 5285.11ms 
iter 5119: loss 2.7445, time 5296.92ms 
iter 5120: loss 2.7644, time 5343.12ms 
iter 5121: loss 2.5045, time 5336.03ms 
iter 5122: loss 2.4010, time 5338.64ms 
iter 5123: loss 2.6248, time 5349.47ms 
iter 5124: loss 2.6195, time 5335.76ms 
iter 5125: loss 2.6812, time 5268.05ms 
iter 5126: loss 2.6330, time 5373.79ms 
iter 5127: loss 2.7828, time 5337.47ms 
iter 5128: loss 2.4236, time 5284.47ms 
iter 5129: loss 2.5042, time 5273.10ms 
iter 5130: loss 2.4991, time 5268.93ms 
iter 5131: loss 2.8732, time 5278.99ms 
iter 5132: loss 2.8575, time 5268.76ms 
iter 5133: loss 2.6372, time 5271.55ms 
iter 5134: loss 2.5843, time 5265.41ms 
iter 5135: loss 2.5966, time 5281.76ms 
iter 5136: loss 2.6412, time 5275.02ms 
iter 5137: loss 2.8349, time 5268.41ms 
iter 5138: loss 2.6374, time 5265.73ms 
iter 5139: loss 2.4778, time 5264.55ms 
iter 5140: loss 2.6152, time 5267.57ms 
iter 5141: loss 2.6146, time 5266.12ms 
iter 5142: loss 2.5787, time 5270.07ms 
iter 5143: loss 2.6557, time 5284.56ms 
iter 5144: loss 2.7235, time 5277.85ms 
iter 5145: loss 2.5359, time 5262.17ms 
iter 5146: loss 2.5548, time 5267.73ms 
iter 5147: loss 2.4604, time 5257.13ms 
iter 5148: loss 2.6674, time 5291.22ms 
iter 5149: loss 2.4599, time 5278.26ms 
step 5150: train loss 2.5843, val loss 2.8465
iter 5150: loss 2.2110, time 20103.33ms 
iter 5151: loss 2.6704, time 5266.23ms 
iter 5152: loss 2.5708, time 5266.31ms 
iter 5153: loss 2.5602, time 5268.85ms 
iter 5154: loss 2.5746, time 5277.78ms 
iter 5155: loss 2.4970, time 5257.65ms 
iter 5156: loss 2.7734, time 5271.17ms 
iter 5157: loss 2.3172, time 5269.08ms 
iter 5158: loss 2.7173, time 5273.45ms 
iter 5159: loss 2.4880, time 5281.96ms 
iter 5160: loss 2.3997, time 5271.46ms 
iter 5161: loss 2.5094, time 5275.16ms 
iter 5162: loss 2.6197, time 5263.08ms 
iter 5163: loss 2.4101, time 5281.75ms 
iter 5164: loss 2.4925, time 5271.25ms 
iter 5165: loss 2.2760, time 5253.73ms 
iter 5166: loss 2.5553, time 5265.90ms 
iter 5167: loss 2.5995, time 5277.17ms 
iter 5168: loss 2.5759, time 5253.52ms 
iter 5169: loss 2.4492, time 5270.82ms 
iter 5170: loss 2.4262, time 5264.66ms 
iter 5171: loss 2.6077, time 5269.31ms 
iter 5172: loss 2.5422, time 5259.58ms 
iter 5173: loss 2.4189, time 5262.04ms 
iter 5174: loss 2.5321, time 5267.74ms 
iter 5175: loss 2.4065, time 5264.53ms 
iter 5176: loss 2.5884, time 5269.92ms 
iter 5177: loss 2.6347, time 5269.47ms 
iter 5178: loss 2.6361, time 5267.95ms 
iter 5179: loss 2.3548, time 5263.73ms 
iter 5180: loss 2.6014, time 5267.93ms 
iter 5181: loss 2.6072, time 5257.60ms 
iter 5182: loss 2.5950, time 5260.75ms 
iter 5183: loss 2.5443, time 5259.92ms 
iter 5184: loss 2.5188, time 5267.93ms 
iter 5185: loss 2.3663, time 5255.79ms 
iter 5186: loss 2.6307, time 5266.10ms 
iter 5187: loss 2.4822, time 5264.96ms 
iter 5188: loss 2.5270, time 5226.63ms 
iter 5189: loss 2.4129, time 5266.72ms 
iter 5190: loss 2.5210, time 5266.04ms 
iter 5191: loss 2.6935, time 5259.35ms 
iter 5192: loss 2.6807, time 5264.69ms 
iter 5193: loss 2.6714, time 5273.11ms 
iter 5194: loss 2.6981, time 5267.04ms 
iter 5195: loss 2.5668, time 5258.55ms 
iter 5196: loss 2.6532, time 5253.80ms 
iter 5197: loss 2.6018, time 5262.60ms 
iter 5198: loss 2.7303, time 5265.69ms 
iter 5199: loss 2.4829, time 5260.70ms 
step 5200: train loss 2.5733, val loss 2.8361
iter 5200: loss 2.4288, time 20079.20ms 
iter 5201: loss 2.4426, time 5260.31ms 
iter 5202: loss 2.5324, time 5337.34ms 
iter 5203: loss 2.5519, time 5276.48ms 
iter 5204: loss 2.6452, time 5256.12ms 
iter 5205: loss 2.6017, time 5260.17ms 
iter 5206: loss 2.4375, time 5256.30ms 
iter 5207: loss 2.5494, time 5271.80ms 
iter 5208: loss 2.7590, time 5263.98ms 
iter 5209: loss 2.4706, time 5254.05ms 
iter 5210: loss 2.5342, time 5264.15ms 
iter 5211: loss 2.6689, time 5260.81ms 
iter 5212: loss 2.5900, time 5267.21ms 
iter 5213: loss 2.7492, time 5255.92ms 
iter 5214: loss 2.4379, time 5227.89ms 
iter 5215: loss 2.7072, time 5255.75ms 
iter 5216: loss 2.5346, time 5264.21ms 
iter 5217: loss 2.6699, time 5279.22ms 
iter 5218: loss 2.3963, time 5260.57ms 
iter 5219: loss 2.6615, time 5262.75ms 
iter 5220: loss 2.7863, time 5270.66ms 
iter 5221: loss 2.6135, time 5277.88ms 
iter 5222: loss 2.5359, time 5262.98ms 
iter 5223: loss 2.5408, time 5262.92ms 
iter 5224: loss 2.4700, time 5232.15ms 
iter 5225: loss 2.5615, time 5263.80ms 
iter 5226: loss 2.4776, time 5263.62ms 
iter 5227: loss 2.5896, time 5266.81ms 
iter 5228: loss 2.7320, time 5259.32ms 
iter 5229: loss 2.7178, time 5263.61ms 
iter 5230: loss 2.7828, time 5265.64ms 
iter 5231: loss 2.5193, time 5262.31ms 
iter 5232: loss 2.7048, time 5257.62ms 
iter 5233: loss 2.4438, time 5268.94ms 
iter 5234: loss 2.4375, time 5269.67ms 
iter 5235: loss 2.4056, time 5260.83ms 
iter 5236: loss 2.5103, time 5285.71ms 
iter 5237: loss 2.5395, time 5280.42ms 
iter 5238: loss 2.4450, time 5288.29ms 
iter 5239: loss 2.5857, time 5280.10ms 
iter 5240: loss 2.5206, time 5253.44ms 
iter 5241: loss 2.5398, time 5287.54ms 
iter 5242: loss 2.6386, time 5283.73ms 
iter 5243: loss 2.5051, time 5272.16ms 
iter 5244: loss 2.6991, time 5260.86ms 
iter 5245: loss 2.4291, time 5207.25ms 
iter 5246: loss 2.6190, time 5205.04ms 
iter 5247: loss 2.6613, time 5257.73ms 
iter 5248: loss 2.5454, time 5266.66ms 
iter 5249: loss 2.5833, time 5266.92ms 
step 5250: train loss 2.5730, val loss 2.8428
iter 5250: loss 2.5116, time 20075.39ms 
iter 5251: loss 2.5947, time 5257.69ms 
iter 5252: loss 2.6290, time 5250.87ms 
iter 5253: loss 2.7482, time 5245.57ms 
iter 5254: loss 2.5342, time 5252.17ms 
iter 5255: loss 2.6872, time 5251.17ms 
iter 5256: loss 2.6664, time 5260.48ms 
iter 5257: loss 2.7822, time 5253.97ms 
iter 5258: loss 2.6144, time 5258.10ms 
iter 5259: loss 2.6478, time 5155.59ms 
iter 5260: loss 2.5495, time 5148.41ms 
iter 5261: loss 2.5199, time 5116.97ms 
iter 5262: loss 2.4509, time 5151.78ms 
iter 5263: loss 2.4434, time 5155.99ms 
iter 5264: loss 2.5048, time 5141.86ms 
iter 5265: loss 2.4717, time 5180.07ms 
iter 5266: loss 2.4406, time 5258.20ms 
iter 5267: loss 2.6612, time 5265.92ms 
iter 5268: loss 2.4851, time 5271.31ms 
iter 5269: loss 2.5058, time 5268.87ms 
iter 5270: loss 2.5887, time 5271.53ms 
iter 5271: loss 2.4969, time 5262.63ms 
iter 5272: loss 2.4579, time 5277.43ms 
iter 5273: loss 2.6605, time 5271.85ms 
iter 5274: loss 2.5140, time 5266.65ms 
iter 5275: loss 2.4929, time 5270.22ms 
iter 5276: loss 2.5488, time 5221.09ms 
iter 5277: loss 2.5020, time 5218.75ms 
iter 5278: loss 2.5570, time 5256.63ms 
iter 5279: loss 2.5442, time 5260.21ms 
iter 5280: loss 2.7091, time 5261.38ms 
iter 5281: loss 2.6149, time 5263.35ms 
iter 5282: loss 2.4234, time 5256.85ms 
iter 5283: loss 2.5572, time 5272.12ms 
iter 5284: loss 2.5798, time 5261.02ms 
iter 5285: loss 2.4421, time 5264.56ms 
iter 5286: loss 2.7712, time 5264.15ms 
iter 5287: loss 2.5596, time 5257.39ms 
iter 5288: loss 2.7525, time 5257.49ms 
iter 5289: loss 2.4735, time 5257.11ms 
iter 5290: loss 2.6719, time 5276.77ms 
iter 5291: loss 2.6368, time 5264.12ms 
iter 5292: loss 2.5072, time 5256.55ms 
iter 5293: loss 2.5891, time 5278.77ms 
iter 5294: loss 2.4402, time 5262.60ms 
iter 5295: loss 2.6086, time 5267.40ms 
iter 5296: loss 2.5745, time 5203.00ms 
iter 5297: loss 2.5954, time 5260.31ms 
iter 5298: loss 2.6436, time 5261.68ms 
iter 5299: loss 2.6459, time 5260.38ms 
step 5300: train loss 2.5812, val loss 2.8409
iter 5300: loss 2.4828, time 20070.70ms 
iter 5301: loss 2.6666, time 5262.99ms 
iter 5302: loss 2.3030, time 5279.68ms 
iter 5303: loss 2.7337, time 5264.39ms 
iter 5304: loss 2.5835, time 5264.15ms 
iter 5305: loss 2.5217, time 5255.53ms 
iter 5306: loss 2.6247, time 5270.38ms 
iter 5307: loss 2.5383, time 5261.66ms 
iter 5308: loss 2.3397, time 5261.15ms 
iter 5309: loss 2.6069, time 5258.88ms 
iter 5310: loss 2.6386, time 5278.43ms 
iter 5311: loss 2.7430, time 5264.95ms 
iter 5312: loss 2.5987, time 5265.68ms 
iter 5313: loss 2.7184, time 5258.45ms 
iter 5314: loss 2.6436, time 5254.74ms 
iter 5315: loss 2.7204, time 5260.42ms 
iter 5316: loss 2.2586, time 5259.11ms 
iter 5317: loss 2.6541, time 5263.28ms 
iter 5318: loss 2.4666, time 5258.18ms 
iter 5319: loss 2.6154, time 5259.22ms 
iter 5320: loss 2.5828, time 5264.94ms 
iter 5321: loss 2.7006, time 5264.43ms 
iter 5322: loss 2.6325, time 5258.48ms 
iter 5323: loss 2.7002, time 5266.46ms 
iter 5324: loss 2.6239, time 5263.02ms 
iter 5325: loss 2.4891, time 5275.85ms 
iter 5326: loss 2.6274, time 5272.65ms 
iter 5327: loss 2.3363, time 5270.62ms 
iter 5328: loss 2.5904, time 5273.99ms 
iter 5329: loss 2.4805, time 5270.62ms 
iter 5330: loss 2.5707, time 5269.96ms 
iter 5331: loss 2.7913, time 5242.07ms 
iter 5332: loss 2.7171, time 5257.23ms 
iter 5333: loss 2.5299, time 5269.36ms 
iter 5334: loss 2.3973, time 5278.29ms 
iter 5335: loss 2.5741, time 5282.28ms 
iter 5336: loss 2.5920, time 5274.05ms 
iter 5337: loss 2.4854, time 5268.39ms 
iter 5338: loss 2.5074, time 5273.24ms 
iter 5339: loss 2.3788, time 5277.49ms 
iter 5340: loss 2.7563, time 5258.80ms 
iter 5341: loss 2.6186, time 5254.42ms 
iter 5342: loss 2.5066, time 5276.93ms 
iter 5343: loss 2.8440, time 5281.43ms 
iter 5344: loss 2.6335, time 5273.50ms 
iter 5345: loss 2.5154, time 5264.57ms 
iter 5346: loss 2.5795, time 5286.80ms 
iter 5347: loss 2.5321, time 5273.33ms 
iter 5348: loss 2.4775, time 5262.90ms 
iter 5349: loss 2.6752, time 5262.64ms 
step 5350: train loss 2.5862, val loss 2.8246
iter 5350: loss 2.6218, time 20012.78ms 
iter 5351: loss 2.7366, time 5268.77ms 
iter 5352: loss 2.4965, time 5274.61ms 
iter 5353: loss 2.3404, time 5282.02ms 
iter 5354: loss 2.6260, time 5270.98ms 
iter 5355: loss 2.6166, time 5264.49ms 
iter 5356: loss 2.6497, time 5272.05ms 
iter 5357: loss 2.5258, time 5270.79ms 
iter 5358: loss 2.6439, time 5266.27ms 
iter 5359: loss 2.6671, time 5267.92ms 
iter 5360: loss 2.5991, time 5273.15ms 
iter 5361: loss 2.4638, time 5260.22ms 
iter 5362: loss 2.6785, time 5270.19ms 
iter 5363: loss 2.4316, time 5275.26ms 
iter 5364: loss 2.5474, time 5265.88ms 
iter 5365: loss 2.7372, time 5281.36ms 
iter 5366: loss 2.5387, time 5276.25ms 
iter 5367: loss 2.5053, time 5257.18ms 
iter 5368: loss 2.7155, time 5261.91ms 
iter 5369: loss 2.7704, time 5273.99ms 
iter 5370: loss 2.6303, time 5268.81ms 
iter 5371: loss 2.4794, time 5258.91ms 
iter 5372: loss 2.6742, time 5256.68ms 
iter 5373: loss 2.5881, time 5265.36ms 
iter 5374: loss 2.3968, time 5265.28ms 
iter 5375: loss 2.6381, time 5258.13ms 
iter 5376: loss 2.7015, time 5262.27ms 
iter 5377: loss 2.6159, time 5263.03ms 
iter 5378: loss 2.3819, time 5275.81ms 
iter 5379: loss 2.6626, time 5269.65ms 
iter 5380: loss 2.4086, time 5274.96ms 
iter 5381: loss 2.5172, time 5269.73ms 
iter 5382: loss 2.5972, time 5279.76ms 
iter 5383: loss 2.5274, time 5277.44ms 
iter 5384: loss 2.5112, time 5265.52ms 
iter 5385: loss 2.5133, time 5269.13ms 
iter 5386: loss 2.5001, time 5272.72ms 
iter 5387: loss 2.6102, time 5258.58ms 
iter 5388: loss 2.7626, time 5254.80ms 
iter 5389: loss 2.4477, time 5225.04ms 
iter 5390: loss 2.8262, time 5270.03ms 
iter 5391: loss 2.4982, time 5273.44ms 
iter 5392: loss 2.5377, time 5268.44ms 
iter 5393: loss 2.4726, time 5264.42ms 
iter 5394: loss 2.7455, time 5279.09ms 
iter 5395: loss 2.4606, time 5270.83ms 
iter 5396: loss 2.4485, time 5279.48ms 
iter 5397: loss 2.6511, time 5257.66ms 
iter 5398: loss 2.5261, time 5270.29ms 
iter 5399: loss 2.5372, time 5272.95ms 
step 5400: train loss 2.5596, val loss 2.8346
iter 5400: loss 2.5558, time 20041.29ms 
iter 5401: loss 2.5084, time 5267.49ms 
iter 5402: loss 2.6694, time 5257.91ms 
iter 5403: loss 2.6861, time 5280.05ms 
iter 5404: loss 2.4945, time 5274.38ms 
iter 5405: loss 2.5023, time 5269.48ms 
iter 5406: loss 2.4999, time 5270.70ms 
iter 5407: loss 2.4115, time 5275.27ms 
iter 5408: loss 2.6281, time 5273.97ms 
iter 5409: loss 2.3968, time 5275.20ms 
iter 5410: loss 2.4768, time 5265.85ms 
iter 5411: loss 2.7809, time 5278.82ms 
iter 5412: loss 2.4999, time 5276.30ms 
iter 5413: loss 2.5404, time 5272.43ms 
iter 5414: loss 2.6359, time 5305.91ms 
iter 5415: loss 2.6815, time 5335.03ms 
iter 5416: loss 2.2591, time 5290.83ms 
iter 5417: loss 2.4045, time 5261.69ms 
iter 5418: loss 2.4840, time 5262.23ms 
iter 5419: loss 2.5337, time 5273.60ms 
iter 5420: loss 2.5383, time 5275.29ms 
iter 5421: loss 2.5353, time 5277.87ms 
iter 5422: loss 2.6006, time 5271.72ms 
iter 5423: loss 2.6054, time 5268.49ms 
iter 5424: loss 2.6156, time 5276.94ms 
iter 5425: loss 2.5592, time 5273.03ms 
iter 5426: loss 2.3747, time 5246.01ms 
iter 5427: loss 2.4027, time 5259.64ms 
iter 5428: loss 2.3589, time 5274.81ms 
iter 5429: loss 2.5896, time 5269.92ms 
iter 5430: loss 2.5381, time 5314.03ms 
iter 5431: loss 2.6026, time 5303.13ms 
iter 5432: loss 2.4827, time 5268.21ms 
iter 5433: loss 2.5169, time 5271.58ms 
iter 5434: loss 2.2474, time 5350.52ms 
iter 5435: loss 2.4412, time 5346.85ms 
iter 5436: loss 2.4410, time 5322.97ms 
iter 5437: loss 2.5933, time 5318.79ms 
iter 5438: loss 2.7081, time 5271.91ms 
iter 5439: loss 2.5924, time 5272.96ms 
iter 5440: loss 2.4023, time 5282.47ms 
iter 5441: loss 2.8114, time 5333.65ms 
iter 5442: loss 2.6290, time 5298.54ms 
iter 5443: loss 2.6515, time 5305.65ms 
iter 5444: loss 2.6297, time 5330.22ms 
iter 5445: loss 2.4976, time 5307.52ms 
iter 5446: loss 2.6031, time 5273.62ms 
iter 5447: loss 2.3337, time 5269.23ms 
iter 5448: loss 2.5736, time 5273.80ms 
iter 5449: loss 2.5298, time 5276.51ms 
step 5450: train loss 2.5762, val loss 2.8377
iter 5450: loss 2.4711, time 20107.43ms 
iter 5451: loss 2.7277, time 5316.94ms 
iter 5452: loss 2.5733, time 5280.00ms 
iter 5453: loss 2.5674, time 5278.24ms 
iter 5454: loss 2.4238, time 5266.17ms 
iter 5455: loss 2.4919, time 5266.01ms 
iter 5456: loss 2.6160, time 5255.54ms 
iter 5457: loss 2.6989, time 5260.70ms 
iter 5458: loss 2.5763, time 5263.53ms 
iter 5459: loss 2.7283, time 5264.48ms 
iter 5460: loss 2.5437, time 5269.12ms 
iter 5461: loss 2.7398, time 5262.61ms 
iter 5462: loss 2.6197, time 5259.24ms 
iter 5463: loss 2.3306, time 5268.27ms 
iter 5464: loss 2.4762, time 5274.54ms 
iter 5465: loss 2.4858, time 5281.33ms 
iter 5466: loss 2.5159, time 5267.15ms 
iter 5467: loss 2.6421, time 5245.48ms 
iter 5468: loss 2.7082, time 5276.99ms 
iter 5469: loss 2.7118, time 5260.34ms 
iter 5470: loss 2.6355, time 5270.66ms 
iter 5471: loss 2.6495, time 5261.05ms 
iter 5472: loss 2.8884, time 5283.29ms 
iter 5473: loss 2.7827, time 5270.03ms 
iter 5474: loss 2.4715, time 5263.27ms 
iter 5475: loss 2.5456, time 5273.41ms 
iter 5476: loss 2.6185, time 5275.77ms 
iter 5477: loss 2.2719, time 5272.52ms 
iter 5478: loss 2.6385, time 5258.56ms 
iter 5479: loss 2.5472, time 5272.00ms 
iter 5480: loss 2.4022, time 5271.52ms 
iter 5481: loss 2.5265, time 5268.05ms 
iter 5482: loss 2.6005, time 5263.53ms 
iter 5483: loss 2.3502, time 5258.94ms 
iter 5484: loss 2.4066, time 5261.68ms 
iter 5485: loss 2.4965, time 5219.34ms 
iter 5486: loss 2.4174, time 5279.94ms 
iter 5487: loss 2.6525, time 5264.67ms 
iter 5488: loss 2.5320, time 5272.96ms 
iter 5489: loss 2.5285, time 5283.03ms 
iter 5490: loss 2.4606, time 5290.58ms 
iter 5491: loss 2.6745, time 5288.97ms 
iter 5492: loss 2.2608, time 5283.01ms 
iter 5493: loss 2.6335, time 5273.68ms 
iter 5494: loss 2.6532, time 5283.93ms 
iter 5495: loss 2.7273, time 5275.95ms 
iter 5496: loss 2.5865, time 5260.12ms 
iter 5497: loss 2.3468, time 5269.50ms 
iter 5498: loss 2.6584, time 5260.01ms 
iter 5499: loss 2.6673, time 5324.93ms 
step 5500: train loss 2.5733, val loss 2.8472
iter 5500: loss 2.6706, time 20128.49ms 
iter 5501: loss 2.6838, time 5316.62ms 
iter 5502: loss 2.7202, time 5338.41ms 
iter 5503: loss 2.5404, time 5302.23ms 
iter 5504: loss 2.4771, time 5280.68ms 
iter 5505: loss 2.6052, time 5259.76ms 
iter 5506: loss 2.5227, time 5267.09ms 
iter 5507: loss 2.5990, time 5259.33ms 
iter 5508: loss 2.5050, time 5255.31ms 
iter 5509: loss 2.6692, time 5266.75ms 
iter 5510: loss 2.5617, time 5262.54ms 
iter 5511: loss 2.4860, time 5253.06ms 
iter 5512: loss 2.7317, time 5260.63ms 
iter 5513: loss 2.7562, time 5266.67ms 
iter 5514: loss 2.6741, time 5265.08ms 
iter 5515: loss 2.8167, time 5256.90ms 
iter 5516: loss 2.7612, time 5265.26ms 
iter 5517: loss 2.6640, time 5258.29ms 
iter 5518: loss 2.4042, time 5287.28ms 
iter 5519: loss 2.7630, time 5260.16ms 
iter 5520: loss 2.5365, time 5265.37ms 
iter 5521: loss 2.5373, time 5260.72ms 
iter 5522: loss 2.2934, time 5272.84ms 
iter 5523: loss 2.5848, time 5351.48ms 
iter 5524: loss 2.5846, time 5347.23ms 
iter 5525: loss 2.5494, time 5339.22ms 
iter 5526: loss 2.5592, time 5338.65ms 
iter 5527: loss 2.7604, time 5312.53ms 
iter 5528: loss 2.7146, time 5323.44ms 
iter 5529: loss 2.7307, time 5339.01ms 
iter 5530: loss 2.4086, time 5332.82ms 
iter 5531: loss 2.5470, time 5274.33ms 
iter 5532: loss 2.7216, time 5221.26ms 
iter 5533: loss 2.7299, time 5228.51ms 
iter 5534: loss 2.6684, time 5264.71ms 
iter 5535: loss 2.5838, time 5258.43ms 
iter 5536: loss 2.7223, time 5260.82ms 
iter 5537: loss 2.6842, time 5207.64ms 
iter 5538: loss 2.5791, time 5152.67ms 
iter 5539: loss 2.5924, time 5257.74ms 
iter 5540: loss 2.5101, time 5276.58ms 
iter 5541: loss 2.5590, time 5234.68ms 
iter 5542: loss 2.6749, time 5264.62ms 
iter 5543: loss 2.4299, time 5239.31ms 
iter 5544: loss 2.5353, time 5240.76ms 
iter 5545: loss 2.5408, time 5195.72ms 
iter 5546: loss 2.6316, time 5223.50ms 
iter 5547: loss 2.5537, time 5258.83ms 
iter 5548: loss 2.5553, time 5258.55ms 
iter 5549: loss 2.4915, time 5262.68ms 
step 5550: train loss 2.5675, val loss 2.8501
iter 5550: loss 2.4154, time 20064.56ms 
iter 5551: loss 2.4636, time 5262.99ms 
iter 5552: loss 2.4676, time 5266.52ms 
iter 5553: loss 2.4316, time 5263.55ms 
iter 5554: loss 2.4356, time 5265.75ms 
iter 5555: loss 2.4028, time 5260.78ms 
iter 5556: loss 2.6057, time 5223.49ms 
iter 5557: loss 2.5925, time 5092.60ms 
iter 5558: loss 2.7258, time 5114.00ms 
iter 5559: loss 2.5220, time 5138.69ms 
iter 5560: loss 2.7753, time 5134.18ms 
iter 5561: loss 2.5936, time 5219.50ms 
iter 5562: loss 2.5163, time 5228.12ms 
iter 5563: loss 2.4821, time 5244.01ms 
iter 5564: loss 2.5599, time 5266.01ms 
iter 5565: loss 2.5437, time 5236.65ms 
iter 5566: loss 2.9190, time 5260.69ms 
iter 5567: loss 2.5438, time 5263.59ms 
iter 5568: loss 2.4421, time 5270.55ms 
iter 5569: loss 2.6270, time 5266.47ms 
iter 5570: loss 2.7448, time 5258.00ms 
iter 5571: loss 2.4915, time 5228.20ms 
iter 5572: loss 2.3892, time 5265.40ms 
iter 5573: loss 2.6337, time 5263.36ms 
iter 5574: loss 2.4656, time 5267.64ms 
iter 5575: loss 2.4752, time 5266.15ms 
iter 5576: loss 2.3244, time 5265.39ms 
iter 5577: loss 2.5913, time 5283.73ms 
iter 5578: loss 2.7062, time 5296.79ms 
iter 5579: loss 2.4842, time 5235.70ms 
iter 5580: loss 2.5708, time 5221.21ms 
iter 5581: loss 2.7365, time 5205.44ms 
iter 5582: loss 2.3969, time 5268.81ms 
iter 5583: loss 2.6607, time 5262.91ms 
iter 5584: loss 2.5834, time 5272.30ms 
iter 5585: loss 2.4667, time 5293.90ms 
iter 5586: loss 2.4367, time 5280.89ms 
iter 5587: loss 2.5994, time 5281.34ms 
iter 5588: loss 2.2214, time 5274.42ms 
iter 5589: loss 2.5439, time 5272.37ms 
iter 5590: loss 2.3136, time 5276.59ms 
iter 5591: loss 2.6797, time 5266.47ms 
iter 5592: loss 2.6901, time 5273.52ms 
iter 5593: loss 2.6030, time 5264.02ms 
iter 5594: loss 2.4186, time 5266.07ms 
iter 5595: loss 2.6545, time 5266.29ms 
iter 5596: loss 2.5404, time 5278.40ms 
iter 5597: loss 2.5008, time 5275.16ms 
iter 5598: loss 2.5035, time 5264.51ms 
iter 5599: loss 2.5684, time 5272.23ms 
step 5600: train loss 2.5438, val loss 2.8284
iter 5600: loss 2.5103, time 20071.14ms 
iter 5601: loss 2.7150, time 5264.22ms 
iter 5602: loss 2.9389, time 5229.72ms 
iter 5603: loss 2.5324, time 5260.55ms 
iter 5604: loss 2.6780, time 5259.71ms 
iter 5605: loss 2.5601, time 5275.52ms 
iter 5606: loss 2.5047, time 5257.30ms 
iter 5607: loss 2.4827, time 5250.69ms 
iter 5608: loss 2.7149, time 5259.41ms 
iter 5609: loss 2.3658, time 5263.59ms 
iter 5610: loss 2.7337, time 5264.72ms 
iter 5611: loss 2.5809, time 5256.32ms 
iter 5612: loss 2.5860, time 5264.90ms 
iter 5613: loss 2.4261, time 5264.28ms 
iter 5614: loss 2.6121, time 5270.29ms 
iter 5615: loss 2.4426, time 5255.05ms 
iter 5616: loss 2.3572, time 5268.17ms 
iter 5617: loss 2.5570, time 5266.82ms 
iter 5618: loss 2.4757, time 5280.09ms 
iter 5619: loss 2.5005, time 5268.06ms 
iter 5620: loss 2.3782, time 5250.52ms 
iter 5621: loss 2.7300, time 5222.88ms 
iter 5622: loss 2.4750, time 5265.22ms 
iter 5623: loss 2.3857, time 5274.86ms 
iter 5624: loss 2.5662, time 5252.79ms 
iter 5625: loss 2.6757, time 5258.75ms 
iter 5626: loss 2.7089, time 5264.61ms 
iter 5627: loss 2.6629, time 5299.85ms 
iter 5628: loss 2.5433, time 5287.83ms 
iter 5629: loss 2.5559, time 5277.90ms 
iter 5630: loss 2.6628, time 5269.84ms 
iter 5631: loss 2.5657, time 5279.49ms 
iter 5632: loss 2.6706, time 5268.31ms 
iter 5633: loss 2.6195, time 5275.16ms 
iter 5634: loss 2.6352, time 5270.30ms 
iter 5635: loss 2.6025, time 5271.20ms 
iter 5636: loss 2.5892, time 5289.47ms 
iter 5637: loss 2.4695, time 5317.00ms 
iter 5638: loss 2.7196, time 5263.86ms 
iter 5639: loss 2.6561, time 5276.30ms 
iter 5640: loss 2.7236, time 5282.65ms 
iter 5641: loss 2.4377, time 5289.30ms 
iter 5642: loss 2.3987, time 5267.39ms 
iter 5643: loss 2.5618, time 5277.42ms 
iter 5644: loss 2.5874, time 5271.60ms 
iter 5645: loss 2.7165, time 5276.09ms 
iter 5646: loss 2.5705, time 5285.18ms 
iter 5647: loss 2.3529, time 5309.36ms 
iter 5648: loss 2.4836, time 5266.75ms 
iter 5649: loss 2.5325, time 5279.83ms 
step 5650: train loss 2.5482, val loss 2.8381
iter 5650: loss 2.6153, time 20094.03ms 
iter 5651: loss 2.4693, time 5277.37ms 
iter 5652: loss 2.6105, time 5269.23ms 
iter 5653: loss 2.4702, time 5265.40ms 
iter 5654: loss 2.6401, time 5272.98ms 
iter 5655: loss 2.8178, time 5267.83ms 
iter 5656: loss 2.6502, time 5266.54ms 
iter 5657: loss 2.5527, time 5280.27ms 
iter 5658: loss 2.6063, time 5268.72ms 
iter 5659: loss 2.4900, time 5280.07ms 
iter 5660: loss 2.8716, time 5265.62ms 
iter 5661: loss 2.7230, time 5275.68ms 
iter 5662: loss 2.4669, time 5263.89ms 
iter 5663: loss 2.7136, time 5336.25ms 
iter 5664: loss 2.7078, time 5340.35ms 
iter 5665: loss 2.4192, time 5312.47ms 
iter 5666: loss 2.4580, time 5324.63ms 
iter 5667: loss 2.6004, time 5331.93ms 
iter 5668: loss 2.4165, time 5345.77ms 
iter 5669: loss 2.5292, time 5263.66ms 
iter 5670: loss 2.6449, time 5255.25ms 
iter 5671: loss 2.6497, time 5260.42ms 
iter 5672: loss 2.4786, time 5264.96ms 
iter 5673: loss 2.4894, time 5258.86ms 
iter 5674: loss 2.4518, time 5255.47ms 
iter 5675: loss 2.6157, time 5257.47ms 
iter 5676: loss 2.6678, time 5275.38ms 
iter 5677: loss 2.4086, time 5265.40ms 
iter 5678: loss 2.6434, time 5261.75ms 
iter 5679: loss 2.5780, time 5262.85ms 
iter 5680: loss 2.4766, time 5138.01ms 
iter 5681: loss 2.3935, time 5278.25ms 
iter 5682: loss 2.6742, time 5240.64ms 
iter 5683: loss 2.5941, time 5262.01ms 
iter 5684: loss 2.5399, time 5275.99ms 
iter 5685: loss 2.4885, time 5326.41ms 
iter 5686: loss 2.3854, time 5276.60ms 
iter 5687: loss 2.7571, time 5269.88ms 
iter 5688: loss 2.6217, time 5267.01ms 
iter 5689: loss 2.4849, time 5276.17ms 
iter 5690: loss 2.5373, time 5237.66ms 
iter 5691: loss 2.5981, time 5259.82ms 
iter 5692: loss 2.8720, time 5264.97ms 
iter 5693: loss 2.5241, time 5277.20ms 
iter 5694: loss 2.5260, time 5329.21ms 
iter 5695: loss 2.5769, time 5294.41ms 
iter 5696: loss 2.5338, time 5309.20ms 
iter 5697: loss 2.5115, time 5324.44ms 
iter 5698: loss 2.3496, time 5321.53ms 
iter 5699: loss 2.6546, time 5308.82ms 
step 5700: train loss 2.5573, val loss 2.8263
iter 5700: loss 2.5863, time 20118.95ms 
iter 5701: loss 2.3952, time 5314.79ms 
iter 5702: loss 2.5147, time 5289.57ms 
iter 5703: loss 2.6207, time 5339.59ms 
iter 5704: loss 2.4812, time 5284.91ms 
iter 5705: loss 2.6010, time 5266.22ms 
iter 5706: loss 2.6204, time 5278.65ms 
iter 5707: loss 2.6977, time 5272.98ms 
iter 5708: loss 2.4274, time 5265.59ms 
iter 5709: loss 2.7167, time 5247.15ms 
iter 5710: loss 2.6470, time 5214.43ms 
iter 5711: loss 2.5026, time 5286.98ms 
iter 5712: loss 2.6290, time 5317.70ms 
iter 5713: loss 2.5877, time 5265.55ms 
iter 5714: loss 2.6018, time 5263.65ms 
iter 5715: loss 2.4434, time 5269.74ms 
iter 5716: loss 2.5077, time 5271.63ms 
iter 5717: loss 2.7052, time 5283.42ms 
iter 5718: loss 2.6651, time 5264.51ms 
iter 5719: loss 2.3606, time 5260.38ms 
iter 5720: loss 2.4605, time 5264.55ms 
iter 5721: loss 2.6554, time 5258.41ms 
iter 5722: loss 2.5943, time 5261.81ms 
iter 5723: loss 2.5583, time 5257.29ms 
iter 5724: loss 2.6905, time 5259.53ms 
iter 5725: loss 2.4893, time 5271.61ms 
iter 5726: loss 2.7408, time 5262.88ms 
iter 5727: loss 2.6270, time 5265.06ms 
iter 5728: loss 2.3700, time 5261.78ms 
iter 5729: loss 2.6049, time 5271.48ms 
iter 5730: loss 2.4666, time 5267.28ms 
iter 5731: loss 2.3869, time 5264.37ms 
iter 5732: loss 2.5996, time 5268.32ms 
iter 5733: loss 2.5673, time 5278.23ms 
iter 5734: loss 2.5563, time 5264.20ms 
iter 5735: loss 2.5084, time 5270.68ms 
iter 5736: loss 2.5363, time 5266.55ms 
iter 5737: loss 2.6697, time 5276.26ms 
iter 5738: loss 2.6746, time 5263.96ms 
iter 5739: loss 2.3336, time 5267.18ms 
iter 5740: loss 2.7089, time 5270.49ms 
iter 5741: loss 2.6694, time 5267.12ms 
iter 5742: loss 2.5617, time 5284.36ms 
iter 5743: loss 2.2872, time 5273.71ms 
iter 5744: loss 2.4453, time 5269.81ms 
iter 5745: loss 2.4015, time 5269.29ms 
iter 5746: loss 2.4867, time 5279.58ms 
iter 5747: loss 2.5415, time 5270.84ms 
iter 5748: loss 2.5534, time 5319.03ms 
iter 5749: loss 2.3192, time 5283.20ms 
step 5750: train loss 2.5549, val loss 2.8481
iter 5750: loss 2.5508, time 20060.49ms 
iter 5751: loss 2.5166, time 5267.79ms 
iter 5752: loss 2.6187, time 5276.06ms 
iter 5753: loss 2.5787, time 5306.55ms 
iter 5754: loss 2.6907, time 5283.67ms 
iter 5755: loss 2.5268, time 5284.36ms 
iter 5756: loss 2.7301, time 5289.34ms 
iter 5757: loss 2.5458, time 5290.41ms 
iter 5758: loss 2.4598, time 5288.79ms 
iter 5759: loss 2.6464, time 5285.29ms 
iter 5760: loss 2.6982, time 5283.71ms 
iter 5761: loss 2.7029, time 5275.65ms 
iter 5762: loss 2.4405, time 5281.32ms 
iter 5763: loss 2.6923, time 5276.53ms 
iter 5764: loss 2.4577, time 5337.01ms 
iter 5765: loss 2.4373, time 5315.11ms 
iter 5766: loss 2.6476, time 5274.67ms 
iter 5767: loss 2.4365, time 5272.08ms 
iter 5768: loss 2.5306, time 5267.24ms 
iter 5769: loss 2.6121, time 5284.87ms 
iter 5770: loss 2.6598, time 5275.13ms 
iter 5771: loss 2.4577, time 5276.63ms 
iter 5772: loss 2.4636, time 5280.25ms 
iter 5773: loss 2.6695, time 5272.33ms 
iter 5774: loss 2.6456, time 5283.40ms 
iter 5775: loss 2.6482, time 5271.43ms 
iter 5776: loss 2.6340, time 5274.83ms 
iter 5777: loss 2.5292, time 5265.01ms 
iter 5778: loss 2.5714, time 5261.38ms 
iter 5779: loss 2.3044, time 5266.48ms 
iter 5780: loss 2.2988, time 5277.61ms 
iter 5781: loss 2.5520, time 5270.14ms 
iter 5782: loss 2.5742, time 5270.21ms 
iter 5783: loss 2.4296, time 5254.10ms 
iter 5784: loss 2.5028, time 5269.43ms 
iter 5785: loss 2.5778, time 5264.90ms 
iter 5786: loss 2.6800, time 5273.28ms 
iter 5787: loss 2.5544, time 5423.30ms 
iter 5788: loss 2.3310, time 5441.99ms 
iter 5789: loss 2.4933, time 5292.63ms 
iter 5790: loss 2.6116, time 5287.49ms 
iter 5791: loss 2.3779, time 5292.54ms 
iter 5792: loss 2.3202, time 5275.15ms 
iter 5793: loss 2.5251, time 5266.63ms 
iter 5794: loss 2.5630, time 5271.45ms 
iter 5795: loss 2.5328, time 5265.98ms 
iter 5796: loss 2.5359, time 5264.27ms 
iter 5797: loss 2.3002, time 5227.00ms 
iter 5798: loss 2.5503, time 5229.36ms 
iter 5799: loss 2.4503, time 5257.83ms 
step 5800: train loss 2.5509, val loss 2.8400
iter 5800: loss 2.4030, time 20065.87ms 
iter 5801: loss 2.5628, time 5279.51ms 
iter 5802: loss 2.4269, time 5287.97ms 
iter 5803: loss 2.4881, time 5265.72ms 
iter 5804: loss 2.5173, time 5257.21ms 
iter 5805: loss 2.5369, time 5254.04ms 
iter 5806: loss 2.5335, time 5286.83ms 
iter 5807: loss 2.6188, time 5265.71ms 
iter 5808: loss 2.4624, time 5270.62ms 
iter 5809: loss 2.6504, time 5266.15ms 
iter 5810: loss 2.3332, time 5265.59ms 
iter 5811: loss 2.4715, time 5265.47ms 
iter 5812: loss 2.3645, time 5262.50ms 
iter 5813: loss 2.5180, time 5262.65ms 
iter 5814: loss 2.6766, time 5255.33ms 
iter 5815: loss 2.5758, time 5273.55ms 
iter 5816: loss 2.5794, time 5256.32ms 
iter 5817: loss 2.6928, time 5291.46ms 
iter 5818: loss 2.5259, time 5257.32ms 
iter 5819: loss 2.5377, time 5322.50ms 
iter 5820: loss 2.6199, time 5277.57ms 
iter 5821: loss 2.6035, time 5348.40ms 
iter 5822: loss 2.5977, time 5338.43ms 
iter 5823: loss 2.4264, time 5321.05ms 
iter 5824: loss 2.5175, time 5291.39ms 
iter 5825: loss 2.5363, time 5262.32ms 
iter 5826: loss 2.9713, time 5263.76ms 
iter 5827: loss 2.3561, time 5263.32ms 
iter 5828: loss 2.5444, time 5270.79ms 
iter 5829: loss 2.7087, time 5286.56ms 
iter 5830: loss 2.5923, time 5337.65ms 
iter 5831: loss 2.7641, time 5168.20ms 
iter 5832: loss 2.6309, time 5226.21ms 
iter 5833: loss 2.6394, time 5228.79ms 
iter 5834: loss 2.5599, time 5213.12ms 
iter 5835: loss 2.3728, time 5246.50ms 
iter 5836: loss 2.7318, time 5244.66ms 
iter 5837: loss 2.4679, time 5211.44ms 
iter 5838: loss 2.5904, time 5247.81ms 
iter 5839: loss 2.6026, time 5238.78ms 
iter 5840: loss 2.4384, time 5266.48ms 
iter 5841: loss 2.4523, time 5261.96ms 
iter 5842: loss 2.6675, time 5233.53ms 
iter 5843: loss 2.5674, time 5266.53ms 
iter 5844: loss 2.4495, time 5224.30ms 
iter 5845: loss 2.5816, time 5232.70ms 
iter 5846: loss 2.6293, time 5124.43ms 
iter 5847: loss 2.4579, time 5100.27ms 
iter 5848: loss 2.4613, time 5163.83ms 
iter 5849: loss 2.6824, time 5248.72ms 
step 5850: train loss 2.5485, val loss 2.8354
iter 5850: loss 2.6818, time 19985.58ms 
iter 5851: loss 2.6539, time 5222.06ms 
iter 5852: loss 2.8689, time 5247.60ms 
iter 5853: loss 2.2817, time 5244.92ms 
iter 5854: loss 2.6685, time 5258.49ms 
iter 5855: loss 2.4216, time 5268.71ms 
iter 5856: loss 2.7592, time 5270.12ms 
iter 5857: loss 2.5647, time 5265.57ms 
iter 5858: loss 2.6532, time 5350.94ms 
iter 5859: loss 2.4465, time 5341.36ms 
iter 5860: loss 2.3884, time 5337.21ms 
iter 5861: loss 2.4755, time 5079.63ms 
iter 5862: loss 2.5486, time 5026.78ms 
iter 5863: loss 2.7166, time 5028.72ms 
iter 5864: loss 2.3719, time 5085.55ms 
iter 5865: loss 2.2406, time 5243.57ms 
iter 5866: loss 2.6422, time 5237.24ms 
iter 5867: loss 2.4944, time 5243.85ms 
iter 5868: loss 2.6015, time 5221.96ms 
iter 5869: loss 2.4943, time 5157.74ms 
iter 5870: loss 2.4055, time 5133.85ms 
iter 5871: loss 2.5875, time 5240.93ms 
iter 5872: loss 2.4464, time 5269.53ms 
iter 5873: loss 2.7599, time 5264.66ms 
iter 5874: loss 2.6615, time 5348.02ms 
iter 5875: loss 2.5068, time 5349.87ms 
iter 5876: loss 2.5301, time 5340.61ms 
iter 5877: loss 2.5244, time 5347.10ms 
iter 5878: loss 2.5514, time 5310.73ms 
iter 5879: loss 2.7658, time 5269.57ms 
iter 5880: loss 2.6302, time 5302.92ms 
iter 5881: loss 2.5744, time 5335.90ms 
iter 5882: loss 2.6079, time 5345.37ms 
iter 5883: loss 2.6053, time 5266.67ms 
iter 5884: loss 2.6365, time 5350.76ms 
iter 5885: loss 2.4132, time 5343.47ms 
iter 5886: loss 2.4094, time 5335.66ms 
iter 5887: loss 2.5926, time 5299.24ms 
iter 5888: loss 2.3636, time 5270.64ms 
iter 5889: loss 2.4291, time 5269.34ms 
iter 5890: loss 2.8800, time 5276.01ms 
iter 5891: loss 2.4993, time 5273.04ms 
iter 5892: loss 2.5669, time 5275.44ms 
iter 5893: loss 2.5737, time 5271.76ms 
iter 5894: loss 2.5966, time 5275.56ms 
iter 5895: loss 2.8469, time 5276.13ms 
iter 5896: loss 2.5088, time 5308.19ms 
iter 5897: loss 2.7150, time 5345.75ms 
iter 5898: loss 2.4880, time 5349.98ms 
iter 5899: loss 2.6539, time 5343.22ms 
step 5900: train loss 2.5550, val loss 2.8204
iter 5900: loss 2.3539, time 20189.86ms 
iter 5901: loss 2.6233, time 5344.98ms 
iter 5902: loss 2.6707, time 5338.87ms 
iter 5903: loss 2.4326, time 5265.12ms 
iter 5904: loss 2.5773, time 5308.94ms 
iter 5905: loss 2.5127, time 5333.70ms 
iter 5906: loss 2.3748, time 5332.99ms 
iter 5907: loss 2.7284, time 5348.28ms 
iter 5908: loss 2.7851, time 5297.19ms 
iter 5909: loss 2.4185, time 5298.28ms 
iter 5910: loss 2.2391, time 5311.29ms 
iter 5911: loss 2.6338, time 5323.39ms 
iter 5912: loss 2.5694, time 5343.02ms 
iter 5913: loss 2.7279, time 5350.95ms 
iter 5914: loss 2.5680, time 5346.61ms 
iter 5915: loss 2.3737, time 5276.60ms 
iter 5916: loss 2.3702, time 5259.18ms 
iter 5917: loss 2.5203, time 5267.27ms 
iter 5918: loss 2.3813, time 5270.63ms 
iter 5919: loss 2.7046, time 5302.73ms 
iter 5920: loss 2.5013, time 5280.11ms 
iter 5921: loss 2.7937, time 5268.13ms 
iter 5922: loss 2.5797, time 5257.00ms 
iter 5923: loss 2.5648, time 5273.69ms 
iter 5924: loss 2.5087, time 5274.60ms 
iter 5925: loss 2.6008, time 5262.08ms 
iter 5926: loss 2.8571, time 5261.05ms 
iter 5927: loss 2.6849, time 5257.40ms 
iter 5928: loss 2.6533, time 5265.11ms 
iter 5929: loss 2.6403, time 5260.88ms 
iter 5930: loss 2.2564, time 5256.17ms 
iter 5931: loss 2.5474, time 5279.72ms 
iter 5932: loss 2.5644, time 5333.98ms 
iter 5933: loss 2.5739, time 5306.16ms 
iter 5934: loss 2.5915, time 5344.00ms 
iter 5935: loss 2.6452, time 5258.96ms 
iter 5936: loss 2.8406, time 5306.93ms 
iter 5937: loss 2.4468, time 5308.45ms 
iter 5938: loss 2.7242, time 5255.76ms 
iter 5939: loss 2.5152, time 5252.26ms 
iter 5940: loss 2.5371, time 5263.75ms 
iter 5941: loss 2.3900, time 5264.10ms 
iter 5942: loss 2.4178, time 5260.99ms 
iter 5943: loss 2.3916, time 5258.58ms 
iter 5944: loss 2.3288, time 5266.68ms 
iter 5945: loss 2.6278, time 5271.14ms 
iter 5946: loss 2.5956, time 5268.48ms 
iter 5947: loss 2.5081, time 5267.82ms 
iter 5948: loss 2.5157, time 5266.10ms 
iter 5949: loss 2.4562, time 5274.42ms 
step 5950: train loss 2.5529, val loss 2.8357
iter 5950: loss 2.6404, time 20087.35ms 
iter 5951: loss 2.4902, time 5280.06ms 
iter 5952: loss 2.4961, time 5260.73ms 
iter 5953: loss 2.7859, time 5263.28ms 
iter 5954: loss 2.6189, time 5269.86ms 
iter 5955: loss 2.4888, time 5299.43ms 
iter 5956: loss 2.3633, time 5332.58ms 
iter 5957: loss 2.4273, time 5276.90ms 
iter 5958: loss 2.4585, time 5334.60ms 
iter 5959: loss 2.6406, time 5333.83ms 
iter 5960: loss 2.4890, time 5275.10ms 
iter 5961: loss 2.4740, time 5256.80ms 
iter 5962: loss 2.6009, time 5266.33ms 
iter 5963: loss 2.5060, time 5261.52ms 
iter 5964: loss 2.7801, time 5257.61ms 
iter 5965: loss 2.8332, time 5254.56ms 
iter 5966: loss 2.6151, time 5271.88ms 
iter 5967: loss 2.7390, time 5265.10ms 
iter 5968: loss 2.5549, time 5279.73ms 
iter 5969: loss 2.6404, time 5273.40ms 
iter 5970: loss 2.3387, time 5292.85ms 
iter 5971: loss 2.6825, time 5282.41ms 
iter 5972: loss 2.7110, time 5221.39ms 
iter 5973: loss 2.4688, time 5307.75ms 
iter 5974: loss 2.6177, time 5289.67ms 
iter 5975: loss 2.4235, time 5280.80ms 
iter 5976: loss 2.2897, time 5335.22ms 
iter 5977: loss 2.4927, time 5298.98ms 
iter 5978: loss 2.8037, time 5306.15ms 
iter 5979: loss 2.5437, time 5306.91ms 
iter 5980: loss 2.3772, time 5301.17ms 
iter 5981: loss 2.6372, time 5273.78ms 
iter 5982: loss 2.8226, time 5290.23ms 
iter 5983: loss 2.4110, time 5280.36ms 
iter 5984: loss 2.6190, time 5266.98ms 
iter 5985: loss 2.6824, time 5315.89ms 
iter 5986: loss 2.4540, time 5274.38ms 
iter 5987: loss 2.4689, time 5284.20ms 
iter 5988: loss 2.6747, time 5274.97ms 
iter 5989: loss 2.5118, time 5283.09ms 
iter 5990: loss 2.5775, time 5266.22ms 
iter 5991: loss 2.5319, time 5265.84ms 
iter 5992: loss 2.6644, time 5263.04ms 
iter 5993: loss 2.5642, time 5234.19ms 
iter 5994: loss 2.4681, time 5264.32ms 
iter 5995: loss 2.5399, time 5277.53ms 
iter 5996: loss 2.5795, time 5275.83ms 
iter 5997: loss 2.4753, time 5274.42ms 
iter 5998: loss 2.5202, time 5276.05ms 
iter 5999: loss 2.6348, time 5279.01ms 
step 6000: train loss 2.5500, val loss 2.8351
iter 6000: loss 2.4393, time 20127.21ms 
iter 6001: loss 2.5248, time 5264.36ms 
iter 6002: loss 2.6305, time 5268.87ms 
iter 6003: loss 2.7187, time 5257.64ms 
iter 6004: loss 2.6146, time 5309.26ms 
iter 6005: loss 2.4991, time 5306.55ms 
iter 6006: loss 2.6845, time 5265.66ms 
iter 6007: loss 2.4565, time 5278.82ms 
iter 6008: loss 2.4413, time 5346.04ms 
iter 6009: loss 2.4364, time 5354.54ms 
iter 6010: loss 2.7213, time 5348.42ms 
iter 6011: loss 2.5331, time 5347.53ms 
iter 6012: loss 2.5071, time 5345.60ms 
iter 6013: loss 2.6059, time 5342.42ms 
iter 6014: loss 2.5857, time 5352.97ms 
iter 6015: loss 2.7641, time 5330.82ms 
iter 6016: loss 2.5610, time 5317.64ms 
iter 6017: loss 2.4471, time 5263.79ms 
iter 6018: loss 2.6044, time 5261.47ms 
iter 6019: loss 2.4779, time 5258.01ms 
iter 6020: loss 2.6655, time 5258.64ms 
iter 6021: loss 2.4188, time 5261.66ms 
iter 6022: loss 2.4364, time 5263.15ms 
iter 6023: loss 2.7940, time 5293.76ms 
iter 6024: loss 2.6254, time 5259.41ms 
iter 6025: loss 2.5701, time 5261.30ms 
iter 6026: loss 2.5228, time 5322.10ms 
iter 6027: loss 2.7765, time 5345.73ms 
iter 6028: loss 2.6552, time 5347.11ms 
iter 6029: loss 2.7484, time 5351.41ms 
iter 6030: loss 2.6317, time 5299.15ms 
iter 6031: loss 2.5042, time 5270.83ms 
iter 6032: loss 2.5339, time 5341.00ms 
iter 6033: loss 2.6002, time 5346.10ms 
iter 6034: loss 2.4649, time 5329.49ms 
iter 6035: loss 2.3857, time 5276.98ms 
iter 6036: loss 2.5132, time 5268.22ms 
iter 6037: loss 2.7333, time 5266.91ms 
iter 6038: loss 2.5827, time 5280.36ms 
iter 6039: loss 2.4335, time 5231.10ms 
iter 6040: loss 2.3610, time 5324.59ms 
iter 6041: loss 2.5894, time 5270.94ms 
iter 6042: loss 2.6643, time 5274.29ms 
iter 6043: loss 2.5714, time 5289.31ms 
iter 6044: loss 2.5244, time 5293.82ms 
iter 6045: loss 2.6835, time 5266.65ms 
iter 6046: loss 2.6614, time 5276.21ms 
iter 6047: loss 2.7606, time 5267.06ms 
iter 6048: loss 2.8572, time 5264.75ms 
iter 6049: loss 2.5282, time 5262.64ms 
step 6050: train loss 2.5406, val loss 2.8322
iter 6050: loss 2.4404, time 20070.23ms 
iter 6051: loss 2.5009, time 5267.93ms 
iter 6052: loss 2.6283, time 5267.62ms 
iter 6053: loss 2.6291, time 5269.21ms 
iter 6054: loss 2.5955, time 5267.25ms 
iter 6055: loss 2.4855, time 5265.49ms 
iter 6056: loss 2.5093, time 5262.18ms 
iter 6057: loss 2.4524, time 5259.51ms 
iter 6058: loss 2.7496, time 5255.21ms 
iter 6059: loss 2.4952, time 5263.71ms 
iter 6060: loss 2.5283, time 5272.52ms 
iter 6061: loss 2.4606, time 5274.15ms 
iter 6062: loss 2.4911, time 5231.37ms 
iter 6063: loss 2.6957, time 5244.81ms 
iter 6064: loss 2.3074, time 5276.76ms 
iter 6065: loss 2.5021, time 5268.95ms 
iter 6066: loss 2.5959, time 5235.66ms 
iter 6067: loss 2.4864, time 5268.27ms 
iter 6068: loss 2.4877, time 5281.10ms 
iter 6069: loss 2.4532, time 5278.61ms 
iter 6070: loss 2.6029, time 5269.78ms 
iter 6071: loss 2.3794, time 5272.85ms 
iter 6072: loss 2.8698, time 5279.05ms 
iter 6073: loss 2.6082, time 5266.38ms 
iter 6074: loss 2.4812, time 5266.84ms 
iter 6075: loss 2.7299, time 5256.60ms 
iter 6076: loss 2.7694, time 5261.87ms 
iter 6077: loss 2.4592, time 5257.50ms 
iter 6078: loss 2.5829, time 5246.46ms 
iter 6079: loss 2.3911, time 5243.76ms 
iter 6080: loss 2.6924, time 5173.59ms 
iter 6081: loss 2.4793, time 5223.68ms 
iter 6082: loss 2.4706, time 5254.44ms 
iter 6083: loss 2.5953, time 5239.44ms 
iter 6084: loss 2.5392, time 5251.65ms 
iter 6085: loss 2.6361, time 5250.10ms 
iter 6086: loss 2.3749, time 5237.97ms 
iter 6087: loss 2.6959, time 5140.74ms 
iter 6088: loss 2.5720, time 5125.66ms 
iter 6089: loss 2.5134, time 5253.38ms 
iter 6090: loss 2.6554, time 5238.73ms 
iter 6091: loss 2.5772, time 5257.94ms 
iter 6092: loss 2.6227, time 5255.91ms 
iter 6093: loss 2.5259, time 5246.98ms 
iter 6094: loss 2.5243, time 5239.37ms 
iter 6095: loss 2.2935, time 5205.84ms 
iter 6096: loss 2.7321, time 5258.29ms 
iter 6097: loss 2.3191, time 5248.50ms 
iter 6098: loss 2.5113, time 5248.31ms 
iter 6099: loss 2.4109, time 5238.78ms 
step 6100: train loss 2.5555, val loss 2.8456
iter 6100: loss 2.5123, time 20073.01ms 
iter 6101: loss 2.3108, time 5272.36ms 
iter 6102: loss 2.4040, time 5235.22ms 
iter 6103: loss 2.5754, time 5256.92ms 
iter 6104: loss 2.5797, time 5264.39ms 
iter 6105: loss 2.3526, time 5266.97ms 
iter 6106: loss 2.4758, time 5222.67ms 
iter 6107: loss 2.2057, time 5249.84ms 
iter 6108: loss 2.5007, time 5249.38ms 
iter 6109: loss 2.4346, time 5261.69ms 
iter 6110: loss 2.5699, time 5252.67ms 
iter 6111: loss 2.3589, time 5232.41ms 
iter 6112: loss 2.4225, time 5252.33ms 
iter 6113: loss 2.7271, time 5253.04ms 
iter 6114: loss 2.5859, time 5246.14ms 
iter 6115: loss 2.3100, time 5199.18ms 
iter 6116: loss 2.6558, time 5126.21ms 
iter 6117: loss 2.3830, time 5147.06ms 
iter 6118: loss 2.6179, time 5147.26ms 
iter 6119: loss 2.6234, time 5157.47ms 
iter 6120: loss 2.5132, time 5167.04ms 
iter 6121: loss 2.4737, time 5136.67ms 
iter 6122: loss 2.4775, time 5205.56ms 
iter 6123: loss 2.4919, time 5136.44ms 
iter 6124: loss 2.4335, time 5244.59ms 
iter 6125: loss 2.5596, time 5142.48ms 
iter 6126: loss 2.6913, time 5126.41ms 
iter 6127: loss 2.6315, time 5243.19ms 
iter 6128: loss 2.5041, time 5248.70ms 
iter 6129: loss 2.4669, time 5176.08ms 
iter 6130: loss 2.6848, time 5186.58ms 
iter 6131: loss 2.3848, time 5220.69ms 
iter 6132: loss 2.4797, time 5257.74ms 
iter 6133: loss 2.6225, time 5259.79ms 
iter 6134: loss 2.7306, time 5264.23ms 
iter 6135: loss 2.3728, time 5259.81ms 
iter 6136: loss 2.4975, time 5253.63ms 
iter 6137: loss 2.6277, time 5247.00ms 
iter 6138: loss 2.4964, time 5223.20ms 
iter 6139: loss 2.4704, time 5161.71ms 
iter 6140: loss 2.5013, time 5126.21ms 
iter 6141: loss 2.4060, time 5129.47ms 
iter 6142: loss 2.4126, time 5144.79ms 
iter 6143: loss 2.4803, time 5114.93ms 
iter 6144: loss 2.8173, time 5086.30ms 
iter 6145: loss 2.7644, time 5070.83ms 
iter 6146: loss 2.5727, time 5119.82ms 
iter 6147: loss 2.5504, time 5108.56ms 
iter 6148: loss 2.6542, time 5161.34ms 
iter 6149: loss 2.5541, time 5307.11ms 
step 6150: train loss 2.5535, val loss 2.8450
iter 6150: loss 2.4170, time 20084.78ms 
iter 6151: loss 2.5402, time 5259.51ms 
iter 6152: loss 2.6057, time 5312.82ms 
iter 6153: loss 2.3824, time 5261.66ms 
iter 6154: loss 2.6697, time 5256.41ms 
iter 6155: loss 2.6106, time 5256.51ms 
iter 6156: loss 2.6599, time 5262.51ms 
iter 6157: loss 2.4509, time 5266.54ms 
iter 6158: loss 2.3664, time 5257.60ms 
iter 6159: loss 2.6434, time 5261.88ms 
iter 6160: loss 2.4214, time 5264.96ms 
iter 6161: loss 2.6273, time 5227.99ms 
iter 6162: loss 2.5059, time 5230.08ms 
iter 6163: loss 2.5807, time 5224.93ms 
iter 6164: loss 2.6127, time 5235.70ms 
iter 6165: loss 2.6070, time 5260.09ms 
iter 6166: loss 2.4761, time 5239.72ms 
iter 6167: loss 2.5803, time 5241.56ms 
iter 6168: loss 2.4917, time 5121.74ms 
iter 6169: loss 2.4806, time 5099.95ms 
iter 6170: loss 2.4192, time 5180.40ms 
iter 6171: loss 2.5132, time 5242.77ms 
iter 6172: loss 2.4895, time 5246.83ms 
iter 6173: loss 2.5072, time 5248.06ms 
iter 6174: loss 2.3918, time 5246.89ms 
iter 6175: loss 2.6542, time 5241.87ms 
iter 6176: loss 2.5307, time 5252.42ms 
iter 6177: loss 2.5889, time 5273.32ms 
iter 6178: loss 2.5796, time 5269.87ms 
iter 6179: loss 2.5006, time 5296.67ms 
iter 6180: loss 2.7178, time 5294.44ms 
iter 6181: loss 2.5700, time 5345.52ms 
iter 6182: loss 2.4588, time 5277.03ms 
iter 6183: loss 2.3990, time 5251.46ms 
iter 6184: loss 2.2074, time 5227.38ms 
iter 6185: loss 2.6308, time 5260.77ms 
iter 6186: loss 2.5955, time 5255.27ms 
iter 6187: loss 2.5076, time 5169.72ms 
iter 6188: loss 2.3784, time 5184.35ms 
iter 6189: loss 2.5190, time 5172.97ms 
iter 6190: loss 2.6888, time 5246.91ms 
iter 6191: loss 2.5318, time 5242.84ms 
iter 6192: loss 2.6925, time 5250.68ms 
iter 6193: loss 2.4198, time 5258.61ms 
iter 6194: loss 2.3963, time 5243.14ms 
iter 6195: loss 2.7493, time 5138.45ms 
iter 6196: loss 2.8166, time 5239.12ms 
iter 6197: loss 2.5892, time 5247.53ms 
iter 6198: loss 2.5038, time 5261.40ms 
iter 6199: loss 2.6783, time 5254.85ms 
step 6200: train loss 2.5561, val loss 2.8481
iter 6200: loss 2.6869, time 20036.77ms 
iter 6201: loss 2.4848, time 5224.42ms 
iter 6202: loss 2.5886, time 5164.53ms 
iter 6203: loss 2.3720, time 5247.59ms 
iter 6204: loss 2.3859, time 5245.79ms 
iter 6205: loss 2.4668, time 5245.08ms 
iter 6206: loss 2.3746, time 5232.55ms 
iter 6207: loss 2.4038, time 5127.55ms 
iter 6208: loss 2.5234, time 5171.83ms 
iter 6209: loss 2.2511, time 5252.70ms 
iter 6210: loss 2.4707, time 5247.39ms 
iter 6211: loss 2.4070, time 5238.76ms 
iter 6212: loss 2.5565, time 5261.90ms 
iter 6213: loss 2.6463, time 5261.77ms 
iter 6214: loss 2.6203, time 5282.10ms 
iter 6215: loss 2.5272, time 5260.02ms 
iter 6216: loss 2.7044, time 5262.35ms 
iter 6217: loss 2.4417, time 5286.03ms 
iter 6218: loss 2.5394, time 5283.39ms 
iter 6219: loss 2.4543, time 5259.59ms 
iter 6220: loss 2.5656, time 5245.83ms 
iter 6221: loss 2.6149, time 5255.80ms 
iter 6222: loss 2.6230, time 5284.12ms 
iter 6223: loss 2.5533, time 5244.70ms 
iter 6224: loss 2.6615, time 5261.27ms 
iter 6225: loss 2.3605, time 5208.56ms 
iter 6226: loss 2.4716, time 5206.19ms 
iter 6227: loss 2.7108, time 5182.92ms 
iter 6228: loss 2.5897, time 5148.80ms 
iter 6229: loss 2.5174, time 5199.97ms 
iter 6230: loss 2.5653, time 5246.51ms 
iter 6231: loss 2.4633, time 5274.21ms 
iter 6232: loss 2.5616, time 5352.50ms 
iter 6233: loss 2.5726, time 5346.47ms 
iter 6234: loss 2.5387, time 5360.47ms 
iter 6235: loss 2.6069, time 5346.40ms 
iter 6236: loss 2.4216, time 5346.53ms 
iter 6237: loss 2.6089, time 5321.15ms 
iter 6238: loss 2.5510, time 5329.64ms 
iter 6239: loss 2.5630, time 5273.14ms 
iter 6240: loss 2.5360, time 5308.68ms 
iter 6241: loss 2.7369, time 5346.69ms 
iter 6242: loss 2.6300, time 5345.33ms 
iter 6243: loss 2.5332, time 5348.42ms 
iter 6244: loss 2.4655, time 5349.79ms 
iter 6245: loss 2.7107, time 5309.49ms 
iter 6246: loss 2.6184, time 5342.69ms 
iter 6247: loss 2.4751, time 5342.35ms 
iter 6248: loss 2.6603, time 5340.77ms 
iter 6249: loss 2.6381, time 5341.34ms 
step 6250: train loss 2.5297, val loss 2.8384
iter 6250: loss 2.2492, time 20215.11ms 
iter 6251: loss 2.5402, time 5347.75ms 
iter 6252: loss 2.6886, time 5344.16ms 
iter 6253: loss 2.6383, time 5346.64ms 
iter 6254: loss 2.4905, time 5277.34ms 
iter 6255: loss 2.5447, time 5274.02ms 
iter 6256: loss 2.4513, time 5268.97ms 
iter 6257: loss 2.4583, time 5278.88ms 
iter 6258: loss 2.5277, time 5276.76ms 
iter 6259: loss 2.8271, time 5276.34ms 
iter 6260: loss 2.4810, time 5279.69ms 
iter 6261: loss 2.7072, time 5282.04ms 
iter 6262: loss 2.3906, time 5270.95ms 
iter 6263: loss 2.6651, time 5233.11ms 
iter 6264: loss 2.5882, time 5243.77ms 
iter 6265: loss 2.7664, time 5253.63ms 
iter 6266: loss 2.3682, time 5243.55ms 
iter 6267: loss 2.6922, time 5244.06ms 
iter 6268: loss 2.5170, time 5250.21ms 
iter 6269: loss 2.6910, time 5250.16ms 
iter 6270: loss 2.4951, time 5251.10ms 
iter 6271: loss 2.6359, time 5250.69ms 
iter 6272: loss 2.4783, time 5243.27ms 
iter 6273: loss 2.5138, time 5251.20ms 
iter 6274: loss 2.3769, time 5259.39ms 
iter 6275: loss 2.8014, time 5259.56ms 
iter 6276: loss 2.5837, time 5195.79ms 
iter 6277: loss 2.6200, time 5249.87ms 
iter 6278: loss 2.3604, time 5277.81ms 
iter 6279: loss 2.6307, time 5263.44ms 
iter 6280: loss 2.4140, time 5258.93ms 
iter 6281: loss 2.4791, time 5278.30ms 
iter 6282: loss 2.2491, time 5321.34ms 
iter 6283: loss 2.5560, time 5306.11ms 
iter 6284: loss 2.5567, time 5348.44ms 
iter 6285: loss 2.4710, time 5341.04ms 
iter 6286: loss 2.6124, time 5312.19ms 
iter 6287: loss 2.6270, time 5287.90ms 
iter 6288: loss 2.6060, time 5341.54ms 
iter 6289: loss 2.4624, time 5286.98ms 
iter 6290: loss 2.4567, time 5282.58ms 
iter 6291: loss 2.4777, time 5280.22ms 
iter 6292: loss 2.5879, time 5271.49ms 
iter 6293: loss 2.6742, time 5273.89ms 
iter 6294: loss 2.6224, time 5262.68ms 
iter 6295: loss 2.4220, time 5274.82ms 
iter 6296: loss 2.5063, time 5290.12ms 
iter 6297: loss 2.6240, time 5278.76ms 
iter 6298: loss 2.4573, time 5260.72ms 
iter 6299: loss 2.4589, time 5269.12ms 
step 6300: train loss 2.5276, val loss 2.8353
iter 6300: loss 2.6391, time 20054.96ms 
iter 6301: loss 2.3067, time 5271.16ms 
iter 6302: loss 2.6920, time 5317.19ms 
iter 6303: loss 2.5487, time 5269.84ms 
iter 6304: loss 2.5355, time 5241.69ms 
iter 6305: loss 2.5070, time 5265.18ms 
iter 6306: loss 2.6719, time 5257.40ms 
iter 6307: loss 2.4359, time 5278.96ms 
iter 6308: loss 2.5631, time 5306.33ms 
iter 6309: loss 2.4852, time 5276.52ms 
iter 6310: loss 2.4638, time 5261.22ms 
iter 6311: loss 2.5415, time 5320.31ms 
iter 6312: loss 2.6254, time 5352.74ms 
iter 6313: loss 2.6120, time 5312.50ms 
iter 6314: loss 2.4118, time 5297.89ms 
iter 6315: loss 2.5868, time 5275.15ms 
iter 6316: loss 2.4452, time 5255.85ms 
iter 6317: loss 2.5850, time 5230.55ms 
iter 6318: loss 2.4878, time 5260.20ms 
iter 6319: loss 2.4560, time 5271.94ms 
iter 6320: loss 2.4104, time 5267.04ms 
iter 6321: loss 2.5188, time 5262.50ms 
iter 6322: loss 2.5193, time 5257.17ms 
iter 6323: loss 2.5139, time 5259.52ms 
iter 6324: loss 2.6638, time 5283.95ms 
iter 6325: loss 2.5517, time 5243.57ms 
iter 6326: loss 2.7065, time 5242.75ms 
iter 6327: loss 2.5772, time 5257.63ms 
iter 6328: loss 2.5247, time 5267.72ms 
iter 6329: loss 2.5134, time 5268.30ms 
iter 6330: loss 2.4839, time 5273.51ms 
iter 6331: loss 2.3820, time 5311.49ms 
iter 6332: loss 2.5887, time 5268.95ms 
iter 6333: loss 2.4058, time 5278.41ms 
iter 6334: loss 2.5235, time 5304.85ms 
iter 6335: loss 2.6005, time 5270.41ms 
iter 6336: loss 2.5239, time 5273.72ms 
iter 6337: loss 2.6452, time 5271.56ms 
iter 6338: loss 2.6849, time 5267.40ms 
iter 6339: loss 2.6649, time 5277.69ms 
iter 6340: loss 2.4426, time 5274.80ms 
iter 6341: loss 2.5639, time 5264.23ms 
iter 6342: loss 2.3865, time 5265.14ms 
iter 6343: loss 2.7289, time 5271.21ms 
iter 6344: loss 2.4636, time 5275.87ms 
iter 6345: loss 2.5930, time 5249.75ms 
iter 6346: loss 2.6956, time 5263.30ms 
iter 6347: loss 2.2398, time 5259.20ms 
iter 6348: loss 2.5020, time 5278.97ms 
iter 6349: loss 2.5860, time 5288.36ms 
step 6350: train loss 2.5400, val loss 2.8520
iter 6350: loss 2.6035, time 20121.82ms 
iter 6351: loss 2.5145, time 5268.37ms 
iter 6352: loss 2.5247, time 5285.16ms 
iter 6353: loss 2.5259, time 5281.01ms 
iter 6354: loss 2.5667, time 5307.73ms 
iter 6355: loss 2.4999, time 5319.43ms 
iter 6356: loss 2.4092, time 5297.99ms 
iter 6357: loss 2.3584, time 5331.70ms 
iter 6358: loss 2.5154, time 5357.00ms 
iter 6359: loss 2.6148, time 5350.36ms 
iter 6360: loss 2.4820, time 5348.68ms 
iter 6361: loss 2.4575, time 5345.45ms 
iter 6362: loss 2.6251, time 5285.50ms 
iter 6363: loss 2.5652, time 5280.17ms 
iter 6364: loss 2.6212, time 5316.72ms 
iter 6365: loss 2.6542, time 5283.50ms 
iter 6366: loss 2.2713, time 5276.89ms 
iter 6367: loss 2.5450, time 5339.19ms 
iter 6368: loss 2.4529, time 5307.88ms 
iter 6369: loss 2.7380, time 5317.25ms 
iter 6370: loss 2.6572, time 5257.60ms 
iter 6371: loss 2.6998, time 5259.21ms 
iter 6372: loss 2.5783, time 5275.10ms 
iter 6373: loss 2.6997, time 5276.93ms 
iter 6374: loss 2.3665, time 5267.09ms 
iter 6375: loss 2.4517, time 5274.95ms 
iter 6376: loss 2.6484, time 5276.11ms 
iter 6377: loss 2.5545, time 5276.75ms 
iter 6378: loss 2.5235, time 5273.68ms 
iter 6379: loss 2.5152, time 5279.64ms 
iter 6380: loss 2.4240, time 5287.89ms 
iter 6381: loss 2.4494, time 5275.25ms 
iter 6382: loss 2.6869, time 5300.66ms 
iter 6383: loss 2.4324, time 5304.48ms 
iter 6384: loss 2.3375, time 5316.75ms 
iter 6385: loss 2.4397, time 5313.36ms 
iter 6386: loss 2.5692, time 5312.90ms 
iter 6387: loss 2.6331, time 5285.61ms 
iter 6388: loss 2.5592, time 5275.12ms 
iter 6389: loss 2.4682, time 5270.59ms 
iter 6390: loss 2.3960, time 5284.51ms 
iter 6391: loss 2.5901, time 5266.48ms 
iter 6392: loss 2.4624, time 5272.39ms 
iter 6393: loss 2.6188, time 5259.99ms 
iter 6394: loss 2.4666, time 5284.06ms 
iter 6395: loss 2.5349, time 5266.92ms 
iter 6396: loss 2.6747, time 5274.98ms 
iter 6397: loss 2.6076, time 5256.01ms 
iter 6398: loss 2.6235, time 5261.97ms 
iter 6399: loss 2.4581, time 5236.74ms 
step 6400: train loss 2.5257, val loss 2.8413
iter 6400: loss 2.4715, time 20106.13ms 
iter 6401: loss 2.5682, time 5345.76ms 
iter 6402: loss 2.7214, time 5324.06ms 
iter 6403: loss 2.6323, time 5289.74ms 
iter 6404: loss 2.7147, time 5290.55ms 
iter 6405: loss 2.8235, time 5306.58ms 
iter 6406: loss 2.4500, time 5270.48ms 
iter 6407: loss 2.7625, time 5347.66ms 
iter 6408: loss 2.3591, time 5251.49ms 
iter 6409: loss 2.4606, time 5276.11ms 
iter 6410: loss 2.5982, time 5264.34ms 
iter 6411: loss 2.5499, time 5274.55ms 
iter 6412: loss 2.3223, time 5264.35ms 
iter 6413: loss 2.5895, time 5278.52ms 
iter 6414: loss 2.6278, time 5237.15ms 
iter 6415: loss 2.3488, time 5269.74ms 
iter 6416: loss 2.5801, time 5264.74ms 
iter 6417: loss 2.6178, time 5235.87ms 
iter 6418: loss 2.4490, time 5252.55ms 
iter 6419: loss 2.4137, time 5268.14ms 
iter 6420: loss 2.7649, time 5262.43ms 
iter 6421: loss 2.5746, time 5255.05ms 
iter 6422: loss 2.6717, time 5268.87ms 
iter 6423: loss 2.5812, time 5255.77ms 
iter 6424: loss 2.4788, time 5264.45ms 
iter 6425: loss 2.6431, time 5264.80ms 
iter 6426: loss 2.7328, time 5266.90ms 
iter 6427: loss 2.5210, time 5255.12ms 
iter 6428: loss 2.7918, time 5267.49ms 
iter 6429: loss 2.5321, time 5254.10ms 
iter 6430: loss 2.4609, time 5257.99ms 
iter 6431: loss 2.6799, time 5257.78ms 
iter 6432: loss 2.6035, time 5263.15ms 
iter 6433: loss 2.4386, time 5258.33ms 
iter 6434: loss 2.6028, time 5246.08ms 
iter 6435: loss 2.4121, time 5260.56ms 
iter 6436: loss 2.5935, time 5267.10ms 
iter 6437: loss 2.5746, time 5261.97ms 
iter 6438: loss 2.6791, time 5266.30ms 
iter 6439: loss 2.7055, time 5269.05ms 
iter 6440: loss 2.4577, time 5273.18ms 
iter 6441: loss 2.6250, time 5274.25ms 
iter 6442: loss 2.3306, time 5282.39ms 
iter 6443: loss 2.3826, time 5337.54ms 
iter 6444: loss 2.5960, time 5271.79ms 
iter 6445: loss 2.7021, time 5263.93ms 
iter 6446: loss 2.5945, time 5267.40ms 
iter 6447: loss 2.5395, time 5275.48ms 
iter 6448: loss 2.4161, time 5276.38ms 
iter 6449: loss 2.3753, time 5262.47ms 
step 6450: train loss 2.5190, val loss 2.8335
iter 6450: loss 2.6119, time 19990.57ms 
iter 6451: loss 2.7113, time 5268.72ms 
iter 6452: loss 2.5043, time 5347.82ms 
iter 6453: loss 2.3349, time 5296.68ms 
iter 6454: loss 2.7631, time 5288.77ms 
iter 6455: loss 2.3607, time 5269.05ms 
iter 6456: loss 2.2519, time 5283.51ms 
iter 6457: loss 2.8635, time 5278.78ms 
iter 6458: loss 2.6068, time 5264.86ms 
iter 6459: loss 2.4833, time 5272.88ms 
iter 6460: loss 2.4707, time 5287.72ms 
iter 6461: loss 2.6082, time 5292.54ms 
iter 6462: loss 2.8125, time 5317.11ms 
iter 6463: loss 2.4822, time 5341.85ms 
iter 6464: loss 2.5806, time 5339.03ms 
iter 6465: loss 2.4545, time 5347.00ms 
iter 6466: loss 2.6010, time 5327.86ms 
iter 6467: loss 2.4887, time 5256.73ms 
iter 6468: loss 2.4389, time 5279.98ms 
iter 6469: loss 2.4756, time 5268.17ms 
iter 6470: loss 2.5509, time 5274.96ms 
iter 6471: loss 2.4978, time 5267.96ms 
iter 6472: loss 2.5738, time 5252.74ms 
iter 6473: loss 2.2414, time 5264.76ms 
iter 6474: loss 2.3354, time 5259.32ms 
iter 6475: loss 2.5665, time 5264.13ms 
iter 6476: loss 2.6177, time 5288.67ms 
iter 6477: loss 2.4613, time 5262.36ms 
iter 6478: loss 2.6001, time 5264.09ms 
iter 6479: loss 2.7753, time 5262.40ms 
iter 6480: loss 2.6215, time 5265.92ms 
iter 6481: loss 2.4644, time 5260.87ms 
iter 6482: loss 2.8218, time 5265.71ms 
iter 6483: loss 2.4733, time 5255.35ms 
iter 6484: loss 2.4706, time 5265.98ms 
iter 6485: loss 2.5460, time 5270.81ms 
iter 6486: loss 2.5606, time 5257.83ms 
iter 6487: loss 2.4242, time 5266.13ms 
iter 6488: loss 2.5304, time 5267.92ms 
iter 6489: loss 2.7413, time 5270.41ms 
iter 6490: loss 2.4678, time 5263.18ms 
iter 6491: loss 2.5610, time 5270.17ms 
iter 6492: loss 2.5178, time 5256.02ms 
iter 6493: loss 2.5138, time 5274.48ms 
iter 6494: loss 2.5817, time 5254.53ms 
iter 6495: loss 2.4275, time 5278.38ms 
iter 6496: loss 2.4256, time 5260.76ms 
iter 6497: loss 2.7033, time 5285.42ms 
iter 6498: loss 2.5382, time 5246.95ms 
iter 6499: loss 2.3418, time 5259.63ms 
step 6500: train loss 2.5252, val loss 2.8453
iter 6500: loss 2.5029, time 20087.69ms 
iter 6501: loss 2.4807, time 5274.34ms 
iter 6502: loss 2.6609, time 5248.27ms 
iter 6503: loss 2.6322, time 5260.60ms 
iter 6504: loss 2.5811, time 5273.80ms 
iter 6505: loss 2.6155, time 5271.20ms 
iter 6506: loss 2.4789, time 5268.93ms 
iter 6507: loss 2.5743, time 5260.66ms 
iter 6508: loss 2.5387, time 5225.14ms 
iter 6509: loss 2.6623, time 5275.75ms 
iter 6510: loss 2.4901, time 5276.04ms 
iter 6511: loss 2.4529, time 5268.07ms 
iter 6512: loss 2.5428, time 5272.41ms 
iter 6513: loss 2.6613, time 5259.32ms 
iter 6514: loss 2.2979, time 5271.33ms 
iter 6515: loss 2.7361, time 5274.26ms 
iter 6516: loss 2.6976, time 5270.24ms 
iter 6517: loss 2.5091, time 5272.24ms 
iter 6518: loss 2.3733, time 5278.00ms 
iter 6519: loss 2.4390, time 5273.34ms 
iter 6520: loss 2.6067, time 5258.00ms 
iter 6521: loss 2.5687, time 5278.99ms 
iter 6522: loss 2.5461, time 5283.57ms 
iter 6523: loss 2.4428, time 5286.27ms 
iter 6524: loss 2.6416, time 5282.94ms 
iter 6525: loss 2.4091, time 5262.00ms 
iter 6526: loss 2.5655, time 5265.65ms 
iter 6527: loss 2.5194, time 5268.90ms 
iter 6528: loss 2.4747, time 5257.85ms 
iter 6529: loss 2.3755, time 5261.55ms 
iter 6530: loss 2.4509, time 5271.43ms 
iter 6531: loss 2.4394, time 5282.49ms 
iter 6532: loss 2.4546, time 5261.71ms 
iter 6533: loss 2.4428, time 5259.91ms 
iter 6534: loss 2.5278, time 5261.32ms 
iter 6535: loss 2.5534, time 5285.63ms 
iter 6536: loss 2.3165, time 5261.88ms 
iter 6537: loss 2.4274, time 5258.22ms 
iter 6538: loss 2.3809, time 5268.98ms 
iter 6539: loss 2.4802, time 5266.55ms 
iter 6540: loss 2.4798, time 5260.60ms 
iter 6541: loss 2.5011, time 5261.12ms 
iter 6542: loss 2.6185, time 5265.39ms 
iter 6543: loss 2.3226, time 5265.72ms 
iter 6544: loss 2.1916, time 5264.67ms 
iter 6545: loss 2.3933, time 5263.88ms 
iter 6546: loss 2.5783, time 5274.38ms 
iter 6547: loss 2.6282, time 5264.87ms 
iter 6548: loss 2.6197, time 5270.40ms 
iter 6549: loss 2.3929, time 5260.27ms 
step 6550: train loss 2.5144, val loss 2.8358
iter 6550: loss 2.5795, time 20058.53ms 
iter 6551: loss 2.5430, time 5268.27ms 
iter 6552: loss 2.4267, time 5278.05ms 
iter 6553: loss 2.6145, time 5271.97ms 
iter 6554: loss 2.3985, time 5265.98ms 
iter 6555: loss 2.4600, time 5262.18ms 
iter 6556: loss 2.5579, time 5256.86ms 
iter 6557: loss 2.4539, time 5272.03ms 
iter 6558: loss 2.3901, time 5278.53ms 
iter 6559: loss 2.4919, time 5274.18ms 
iter 6560: loss 2.4757, time 5272.60ms 
iter 6561: loss 2.6062, time 5275.26ms 
iter 6562: loss 2.5262, time 5269.33ms 
iter 6563: loss 2.4551, time 5276.23ms 
iter 6564: loss 2.6687, time 5275.67ms 
iter 6565: loss 2.6006, time 5256.63ms 
iter 6566: loss 2.7297, time 5258.57ms 
iter 6567: loss 2.4969, time 5262.53ms 
iter 6568: loss 2.3398, time 5259.90ms 
iter 6569: loss 2.4366, time 5258.10ms 
iter 6570: loss 2.5539, time 5257.47ms 
iter 6571: loss 2.7167, time 5258.16ms 
iter 6572: loss 2.4342, time 5265.19ms 
iter 6573: loss 2.4935, time 5267.15ms 
iter 6574: loss 2.5628, time 5262.27ms 
iter 6575: loss 2.5478, time 5254.68ms 
iter 6576: loss 2.6527, time 5261.79ms 
iter 6577: loss 2.5760, time 5266.80ms 
iter 6578: loss 2.2712, time 5261.12ms 
iter 6579: loss 2.5999, time 5260.95ms 
iter 6580: loss 2.4727, time 5267.86ms 
iter 6581: loss 2.2209, time 5269.70ms 
iter 6582: loss 2.5623, time 5254.70ms 
iter 6583: loss 2.5320, time 5256.33ms 
iter 6584: loss 2.4692, time 5270.80ms 
iter 6585: loss 2.4434, time 5265.13ms 
iter 6586: loss 2.6417, time 5264.95ms 
iter 6587: loss 2.4355, time 5262.26ms 
iter 6588: loss 2.5308, time 5264.42ms 
iter 6589: loss 2.4201, time 5267.68ms 
iter 6590: loss 2.6489, time 5259.55ms 
iter 6591: loss 2.5702, time 5259.97ms 
iter 6592: loss 2.5378, time 5264.05ms 
iter 6593: loss 2.5854, time 5274.64ms 
iter 6594: loss 2.6781, time 5257.24ms 
iter 6595: loss 2.2031, time 5255.81ms 
iter 6596: loss 2.4772, time 5264.95ms 
iter 6597: loss 2.5088, time 5275.51ms 
iter 6598: loss 2.5670, time 5265.12ms 
iter 6599: loss 2.4435, time 5267.20ms 
step 6600: train loss 2.5250, val loss 2.8497
iter 6600: loss 2.6343, time 20067.73ms 
iter 6601: loss 2.5801, time 5279.73ms 
iter 6602: loss 2.4074, time 5259.54ms 
iter 6603: loss 2.3490, time 5273.60ms 
iter 6604: loss 2.7512, time 5266.73ms 
iter 6605: loss 2.6768, time 5265.93ms 
iter 6606: loss 2.7305, time 5267.40ms 
iter 6607: loss 2.6824, time 5269.36ms 
iter 6608: loss 2.4335, time 5264.93ms 
iter 6609: loss 2.4389, time 5278.41ms 
iter 6610: loss 2.5399, time 5282.23ms 
iter 6611: loss 2.4313, time 5268.51ms 
iter 6612: loss 2.5983, time 5261.52ms 
iter 6613: loss 2.6379, time 5270.65ms 
iter 6614: loss 2.5427, time 5275.36ms 
iter 6615: loss 2.5240, time 5270.46ms 
iter 6616: loss 2.2170, time 5269.42ms 
iter 6617: loss 2.3749, time 5302.13ms 
iter 6618: loss 2.6203, time 5312.34ms 
iter 6619: loss 2.5308, time 5287.33ms 
iter 6620: loss 2.4688, time 5318.27ms 
iter 6621: loss 2.6577, time 5266.68ms 
iter 6622: loss 2.5701, time 5271.30ms 
iter 6623: loss 2.4818, time 5274.40ms 
iter 6624: loss 2.6447, time 5277.28ms 
iter 6625: loss 2.3360, time 5265.88ms 
iter 6626: loss 2.6120, time 5282.01ms 
iter 6627: loss 2.6083, time 5254.44ms 
iter 6628: loss 2.6949, time 5267.86ms 
iter 6629: loss 2.4702, time 5270.17ms 
iter 6630: loss 2.4956, time 5281.88ms 
iter 6631: loss 2.4346, time 5244.56ms 
iter 6632: loss 2.6513, time 5260.58ms 
iter 6633: loss 2.6071, time 5275.43ms 
iter 6634: loss 2.7373, time 5262.12ms 
iter 6635: loss 2.7803, time 5264.01ms 
iter 6636: loss 2.6214, time 5238.32ms 
iter 6637: loss 2.4697, time 5257.61ms 
iter 6638: loss 2.5839, time 5267.75ms 
iter 6639: loss 2.5690, time 5271.44ms 
iter 6640: loss 2.4500, time 5253.76ms 
iter 6641: loss 2.4791, time 5265.53ms 
iter 6642: loss 2.4137, time 5275.60ms 
iter 6643: loss 2.5619, time 5276.81ms 
iter 6644: loss 2.6975, time 5255.87ms 
iter 6645: loss 2.5853, time 5259.81ms 
iter 6646: loss 2.4402, time 5275.92ms 
iter 6647: loss 2.6279, time 5263.71ms 
iter 6648: loss 2.4905, time 5254.11ms 
iter 6649: loss 2.5231, time 5256.48ms 
step 6650: train loss 2.5331, val loss 2.8462
iter 6650: loss 2.3299, time 20090.58ms 
iter 6651: loss 2.4784, time 5322.31ms 
iter 6652: loss 2.7937, time 5315.29ms 
iter 6653: loss 2.3452, time 5269.29ms 
iter 6654: loss 2.6989, time 5272.65ms 
iter 6655: loss 2.6638, time 5264.83ms 
iter 6656: loss 2.7071, time 5258.56ms 
iter 6657: loss 2.5776, time 5256.69ms 
iter 6658: loss 2.6875, time 5258.92ms 
iter 6659: loss 2.4859, time 5271.60ms 
iter 6660: loss 2.5243, time 5270.56ms 
iter 6661: loss 2.6558, time 5219.12ms 
iter 6662: loss 2.5185, time 5260.97ms 
iter 6663: loss 2.2542, time 5295.91ms 
iter 6664: loss 2.5445, time 5264.88ms 
iter 6665: loss 2.7021, time 5259.79ms 
iter 6666: loss 2.5271, time 5268.04ms 
iter 6667: loss 2.5923, time 5282.27ms 
iter 6668: loss 2.5108, time 5280.31ms 
iter 6669: loss 2.4002, time 5281.52ms 
iter 6670: loss 2.4803, time 5241.16ms 
iter 6671: loss 2.5310, time 5306.52ms 
iter 6672: loss 2.4839, time 5282.95ms 
iter 6673: loss 2.6525, time 5262.95ms 
iter 6674: loss 2.5631, time 5259.22ms 
iter 6675: loss 2.5865, time 5272.36ms 
iter 6676: loss 2.5309, time 5279.92ms 
iter 6677: loss 2.5179, time 5261.52ms 
iter 6678: loss 2.3136, time 5258.67ms 
iter 6679: loss 2.5331, time 5281.92ms 
iter 6680: loss 2.6703, time 5267.06ms 
iter 6681: loss 2.5855, time 5297.55ms 
iter 6682: loss 2.4768, time 5268.86ms 
iter 6683: loss 2.5089, time 5222.28ms 
iter 6684: loss 2.5740, time 5256.29ms 
iter 6685: loss 2.6046, time 5260.72ms 
iter 6686: loss 2.5175, time 5342.47ms 
iter 6687: loss 2.6246, time 5323.42ms 
iter 6688: loss 2.4363, time 5274.26ms 
iter 6689: loss 2.5732, time 5266.82ms 
iter 6690: loss 2.5344, time 5268.23ms 
iter 6691: loss 2.5734, time 5299.05ms 
iter 6692: loss 2.4526, time 5279.24ms 
iter 6693: loss 2.4722, time 5264.06ms 
iter 6694: loss 2.4335, time 5243.01ms 
iter 6695: loss 2.3911, time 5248.60ms 
iter 6696: loss 2.4178, time 5269.61ms 
iter 6697: loss 2.3058, time 5262.06ms 
iter 6698: loss 2.4040, time 5258.77ms 
iter 6699: loss 2.5983, time 5268.56ms 
step 6700: train loss 2.5384, val loss 2.8405
iter 6700: loss 2.5388, time 20093.24ms 
iter 6701: loss 2.7177, time 5267.51ms 
iter 6702: loss 2.6734, time 5269.09ms 
iter 6703: loss 2.3831, time 5280.96ms 
iter 6704: loss 2.5032, time 5273.60ms 
iter 6705: loss 2.6574, time 5282.28ms 
iter 6706: loss 2.4690, time 5225.98ms 
iter 6707: loss 2.5648, time 5240.26ms 
iter 6708: loss 2.2804, time 5261.00ms 
iter 6709: loss 2.4078, time 5264.61ms 
iter 6710: loss 2.6099, time 5255.89ms 
iter 6711: loss 2.2499, time 5257.68ms 
iter 6712: loss 2.4431, time 5265.02ms 
iter 6713: loss 2.5284, time 5260.05ms 
iter 6714: loss 2.5454, time 5259.98ms 
iter 6715: loss 2.2693, time 5244.16ms 
iter 6716: loss 2.6908, time 5257.21ms 
iter 6717: loss 2.4172, time 5258.62ms 
iter 6718: loss 2.4865, time 5247.66ms 
iter 6719: loss 2.6214, time 5269.15ms 
iter 6720: loss 2.5574, time 5271.97ms 
iter 6721: loss 2.5315, time 5275.10ms 
iter 6722: loss 2.5646, time 5280.55ms 
iter 6723: loss 2.6087, time 5273.04ms 
iter 6724: loss 2.4523, time 5270.43ms 
iter 6725: loss 2.5677, time 5250.00ms 
iter 6726: loss 2.5151, time 5265.65ms 
iter 6727: loss 2.5902, time 5151.69ms 
iter 6728: loss 2.5452, time 5309.81ms 
iter 6729: loss 2.5464, time 5284.22ms 
iter 6730: loss 2.6264, time 5276.26ms 
iter 6731: loss 2.4477, time 5267.36ms 
iter 6732: loss 2.5270, time 5270.48ms 
iter 6733: loss 2.7231, time 5261.63ms 
iter 6734: loss 2.4572, time 5273.16ms 
iter 6735: loss 2.4707, time 5262.83ms 
iter 6736: loss 2.6750, time 5249.38ms 
iter 6737: loss 2.5554, time 5257.12ms 
iter 6738: loss 2.3973, time 5273.89ms 
iter 6739: loss 2.4742, time 5269.75ms 
iter 6740: loss 2.6028, time 5268.16ms 
iter 6741: loss 2.5251, time 5278.08ms 
iter 6742: loss 2.7010, time 5271.91ms 
iter 6743: loss 2.4490, time 5243.74ms 
iter 6744: loss 2.5955, time 5270.64ms 
iter 6745: loss 2.7279, time 5271.24ms 
iter 6746: loss 2.6200, time 5345.61ms 
iter 6747: loss 2.2248, time 5311.39ms 
iter 6748: loss 2.4135, time 5273.42ms 
iter 6749: loss 2.4834, time 5280.58ms 
step 6750: train loss 2.5171, val loss 2.8244
iter 6750: loss 2.5895, time 20090.43ms 
iter 6751: loss 2.5227, time 5273.30ms 
iter 6752: loss 2.5449, time 5277.67ms 
iter 6753: loss 2.5775, time 5279.85ms 
iter 6754: loss 2.2800, time 5283.69ms 
iter 6755: loss 2.7263, time 5285.26ms 
iter 6756: loss 1.9835, time 5282.81ms 
iter 6757: loss 2.4438, time 5284.22ms 
iter 6758: loss 2.3662, time 5307.33ms 
iter 6759: loss 2.6151, time 5281.76ms 
iter 6760: loss 2.7313, time 5292.90ms 
iter 6761: loss 2.5992, time 5272.22ms 
iter 6762: loss 2.6630, time 5282.94ms 
iter 6763: loss 2.6697, time 5259.59ms 
iter 6764: loss 2.5177, time 5259.49ms 
iter 6765: loss 2.8076, time 5267.81ms 
iter 6766: loss 2.7282, time 5281.04ms 
iter 6767: loss 2.5565, time 5245.19ms 
iter 6768: loss 2.4523, time 5260.82ms 
iter 6769: loss 2.6259, time 5173.26ms 
iter 6770: loss 2.5596, time 5265.77ms 
iter 6771: loss 2.7633, time 5256.22ms 
iter 6772: loss 2.5031, time 5219.27ms 
iter 6773: loss 2.4220, time 5259.58ms 
iter 6774: loss 2.6886, time 5219.51ms 
iter 6775: loss 2.4067, time 5273.40ms 
iter 6776: loss 2.7100, time 5256.65ms 
iter 6777: loss 2.6053, time 5266.66ms 
iter 6778: loss 2.6961, time 5270.15ms 
iter 6779: loss 2.3701, time 5269.26ms 
iter 6780: loss 2.6360, time 5256.69ms 
iter 6781: loss 2.5828, time 5262.66ms 
iter 6782: loss 2.5058, time 5278.28ms 
iter 6783: loss 2.6432, time 5280.85ms 
iter 6784: loss 2.2992, time 5269.08ms 
iter 6785: loss 2.4572, time 5267.86ms 
iter 6786: loss 2.4318, time 5276.71ms 
iter 6787: loss 2.6365, time 5275.57ms 
iter 6788: loss 2.5236, time 5270.46ms 
iter 6789: loss 2.4737, time 5280.53ms 
iter 6790: loss 2.4367, time 5314.97ms 
iter 6791: loss 2.5048, time 5295.65ms 
iter 6792: loss 2.5224, time 5285.08ms 
iter 6793: loss 2.4539, time 5278.65ms 
iter 6794: loss 2.6339, time 5286.57ms 
iter 6795: loss 2.5232, time 5269.90ms 
iter 6796: loss 2.7434, time 5235.32ms 
iter 6797: loss 2.3101, time 5260.07ms 
iter 6798: loss 2.6263, time 5288.89ms 
iter 6799: loss 2.5379, time 5275.07ms 
step 6800: train loss 2.5201, val loss 2.8448
iter 6800: loss 2.6021, time 19979.84ms 
iter 6801: loss 2.6204, time 5238.46ms 
iter 6802: loss 2.4496, time 5267.08ms 
iter 6803: loss 2.7613, time 5269.87ms 
iter 6804: loss 2.4492, time 5260.14ms 
iter 6805: loss 2.5700, time 5266.73ms 
iter 6806: loss 2.5749, time 5260.54ms 
iter 6807: loss 2.6471, time 5268.81ms 
iter 6808: loss 2.6451, time 5257.61ms 
iter 6809: loss 2.5497, time 5257.85ms 
iter 6810: loss 2.4797, time 5261.23ms 
iter 6811: loss 2.3269, time 5279.40ms 
iter 6812: loss 2.3017, time 5260.81ms 
iter 6813: loss 2.5565, time 5257.16ms 
iter 6814: loss 2.7014, time 5277.73ms 
iter 6815: loss 2.2939, time 5285.68ms 
iter 6816: loss 2.4379, time 5268.52ms 
iter 6817: loss 2.4480, time 5267.72ms 
iter 6818: loss 2.6299, time 5272.54ms 
iter 6819: loss 2.5229, time 5271.58ms 
iter 6820: loss 2.7400, time 5261.74ms 
iter 6821: loss 2.3954, time 5264.23ms 
iter 6822: loss 2.7175, time 5260.56ms 
iter 6823: loss 2.6168, time 5305.02ms 
iter 6824: loss 2.4738, time 5264.24ms 
iter 6825: loss 2.4262, time 5269.16ms 
iter 6826: loss 2.2718, time 5264.17ms 
iter 6827: loss 2.5865, time 5273.52ms 
iter 6828: loss 2.5869, time 5293.18ms 
iter 6829: loss 2.5719, time 5268.36ms 
iter 6830: loss 2.3969, time 5269.79ms 
iter 6831: loss 2.4983, time 5272.13ms 
iter 6832: loss 2.6364, time 5258.85ms 
iter 6833: loss 2.4245, time 5260.18ms 
iter 6834: loss 2.4701, time 5265.64ms 
iter 6835: loss 2.3063, time 5287.54ms 
iter 6836: loss 2.4528, time 5275.34ms 
iter 6837: loss 2.5604, time 5271.77ms 
iter 6838: loss 2.5031, time 5267.63ms 
iter 6839: loss 2.4901, time 5322.55ms 
iter 6840: loss 2.5131, time 5317.50ms 
iter 6841: loss 2.6399, time 5293.78ms 
iter 6842: loss 2.5559, time 5345.16ms 
iter 6843: loss 2.4291, time 5344.04ms 
iter 6844: loss 2.3698, time 5328.35ms 
iter 6845: loss 2.6316, time 5325.78ms 
iter 6846: loss 2.6036, time 5335.47ms 
iter 6847: loss 2.4635, time 5280.98ms 
iter 6848: loss 2.3604, time 5276.79ms 
iter 6849: loss 2.4455, time 5262.61ms 
step 6850: train loss 2.5264, val loss 2.8436
iter 6850: loss 2.6440, time 20096.70ms 
iter 6851: loss 2.8344, time 5332.54ms 
iter 6852: loss 2.9213, time 5287.05ms 
iter 6853: loss 2.2709, time 5288.57ms 
iter 6854: loss 2.4814, time 5308.07ms 
iter 6855: loss 2.5482, time 5280.24ms 
iter 6856: loss 2.6747, time 5268.43ms 
iter 6857: loss 2.4855, time 5235.35ms 
iter 6858: loss 2.5606, time 5274.98ms 
iter 6859: loss 2.4973, time 5261.02ms 
iter 6860: loss 2.5556, time 5257.96ms 
iter 6861: loss 2.6537, time 5276.17ms 
iter 6862: loss 2.4999, time 5259.80ms 
iter 6863: loss 2.5747, time 5254.91ms 
iter 6864: loss 2.4638, time 5265.86ms 
iter 6865: loss 2.4252, time 5267.36ms 
iter 6866: loss 2.4091, time 5258.35ms 
iter 6867: loss 2.4919, time 5257.06ms 
iter 6868: loss 2.4028, time 5264.93ms 
iter 6869: loss 2.6837, time 5271.97ms 
iter 6870: loss 2.5111, time 5256.93ms 
iter 6871: loss 2.4864, time 5257.78ms 
iter 6872: loss 2.5149, time 5263.16ms 
iter 6873: loss 2.4295, time 5266.89ms 
iter 6874: loss 2.5798, time 5264.12ms 
iter 6875: loss 2.6300, time 5257.49ms 
iter 6876: loss 2.3621, time 5267.29ms 
iter 6877: loss 2.7783, time 5267.45ms 
iter 6878: loss 2.4678, time 5257.74ms 
iter 6879: loss 2.4609, time 5264.31ms 
iter 6880: loss 2.4990, time 5263.84ms 
iter 6881: loss 2.3763, time 5309.57ms 
iter 6882: loss 2.3194, time 5292.11ms 
iter 6883: loss 2.3520, time 5257.88ms 
iter 6884: loss 2.5163, time 5247.06ms 
iter 6885: loss 2.5035, time 5267.53ms 
iter 6886: loss 2.5909, time 5257.76ms 
iter 6887: loss 2.3281, time 5265.43ms 
iter 6888: loss 2.6466, time 5263.05ms 
iter 6889: loss 2.4805, time 5268.97ms 
iter 6890: loss 2.4784, time 5259.77ms 
iter 6891: loss 2.4744, time 5267.93ms 
iter 6892: loss 2.4362, time 5267.50ms 
iter 6893: loss 2.4068, time 5263.06ms 
iter 6894: loss 2.3211, time 5261.05ms 
iter 6895: loss 2.7776, time 5261.24ms 
iter 6896: loss 2.5067, time 5271.78ms 
iter 6897: loss 2.6395, time 5274.43ms 
iter 6898: loss 2.4930, time 5273.32ms 
iter 6899: loss 2.4440, time 5272.42ms 
step 6900: train loss 2.5129, val loss 2.8461
iter 6900: loss 2.5974, time 20118.02ms 
iter 6901: loss 2.6189, time 5285.16ms 
iter 6902: loss 2.5274, time 5281.87ms 
iter 6903: loss 2.5853, time 5284.58ms 
iter 6904: loss 2.4578, time 5287.01ms 
iter 6905: loss 2.5234, time 5270.22ms 
iter 6906: loss 2.5701, time 5260.17ms 
iter 6907: loss 2.6119, time 5256.76ms 
iter 6908: loss 2.6447, time 5260.03ms 
iter 6909: loss 2.6231, time 5269.83ms 
iter 6910: loss 2.7657, time 5262.06ms 
iter 6911: loss 2.5268, time 5260.61ms 
iter 6912: loss 2.7163, time 5257.57ms 
iter 6913: loss 2.6301, time 5275.58ms 
iter 6914: loss 2.3618, time 5268.24ms 
iter 6915: loss 2.2917, time 5272.92ms 
iter 6916: loss 2.5587, time 5270.05ms 
iter 6917: loss 2.6929, time 5277.53ms 
iter 6918: loss 2.5505, time 5264.77ms 
iter 6919: loss 2.4603, time 5249.47ms 
iter 6920: loss 2.5837, time 5283.42ms 
iter 6921: loss 2.6689, time 5356.98ms 
iter 6922: loss 2.6376, time 5289.80ms 
iter 6923: loss 2.4927, time 5286.03ms 
iter 6924: loss 2.3245, time 5275.69ms 
iter 6925: loss 2.6080, time 5294.94ms 
iter 6926: loss 2.4374, time 5328.80ms 
iter 6927: loss 2.4308, time 5264.42ms 
iter 6928: loss 2.6476, time 5268.24ms 
iter 6929: loss 2.4292, time 5271.81ms 
iter 6930: loss 2.4485, time 5237.84ms 
iter 6931: loss 2.5150, time 5257.50ms 
iter 6932: loss 2.6124, time 5312.21ms 
iter 6933: loss 2.4053, time 5258.73ms 
iter 6934: loss 2.4682, time 5327.67ms 
iter 6935: loss 2.4536, time 5301.51ms 
iter 6936: loss 2.6329, time 5306.62ms 
iter 6937: loss 2.4951, time 5270.84ms 
iter 6938: loss 2.3919, time 5261.87ms 
iter 6939: loss 2.5305, time 5259.38ms 
iter 6940: loss 2.4442, time 5284.25ms 
iter 6941: loss 2.4055, time 5276.25ms 
iter 6942: loss 2.4599, time 5274.92ms 
iter 6943: loss 2.4819, time 5249.62ms 
iter 6944: loss 2.4632, time 5282.28ms 
iter 6945: loss 2.7236, time 5274.52ms 
iter 6946: loss 2.7105, time 5271.29ms 
iter 6947: loss 2.5354, time 5271.43ms 
iter 6948: loss 2.4388, time 5275.35ms 
iter 6949: loss 2.6132, time 5306.67ms 
step 6950: train loss 2.5305, val loss 2.8498
iter 6950: loss 2.3476, time 20178.67ms 
iter 6951: loss 2.5693, time 5346.80ms 
iter 6952: loss 2.5097, time 5295.63ms 
iter 6953: loss 2.3242, time 5266.12ms 
iter 6954: loss 2.7849, time 5264.42ms 
iter 6955: loss 2.5200, time 5273.34ms 
iter 6956: loss 2.5333, time 5286.84ms 
iter 6957: loss 2.5374, time 5268.49ms 
iter 6958: loss 2.5217, time 5274.50ms 
iter 6959: loss 2.4881, time 5189.61ms 
iter 6960: loss 2.6222, time 5226.84ms 
iter 6961: loss 2.5341, time 5156.54ms 
iter 6962: loss 2.6209, time 5201.21ms 
iter 6963: loss 2.6416, time 5163.00ms 
iter 6964: loss 2.3216, time 5147.61ms 
iter 6965: loss 2.3761, time 5229.06ms 
iter 6966: loss 2.5922, time 5238.38ms 
iter 6967: loss 2.4911, time 5174.51ms 
iter 6968: loss 2.4899, time 5172.47ms 
iter 6969: loss 2.3712, time 5164.85ms 
iter 6970: loss 2.5673, time 5266.46ms 
iter 6971: loss 2.5370, time 5266.02ms 
iter 6972: loss 2.4079, time 5270.66ms 
iter 6973: loss 2.6216, time 5259.03ms 
iter 6974: loss 2.5131, time 5194.95ms 
iter 6975: loss 2.2998, time 5234.19ms 
iter 6976: loss 2.4821, time 5155.81ms 
iter 6977: loss 2.4150, time 5208.68ms 
iter 6978: loss 2.6997, time 5265.01ms 
iter 6979: loss 2.3828, time 5306.95ms 
iter 6980: loss 2.2704, time 5263.70ms 
iter 6981: loss 2.4001, time 5276.77ms 
iter 6982: loss 2.5134, time 5263.13ms 
iter 6983: loss 2.4401, time 5280.14ms 
iter 6984: loss 2.4610, time 5263.46ms 
iter 6985: loss 2.6226, time 5260.95ms 
iter 6986: loss 2.3539, time 5235.81ms 
iter 6987: loss 2.5094, time 5271.84ms 
iter 6988: loss 2.5720, time 5267.31ms 
iter 6989: loss 2.5039, time 5264.31ms 
iter 6990: loss 2.6194, time 5322.11ms 
iter 6991: loss 2.5437, time 5343.76ms 
iter 6992: loss 2.4051, time 5292.88ms 
iter 6993: loss 2.3784, time 5229.87ms 
iter 6994: loss 2.5267, time 5180.13ms 
iter 6995: loss 2.6563, time 5201.94ms 
iter 6996: loss 2.3484, time 5252.18ms 
iter 6997: loss 2.5628, time 5252.76ms 
iter 6998: loss 2.4935, time 5251.27ms 
iter 6999: loss 2.5973, time 5244.61ms 
step 7000: train loss 2.5002, val loss 2.8424
iter 7000: loss 2.5965, time 19970.20ms 
iter 7001: loss 2.5735, time 5220.93ms 
iter 7002: loss 2.5743, time 5201.48ms 
iter 7003: loss 2.6372, time 5236.51ms 
iter 7004: loss 2.4476, time 5247.06ms 
iter 7005: loss 2.4616, time 5225.64ms 
iter 7006: loss 2.2266, time 5205.62ms 
iter 7007: loss 2.5742, time 5197.41ms 
iter 7008: loss 2.3822, time 5208.93ms 
iter 7009: loss 2.5356, time 5139.73ms 
iter 7010: loss 2.4387, time 5156.55ms 
iter 7011: loss 2.5908, time 5173.95ms 
iter 7012: loss 2.5265, time 5206.05ms 
iter 7013: loss 2.3535, time 5264.68ms 
iter 7014: loss 2.5258, time 5265.20ms 
iter 7015: loss 2.3934, time 5270.29ms 
iter 7016: loss 2.3397, time 5270.23ms 
iter 7017: loss 2.4317, time 5222.80ms 
iter 7018: loss 2.7011, time 5258.89ms 
iter 7019: loss 2.6801, time 5267.30ms 
iter 7020: loss 2.5725, time 5265.63ms 
iter 7021: loss 2.5751, time 5264.27ms 
iter 7022: loss 2.4966, time 5272.68ms 
iter 7023: loss 2.5220, time 5294.20ms 
iter 7024: loss 2.5998, time 5283.58ms 
iter 7025: loss 2.4227, time 5262.18ms 
iter 7026: loss 2.5483, time 5267.80ms 
iter 7027: loss 2.4908, time 5252.46ms 
iter 7028: loss 2.5634, time 5265.07ms 
iter 7029: loss 2.5784, time 5261.69ms 
iter 7030: loss 2.5620, time 5171.80ms 
iter 7031: loss 2.5983, time 5188.51ms 
iter 7032: loss 2.5097, time 5219.52ms 
iter 7033: loss 2.5005, time 5257.14ms 
iter 7034: loss 2.6440, time 5264.00ms 
iter 7035: loss 2.4296, time 5270.34ms 
iter 7036: loss 2.6269, time 5269.23ms 
iter 7037: loss 2.3628, time 5255.59ms 
iter 7038: loss 2.5218, time 5331.20ms 
iter 7039: loss 2.6041, time 5331.80ms 
iter 7040: loss 2.7199, time 5303.38ms 
iter 7041: loss 2.6536, time 5283.06ms 
iter 7042: loss 2.4511, time 5278.52ms 
iter 7043: loss 2.5898, time 5306.27ms 
iter 7044: loss 2.3488, time 5299.57ms 
iter 7045: loss 2.5513, time 5329.89ms 
iter 7046: loss 2.6187, time 5278.86ms 
iter 7047: loss 2.6490, time 5284.97ms 
iter 7048: loss 2.5340, time 5310.04ms 
iter 7049: loss 2.7429, time 5285.78ms 
step 7050: train loss 2.5156, val loss 2.8536
iter 7050: loss 2.6953, time 20049.81ms 
iter 7051: loss 2.2669, time 5325.38ms 
iter 7052: loss 2.3616, time 5308.57ms 
iter 7053: loss 2.5548, time 5348.06ms 
iter 7054: loss 2.6568, time 5348.09ms 
iter 7055: loss 2.4540, time 5305.02ms 
iter 7056: loss 2.4770, time 5266.35ms 
iter 7057: loss 2.5642, time 5264.23ms 
iter 7058: loss 2.4473, time 5270.81ms 
iter 7059: loss 2.5834, time 5272.96ms 
iter 7060: loss 2.7398, time 5270.85ms 
iter 7061: loss 2.6396, time 5275.39ms 
iter 7062: loss 2.6846, time 5261.44ms 
iter 7063: loss 2.4485, time 5275.92ms 
iter 7064: loss 2.5745, time 5308.83ms 
iter 7065: loss 2.6538, time 5308.70ms 
iter 7066: loss 2.5105, time 5281.00ms 
iter 7067: loss 2.4563, time 5313.64ms 
iter 7068: loss 2.5362, time 5272.82ms 
iter 7069: loss 2.5004, time 5278.41ms 
iter 7070: loss 2.6549, time 5265.88ms 
iter 7071: loss 2.7437, time 5262.79ms 
iter 7072: loss 2.5484, time 5230.96ms 
iter 7073: loss 2.3413, time 5256.53ms 
iter 7074: loss 2.6395, time 5268.61ms 
iter 7075: loss 2.5733, time 5265.89ms 
iter 7076: loss 2.3667, time 5263.16ms 
iter 7077: loss 2.7543, time 5264.65ms 
iter 7078: loss 2.5266, time 5269.30ms 
iter 7079: loss 2.4912, time 5266.42ms 
iter 7080: loss 2.2775, time 5263.52ms 
iter 7081: loss 2.4754, time 5255.24ms 
iter 7082: loss 2.5074, time 5279.01ms 
iter 7083: loss 2.6340, time 5275.80ms 
iter 7084: loss 2.5162, time 5267.31ms 
iter 7085: loss 2.6239, time 5263.31ms 
iter 7086: loss 2.5869, time 5278.99ms 
iter 7087: loss 2.5447, time 5265.47ms 
iter 7088: loss 2.5868, time 5257.27ms 
iter 7089: loss 2.5109, time 5254.08ms 
iter 7090: loss 2.5444, time 5274.45ms 
iter 7091: loss 2.4708, time 5255.95ms 
iter 7092: loss 2.7215, time 5254.27ms 
iter 7093: loss 2.5877, time 5259.56ms 
iter 7094: loss 2.2877, time 5271.04ms 
iter 7095: loss 2.6065, time 5262.06ms 
iter 7096: loss 2.6796, time 5255.99ms 
iter 7097: loss 2.4790, time 5289.39ms 
iter 7098: loss 2.4546, time 5346.03ms 
iter 7099: loss 2.5568, time 5346.07ms 
step 7100: train loss 2.5178, val loss 2.8311
iter 7100: loss 2.6697, time 20164.42ms 
iter 7101: loss 2.4968, time 5311.18ms 
iter 7102: loss 2.4868, time 5266.51ms 
iter 7103: loss 2.6041, time 5295.19ms 
iter 7104: loss 2.4534, time 5300.43ms 
iter 7105: loss 2.4790, time 5297.53ms 
iter 7106: loss 2.5514, time 5271.87ms 
iter 7107: loss 2.4107, time 5277.32ms 
iter 7108: loss 2.4690, time 5268.67ms 
iter 7109: loss 2.3870, time 5331.59ms 
iter 7110: loss 2.4404, time 5306.25ms 
iter 7111: loss 2.6020, time 5272.09ms 
iter 7112: loss 2.6904, time 5300.78ms 
iter 7113: loss 2.5168, time 5318.15ms 
iter 7114: loss 2.3268, time 5313.95ms 
iter 7115: loss 2.8231, time 5323.75ms 
iter 7116: loss 2.5868, time 5332.44ms 
iter 7117: loss 2.6292, time 5309.04ms 
iter 7118: loss 2.2861, time 5347.04ms 
iter 7119: loss 2.4883, time 5338.03ms 
iter 7120: loss 2.5543, time 5287.59ms 
iter 7121: loss 2.4355, time 5262.10ms 
iter 7122: loss 2.5164, time 5259.18ms 
iter 7123: loss 2.5685, time 5258.78ms 
iter 7124: loss 2.3576, time 5268.83ms 
iter 7125: loss 2.2522, time 5273.78ms 
iter 7126: loss 2.6283, time 5273.69ms 
iter 7127: loss 2.4497, time 5265.78ms 
iter 7128: loss 2.3831, time 5266.18ms 
iter 7129: loss 2.5373, time 5265.78ms 
iter 7130: loss 2.5150, time 5261.94ms 
iter 7131: loss 2.5504, time 5256.25ms 
iter 7132: loss 2.5822, time 5256.75ms 
iter 7133: loss 2.5805, time 5273.24ms 
iter 7134: loss 2.6006, time 5257.58ms 
iter 7135: loss 2.3437, time 5235.44ms 
iter 7136: loss 2.5979, time 5249.50ms 
iter 7137: loss 2.5889, time 5322.55ms 
iter 7138: loss 2.6736, time 5345.67ms 
iter 7139: loss 2.5782, time 5319.10ms 
iter 7140: loss 2.4654, time 5269.83ms 
iter 7141: loss 2.3778, time 5289.83ms 
iter 7142: loss 2.4311, time 5282.31ms 
iter 7143: loss 2.3033, time 5277.68ms 
iter 7144: loss 2.4316, time 5271.54ms 
iter 7145: loss 2.5828, time 5285.33ms 
iter 7146: loss 2.4976, time 5274.57ms 
iter 7147: loss 2.4246, time 5293.80ms 
iter 7148: loss 2.6259, time 5306.76ms 
iter 7149: loss 2.4697, time 5273.97ms 
step 7150: train loss 2.5033, val loss 2.8528
iter 7150: loss 2.6084, time 20070.01ms 
iter 7151: loss 2.5219, time 5270.72ms 
iter 7152: loss 2.4991, time 5272.87ms 
iter 7153: loss 2.4354, time 5276.38ms 
iter 7154: loss 2.5575, time 5283.18ms 
iter 7155: loss 2.4285, time 5283.21ms 
iter 7156: loss 2.4576, time 5267.58ms 
iter 7157: loss 2.6071, time 5283.20ms 
iter 7158: loss 2.4536, time 5264.80ms 
iter 7159: loss 2.4393, time 5248.96ms 
iter 7160: loss 2.4357, time 5262.61ms 
iter 7161: loss 2.6971, time 5272.61ms 
iter 7162: loss 2.6607, time 5253.93ms 
iter 7163: loss 2.4398, time 5264.72ms 
iter 7164: loss 2.2409, time 5263.63ms 
iter 7165: loss 2.5001, time 5317.52ms 
iter 7166: loss 2.4084, time 5300.09ms 
iter 7167: loss 2.5401, time 5272.22ms 
iter 7168: loss 2.4485, time 5278.65ms 
iter 7169: loss 2.2141, time 5289.69ms 
iter 7170: loss 2.4224, time 5279.12ms 
iter 7171: loss 2.3975, time 5270.48ms 
iter 7172: loss 2.5160, time 5275.37ms 
iter 7173: loss 2.4785, time 5319.49ms 
iter 7174: loss 2.3004, time 5312.52ms 
iter 7175: loss 2.4513, time 5330.90ms 
iter 7176: loss 2.5654, time 5272.11ms 
iter 7177: loss 2.3667, time 5273.23ms 
iter 7178: loss 2.2311, time 5276.67ms 
iter 7179: loss 2.4751, time 5252.15ms 
iter 7180: loss 2.6992, time 5268.58ms 
iter 7181: loss 2.4414, time 5210.26ms 
iter 7182: loss 2.5602, time 5179.08ms 
iter 7183: loss 2.5081, time 5213.99ms 
iter 7184: loss 2.4891, time 5231.34ms 
iter 7185: loss 2.5848, time 5227.84ms 
iter 7186: loss 2.7215, time 5192.62ms 
iter 7187: loss 2.4355, time 5182.55ms 
iter 7188: loss 2.4835, time 5143.03ms 
iter 7189: loss 2.6702, time 5193.29ms 
iter 7190: loss 2.5549, time 5153.70ms 
iter 7191: loss 2.4333, time 5199.80ms 
iter 7192: loss 2.7678, time 5194.65ms 
iter 7193: loss 2.5736, time 5210.95ms 
iter 7194: loss 2.3685, time 5182.22ms 
iter 7195: loss 2.7755, time 5189.11ms 
iter 7196: loss 2.6037, time 5141.09ms 
iter 7197: loss 2.5302, time 5125.99ms 
iter 7198: loss 2.4319, time 5107.15ms 
iter 7199: loss 2.6902, time 5155.02ms 
step 7200: train loss 2.5120, val loss 2.8628
iter 7200: loss 2.3948, time 20052.68ms 
iter 7201: loss 2.2456, time 5273.00ms 
iter 7202: loss 2.4749, time 5267.79ms 
iter 7203: loss 2.4105, time 5274.74ms 
iter 7204: loss 2.4415, time 5283.60ms 
iter 7205: loss 2.5009, time 5274.64ms 
iter 7206: loss 2.4697, time 5260.07ms 
iter 7207: loss 2.4337, time 5271.89ms 
iter 7208: loss 2.4108, time 5248.56ms 
iter 7209: loss 2.6312, time 5268.23ms 
iter 7210: loss 2.3979, time 5269.95ms 
iter 7211: loss 2.3683, time 5274.00ms 
iter 7212: loss 2.2246, time 5262.68ms 
iter 7213: loss 2.4018, time 5266.64ms 
iter 7214: loss 2.4263, time 5270.35ms 
iter 7215: loss 2.6049, time 5279.14ms 
iter 7216: loss 2.5171, time 5271.23ms 
iter 7217: loss 2.2431, time 5269.91ms 
iter 7218: loss 2.5142, time 5275.71ms 
iter 7219: loss 2.5754, time 5268.25ms 
iter 7220: loss 2.5056, time 5275.41ms 
iter 7221: loss 2.5818, time 5260.52ms 
iter 7222: loss 2.5518, time 5272.52ms 
iter 7223: loss 2.5377, time 5273.74ms 
iter 7224: loss 2.3250, time 5278.18ms 
iter 7225: loss 2.7136, time 5261.49ms 
iter 7226: loss 2.3242, time 5267.50ms 
iter 7227: loss 2.6612, time 5267.05ms 
iter 7228: loss 2.6182, time 5258.17ms 
iter 7229: loss 2.5845, time 5261.33ms 
iter 7230: loss 2.4084, time 5269.98ms 
iter 7231: loss 2.6672, time 5265.28ms 
iter 7232: loss 2.6618, time 5268.53ms 
iter 7233: loss 2.4749, time 5276.47ms 
iter 7234: loss 2.4154, time 5229.89ms 
iter 7235: loss 2.6112, time 5243.65ms 
iter 7236: loss 2.6148, time 5257.49ms 
iter 7237: loss 2.3982, time 5277.74ms 
iter 7238: loss 2.3175, time 5264.13ms 
iter 7239: loss 2.5644, time 5259.25ms 
iter 7240: loss 2.3757, time 5268.85ms 
iter 7241: loss 2.4719, time 5282.18ms 
iter 7242: loss 2.5641, time 5267.54ms 
iter 7243: loss 2.5121, time 5270.19ms 
iter 7244: loss 2.5100, time 5271.76ms 
iter 7245: loss 2.5610, time 5264.28ms 
iter 7246: loss 2.4042, time 5266.54ms 
iter 7247: loss 2.4065, time 5279.52ms 
iter 7248: loss 2.5137, time 5267.70ms 
iter 7249: loss 2.2046, time 5266.97ms 
step 7250: train loss 2.5033, val loss 2.8456
iter 7250: loss 2.4741, time 20022.94ms 
iter 7251: loss 2.1048, time 5278.45ms 
iter 7252: loss 2.4312, time 5282.06ms 
iter 7253: loss 2.5226, time 5260.43ms 
iter 7254: loss 2.7110, time 5260.73ms 
iter 7255: loss 2.5044, time 5254.69ms 
iter 7256: loss 2.3890, time 5269.64ms 
iter 7257: loss 2.5230, time 5263.83ms 
iter 7258: loss 2.5815, time 5154.99ms 
iter 7259: loss 2.4548, time 5038.97ms 
iter 7260: loss 2.2781, time 5031.51ms 
iter 7261: loss 2.4000, time 5215.45ms 
iter 7262: loss 2.3507, time 5266.75ms 
iter 7263: loss 2.4337, time 5274.77ms 
iter 7264: loss 2.6800, time 5289.19ms 
iter 7265: loss 2.3670, time 5268.11ms 
iter 7266: loss 2.4524, time 5275.15ms 
iter 7267: loss 2.3285, time 5266.40ms 
iter 7268: loss 2.3367, time 5273.96ms 
iter 7269: loss 2.6541, time 5262.18ms 
iter 7270: loss 2.4670, time 5267.15ms 
iter 7271: loss 2.6122, time 5259.05ms 
iter 7272: loss 2.5575, time 5279.37ms 
iter 7273: loss 2.3305, time 5262.57ms 
iter 7274: loss 2.3535, time 5270.03ms 
iter 7275: loss 2.5045, time 5286.82ms 
iter 7276: loss 2.4905, time 5200.39ms 
iter 7277: loss 2.5886, time 5266.45ms 
iter 7278: loss 2.4162, time 5309.30ms 
iter 7279: loss 2.4856, time 5286.57ms 
iter 7280: loss 2.7382, time 5271.24ms 
iter 7281: loss 2.6057, time 5268.55ms 
iter 7282: loss 2.4213, time 5270.45ms 
iter 7283: loss 2.4561, time 5265.78ms 
iter 7284: loss 2.5539, time 5292.66ms 
iter 7285: loss 2.4564, time 5281.16ms 
iter 7286: loss 2.4991, time 5287.81ms 
iter 7287: loss 2.6471, time 5273.90ms 
iter 7288: loss 2.5812, time 5291.97ms 
iter 7289: loss 2.6073, time 5279.77ms 
iter 7290: loss 2.5735, time 5261.47ms 
iter 7291: loss 2.2805, time 5266.76ms 
iter 7292: loss 2.3019, time 5269.10ms 
iter 7293: loss 2.5942, time 5262.33ms 
iter 7294: loss 2.6368, time 5262.85ms 
iter 7295: loss 2.5203, time 5283.35ms 
iter 7296: loss 2.7310, time 5281.85ms 
iter 7297: loss 2.5785, time 5276.92ms 
iter 7298: loss 2.5129, time 5281.51ms 
iter 7299: loss 2.6747, time 5285.49ms 
step 7300: train loss 2.5073, val loss 2.8470
iter 7300: loss 2.5530, time 20087.40ms 
iter 7301: loss 2.8114, time 5277.64ms 
iter 7302: loss 2.4393, time 5260.86ms 
iter 7303: loss 2.4210, time 5278.48ms 
iter 7304: loss 2.6093, time 5278.01ms 
iter 7305: loss 2.5672, time 5259.16ms 
iter 7306: loss 2.7002, time 5259.08ms 
iter 7307: loss 2.3254, time 5275.43ms 
iter 7308: loss 2.5029, time 5267.57ms 
iter 7309: loss 2.7183, time 5266.22ms 
iter 7310: loss 2.5747, time 5265.62ms 
iter 7311: loss 2.4386, time 5275.06ms 
iter 7312: loss 2.4306, time 5261.83ms 
iter 7313: loss 2.6827, time 5260.86ms 
iter 7314: loss 2.3338, time 5264.92ms 
iter 7315: loss 2.4111, time 5266.68ms 
iter 7316: loss 2.3614, time 5268.61ms 
iter 7317: loss 2.5984, time 5268.78ms 
iter 7318: loss 2.1480, time 5272.62ms 
iter 7319: loss 2.2823, time 5271.18ms 
iter 7320: loss 2.4656, time 5265.08ms 
iter 7321: loss 2.5379, time 5276.39ms 
iter 7322: loss 2.4995, time 5279.79ms 
iter 7323: loss 2.4368, time 5272.75ms 
iter 7324: loss 2.3197, time 5257.98ms 
iter 7325: loss 2.2464, time 5259.80ms 
iter 7326: loss 2.6786, time 5283.72ms 
iter 7327: loss 2.8300, time 5255.86ms 
iter 7328: loss 2.4728, time 5288.77ms 
iter 7329: loss 2.4382, time 5270.55ms 
iter 7330: loss 2.4344, time 5285.59ms 
iter 7331: loss 2.3841, time 5274.06ms 
iter 7332: loss 2.5963, time 5273.23ms 
iter 7333: loss 2.4162, time 5272.23ms 
iter 7334: loss 2.7896, time 5285.00ms 
iter 7335: loss 2.4914, time 5276.16ms 
iter 7336: loss 2.4327, time 5280.95ms 
iter 7337: loss 2.7076, time 5281.62ms 
iter 7338: loss 2.6141, time 5276.61ms 
iter 7339: loss 2.2882, time 5262.98ms 
iter 7340: loss 2.2830, time 5257.72ms 
iter 7341: loss 2.6220, time 5235.68ms 
iter 7342: loss 2.3720, time 5260.54ms 
iter 7343: loss 2.5956, time 5226.16ms 
iter 7344: loss 2.4592, time 5256.81ms 
iter 7345: loss 2.5337, time 5283.53ms 
iter 7346: loss 2.8313, time 5260.09ms 
iter 7347: loss 2.4480, time 5260.69ms 
iter 7348: loss 2.4884, time 5269.26ms 
iter 7349: loss 2.5502, time 5296.05ms 
step 7350: train loss 2.5105, val loss 2.8316
iter 7350: loss 2.6743, time 20106.63ms 
iter 7351: loss 2.2969, time 5268.86ms 
iter 7352: loss 2.5578, time 5274.75ms 
iter 7353: loss 2.3855, time 5281.38ms 
iter 7354: loss 2.5288, time 5269.58ms 
iter 7355: loss 2.3482, time 5271.59ms 
iter 7356: loss 2.7781, time 5273.53ms 
iter 7357: loss 2.3959, time 5268.17ms 
iter 7358: loss 2.6138, time 5262.24ms 
iter 7359: loss 2.4924, time 5263.31ms 
iter 7360: loss 2.8085, time 5274.13ms 
iter 7361: loss 2.5442, time 5257.82ms 
iter 7362: loss 2.4582, time 5268.28ms 
iter 7363: loss 2.4714, time 5274.07ms 
iter 7364: loss 2.4689, time 5273.54ms 
iter 7365: loss 2.2494, time 5260.49ms 
iter 7366: loss 2.3706, time 5265.64ms 
iter 7367: loss 2.7399, time 5279.66ms 
iter 7368: loss 2.5578, time 5270.46ms 
iter 7369: loss 2.4175, time 5267.41ms 
iter 7370: loss 2.3529, time 5262.32ms 
iter 7371: loss 2.6482, time 5273.60ms 
iter 7372: loss 2.5860, time 5262.67ms 
iter 7373: loss 2.4217, time 5257.48ms 
iter 7374: loss 2.4815, time 5257.76ms 
iter 7375: loss 2.5553, time 5276.84ms 
iter 7376: loss 2.3106, time 5273.89ms 
iter 7377: loss 2.3903, time 5268.61ms 
iter 7378: loss 2.7273, time 5276.63ms 
iter 7379: loss 2.5766, time 5270.81ms 
iter 7380: loss 2.6155, time 5272.34ms 
iter 7381: loss 2.4121, time 5262.42ms 
iter 7382: loss 2.5459, time 5261.83ms 
iter 7383: loss 2.4383, time 5265.02ms 
iter 7384: loss 2.5357, time 5261.67ms 
iter 7385: loss 2.5112, time 5271.79ms 
iter 7386: loss 2.4886, time 5265.40ms 
iter 7387: loss 2.4801, time 5256.63ms 
iter 7388: loss 2.6047, time 5255.46ms 
iter 7389: loss 2.7091, time 5269.01ms 
iter 7390: loss 2.3777, time 5279.98ms 
iter 7391: loss 2.6126, time 5271.19ms 
iter 7392: loss 2.3711, time 5274.26ms 
iter 7393: loss 2.7188, time 5277.39ms 
iter 7394: loss 2.5034, time 5261.51ms 
iter 7395: loss 2.5208, time 5276.11ms 
iter 7396: loss 2.5463, time 5271.69ms 
iter 7397: loss 2.3953, time 5275.82ms 
iter 7398: loss 2.4417, time 5264.33ms 
iter 7399: loss 2.4426, time 5264.95ms 
step 7400: train loss 2.5085, val loss 2.8662
iter 7400: loss 2.3944, time 20071.98ms 
iter 7401: loss 2.5191, time 5265.41ms 
iter 7402: loss 2.7255, time 5269.86ms 
iter 7403: loss 2.4231, time 5266.93ms 
iter 7404: loss 2.4578, time 5278.37ms 
iter 7405: loss 2.6495, time 5271.66ms 
iter 7406: loss 2.3160, time 5273.50ms 
iter 7407: loss 2.5232, time 5261.49ms 
iter 7408: loss 2.3285, time 5268.76ms 
iter 7409: loss 2.5439, time 5269.01ms 
iter 7410: loss 2.4619, time 5257.60ms 
iter 7411: loss 2.5915, time 5268.49ms 
iter 7412: loss 2.5260, time 5278.61ms 
iter 7413: loss 2.3232, time 5227.83ms 
iter 7414: loss 2.3652, time 5264.29ms 
iter 7415: loss 2.2229, time 5263.81ms 
iter 7416: loss 2.2642, time 5268.39ms 
iter 7417: loss 2.6431, time 5267.54ms 
iter 7418: loss 2.4874, time 5259.64ms 
iter 7419: loss 2.5741, time 5255.29ms 
iter 7420: loss 2.5536, time 5265.41ms 
iter 7421: loss 2.2856, time 5273.10ms 
iter 7422: loss 2.5916, time 5271.61ms 
iter 7423: loss 2.5575, time 5261.02ms 
iter 7424: loss 2.6657, time 5277.66ms 
iter 7425: loss 2.5608, time 5269.68ms 
iter 7426: loss 2.4600, time 5264.73ms 
iter 7427: loss 2.6981, time 5262.75ms 
iter 7428: loss 2.5922, time 5270.47ms 
iter 7429: loss 2.6278, time 5266.46ms 
iter 7430: loss 2.2198, time 5264.40ms 
iter 7431: loss 2.6879, time 5266.11ms 
iter 7432: loss 2.4372, time 5271.52ms 
iter 7433: loss 2.7171, time 5266.71ms 
iter 7434: loss 2.6725, time 5226.44ms 
iter 7435: loss 2.5027, time 5270.48ms 
iter 7436: loss 2.4151, time 5264.59ms 
iter 7437: loss 2.2467, time 5258.88ms 
iter 7438: loss 2.4962, time 5264.39ms 
iter 7439: loss 2.4359, time 5276.18ms 
iter 7440: loss 2.5976, time 5261.61ms 
iter 7441: loss 2.4571, time 5258.72ms 
iter 7442: loss 2.2931, time 5205.01ms 
iter 7443: loss 2.4789, time 5260.80ms 
iter 7444: loss 2.3466, time 5265.11ms 
iter 7445: loss 2.6085, time 5283.85ms 
iter 7446: loss 2.5821, time 5267.92ms 
iter 7447: loss 2.3942, time 5260.88ms 
iter 7448: loss 2.6600, time 5266.49ms 
iter 7449: loss 2.5792, time 5270.01ms 
step 7450: train loss 2.5078, val loss 2.8348
iter 7450: loss 2.3009, time 19984.58ms 
iter 7451: loss 2.3249, time 5268.41ms 
iter 7452: loss 2.4782, time 5256.43ms 
iter 7453: loss 2.5897, time 5247.70ms 
iter 7454: loss 2.6210, time 5262.02ms 
iter 7455: loss 2.5575, time 5260.08ms 
iter 7456: loss 2.3578, time 5258.91ms 
iter 7457: loss 2.6878, time 5266.30ms 
iter 7458: loss 2.7343, time 5273.54ms 
iter 7459: loss 2.4475, time 5257.88ms 
iter 7460: loss 2.6528, time 5267.95ms 
iter 7461: loss 2.4683, time 5268.30ms 
iter 7462: loss 2.5103, time 5267.83ms 
iter 7463: loss 2.5557, time 5233.48ms 
iter 7464: loss 2.5535, time 5266.06ms 
iter 7465: loss 2.3042, time 5263.64ms 
iter 7466: loss 2.6097, time 5251.73ms 
iter 7467: loss 2.8185, time 5257.93ms 
iter 7468: loss 2.5891, time 5269.26ms 
iter 7469: loss 2.5759, time 5263.29ms 
iter 7470: loss 2.6303, time 5254.68ms 
iter 7471: loss 2.4836, time 5247.38ms 
iter 7472: loss 2.5997, time 5261.55ms 
iter 7473: loss 2.5965, time 5277.74ms 
iter 7474: loss 2.6632, time 5266.40ms 
iter 7475: loss 2.6403, time 5267.98ms 
iter 7476: loss 2.4479, time 5268.61ms 
iter 7477: loss 2.2581, time 5263.06ms 
iter 7478: loss 2.4973, time 5264.42ms 
iter 7479: loss 2.4372, time 5270.06ms 
iter 7480: loss 2.5101, time 5272.35ms 
iter 7481: loss 2.3827, time 5264.08ms 
iter 7482: loss 2.4344, time 5268.04ms 
iter 7483: loss 2.6187, time 5280.14ms 
iter 7484: loss 2.5108, time 5279.00ms 
iter 7485: loss 2.3954, time 5272.16ms 
iter 7486: loss 2.5723, time 5264.90ms 
iter 7487: loss 2.2985, time 5275.89ms 
iter 7488: loss 2.5422, time 5270.76ms 
iter 7489: loss 2.6732, time 5274.15ms 
iter 7490: loss 2.7241, time 5259.99ms 
iter 7491: loss 2.6470, time 5264.73ms 
iter 7492: loss 2.5364, time 5270.13ms 
iter 7493: loss 2.5085, time 5256.02ms 
iter 7494: loss 2.5061, time 5266.54ms 
iter 7495: loss 2.5503, time 5264.64ms 
iter 7496: loss 2.3493, time 5258.87ms 
iter 7497: loss 2.5014, time 5268.19ms 
iter 7498: loss 2.6429, time 5282.55ms 
iter 7499: loss 2.4534, time 5255.13ms 
step 7500: train loss 2.5062, val loss 2.8463
iter 7500: loss 2.3297, time 20065.69ms 
iter 7501: loss 2.2043, time 5273.86ms 
iter 7502: loss 2.5254, time 5267.46ms 
iter 7503: loss 2.6909, time 5270.92ms 
iter 7504: loss 2.5483, time 5263.12ms 
iter 7505: loss 2.4792, time 5273.62ms 
iter 7506: loss 2.6167, time 5262.81ms 
iter 7507: loss 2.4055, time 5265.32ms 
iter 7508: loss 2.4716, time 5268.29ms 
iter 7509: loss 2.6319, time 5279.22ms 
iter 7510: loss 2.3841, time 5269.02ms 
iter 7511: loss 2.5394, time 5263.02ms 
iter 7512: loss 2.3837, time 5266.10ms 
iter 7513: loss 2.4936, time 5281.93ms 
iter 7514: loss 2.5194, time 5268.10ms 
iter 7515: loss 2.3586, time 5261.62ms 
iter 7516: loss 2.4174, time 5278.35ms 
iter 7517: loss 2.3242, time 5257.00ms 
iter 7518: loss 2.5774, time 5268.49ms 
iter 7519: loss 2.5324, time 5237.90ms 
iter 7520: loss 2.4742, time 5265.33ms 
iter 7521: loss 2.5261, time 5223.74ms 
iter 7522: loss 2.4359, time 5256.75ms 
iter 7523: loss 2.4818, time 5262.85ms 
iter 7524: loss 2.5520, time 5263.30ms 
iter 7525: loss 2.6043, time 5260.70ms 
iter 7526: loss 2.4092, time 5253.06ms 
iter 7527: loss 2.5058, time 5275.83ms 
iter 7528: loss 2.2625, time 5257.72ms 
iter 7529: loss 2.4436, time 5255.96ms 
iter 7530: loss 2.4196, time 5271.38ms 
iter 7531: loss 2.5543, time 5226.07ms 
iter 7532: loss 2.5271, time 5251.61ms 
iter 7533: loss 2.4194, time 5256.07ms 
iter 7534: loss 2.7020, time 5269.20ms 
iter 7535: loss 2.4323, time 5270.45ms 
iter 7536: loss 2.5642, time 5254.36ms 
iter 7537: loss 2.8545, time 5259.19ms 
iter 7538: loss 2.5324, time 5268.98ms 
iter 7539: loss 2.5561, time 5253.12ms 
iter 7540: loss 2.5440, time 5260.40ms 
iter 7541: loss 2.6191, time 5258.90ms 
iter 7542: loss 2.5489, time 5264.29ms 
iter 7543: loss 2.4434, time 5255.24ms 
iter 7544: loss 2.5481, time 5260.78ms 
iter 7545: loss 2.5363, time 5267.98ms 
iter 7546: loss 2.5235, time 5264.32ms 
iter 7547: loss 2.5441, time 5260.96ms 
iter 7548: loss 2.5574, time 5260.38ms 
iter 7549: loss 2.5371, time 5269.98ms 
step 7550: train loss 2.4968, val loss 2.8480
iter 7550: loss 2.4933, time 20070.48ms 
iter 7551: loss 2.4168, time 5259.91ms 
iter 7552: loss 2.5660, time 5257.41ms 
iter 7553: loss 2.5438, time 5262.83ms 
iter 7554: loss 2.3829, time 5261.88ms 
iter 7555: loss 2.4766, time 5260.99ms 
iter 7556: loss 2.5176, time 5257.83ms 
iter 7557: loss 2.5031, time 5278.88ms 
iter 7558: loss 2.5626, time 5263.81ms 
iter 7559: loss 2.6096, time 5264.86ms 
iter 7560: loss 2.4825, time 5266.28ms 
iter 7561: loss 2.5971, time 5279.54ms 
iter 7562: loss 2.5328, time 5271.80ms 
iter 7563: loss 2.5864, time 5267.20ms 
iter 7564: loss 2.4319, time 5261.32ms 
iter 7565: loss 2.5695, time 5273.07ms 
iter 7566: loss 2.4365, time 5266.48ms 
iter 7567: loss 2.2269, time 5268.42ms 
iter 7568: loss 2.7123, time 5268.93ms 
iter 7569: loss 2.4575, time 5268.07ms 
iter 7570: loss 2.5252, time 5254.86ms 
iter 7571: loss 2.5054, time 5228.45ms 
iter 7572: loss 2.5389, time 5267.66ms 
iter 7573: loss 2.6210, time 5259.10ms 
iter 7574: loss 2.4815, time 5256.21ms 
iter 7575: loss 2.4986, time 5265.09ms 
iter 7576: loss 2.1882, time 5269.57ms 
iter 7577: loss 2.2945, time 5260.63ms 
iter 7578: loss 2.3585, time 5266.06ms 
iter 7579: loss 2.4223, time 5268.69ms 
iter 7580: loss 2.6635, time 5275.92ms 
iter 7581: loss 2.5302, time 5265.17ms 
iter 7582: loss 2.4461, time 5276.47ms 
iter 7583: loss 2.4178, time 5264.67ms 
iter 7584: loss 2.3198, time 5262.71ms 
iter 7585: loss 2.5565, time 5253.55ms 
iter 7586: loss 2.3736, time 5272.57ms 
iter 7587: loss 2.4678, time 5266.00ms 
iter 7588: loss 2.2757, time 5258.24ms 
iter 7589: loss 2.5803, time 5258.09ms 
iter 7590: loss 2.5500, time 5279.42ms 
iter 7591: loss 2.4987, time 5229.68ms 
iter 7592: loss 2.3788, time 5232.46ms 
iter 7593: loss 2.7696, time 5272.54ms 
iter 7594: loss 2.4597, time 5262.49ms 
iter 7595: loss 2.5052, time 5257.61ms 
iter 7596: loss 2.5210, time 5258.43ms 
iter 7597: loss 2.2663, time 5273.11ms 
iter 7598: loss 2.3148, time 5259.53ms 
iter 7599: loss 2.5788, time 5258.01ms 
step 7600: train loss 2.4944, val loss 2.8554
iter 7600: loss 2.3643, time 20082.51ms 
iter 7601: loss 2.6436, time 5265.39ms 
iter 7602: loss 2.6398, time 5268.10ms 
iter 7603: loss 2.6658, time 5261.51ms 
iter 7604: loss 2.5230, time 5267.79ms 
iter 7605: loss 2.2753, time 5257.57ms 
iter 7606: loss 2.4873, time 5268.68ms 
iter 7607: loss 2.6090, time 5276.33ms 
iter 7608: loss 2.2444, time 5282.47ms 
iter 7609: loss 2.7015, time 5272.90ms 
iter 7610: loss 2.4209, time 5274.34ms 
iter 7611: loss 2.4619, time 5270.50ms 
iter 7612: loss 2.2617, time 5260.28ms 
iter 7613: loss 2.7444, time 5262.01ms 
iter 7614: loss 2.4708, time 5239.58ms 
iter 7615: loss 2.4283, time 5263.05ms 
iter 7616: loss 2.5535, time 5270.95ms 
iter 7617: loss 2.1513, time 5262.50ms 
iter 7618: loss 2.6817, time 5259.64ms 
iter 7619: loss 2.5733, time 5260.22ms 
iter 7620: loss 2.6592, time 5248.35ms 
iter 7621: loss 2.4668, time 5258.73ms 
iter 7622: loss 2.2991, time 5263.48ms 
iter 7623: loss 2.5683, time 5264.33ms 
iter 7624: loss 2.5611, time 5264.00ms 
iter 7625: loss 2.7380, time 5277.28ms 
iter 7626: loss 2.5424, time 5271.36ms 
iter 7627: loss 2.6793, time 5281.64ms 
iter 7628: loss 2.4697, time 5283.51ms 
iter 7629: loss 2.3844, time 5261.77ms 
iter 7630: loss 2.5018, time 5282.66ms 
iter 7631: loss 2.5162, time 5260.95ms 
iter 7632: loss 2.3353, time 5266.51ms 
iter 7633: loss 2.4726, time 5271.95ms 
iter 7634: loss 2.7765, time 5282.24ms 
iter 7635: loss 2.6317, time 5293.21ms 
iter 7636: loss 2.4564, time 5286.40ms 
iter 7637: loss 2.3818, time 5303.10ms 
iter 7638: loss 2.5576, time 5262.25ms 
iter 7639: loss 2.5680, time 5256.40ms 
iter 7640: loss 2.4466, time 5267.90ms 
iter 7641: loss 2.4774, time 5279.07ms 
iter 7642: loss 2.4364, time 5262.40ms 
iter 7643: loss 2.5791, time 5276.86ms 
iter 7644: loss 2.3528, time 5277.20ms 
iter 7645: loss 2.5732, time 5265.43ms 
iter 7646: loss 2.5335, time 5277.02ms 
iter 7647: loss 2.5231, time 5284.48ms 
iter 7648: loss 2.5597, time 5287.25ms 
iter 7649: loss 2.3572, time 5262.33ms 
step 7650: train loss 2.5040, val loss 2.8327
iter 7650: loss 2.4755, time 20046.95ms 
iter 7651: loss 2.4805, time 5279.70ms 
iter 7652: loss 2.5914, time 5261.41ms 
iter 7653: loss 2.4527, time 5271.76ms 
iter 7654: loss 2.3799, time 5266.26ms 
iter 7655: loss 2.4621, time 5281.96ms 
iter 7656: loss 2.4955, time 5262.55ms 
iter 7657: loss 2.5077, time 5268.09ms 
iter 7658: loss 2.5700, time 5269.75ms 
iter 7659: loss 2.5043, time 5276.72ms 
iter 7660: loss 2.3689, time 5282.85ms 
iter 7661: loss 2.5782, time 5269.14ms 
iter 7662: loss 2.5439, time 5286.18ms 
iter 7663: loss 2.5092, time 5275.11ms 
iter 7664: loss 2.4785, time 5256.70ms 
iter 7665: loss 2.5688, time 5268.53ms 
iter 7666: loss 2.4960, time 5260.95ms 
iter 7667: loss 2.2716, time 5257.86ms 
iter 7668: loss 2.4051, time 5258.78ms 
iter 7669: loss 2.3974, time 5275.15ms 
iter 7670: loss 2.5252, time 5268.44ms 
iter 7671: loss 2.3303, time 5262.42ms 
iter 7672: loss 2.6424, time 5271.11ms 
iter 7673: loss 2.5498, time 5277.71ms 
iter 7674: loss 2.4483, time 5260.76ms 
iter 7675: loss 2.2290, time 5269.47ms 
iter 7676: loss 2.6547, time 5272.29ms 
iter 7677: loss 2.5410, time 5264.54ms 
iter 7678: loss 2.4879, time 5258.24ms 
iter 7679: loss 2.7037, time 5267.63ms 
iter 7680: loss 2.1754, time 5266.85ms 
iter 7681: loss 2.3577, time 5255.73ms 
iter 7682: loss 2.4672, time 5265.99ms 
iter 7683: loss 2.5160, time 5273.19ms 
iter 7684: loss 2.4542, time 5269.05ms 
iter 7685: loss 2.5550, time 5274.34ms 
iter 7686: loss 2.6485, time 5279.91ms 
iter 7687: loss 2.6272, time 5251.16ms 
iter 7688: loss 2.6370, time 5273.46ms 
iter 7689: loss 2.2938, time 5270.67ms 
iter 7690: loss 2.3644, time 5269.18ms 
iter 7691: loss 2.4788, time 5264.40ms 
iter 7692: loss 2.4814, time 5265.79ms 
iter 7693: loss 2.8671, time 5264.67ms 
iter 7694: loss 2.5568, time 5268.57ms 
iter 7695: loss 2.6809, time 5272.69ms 
iter 7696: loss 2.5292, time 5278.92ms 
iter 7697: loss 2.5033, time 5264.69ms 
iter 7698: loss 2.7147, time 5271.74ms 
iter 7699: loss 2.2584, time 5260.82ms 
step 7700: train loss 2.4892, val loss 2.8504
iter 7700: loss 2.3914, time 20043.71ms 
iter 7701: loss 2.3560, time 5262.55ms 
iter 7702: loss 2.7210, time 5258.63ms 
iter 7703: loss 2.4669, time 5268.10ms 
iter 7704: loss 2.5289, time 5267.63ms 
iter 7705: loss 2.4439, time 5276.16ms 
iter 7706: loss 2.4233, time 5276.00ms 
iter 7707: loss 2.6076, time 5269.06ms 
iter 7708: loss 2.5138, time 5276.23ms 
iter 7709: loss 2.4404, time 5285.45ms 
iter 7710: loss 2.6752, time 5279.83ms 
iter 7711: loss 2.4071, time 5268.08ms 
iter 7712: loss 2.5298, time 5231.43ms 
iter 7713: loss 2.5092, time 5225.62ms 
iter 7714: loss 2.3637, time 5261.36ms 
iter 7715: loss 2.6346, time 5256.51ms 
iter 7716: loss 2.6047, time 5270.93ms 
iter 7717: loss 2.4182, time 5264.71ms 
iter 7718: loss 2.5591, time 5261.07ms 
iter 7719: loss 2.4368, time 5267.58ms 
iter 7720: loss 2.4453, time 5278.26ms 
iter 7721: loss 2.4007, time 5265.85ms 
iter 7722: loss 2.2728, time 5272.88ms 
iter 7723: loss 2.4731, time 5265.38ms 
iter 7724: loss 2.3055, time 5263.75ms 
iter 7725: loss 2.5899, time 5255.70ms 
iter 7726: loss 2.5838, time 5260.44ms 
iter 7727: loss 2.3645, time 5263.64ms 
iter 7728: loss 2.4957, time 5220.76ms 
iter 7729: loss 2.2162, time 5234.86ms 
iter 7730: loss 2.4040, time 5236.57ms 
iter 7731: loss 2.6020, time 5257.06ms 
iter 7732: loss 2.7552, time 5247.92ms 
iter 7733: loss 2.3422, time 5247.06ms 
iter 7734: loss 2.6870, time 5245.22ms 
iter 7735: loss 2.4803, time 5251.67ms 
iter 7736: loss 2.4046, time 5242.32ms 
iter 7737: loss 2.6375, time 5240.27ms 
iter 7738: loss 2.4790, time 5275.56ms 
iter 7739: loss 2.5395, time 5274.96ms 
iter 7740: loss 2.5530, time 5270.26ms 
iter 7741: loss 2.4295, time 5266.30ms 
iter 7742: loss 2.4928, time 5274.50ms 
iter 7743: loss 2.6184, time 5281.06ms 
iter 7744: loss 2.4813, time 5268.49ms 
iter 7745: loss 2.4570, time 5250.10ms 
iter 7746: loss 2.4133, time 5282.45ms 
iter 7747: loss 2.4627, time 5278.48ms 
iter 7748: loss 2.5990, time 5277.62ms 
iter 7749: loss 2.5497, time 5278.53ms 
step 7750: train loss 2.4837, val loss 2.8313
iter 7750: loss 2.4226, time 20055.55ms 
iter 7751: loss 2.5057, time 5264.29ms 
iter 7752: loss 2.6080, time 5260.72ms 
iter 7753: loss 2.3560, time 5263.42ms 
iter 7754: loss 2.5417, time 5267.51ms 
iter 7755: loss 2.5166, time 5260.92ms 
iter 7756: loss 2.3083, time 5234.93ms 
iter 7757: loss 2.3548, time 5234.95ms 
iter 7758: loss 2.6578, time 5253.90ms 
iter 7759: loss 2.6739, time 5270.26ms 
iter 7760: loss 2.4886, time 5256.60ms 
iter 7761: loss 2.5188, time 5253.83ms 
iter 7762: loss 2.5269, time 5236.83ms 
iter 7763: loss 2.6493, time 5255.62ms 
iter 7764: loss 2.5365, time 5249.35ms 
iter 7765: loss 2.6500, time 5217.42ms 
iter 7766: loss 2.5759, time 5170.53ms 
iter 7767: loss 2.4130, time 5158.73ms 
iter 7768: loss 2.5214, time 5241.03ms 
iter 7769: loss 2.5755, time 5175.74ms 
iter 7770: loss 2.2174, time 5280.53ms 
iter 7771: loss 2.5326, time 5267.74ms 
iter 7772: loss 2.4646, time 5274.45ms 
iter 7773: loss 2.4911, time 5274.84ms 
iter 7774: loss 2.6912, time 5275.50ms 
iter 7775: loss 2.7657, time 5274.97ms 
iter 7776: loss 2.4065, time 5235.36ms 
iter 7777: loss 2.5587, time 5265.07ms 
iter 7778: loss 2.6138, time 5263.07ms 
iter 7779: loss 2.4701, time 5257.29ms 
iter 7780: loss 2.5205, time 5261.47ms 
iter 7781: loss 2.6741, time 5252.79ms 
iter 7782: loss 2.6885, time 5161.62ms 
iter 7783: loss 2.5843, time 5187.41ms 
iter 7784: loss 2.2951, time 5191.28ms 
iter 7785: loss 2.3984, time 5169.92ms 
iter 7786: loss 2.4080, time 5204.49ms 
iter 7787: loss 2.6433, time 5165.92ms 
iter 7788: loss 2.7393, time 5215.72ms 
iter 7789: loss 2.3933, time 5189.55ms 
iter 7790: loss 2.3676, time 5200.92ms 
iter 7791: loss 2.5711, time 5233.94ms 
iter 7792: loss 2.2620, time 5261.73ms 
iter 7793: loss 2.5598, time 5260.96ms 
iter 7794: loss 2.3906, time 5229.44ms 
iter 7795: loss 2.4682, time 5264.20ms 
iter 7796: loss 2.2830, time 5257.10ms 
iter 7797: loss 2.4111, time 5250.50ms 
iter 7798: loss 2.4666, time 5228.99ms 
iter 7799: loss 2.2161, time 5265.26ms 
step 7800: train loss 2.4832, val loss 2.8351
iter 7800: loss 2.4943, time 20055.75ms 
iter 7801: loss 2.1911, time 5255.64ms 
iter 7802: loss 2.3818, time 5262.23ms 
iter 7803: loss 2.5162, time 5225.56ms 
iter 7804: loss 2.8040, time 5260.85ms 
iter 7805: loss 2.5297, time 5266.49ms 
iter 7806: loss 2.5262, time 5259.26ms 
iter 7807: loss 2.4545, time 5264.32ms 
iter 7808: loss 2.5827, time 5261.25ms 
iter 7809: loss 2.6544, time 5265.91ms 
iter 7810: loss 2.4782, time 5260.60ms 
iter 7811: loss 2.3873, time 5266.50ms 
iter 7812: loss 2.5316, time 5249.95ms 
iter 7813: loss 2.5289, time 5268.82ms 
iter 7814: loss 2.7905, time 5263.95ms 
iter 7815: loss 2.2530, time 5302.27ms 
iter 7816: loss 2.3005, time 5314.92ms 
iter 7817: loss 2.6923, time 5255.17ms 
iter 7818: loss 2.5272, time 5262.33ms 
iter 7819: loss 2.4037, time 5271.39ms 
iter 7820: loss 2.3230, time 5272.32ms 
iter 7821: loss 2.6017, time 5261.42ms 
iter 7822: loss 2.5097, time 5283.00ms 
iter 7823: loss 2.3528, time 5265.35ms 
iter 7824: loss 2.4868, time 5260.82ms 
iter 7825: loss 2.3524, time 5244.46ms 
iter 7826: loss 2.2706, time 5261.78ms 
iter 7827: loss 2.4254, time 5283.31ms 
iter 7828: loss 2.5705, time 5268.90ms 
iter 7829: loss 2.4971, time 5237.23ms 
iter 7830: loss 2.3639, time 5280.26ms 
iter 7831: loss 2.5651, time 5282.60ms 
iter 7832: loss 2.4559, time 5283.54ms 
iter 7833: loss 2.3448, time 5279.54ms 
iter 7834: loss 2.5413, time 5283.92ms 
iter 7835: loss 2.2566, time 5275.24ms 
iter 7836: loss 2.5049, time 5278.03ms 
iter 7837: loss 2.4873, time 5265.49ms 
iter 7838: loss 2.2720, time 5261.55ms 
iter 7839: loss 2.4909, time 5274.12ms 
iter 7840: loss 2.3114, time 5266.61ms 
iter 7841: loss 2.4896, time 5271.84ms 
iter 7842: loss 2.2302, time 5264.38ms 
iter 7843: loss 2.3858, time 5278.54ms 
iter 7844: loss 2.5411, time 5273.46ms 
iter 7845: loss 2.5538, time 5270.91ms 
iter 7846: loss 2.6648, time 5269.38ms 
iter 7847: loss 2.7038, time 5273.43ms 
iter 7848: loss 2.4965, time 5277.89ms 
iter 7849: loss 2.5049, time 5268.40ms 
step 7850: train loss 2.4951, val loss 2.8484
iter 7850: loss 2.4172, time 20078.73ms 
iter 7851: loss 2.6258, time 5255.25ms 
iter 7852: loss 2.6706, time 5275.05ms 
iter 7853: loss 2.4853, time 5263.26ms 
iter 7854: loss 2.5134, time 5264.67ms 
iter 7855: loss 2.6537, time 5278.01ms 
iter 7856: loss 2.6293, time 5265.13ms 
iter 7857: loss 2.6512, time 5255.54ms 
iter 7858: loss 2.5994, time 5258.18ms 
iter 7859: loss 2.5649, time 5261.07ms 
iter 7860: loss 2.5150, time 5264.07ms 
iter 7861: loss 2.2820, time 5271.73ms 
iter 7862: loss 2.4362, time 5269.75ms 
iter 7863: loss 2.7245, time 5267.52ms 
iter 7864: loss 2.5121, time 5268.04ms 
iter 7865: loss 2.3615, time 5261.29ms 
iter 7866: loss 2.5784, time 5264.98ms 
iter 7867: loss 2.3966, time 5262.18ms 
iter 7868: loss 2.4538, time 5271.47ms 
iter 7869: loss 2.3972, time 5273.21ms 
iter 7870: loss 2.4457, time 5270.86ms 
iter 7871: loss 2.5579, time 5267.29ms 
iter 7872: loss 2.3735, time 5264.44ms 
iter 7873: loss 2.5495, time 5263.21ms 
iter 7874: loss 2.6649, time 5274.50ms 
iter 7875: loss 2.6049, time 5246.80ms 
iter 7876: loss 2.3819, time 5254.21ms 
iter 7877: loss 2.5182, time 5229.20ms 
iter 7878: loss 2.3388, time 5277.49ms 
iter 7879: loss 2.4710, time 5220.39ms 
iter 7880: loss 2.5768, time 5260.38ms 
iter 7881: loss 2.5360, time 5267.26ms 
iter 7882: loss 2.5501, time 5265.38ms 
iter 7883: loss 2.2585, time 5277.42ms 
iter 7884: loss 2.6448, time 5271.18ms 
iter 7885: loss 2.4344, time 5274.26ms 
iter 7886: loss 2.4241, time 5281.34ms 
iter 7887: loss 2.3435, time 5274.60ms 
iter 7888: loss 2.4320, time 5275.49ms 
iter 7889: loss 2.3702, time 5278.95ms 
iter 7890: loss 2.4583, time 5258.48ms 
iter 7891: loss 2.4194, time 5232.04ms 
iter 7892: loss 2.3632, time 5276.64ms 
iter 7893: loss 2.5345, time 5282.83ms 
iter 7894: loss 2.7061, time 5255.68ms 
iter 7895: loss 2.4490, time 5265.70ms 
iter 7896: loss 2.4976, time 5254.55ms 
iter 7897: loss 2.7256, time 5272.47ms 
iter 7898: loss 2.4285, time 5262.28ms 
iter 7899: loss 2.4549, time 5266.94ms 
step 7900: train loss 2.4926, val loss 2.8350
iter 7900: loss 2.5387, time 20059.30ms 
iter 7901: loss 2.6479, time 5240.20ms 
iter 7902: loss 2.4410, time 5267.78ms 
iter 7903: loss 2.4817, time 5257.11ms 
iter 7904: loss 2.6256, time 5238.79ms 
iter 7905: loss 2.3197, time 5272.07ms 
iter 7906: loss 2.5074, time 5275.18ms 
iter 7907: loss 2.5729, time 5265.42ms 
iter 7908: loss 2.4153, time 5269.56ms 
iter 7909: loss 2.4003, time 5269.39ms 
iter 7910: loss 2.4510, time 5261.08ms 
iter 7911: loss 2.5580, time 5271.48ms 
iter 7912: loss 2.4957, time 5281.13ms 
iter 7913: loss 2.3859, time 5279.66ms 
iter 7914: loss 2.2439, time 5276.13ms 
iter 7915: loss 2.3658, time 5272.49ms 
iter 7916: loss 2.4663, time 5293.92ms 
iter 7917: loss 2.4655, time 5272.04ms 
iter 7918: loss 2.3868, time 5259.97ms 
iter 7919: loss 2.6085, time 5264.43ms 
iter 7920: loss 2.4088, time 5264.09ms 
iter 7921: loss 2.6842, time 5265.86ms 
iter 7922: loss 2.4655, time 5263.88ms 
iter 7923: loss 2.3313, time 5280.91ms 
iter 7924: loss 2.4561, time 5283.51ms 
iter 7925: loss 2.3767, time 5262.92ms 
iter 7926: loss 2.4204, time 5259.95ms 
iter 7927: loss 2.5337, time 5275.57ms 
iter 7928: loss 2.6058, time 5240.17ms 
iter 7929: loss 2.4457, time 5268.43ms 
iter 7930: loss 2.6168, time 5271.43ms 
iter 7931: loss 2.3243, time 5277.86ms 
iter 7932: loss 2.8410, time 5270.89ms 
iter 7933: loss 2.5174, time 5262.77ms 
iter 7934: loss 2.6383, time 5264.50ms 
iter 7935: loss 2.6309, time 5263.98ms 
iter 7936: loss 2.4776, time 5256.26ms 
iter 7937: loss 2.4475, time 5261.76ms 
iter 7938: loss 2.6799, time 5273.25ms 
iter 7939: loss 2.3343, time 5258.37ms 
iter 7940: loss 2.6137, time 5261.91ms 
iter 7941: loss 2.3002, time 5269.03ms 
iter 7942: loss 2.1929, time 5270.75ms 
iter 7943: loss 2.7215, time 5260.03ms 
iter 7944: loss 2.6062, time 5273.39ms 
iter 7945: loss 2.5578, time 5278.29ms 
iter 7946: loss 2.6315, time 5265.39ms 
iter 7947: loss 2.6361, time 5268.38ms 
iter 7948: loss 2.5123, time 5269.44ms 
iter 7949: loss 2.4201, time 5276.61ms 
step 7950: train loss 2.4813, val loss 2.8287
iter 7950: loss 2.5264, time 20050.94ms 
iter 7951: loss 2.5373, time 5257.78ms 
iter 7952: loss 2.6898, time 5259.34ms 
iter 7953: loss 2.4027, time 5269.26ms 
iter 7954: loss 2.3464, time 5268.28ms 
iter 7955: loss 2.4018, time 5261.16ms 
iter 7956: loss 2.5803, time 5270.21ms 
iter 7957: loss 2.2958, time 5266.71ms 
iter 7958: loss 2.4911, time 5259.77ms 
iter 7959: loss 2.3797, time 5259.00ms 
iter 7960: loss 2.4431, time 5271.68ms 
iter 7961: loss 2.5198, time 5261.04ms 
iter 7962: loss 2.6929, time 5262.91ms 
iter 7963: loss 2.3544, time 5258.99ms 
iter 7964: loss 2.4972, time 5280.16ms 
iter 7965: loss 2.6216, time 5253.38ms 
iter 7966: loss 2.3703, time 5258.28ms 
iter 7967: loss 2.1525, time 5267.20ms 
iter 7968: loss 2.5186, time 5259.09ms 
iter 7969: loss 2.3172, time 5257.50ms 
iter 7970: loss 2.6167, time 5258.58ms 
iter 7971: loss 2.5004, time 5258.02ms 
iter 7972: loss 2.4625, time 5266.29ms 
iter 7973: loss 2.4692, time 5260.08ms 
iter 7974: loss 2.4584, time 5258.36ms 
iter 7975: loss 2.4661, time 5265.46ms 
iter 7976: loss 2.5162, time 5283.40ms 
iter 7977: loss 2.5051, time 5250.38ms 
iter 7978: loss 2.5216, time 5262.41ms 
iter 7979: loss 2.6871, time 5268.33ms 
iter 7980: loss 2.5635, time 5257.42ms 
iter 7981: loss 2.3921, time 5262.53ms 
iter 7982: loss 2.7201, time 5256.06ms 
iter 7983: loss 2.5976, time 5280.43ms 
iter 7984: loss 2.4040, time 5272.29ms 
iter 7985: loss 2.4973, time 5266.08ms 
iter 7986: loss 2.5061, time 5258.97ms 
iter 7987: loss 2.3279, time 5264.55ms 
iter 7988: loss 2.6029, time 5265.61ms 
iter 7989: loss 2.6524, time 5259.48ms 
iter 7990: loss 2.5603, time 5258.39ms 
iter 7991: loss 2.5207, time 5277.00ms 
iter 7992: loss 2.5843, time 5272.22ms 
iter 7993: loss 2.4441, time 5271.24ms 
iter 7994: loss 2.5461, time 5260.53ms 
iter 7995: loss 2.3103, time 5283.96ms 
iter 7996: loss 2.3822, time 5266.63ms 
iter 7997: loss 2.6456, time 5263.91ms 
iter 7998: loss 2.5419, time 5276.27ms 
iter 7999: loss 2.4474, time 5270.82ms 
step 8000: train loss 2.5110, val loss 2.8559
iter 8000: loss 2.6935, time 20062.06ms 
iter 8001: loss 2.4112, time 5269.13ms 
iter 8002: loss 2.3547, time 5264.48ms 
iter 8003: loss 2.4594, time 5273.91ms 
iter 8004: loss 2.7774, time 5273.73ms 
iter 8005: loss 2.1877, time 5265.03ms 
iter 8006: loss 2.3291, time 5275.14ms 
iter 8007: loss 2.2953, time 5276.93ms 
iter 8008: loss 2.4904, time 5265.18ms 
iter 8009: loss 2.4193, time 5265.78ms 
iter 8010: loss 2.6026, time 5277.95ms 
iter 8011: loss 2.6918, time 5270.74ms 
iter 8012: loss 2.7758, time 5262.58ms 
iter 8013: loss 2.5345, time 5266.77ms 
iter 8014: loss 2.5447, time 5283.46ms 
iter 8015: loss 2.5892, time 5276.68ms 
iter 8016: loss 2.8076, time 5275.27ms 
iter 8017: loss 2.5489, time 5283.37ms 
iter 8018: loss 2.3975, time 5274.83ms 
iter 8019: loss 2.3225, time 5274.76ms 
iter 8020: loss 2.3520, time 5282.74ms 
iter 8021: loss 2.5428, time 5279.89ms 
iter 8022: loss 2.4642, time 5261.70ms 
iter 8023: loss 2.6030, time 5261.95ms 
iter 8024: loss 2.4222, time 5223.20ms 
iter 8025: loss 2.3553, time 5259.45ms 
iter 8026: loss 2.4607, time 5266.87ms 
iter 8027: loss 2.5956, time 5261.69ms 
iter 8028: loss 2.5179, time 5263.80ms 
iter 8029: loss 2.2733, time 5265.46ms 
iter 8030: loss 2.6544, time 5264.99ms 
iter 8031: loss 2.3883, time 5258.95ms 
iter 8032: loss 2.4763, time 5270.57ms 
iter 8033: loss 2.5249, time 5270.26ms 
iter 8034: loss 2.5971, time 5262.54ms 
iter 8035: loss 2.4513, time 5259.90ms 
iter 8036: loss 2.5994, time 5272.47ms 
iter 8037: loss 2.5113, time 5264.32ms 
iter 8038: loss 2.5021, time 5220.01ms 
iter 8039: loss 2.4124, time 5227.04ms 
iter 8040: loss 2.6103, time 5269.16ms 
iter 8041: loss 2.6623, time 5259.36ms 
iter 8042: loss 2.1882, time 5255.78ms 
iter 8043: loss 2.4973, time 5240.67ms 
iter 8044: loss 2.6453, time 5279.42ms 
iter 8045: loss 2.4206, time 5264.86ms 
iter 8046: loss 2.5387, time 5272.83ms 
iter 8047: loss 2.5263, time 5279.63ms 
iter 8048: loss 2.4073, time 5273.77ms 
iter 8049: loss 2.5154, time 5281.33ms 
step 8050: train loss 2.4774, val loss 2.8449
iter 8050: loss 2.5857, time 20105.78ms 
iter 8051: loss 2.7973, time 5278.68ms 
iter 8052: loss 2.3694, time 5263.02ms 
iter 8053: loss 2.5541, time 5284.88ms 
iter 8054: loss 2.5374, time 5234.89ms 
iter 8055: loss 2.5545, time 5277.27ms 
iter 8056: loss 2.5358, time 5245.36ms 
iter 8057: loss 2.4993, time 5266.26ms 
iter 8058: loss 2.5457, time 5249.07ms 
iter 8059: loss 2.3950, time 5277.99ms 
iter 8060: loss 2.3573, time 5255.85ms 
iter 8061: loss 2.4876, time 5264.24ms 
iter 8062: loss 2.4390, time 5276.09ms 
iter 8063: loss 2.5251, time 5262.42ms 
iter 8064: loss 2.4881, time 5269.88ms 
iter 8065: loss 2.6700, time 5273.08ms 
iter 8066: loss 2.5537, time 5272.65ms 
iter 8067: loss 2.3855, time 5259.13ms 
iter 8068: loss 2.6794, time 5260.04ms 
iter 8069: loss 2.6049, time 5255.07ms 
iter 8070: loss 2.5856, time 5264.50ms 
iter 8071: loss 2.5339, time 5257.83ms 
iter 8072: loss 2.6535, time 5270.37ms 
iter 8073: loss 2.3310, time 5266.65ms 
iter 8074: loss 2.1791, time 5256.28ms 
iter 8075: loss 2.6096, time 5256.22ms 
iter 8076: loss 2.5729, time 5272.45ms 
iter 8077: loss 2.6593, time 5261.87ms 
iter 8078: loss 2.2869, time 5266.97ms 
iter 8079: loss 2.6124, time 5265.21ms 
iter 8080: loss 2.5054, time 5275.30ms 
iter 8081: loss 2.2015, time 5256.46ms 
iter 8082: loss 2.4210, time 5257.05ms 
iter 8083: loss 2.4916, time 5266.92ms 
iter 8084: loss 2.5123, time 5262.20ms 
iter 8085: loss 2.4422, time 5258.71ms 
iter 8086: loss 2.5691, time 5259.77ms 
iter 8087: loss 2.4660, time 5277.08ms 
iter 8088: loss 2.3471, time 5261.41ms 
iter 8089: loss 2.7074, time 5275.72ms 
iter 8090: loss 2.2932, time 5278.99ms 
iter 8091: loss 2.6283, time 5271.00ms 
iter 8092: loss 2.5351, time 5257.40ms 
iter 8093: loss 2.5848, time 5268.43ms 
iter 8094: loss 2.4751, time 5274.63ms 
iter 8095: loss 2.4244, time 5269.87ms 
iter 8096: loss 2.5401, time 5265.47ms 
iter 8097: loss 2.4019, time 5270.63ms 
iter 8098: loss 2.5375, time 5279.93ms 
iter 8099: loss 2.3807, time 5275.46ms 
step 8100: train loss 2.4875, val loss 2.8480
iter 8100: loss 2.4642, time 20070.69ms 
iter 8101: loss 2.5307, time 5279.70ms 
iter 8102: loss 2.4646, time 5266.44ms 
iter 8103: loss 2.4355, time 5271.00ms 
iter 8104: loss 2.4202, time 5265.57ms 
iter 8105: loss 2.3158, time 5266.75ms 
iter 8106: loss 2.4451, time 5260.89ms 
iter 8107: loss 2.5172, time 5257.64ms 
iter 8108: loss 2.6926, time 5255.52ms 
iter 8109: loss 2.3639, time 5270.29ms 
iter 8110: loss 2.2779, time 5259.10ms 
iter 8111: loss 2.7307, time 5255.38ms 
iter 8112: loss 2.5348, time 5255.93ms 
iter 8113: loss 2.4992, time 5274.29ms 
iter 8114: loss 2.5014, time 5254.73ms 
iter 8115: loss 2.4829, time 5255.75ms 
iter 8116: loss 2.5424, time 5227.83ms 
iter 8117: loss 2.4151, time 5273.62ms 
iter 8118: loss 2.3342, time 5257.29ms 
iter 8119: loss 2.4914, time 5251.90ms 
iter 8120: loss 2.5152, time 5264.07ms 
iter 8121: loss 2.5830, time 5272.44ms 
iter 8122: loss 2.4681, time 5260.87ms 
iter 8123: loss 2.6125, time 5257.64ms 
iter 8124: loss 2.5326, time 5256.65ms 
iter 8125: loss 2.4760, time 5275.37ms 
iter 8126: loss 2.5023, time 5274.02ms 
iter 8127: loss 2.6028, time 5275.28ms 
iter 8128: loss 2.5935, time 5274.42ms 
iter 8129: loss 2.4398, time 5275.18ms 
iter 8130: loss 2.3840, time 5259.38ms 
iter 8131: loss 2.6444, time 5257.60ms 
iter 8132: loss 2.2446, time 5270.66ms 
iter 8133: loss 2.3788, time 5267.64ms 
iter 8134: loss 2.5247, time 5254.95ms 
iter 8135: loss 2.4274, time 5227.88ms 
iter 8136: loss 2.4316, time 5282.20ms 
iter 8137: loss 2.4399, time 5261.15ms 
iter 8138: loss 2.7621, time 5265.31ms 
iter 8139: loss 2.3971, time 5261.99ms 
iter 8140: loss 2.2731, time 5270.67ms 
iter 8141: loss 2.3980, time 5264.35ms 
iter 8142: loss 2.5087, time 5257.81ms 
iter 8143: loss 2.6692, time 5258.80ms 
iter 8144: loss 2.4251, time 5217.53ms 
iter 8145: loss 2.4639, time 5254.90ms 
iter 8146: loss 2.5387, time 5272.23ms 
iter 8147: loss 2.8309, time 5271.21ms 
iter 8148: loss 2.5353, time 5266.09ms 
iter 8149: loss 2.4324, time 5260.97ms 
step 8150: train loss 2.4822, val loss 2.8479
iter 8150: loss 2.6954, time 20062.46ms 
iter 8151: loss 2.6007, time 5237.83ms 
iter 8152: loss 2.4811, time 5175.24ms 
iter 8153: loss 2.5958, time 5244.61ms 
iter 8154: loss 2.5737, time 5234.11ms 
iter 8155: loss 2.6193, time 5256.26ms 
iter 8156: loss 2.3431, time 5211.67ms 
iter 8157: loss 2.6435, time 5259.22ms 
iter 8158: loss 2.3258, time 5268.04ms 
iter 8159: loss 2.4597, time 5244.92ms 
iter 8160: loss 2.5229, time 5197.32ms 
iter 8161: loss 2.3900, time 5241.95ms 
iter 8162: loss 2.4520, time 5218.05ms 
iter 8163: loss 2.3539, time 5229.84ms 
iter 8164: loss 2.4475, time 5218.16ms 
iter 8165: loss 2.6215, time 5266.00ms 
iter 8166: loss 2.2873, time 5260.97ms 
iter 8167: loss 2.5535, time 5261.33ms 
iter 8168: loss 2.4350, time 5210.82ms 
iter 8169: loss 2.4139, time 5207.79ms 
iter 8170: loss 2.5478, time 5214.93ms 
iter 8171: loss 2.5776, time 5221.74ms 
iter 8172: loss 2.3364, time 5194.37ms 
iter 8173: loss 2.4995, time 5239.32ms 
iter 8174: loss 2.6942, time 5217.71ms 
iter 8175: loss 2.6822, time 5186.42ms 
iter 8176: loss 2.5758, time 5204.08ms 
iter 8177: loss 2.3016, time 5224.22ms 
iter 8178: loss 2.7626, time 5205.66ms 
iter 8179: loss 2.1900, time 5196.72ms 
iter 8180: loss 2.2954, time 5243.94ms 
iter 8181: loss 2.5370, time 5231.21ms 
iter 8182: loss 2.5818, time 5221.88ms 
iter 8183: loss 2.3412, time 5265.75ms 
iter 8184: loss 2.4627, time 5268.48ms 
iter 8185: loss 2.5812, time 5259.55ms 
iter 8186: loss 2.4812, time 5276.29ms 
iter 8187: loss 2.6580, time 5282.59ms 
iter 8188: loss 2.4570, time 5268.86ms 
iter 8189: loss 2.4138, time 5267.94ms 
iter 8190: loss 2.4891, time 5268.36ms 
iter 8191: loss 2.6390, time 5279.47ms 
iter 8192: loss 2.4327, time 5256.67ms 
iter 8193: loss 2.6454, time 5264.83ms 
iter 8194: loss 2.3264, time 5268.72ms 
iter 8195: loss 2.3925, time 5249.93ms 
iter 8196: loss 2.2010, time 5264.99ms 
iter 8197: loss 2.5098, time 5267.87ms 
iter 8198: loss 2.5498, time 5282.03ms 
iter 8199: loss 2.5801, time 5258.30ms 
step 8200: train loss 2.4779, val loss 2.8494
iter 8200: loss 2.5454, time 20069.43ms 
iter 8201: loss 2.4035, time 5266.84ms 
iter 8202: loss 2.1643, time 5284.16ms 
iter 8203: loss 2.5727, time 5269.22ms 
iter 8204: loss 2.6308, time 5262.38ms 
iter 8205: loss 2.3765, time 5239.25ms 
iter 8206: loss 2.6547, time 5277.37ms 
iter 8207: loss 2.5526, time 5275.06ms 
iter 8208: loss 2.3435, time 5263.74ms 
iter 8209: loss 2.2208, time 5225.89ms 
iter 8210: loss 2.4389, time 5266.45ms 
iter 8211: loss 2.3577, time 5259.28ms 
iter 8212: loss 2.5158, time 5273.25ms 
iter 8213: loss 2.4903, time 5277.93ms 
iter 8214: loss 2.4558, time 5267.61ms 
iter 8215: loss 2.4753, time 5279.27ms 
iter 8216: loss 2.2837, time 5263.09ms 
iter 8217: loss 2.4377, time 5274.91ms 
iter 8218: loss 2.4310, time 5269.12ms 
iter 8219: loss 2.5316, time 5256.58ms 
iter 8220: loss 2.4183, time 5291.88ms 
iter 8221: loss 2.4774, time 5272.40ms 
iter 8222: loss 2.7340, time 5326.43ms 
iter 8223: loss 2.6317, time 5283.68ms 
iter 8224: loss 2.4054, time 5279.02ms 
iter 8225: loss 2.5057, time 5269.77ms 
iter 8226: loss 2.4813, time 5263.76ms 
iter 8227: loss 2.4206, time 5274.49ms 
iter 8228: loss 2.4624, time 5276.55ms 
iter 8229: loss 2.4092, time 5272.42ms 
iter 8230: loss 2.3238, time 5262.21ms 
iter 8231: loss 2.3383, time 5329.87ms 
iter 8232: loss 2.5264, time 5318.72ms 
iter 8233: loss 2.5209, time 5301.07ms 
iter 8234: loss 2.3689, time 5283.30ms 
iter 8235: loss 2.5294, time 5228.59ms 
iter 8236: loss 2.5630, time 5260.74ms 
iter 8237: loss 2.3607, time 5267.09ms 
iter 8238: loss 2.5859, time 5271.29ms 
iter 8239: loss 2.6880, time 5265.64ms 
iter 8240: loss 2.4765, time 5260.60ms 
iter 8241: loss 2.5010, time 5264.10ms 
iter 8242: loss 2.4578, time 5273.78ms 
iter 8243: loss 2.5778, time 5260.31ms 
iter 8244: loss 2.6404, time 5262.60ms 
iter 8245: loss 2.2783, time 5282.30ms 
iter 8246: loss 2.4733, time 5315.11ms 
iter 8247: loss 2.5574, time 5258.51ms 
iter 8248: loss 2.5550, time 5268.13ms 
iter 8249: loss 2.5260, time 5269.65ms 
step 8250: train loss 2.4910, val loss 2.8462
iter 8250: loss 2.3476, time 20113.62ms 
iter 8251: loss 2.4993, time 5318.11ms 
iter 8252: loss 2.4650, time 5266.95ms 
iter 8253: loss 2.4214, time 5276.47ms 
iter 8254: loss 2.6006, time 5264.62ms 
iter 8255: loss 2.5454, time 5282.78ms 
iter 8256: loss 2.4909, time 5267.28ms 
iter 8257: loss 2.5087, time 5280.83ms 
iter 8258: loss 2.3780, time 5267.85ms 
iter 8259: loss 2.3145, time 5268.46ms 
iter 8260: loss 2.2634, time 5264.58ms 
iter 8261: loss 2.6386, time 5275.42ms 
iter 8262: loss 2.4983, time 5268.44ms 
iter 8263: loss 2.4465, time 5266.21ms 
iter 8264: loss 2.3257, time 5273.00ms 
iter 8265: loss 2.5306, time 5280.08ms 
iter 8266: loss 2.4302, time 5264.37ms 
iter 8267: loss 2.5044, time 5274.91ms 
iter 8268: loss 2.5555, time 5258.78ms 
iter 8269: loss 2.5237, time 5276.87ms 
iter 8270: loss 2.4174, time 5261.81ms 
iter 8271: loss 2.4407, time 5268.63ms 
iter 8272: loss 2.4209, time 5285.30ms 
iter 8273: loss 2.5421, time 5280.18ms 
iter 8274: loss 2.7907, time 5273.00ms 
iter 8275: loss 2.4393, time 5272.34ms 
iter 8276: loss 2.4086, time 5274.90ms 
iter 8277: loss 2.5356, time 5266.55ms 
iter 8278: loss 2.4744, time 5263.14ms 
iter 8279: loss 2.4416, time 5266.33ms 
iter 8280: loss 2.4372, time 5260.66ms 
iter 8281: loss 2.3338, time 5267.74ms 
iter 8282: loss 2.3430, time 5263.54ms 
iter 8283: loss 2.4369, time 5258.02ms 
iter 8284: loss 2.3648, time 5273.30ms 
iter 8285: loss 2.3941, time 5258.48ms 
iter 8286: loss 2.6924, time 5268.04ms 
iter 8287: loss 2.5696, time 5260.76ms 
iter 8288: loss 2.3947, time 5265.24ms 
iter 8289: loss 2.6485, time 5282.94ms 
iter 8290: loss 2.5139, time 5349.38ms 
iter 8291: loss 2.5725, time 5245.68ms 
iter 8292: loss 2.3370, time 5264.87ms 
iter 8293: loss 2.2335, time 5252.60ms 
iter 8294: loss 2.5162, time 5305.93ms 
iter 8295: loss 2.3760, time 5265.74ms 
iter 8296: loss 2.4416, time 5260.60ms 
iter 8297: loss 2.6252, time 5300.13ms 
iter 8298: loss 2.7267, time 5255.35ms 
iter 8299: loss 2.4170, time 5258.55ms 
step 8300: train loss 2.4920, val loss 2.8540
iter 8300: loss 2.4700, time 19897.41ms 
iter 8301: loss 2.5445, time 5264.80ms 
iter 8302: loss 2.4617, time 5260.64ms 
iter 8303: loss 2.5894, time 5265.47ms 
iter 8304: loss 2.5568, time 5273.07ms 
iter 8305: loss 2.3272, time 5268.59ms 
iter 8306: loss 2.4886, time 5274.41ms 
iter 8307: loss 2.4498, time 5279.47ms 
iter 8308: loss 2.6133, time 5265.26ms 
iter 8309: loss 2.3844, time 5266.60ms 
iter 8310: loss 2.4508, time 5261.13ms 
iter 8311: loss 2.4507, time 5265.15ms 
iter 8312: loss 2.2698, time 5261.13ms 
iter 8313: loss 2.4255, time 5262.64ms 
iter 8314: loss 2.2755, time 5278.59ms 
iter 8315: loss 2.5330, time 5266.47ms 
iter 8316: loss 2.3396, time 5262.25ms 
iter 8317: loss 2.4054, time 5257.92ms 
iter 8318: loss 2.4299, time 5269.62ms 
iter 8319: loss 2.4321, time 5266.69ms 
iter 8320: loss 2.3115, time 5262.57ms 
iter 8321: loss 2.5657, time 5271.17ms 
iter 8322: loss 2.3495, time 5268.78ms 
iter 8323: loss 2.3604, time 5248.67ms 
iter 8324: loss 2.2095, time 5267.56ms 
iter 8325: loss 2.5585, time 5264.52ms 
iter 8326: loss 2.3912, time 5275.37ms 
iter 8327: loss 2.6581, time 5266.74ms 
iter 8328: loss 2.4544, time 5267.44ms 
iter 8329: loss 2.2048, time 5314.08ms 
iter 8330: loss 2.3292, time 5282.56ms 
iter 8331: loss 2.8237, time 5289.36ms 
iter 8332: loss 2.3945, time 5272.97ms 
iter 8333: loss 2.4844, time 5261.84ms 
iter 8334: loss 2.4381, time 5279.57ms 
iter 8335: loss 2.5084, time 5260.02ms 
iter 8336: loss 2.6778, time 5268.96ms 
iter 8337: loss 2.5809, time 5291.06ms 
iter 8338: loss 2.3389, time 5272.06ms 
iter 8339: loss 2.3861, time 5259.01ms 
iter 8340: loss 2.5299, time 5279.05ms 
iter 8341: loss 2.5862, time 5260.55ms 
iter 8342: loss 2.2195, time 5272.25ms 
iter 8343: loss 2.5341, time 5266.50ms 
iter 8344: loss 2.4707, time 5266.17ms 
iter 8345: loss 2.7808, time 5262.37ms 
iter 8346: loss 2.4674, time 5263.06ms 
iter 8347: loss 2.1990, time 5287.66ms 
iter 8348: loss 2.3293, time 5289.87ms 
iter 8349: loss 2.5636, time 5299.32ms 
step 8350: train loss 2.4672, val loss 2.8347
iter 8350: loss 2.5760, time 20085.62ms 
iter 8351: loss 2.1729, time 5281.65ms 
iter 8352: loss 2.4265, time 5280.43ms 
iter 8353: loss 2.2681, time 5277.19ms 
iter 8354: loss 2.3679, time 5283.40ms 
iter 8355: loss 2.4588, time 5223.73ms 
iter 8356: loss 2.3659, time 5201.77ms 
iter 8357: loss 2.6595, time 5284.64ms 
iter 8358: loss 2.5442, time 5227.68ms 
iter 8359: loss 2.4402, time 5263.95ms 
iter 8360: loss 2.6213, time 5300.26ms 
iter 8361: loss 2.3476, time 5236.52ms 
iter 8362: loss 2.5148, time 5303.21ms 
iter 8363: loss 2.4270, time 5270.16ms 
iter 8364: loss 2.5720, time 5277.86ms 
iter 8365: loss 2.5922, time 5263.49ms 
iter 8366: loss 2.4496, time 5267.29ms 
iter 8367: loss 2.3584, time 5269.51ms 
iter 8368: loss 2.2469, time 5266.98ms 
iter 8369: loss 2.5066, time 5263.04ms 
iter 8370: loss 2.3452, time 5273.01ms 
iter 8371: loss 2.8476, time 5282.99ms 
iter 8372: loss 2.5652, time 5274.17ms 
iter 8373: loss 2.5187, time 5233.50ms 
iter 8374: loss 2.2556, time 5264.48ms 
iter 8375: loss 2.4234, time 5267.32ms 
iter 8376: loss 2.7269, time 5262.06ms 
iter 8377: loss 2.4125, time 5263.09ms 
iter 8378: loss 2.4113, time 5275.78ms 
iter 8379: loss 2.5615, time 5266.59ms 
iter 8380: loss 2.4172, time 5266.05ms 
iter 8381: loss 2.3811, time 5270.81ms 
iter 8382: loss 2.6916, time 5280.03ms 
iter 8383: loss 2.4820, time 5266.02ms 
iter 8384: loss 2.2601, time 5268.06ms 
iter 8385: loss 2.6995, time 5282.58ms 
iter 8386: loss 2.3142, time 5287.58ms 
iter 8387: loss 2.2937, time 5272.43ms 
iter 8388: loss 2.6134, time 5221.55ms 
iter 8389: loss 2.5746, time 5263.64ms 
iter 8390: loss 2.5012, time 5269.37ms 
iter 8391: loss 2.4443, time 5269.80ms 
iter 8392: loss 2.4350, time 5224.76ms 
iter 8393: loss 2.5076, time 5263.21ms 
iter 8394: loss 2.6077, time 5260.50ms 
iter 8395: loss 2.2834, time 5263.92ms 
iter 8396: loss 2.3678, time 5277.98ms 
iter 8397: loss 2.4822, time 5262.88ms 
iter 8398: loss 2.4320, time 5273.25ms 
iter 8399: loss 2.4366, time 5260.25ms 
step 8400: train loss 2.4734, val loss 2.8493
iter 8400: loss 2.4778, time 20109.32ms 
iter 8401: loss 2.4448, time 5267.21ms 
iter 8402: loss 2.4754, time 5273.43ms 
iter 8403: loss 2.3668, time 5279.43ms 
iter 8404: loss 2.4636, time 5268.66ms 
iter 8405: loss 2.3206, time 5270.19ms 
iter 8406: loss 2.6186, time 5275.32ms 
iter 8407: loss 2.4204, time 5260.17ms 
iter 8408: loss 2.7733, time 5273.49ms 
iter 8409: loss 2.4682, time 5251.46ms 
iter 8410: loss 2.2515, time 5263.73ms 
iter 8411: loss 2.4430, time 5274.13ms 
iter 8412: loss 2.3772, time 5272.08ms 
iter 8413: loss 2.4166, time 5270.98ms 
iter 8414: loss 2.3258, time 5268.12ms 
iter 8415: loss 2.6926, time 5274.40ms 
iter 8416: loss 2.3743, time 5264.88ms 
iter 8417: loss 2.4433, time 5268.53ms 
iter 8418: loss 2.4846, time 5285.49ms 
iter 8419: loss 2.3417, time 5286.49ms 
iter 8420: loss 2.6696, time 5271.91ms 
iter 8421: loss 2.6913, time 5271.61ms 
iter 8422: loss 2.5239, time 5279.86ms 
iter 8423: loss 2.5201, time 5281.55ms 
iter 8424: loss 2.4626, time 5267.70ms 
iter 8425: loss 2.4557, time 5290.20ms 
iter 8426: loss 2.4980, time 5285.15ms 
iter 8427: loss 2.3871, time 5265.61ms 
iter 8428: loss 2.4751, time 5258.48ms 
iter 8429: loss 2.6372, time 5271.46ms 
iter 8430: loss 2.5356, time 5253.00ms 
iter 8431: loss 2.2547, time 5260.18ms 
iter 8432: loss 2.4934, time 5265.19ms 
iter 8433: loss 2.4616, time 5264.45ms 
iter 8434: loss 2.4329, time 5260.53ms 
iter 8435: loss 2.6795, time 5264.90ms 
iter 8436: loss 2.4038, time 5275.78ms 
iter 8437: loss 2.3356, time 5270.67ms 
iter 8438: loss 2.5268, time 5275.78ms 
iter 8439: loss 2.5474, time 5283.22ms 
iter 8440: loss 2.4726, time 5277.41ms 
iter 8441: loss 2.6519, time 5272.45ms 
iter 8442: loss 2.3559, time 5278.48ms 
iter 8443: loss 2.3947, time 5281.05ms 
iter 8444: loss 2.4003, time 5262.83ms 
iter 8445: loss 2.5276, time 5265.43ms 
iter 8446: loss 2.6062, time 5266.19ms 
iter 8447: loss 2.2580, time 5264.14ms 
iter 8448: loss 2.3717, time 5272.06ms 
iter 8449: loss 2.3365, time 5267.92ms 
step 8450: train loss 2.4941, val loss 2.8490
iter 8450: loss 2.4255, time 20098.24ms 
iter 8451: loss 2.5713, time 5261.06ms 
iter 8452: loss 2.3372, time 5268.67ms 
iter 8453: loss 2.2275, time 5263.40ms 
iter 8454: loss 2.5758, time 5279.82ms 
iter 8455: loss 2.4625, time 5265.57ms 
iter 8456: loss 2.5384, time 5321.05ms 
iter 8457: loss 2.5123, time 5265.19ms 
iter 8458: loss 2.7275, time 5279.92ms 
iter 8459: loss 2.3502, time 5270.03ms 
iter 8460: loss 2.6410, time 5264.87ms 
iter 8461: loss 2.2741, time 5268.94ms 
iter 8462: loss 2.4943, time 5271.12ms 
iter 8463: loss 2.5544, time 5258.31ms 
iter 8464: loss 2.3467, time 5266.69ms 
iter 8465: loss 2.4750, time 5273.58ms 
iter 8466: loss 2.6276, time 5272.12ms 
iter 8467: loss 2.5290, time 5268.14ms 
iter 8468: loss 2.2845, time 5279.80ms 
iter 8469: loss 2.3896, time 5270.66ms 
iter 8470: loss 2.5049, time 5261.70ms 
iter 8471: loss 2.2750, time 5262.12ms 
iter 8472: loss 2.5759, time 5266.25ms 
iter 8473: loss 2.3405, time 5262.93ms 
iter 8474: loss 2.7624, time 5275.03ms 
iter 8475: loss 2.3423, time 5281.53ms 
iter 8476: loss 2.6820, time 5217.20ms 
iter 8477: loss 2.5589, time 5264.35ms 
iter 8478: loss 2.3021, time 5276.39ms 
iter 8479: loss 2.5099, time 5269.72ms 
iter 8480: loss 2.4085, time 5263.06ms 
iter 8481: loss 2.4080, time 5259.35ms 
iter 8482: loss 2.5740, time 5284.61ms 
iter 8483: loss 2.6111, time 5268.78ms 
iter 8484: loss 2.2145, time 5266.59ms 
iter 8485: loss 2.3995, time 5267.81ms 
iter 8486: loss 2.5058, time 5270.88ms 
iter 8487: loss 2.6764, time 5257.26ms 
iter 8488: loss 2.4044, time 5261.26ms 
iter 8489: loss 2.4677, time 5270.00ms 
iter 8490: loss 2.5399, time 5258.81ms 
iter 8491: loss 2.4916, time 5271.34ms 
iter 8492: loss 2.3334, time 5276.12ms 
iter 8493: loss 2.3695, time 5280.83ms 
iter 8494: loss 2.4399, time 5270.57ms 
iter 8495: loss 2.4949, time 5246.18ms 
iter 8496: loss 2.6643, time 5283.28ms 
iter 8497: loss 2.4264, time 5269.55ms 
iter 8498: loss 2.5706, time 5266.39ms 
iter 8499: loss 2.6957, time 5281.77ms 
step 8500: train loss 2.4651, val loss 2.8388
iter 8500: loss 2.6076, time 20089.24ms 
iter 8501: loss 2.5522, time 5264.47ms 
iter 8502: loss 2.4559, time 5274.24ms 
iter 8503: loss 2.6332, time 5270.66ms 
iter 8504: loss 2.5073, time 5262.16ms 
iter 8505: loss 2.3210, time 5269.31ms 
iter 8506: loss 2.6789, time 5276.26ms 
iter 8507: loss 2.4188, time 5259.46ms 
iter 8508: loss 2.4040, time 5271.23ms 
iter 8509: loss 2.6881, time 5276.64ms 
iter 8510: loss 2.4078, time 5283.77ms 
iter 8511: loss 2.4097, time 5282.52ms 
iter 8512: loss 2.4507, time 5278.57ms 
iter 8513: loss 2.3827, time 5285.05ms 
iter 8514: loss 2.4865, time 5278.85ms 
iter 8515: loss 2.2243, time 5278.51ms 
iter 8516: loss 2.4787, time 5290.93ms 
iter 8517: loss 2.5446, time 5278.07ms 
iter 8518: loss 2.4707, time 5261.07ms 
iter 8519: loss 2.6840, time 5264.89ms 
iter 8520: loss 2.4569, time 5265.67ms 
iter 8521: loss 2.5074, time 5260.59ms 
iter 8522: loss 2.5129, time 5268.36ms 
iter 8523: loss 2.4747, time 5281.10ms 
iter 8524: loss 2.3773, time 5266.15ms 
iter 8525: loss 2.6447, time 5272.31ms 
iter 8526: loss 2.5156, time 5276.32ms 
iter 8527: loss 2.2683, time 5251.09ms 
iter 8528: loss 2.5250, time 5267.78ms 
iter 8529: loss 2.3234, time 5259.90ms 
iter 8530: loss 2.2397, time 5271.21ms 
iter 8531: loss 2.5302, time 5259.64ms 
iter 8532: loss 2.4178, time 5260.43ms 
iter 8533: loss 2.2769, time 5270.67ms 
iter 8534: loss 2.5322, time 5274.96ms 
iter 8535: loss 2.5057, time 5258.87ms 
iter 8536: loss 2.5714, time 5259.65ms 
iter 8537: loss 2.4014, time 5273.08ms 
iter 8538: loss 2.5483, time 5268.97ms 
iter 8539: loss 2.4262, time 5272.35ms 
iter 8540: loss 2.2333, time 5259.33ms 
iter 8541: loss 2.5362, time 5315.60ms 
iter 8542: loss 2.2804, time 5300.36ms 
iter 8543: loss 2.4674, time 5269.05ms 
iter 8544: loss 2.2442, time 5299.88ms 
iter 8545: loss 2.4824, time 5270.02ms 
iter 8546: loss 2.5210, time 5269.54ms 
iter 8547: loss 2.4246, time 5284.65ms 
iter 8548: loss 2.5331, time 5276.68ms 
iter 8549: loss 2.6841, time 5269.96ms 
step 8550: train loss 2.4614, val loss 2.8460
iter 8550: loss 2.3184, time 20072.62ms 
iter 8551: loss 2.5591, time 5308.18ms 
iter 8552: loss 2.6007, time 5334.67ms 
iter 8553: loss 2.7440, time 5275.48ms 
iter 8554: loss 2.5783, time 5271.44ms 
iter 8555: loss 2.5270, time 5283.24ms 
iter 8556: loss 2.3689, time 5337.17ms 
iter 8557: loss 2.6517, time 5340.35ms 
iter 8558: loss 2.5553, time 5335.97ms 
iter 8559: loss 2.6389, time 5297.66ms 
iter 8560: loss 2.5819, time 5322.68ms 
iter 8561: loss 2.2844, time 5329.89ms 
iter 8562: loss 2.4058, time 5275.18ms 
iter 8563: loss 2.2739, time 5291.36ms 
iter 8564: loss 2.4189, time 5303.77ms 
iter 8565: loss 2.5671, time 5284.63ms 
iter 8566: loss 2.6429, time 5276.85ms 
iter 8567: loss 2.4354, time 5301.22ms 
iter 8568: loss 2.3821, time 5272.22ms 
iter 8569: loss 2.4059, time 5285.60ms 
iter 8570: loss 2.4386, time 5281.47ms 
iter 8571: loss 2.3898, time 5270.67ms 
iter 8572: loss 2.4350, time 5273.83ms 
iter 8573: loss 2.5846, time 5272.68ms 
iter 8574: loss 2.5013, time 5268.53ms 
iter 8575: loss 2.5058, time 5268.69ms 
iter 8576: loss 2.6869, time 5279.10ms 
iter 8577: loss 2.6305, time 5276.44ms 
iter 8578: loss 2.3832, time 5262.51ms 
iter 8579: loss 2.4176, time 5267.48ms 
iter 8580: loss 2.4773, time 5263.68ms 
iter 8581: loss 2.3641, time 5259.52ms 
iter 8582: loss 2.2010, time 5259.83ms 
iter 8583: loss 2.5996, time 5268.81ms 
iter 8584: loss 2.6096, time 5261.28ms 
iter 8585: loss 2.6029, time 5259.72ms 
iter 8586: loss 2.5026, time 5277.37ms 
iter 8587: loss 2.6048, time 5241.55ms 
iter 8588: loss 2.4172, time 5264.87ms 
iter 8589: loss 2.5087, time 5281.88ms 
iter 8590: loss 2.4403, time 5298.12ms 
iter 8591: loss 2.5216, time 5273.76ms 
iter 8592: loss 2.3850, time 5278.56ms 
iter 8593: loss 2.5460, time 5289.89ms 
iter 8594: loss 2.3058, time 5283.90ms 
iter 8595: loss 2.4912, time 5292.05ms 
iter 8596: loss 2.4721, time 5296.59ms 
iter 8597: loss 2.2187, time 5291.77ms 
iter 8598: loss 2.6763, time 5270.11ms 
iter 8599: loss 2.4928, time 5279.13ms 
step 8600: train loss 2.4743, val loss 2.8469
iter 8600: loss 2.5961, time 20003.71ms 
iter 8601: loss 2.2681, time 5331.68ms 
iter 8602: loss 2.3606, time 5341.07ms 
iter 8603: loss 2.4148, time 5342.92ms 
iter 8604: loss 2.5564, time 5351.07ms 
iter 8605: loss 2.5268, time 5278.03ms 
iter 8606: loss 2.4605, time 5255.45ms 
iter 8607: loss 2.6027, time 5284.29ms 
iter 8608: loss 2.4688, time 5276.10ms 
iter 8609: loss 2.4847, time 5267.66ms 
iter 8610: loss 2.5666, time 5275.60ms 
iter 8611: loss 2.4642, time 5266.27ms 
iter 8612: loss 2.4686, time 5270.26ms 
iter 8613: loss 2.4063, time 5265.43ms 
iter 8614: loss 2.5512, time 5267.13ms 
iter 8615: loss 2.5572, time 5276.05ms 
iter 8616: loss 2.4528, time 5252.87ms 
iter 8617: loss 2.5909, time 5211.01ms 
iter 8618: loss 2.5296, time 5208.09ms 
iter 8619: loss 2.3491, time 5182.19ms 
iter 8620: loss 2.4276, time 5212.18ms 
iter 8621: loss 2.4411, time 5263.08ms 
iter 8622: loss 2.3680, time 5262.48ms 
iter 8623: loss 2.4402, time 5164.63ms 
iter 8624: loss 2.4833, time 5251.66ms 
iter 8625: loss 2.8359, time 5261.28ms 
iter 8626: loss 2.6058, time 5264.11ms 
iter 8627: loss 2.6457, time 5301.78ms 
iter 8628: loss 2.5123, time 5308.06ms 
iter 8629: loss 2.7322, time 5321.49ms 
iter 8630: loss 2.6388, time 5273.31ms 
iter 8631: loss 2.5122, time 5315.43ms 
iter 8632: loss 2.5548, time 5268.92ms 
iter 8633: loss 2.5825, time 5232.20ms 
iter 8634: loss 2.2131, time 5275.18ms 
iter 8635: loss 2.6353, time 5272.36ms 
iter 8636: loss 2.7132, time 5274.00ms 
iter 8637: loss 2.4939, time 5266.33ms 
iter 8638: loss 2.4371, time 5279.75ms 
iter 8639: loss 2.5651, time 5267.15ms 
iter 8640: loss 2.6511, time 5263.74ms 
iter 8641: loss 2.4901, time 5263.89ms 
iter 8642: loss 2.3084, time 5274.11ms 
iter 8643: loss 2.5032, time 5260.05ms 
iter 8644: loss 2.6065, time 5261.47ms 
iter 8645: loss 2.5978, time 5273.55ms 
iter 8646: loss 2.5230, time 5259.94ms 
iter 8647: loss 2.4371, time 5253.70ms 
iter 8648: loss 2.3547, time 5201.38ms 
iter 8649: loss 2.2793, time 5276.97ms 
step 8650: train loss 2.4674, val loss 2.8423
iter 8650: loss 2.3797, time 19947.15ms 
iter 8651: loss 2.3164, time 5267.86ms 
iter 8652: loss 2.6701, time 5344.81ms 
iter 8653: loss 2.3151, time 5256.10ms 
iter 8654: loss 2.2109, time 5260.22ms 
iter 8655: loss 2.3876, time 5274.58ms 
iter 8656: loss 2.6012, time 5258.54ms 
iter 8657: loss 2.2827, time 5264.20ms 
iter 8658: loss 2.3199, time 5265.88ms 
iter 8659: loss 2.4472, time 5301.26ms 
iter 8660: loss 2.4236, time 5304.93ms 
iter 8661: loss 2.5259, time 5274.65ms 
iter 8662: loss 2.4978, time 5280.43ms 
iter 8663: loss 2.7809, time 5330.63ms 
iter 8664: loss 2.3667, time 5318.33ms 
iter 8665: loss 2.4801, time 5274.53ms 
iter 8666: loss 2.5615, time 5263.41ms 
iter 8667: loss 2.4193, time 5277.75ms 
iter 8668: loss 2.3635, time 5283.30ms 
iter 8669: loss 2.5732, time 5288.08ms 
iter 8670: loss 2.4203, time 5278.13ms 
iter 8671: loss 2.4292, time 5267.09ms 
iter 8672: loss 2.4327, time 5284.11ms 
iter 8673: loss 2.5005, time 5267.91ms 
iter 8674: loss 2.4983, time 5283.53ms 
iter 8675: loss 2.4065, time 5317.25ms 
iter 8676: loss 2.4266, time 5280.13ms 
iter 8677: loss 2.5100, time 5244.23ms 
iter 8678: loss 2.5283, time 5252.28ms 
iter 8679: loss 2.4119, time 5245.75ms 
iter 8680: loss 2.4712, time 5246.71ms 
iter 8681: loss 2.6721, time 5242.21ms 
iter 8682: loss 2.4333, time 5251.54ms 
iter 8683: loss 2.4881, time 5269.44ms 
iter 8684: loss 2.4654, time 5274.56ms 
iter 8685: loss 2.3455, time 5260.06ms 
iter 8686: loss 2.4891, time 5262.01ms 
iter 8687: loss 2.4045, time 5268.96ms 
iter 8688: loss 2.4992, time 5266.83ms 
iter 8689: loss 2.4218, time 5311.48ms 
iter 8690: loss 2.4947, time 5258.04ms 
iter 8691: loss 2.5608, time 5263.76ms 
iter 8692: loss 2.5592, time 5266.60ms 
iter 8693: loss 2.4263, time 5265.91ms 
iter 8694: loss 2.5051, time 5263.77ms 
iter 8695: loss 2.4324, time 5282.20ms 
iter 8696: loss 2.7363, time 5270.82ms 
iter 8697: loss 2.3738, time 5258.25ms 
iter 8698: loss 2.3612, time 5255.61ms 
iter 8699: loss 2.6507, time 5272.49ms 
step 8700: train loss 2.4760, val loss 2.8406
iter 8700: loss 2.4751, time 20125.01ms 
iter 8701: loss 2.7565, time 5328.35ms 
iter 8702: loss 2.5326, time 5343.86ms 
iter 8703: loss 2.5239, time 5332.69ms 
iter 8704: loss 2.3063, time 5346.53ms 
iter 8705: loss 2.6022, time 5290.59ms 
iter 8706: loss 2.2870, time 5266.70ms 
iter 8707: loss 2.5828, time 5271.46ms 
iter 8708: loss 2.5110, time 5263.69ms 
iter 8709: loss 2.4186, time 5231.71ms 
iter 8710: loss 2.6736, time 5271.93ms 
iter 8711: loss 2.6495, time 5273.45ms 
iter 8712: loss 2.2456, time 5287.14ms 
iter 8713: loss 2.5602, time 5244.14ms 
iter 8714: loss 2.5364, time 5286.13ms 
iter 8715: loss 2.4522, time 5286.70ms 
iter 8716: loss 2.4528, time 5259.12ms 
iter 8717: loss 2.4231, time 5271.68ms 
iter 8718: loss 2.4715, time 5285.44ms 
iter 8719: loss 2.4268, time 5272.47ms 
iter 8720: loss 2.4723, time 5305.96ms 
iter 8721: loss 2.4900, time 5318.40ms 
iter 8722: loss 2.3701, time 5314.26ms 
iter 8723: loss 2.1693, time 5270.45ms 
iter 8724: loss 2.4564, time 5293.38ms 
iter 8725: loss 2.3867, time 5215.43ms 
iter 8726: loss 2.6963, time 5246.50ms 
iter 8727: loss 2.6687, time 5272.04ms 
iter 8728: loss 2.5138, time 5275.48ms 
iter 8729: loss 2.5802, time 5266.67ms 
iter 8730: loss 2.5326, time 5262.01ms 
iter 8731: loss 2.4052, time 5262.33ms 
iter 8732: loss 2.3313, time 5281.27ms 
iter 8733: loss 2.2153, time 5254.73ms 
iter 8734: loss 2.4901, time 5250.52ms 
iter 8735: loss 2.4363, time 5273.16ms 
iter 8736: loss 2.1793, time 5273.77ms 
iter 8737: loss 2.6913, time 5276.21ms 
iter 8738: loss 2.4526, time 5267.82ms 
iter 8739: loss 2.5534, time 5263.53ms 
iter 8740: loss 2.4551, time 5268.25ms 
iter 8741: loss 2.4752, time 5261.58ms 
iter 8742: loss 2.6760, time 5279.82ms 
iter 8743: loss 2.4640, time 5294.11ms 
iter 8744: loss 2.1661, time 5256.09ms 
iter 8745: loss 2.3053, time 5268.05ms 
iter 8746: loss 2.5154, time 5262.49ms 
iter 8747: loss 2.2476, time 5259.97ms 
iter 8748: loss 2.5991, time 5260.87ms 
iter 8749: loss 2.7543, time 5265.08ms 
step 8750: train loss 2.4783, val loss 2.8684
iter 8750: loss 2.4633, time 20063.79ms 
iter 8751: loss 2.4458, time 5265.04ms 
iter 8752: loss 2.1955, time 5276.79ms 
iter 8753: loss 2.4474, time 5282.47ms 
iter 8754: loss 2.4494, time 5338.03ms 
iter 8755: loss 2.4237, time 5243.32ms 
iter 8756: loss 2.6698, time 5274.11ms 
iter 8757: loss 2.5826, time 5290.51ms 
iter 8758: loss 2.4527, time 5263.34ms 
iter 8759: loss 2.4309, time 5262.41ms 
iter 8760: loss 2.4571, time 5272.94ms 
iter 8761: loss 2.4123, time 5264.91ms 
iter 8762: loss 2.6137, time 5270.79ms 
iter 8763: loss 2.5328, time 5274.90ms 
iter 8764: loss 2.4294, time 5293.59ms 
iter 8765: loss 2.3746, time 5264.37ms 
iter 8766: loss 2.6221, time 5269.58ms 
iter 8767: loss 2.2409, time 5274.12ms 
iter 8768: loss 2.5800, time 5279.54ms 
iter 8769: loss 2.3918, time 5261.61ms 
iter 8770: loss 2.6076, time 5276.51ms 
iter 8771: loss 2.4058, time 5275.95ms 
iter 8772: loss 2.4503, time 5288.43ms 
iter 8773: loss 2.5625, time 5269.84ms 
iter 8774: loss 2.6661, time 5310.07ms 
iter 8775: loss 2.2622, time 5282.92ms 
iter 8776: loss 2.4677, time 5281.06ms 
iter 8777: loss 2.4124, time 5288.86ms 
iter 8778: loss 2.4870, time 5274.67ms 
iter 8779: loss 2.5865, time 5264.18ms 
iter 8780: loss 2.4723, time 5276.95ms 
iter 8781: loss 2.3856, time 5273.40ms 
iter 8782: loss 2.6067, time 5282.65ms 
iter 8783: loss 2.4977, time 5271.86ms 
iter 8784: loss 2.4446, time 5270.50ms 
iter 8785: loss 2.3259, time 5258.94ms 
iter 8786: loss 2.6166, time 5309.75ms 
iter 8787: loss 2.6228, time 5239.96ms 
iter 8788: loss 2.4654, time 5307.41ms 
iter 8789: loss 2.5142, time 5319.39ms 
iter 8790: loss 2.4532, time 5311.35ms 
iter 8791: loss 2.4422, time 5301.70ms 
iter 8792: loss 2.3481, time 5282.37ms 
iter 8793: loss 2.3523, time 5225.36ms 
iter 8794: loss 2.4486, time 5097.18ms 
iter 8795: loss 2.4483, time 5170.86ms 
iter 8796: loss 2.2851, time 5334.65ms 
iter 8797: loss 2.5658, time 5148.06ms 
iter 8798: loss 2.2432, time 5163.40ms 
iter 8799: loss 2.3522, time 5259.08ms 
step 8800: train loss 2.4716, val loss 2.8479
iter 8800: loss 2.4984, time 20075.97ms 
iter 8801: loss 2.7437, time 5266.19ms 
iter 8802: loss 2.6062, time 5271.18ms 
iter 8803: loss 2.4319, time 5264.48ms 
iter 8804: loss 2.4128, time 5263.29ms 
iter 8805: loss 2.7035, time 5262.72ms 
iter 8806: loss 2.3922, time 5255.52ms 
iter 8807: loss 2.4293, time 5236.98ms 
iter 8808: loss 2.4281, time 5135.55ms 
iter 8809: loss 2.5175, time 5140.18ms 
iter 8810: loss 2.6957, time 5117.69ms 
iter 8811: loss 2.3852, time 5098.47ms 
iter 8812: loss 2.4552, time 5158.00ms 
iter 8813: loss 2.4263, time 5182.66ms 
iter 8814: loss 2.3022, time 5219.88ms 
iter 8815: loss 2.2714, time 5219.54ms 
iter 8816: loss 2.4937, time 5262.90ms 
iter 8817: loss 2.4094, time 5267.56ms 
iter 8818: loss 2.3288, time 5262.33ms 
iter 8819: loss 2.6190, time 5268.37ms 
iter 8820: loss 2.3365, time 5260.71ms 
iter 8821: loss 2.3539, time 5276.78ms 
iter 8822: loss 2.4880, time 5266.19ms 
iter 8823: loss 2.4555, time 5258.50ms 
iter 8824: loss 2.4492, time 5261.76ms 
iter 8825: loss 2.4069, time 5259.23ms 
iter 8826: loss 2.5560, time 5254.35ms 
iter 8827: loss 2.3264, time 5250.66ms 
iter 8828: loss 2.6552, time 5263.43ms 
iter 8829: loss 2.3001, time 5263.11ms 
iter 8830: loss 2.3216, time 5248.65ms 
iter 8831: loss 2.4782, time 5141.51ms 
iter 8832: loss 2.3900, time 5125.82ms 
iter 8833: loss 2.4005, time 5138.27ms 
iter 8834: loss 2.4615, time 5098.17ms 
iter 8835: loss 2.5087, time 5084.77ms 
iter 8836: loss 2.5866, time 5132.83ms 
iter 8837: loss 2.3954, time 5129.65ms 
iter 8838: loss 2.4379, time 5167.50ms 
iter 8839: loss 2.5041, time 5262.98ms 
iter 8840: loss 2.4471, time 5247.22ms 
iter 8841: loss 2.5309, time 5261.35ms 
iter 8842: loss 2.4552, time 5251.33ms 
iter 8843: loss 2.4038, time 5253.89ms 
iter 8844: loss 2.4443, time 5228.00ms 
iter 8845: loss 2.4559, time 5248.15ms 
iter 8846: loss 2.7260, time 5222.86ms 
iter 8847: loss 2.0774, time 5253.45ms 
iter 8848: loss 2.7739, time 5206.11ms 
iter 8849: loss 2.5887, time 5140.05ms 
step 8850: train loss 2.4559, val loss 2.8361
iter 8850: loss 2.5489, time 20117.42ms 
iter 8851: loss 2.4454, time 5270.67ms 
iter 8852: loss 2.3579, time 5227.14ms 
iter 8853: loss 2.5884, time 5258.42ms 
iter 8854: loss 2.4078, time 5276.05ms 
iter 8855: loss 2.6398, time 5266.94ms 
iter 8856: loss 2.5369, time 5260.52ms 
iter 8857: loss 2.4173, time 5270.96ms 
iter 8858: loss 2.3427, time 5274.60ms 
iter 8859: loss 2.3679, time 5267.45ms 
iter 8860: loss 2.5907, time 5268.15ms 
iter 8861: loss 2.5420, time 5243.10ms 
iter 8862: loss 2.3894, time 5259.80ms 
iter 8863: loss 2.4023, time 5263.37ms 
iter 8864: loss 2.4955, time 5268.32ms 
iter 8865: loss 2.3513, time 5265.81ms 
iter 8866: loss 2.3971, time 5260.70ms 
iter 8867: loss 2.4740, time 5259.34ms 
iter 8868: loss 2.2874, time 5265.04ms 
iter 8869: loss 2.5041, time 5262.56ms 
iter 8870: loss 2.3325, time 5256.63ms 
iter 8871: loss 2.3649, time 5265.24ms 
iter 8872: loss 2.4351, time 5280.39ms 
iter 8873: loss 2.3435, time 5270.42ms 
iter 8874: loss 2.4432, time 5269.26ms 
iter 8875: loss 2.5910, time 5268.82ms 
iter 8876: loss 2.2770, time 5268.46ms 
iter 8877: loss 2.7477, time 5256.51ms 
iter 8878: loss 2.6258, time 5257.22ms 
iter 8879: loss 2.5864, time 5268.11ms 
iter 8880: loss 2.2198, time 5260.90ms 
iter 8881: loss 2.5666, time 5264.41ms 
iter 8882: loss 2.7520, time 5258.67ms 
iter 8883: loss 2.2799, time 5270.66ms 
iter 8884: loss 2.5387, time 5259.67ms 
iter 8885: loss 2.6058, time 5258.90ms 
iter 8886: loss 2.5739, time 5268.65ms 
iter 8887: loss 2.4684, time 5259.77ms 
iter 8888: loss 2.5727, time 5258.77ms 
iter 8889: loss 2.6178, time 5264.55ms 
iter 8890: loss 2.5679, time 5265.05ms 
iter 8891: loss 2.5700, time 5269.02ms 
iter 8892: loss 2.3569, time 5258.62ms 
iter 8893: loss 2.4133, time 5320.87ms 
iter 8894: loss 2.4733, time 5282.37ms 
iter 8895: loss 2.4056, time 5217.18ms 
iter 8896: loss 2.4387, time 5271.92ms 
iter 8897: loss 2.5635, time 5273.73ms 
iter 8898: loss 2.3664, time 5277.53ms 
iter 8899: loss 2.5433, time 5275.60ms 
step 8900: train loss 2.4712, val loss 2.8528
iter 8900: loss 2.5810, time 20110.99ms 
iter 8901: loss 2.3865, time 5283.25ms 
iter 8902: loss 2.4398, time 5260.86ms 
iter 8903: loss 2.5392, time 5266.07ms 
iter 8904: loss 2.3534, time 5272.66ms 
iter 8905: loss 2.2251, time 5273.56ms 
iter 8906: loss 2.4438, time 5269.87ms 
iter 8907: loss 2.4200, time 5266.96ms 
iter 8908: loss 2.6774, time 5268.99ms 
iter 8909: loss 2.5974, time 5266.01ms 
iter 8910: loss 2.3036, time 5261.43ms 
iter 8911: loss 2.6304, time 5281.75ms 
iter 8912: loss 2.6201, time 5269.86ms 
iter 8913: loss 2.5389, time 5257.19ms 
iter 8914: loss 2.5074, time 5260.19ms 
iter 8915: loss 2.4954, time 5280.66ms 
iter 8916: loss 2.4525, time 5269.90ms 
iter 8917: loss 2.3960, time 5353.65ms 
iter 8918: loss 2.4688, time 5353.15ms 
iter 8919: loss 2.5151, time 5241.55ms 
iter 8920: loss 2.6328, time 5276.71ms 
iter 8921: loss 2.3870, time 5286.87ms 
iter 8922: loss 2.5099, time 5289.37ms 
iter 8923: loss 2.5171, time 5283.97ms 
iter 8924: loss 2.4448, time 5283.27ms 
iter 8925: loss 2.3460, time 5271.47ms 
iter 8926: loss 2.5945, time 5271.32ms 
iter 8927: loss 2.4831, time 5274.00ms 
iter 8928: loss 2.3789, time 5273.60ms 
iter 8929: loss 2.3862, time 5284.19ms 
iter 8930: loss 2.5696, time 5274.08ms 
iter 8931: loss 2.4283, time 5281.37ms 
iter 8932: loss 2.4298, time 5272.27ms 
iter 8933: loss 2.4653, time 5279.40ms 
iter 8934: loss 2.3733, time 5231.38ms 
iter 8935: loss 2.3455, time 5265.77ms 
iter 8936: loss 2.4280, time 5277.60ms 
iter 8937: loss 2.3722, time 5271.34ms 
iter 8938: loss 2.4191, time 5231.69ms 
iter 8939: loss 2.4128, time 5259.86ms 
iter 8940: loss 2.6197, time 5286.92ms 
iter 8941: loss 2.4547, time 5273.16ms 
iter 8942: loss 2.4521, time 5324.46ms 
iter 8943: loss 2.5192, time 5294.38ms 
iter 8944: loss 2.6061, time 5269.26ms 
iter 8945: loss 2.5105, time 5266.21ms 
iter 8946: loss 2.4204, time 5258.79ms 
iter 8947: loss 2.6141, time 5272.70ms 
iter 8948: loss 2.5819, time 5264.98ms 
iter 8949: loss 2.3969, time 5267.84ms 
step 8950: train loss 2.4575, val loss 2.8417
iter 8950: loss 2.4583, time 20090.21ms 
iter 8951: loss 2.6535, time 5268.11ms 
iter 8952: loss 2.5046, time 5324.85ms 
iter 8953: loss 2.5281, time 5333.86ms 
iter 8954: loss 2.6559, time 5279.74ms 
iter 8955: loss 2.3688, time 5268.29ms 
iter 8956: loss 2.3204, time 5271.63ms 
iter 8957: loss 2.6788, time 5274.09ms 
iter 8958: loss 2.5533, time 5262.76ms 
iter 8959: loss 2.4724, time 5260.11ms 
iter 8960: loss 2.3105, time 5264.51ms 
iter 8961: loss 2.5637, time 5273.69ms 
iter 8962: loss 2.4189, time 5258.08ms 
iter 8963: loss 2.4987, time 5259.54ms 
iter 8964: loss 2.4817, time 5259.62ms 
iter 8965: loss 2.5009, time 5267.82ms 
iter 8966: loss 2.3333, time 5261.04ms 
iter 8967: loss 2.2736, time 5259.58ms 
iter 8968: loss 2.3904, time 5272.63ms 
iter 8969: loss 2.3683, time 5271.64ms 
iter 8970: loss 2.5348, time 5254.57ms 
iter 8971: loss 2.5578, time 5259.51ms 
iter 8972: loss 2.4435, time 5270.04ms 
iter 8973: loss 2.6354, time 5285.43ms 
iter 8974: loss 2.5925, time 5267.68ms 
iter 8975: loss 2.6288, time 5271.74ms 
iter 8976: loss 2.3180, time 5279.66ms 
iter 8977: loss 2.5577, time 5265.14ms 
iter 8978: loss 2.3823, time 5263.41ms 
iter 8979: loss 2.5433, time 5225.19ms 
iter 8980: loss 2.3955, time 5279.93ms 
iter 8981: loss 2.2191, time 5267.43ms 
iter 8982: loss 2.6346, time 5260.50ms 
iter 8983: loss 2.5224, time 5265.23ms 
iter 8984: loss 2.4520, time 5266.54ms 
iter 8985: loss 2.5063, time 5258.94ms 
iter 8986: loss 2.3557, time 5266.25ms 
iter 8987: loss 2.2790, time 5274.93ms 
iter 8988: loss 2.1646, time 5268.83ms 
iter 8989: loss 2.3721, time 5267.93ms 
iter 8990: loss 2.4820, time 5264.64ms 
iter 8991: loss 2.4058, time 5273.10ms 
iter 8992: loss 2.6331, time 5236.90ms 
iter 8993: loss 2.4105, time 5259.10ms 
iter 8994: loss 2.5635, time 5272.25ms 
iter 8995: loss 2.3516, time 5261.20ms 
iter 8996: loss 2.3546, time 5262.06ms 
iter 8997: loss 2.5071, time 5287.91ms 
iter 8998: loss 2.5013, time 5269.98ms 
iter 8999: loss 2.2746, time 5267.66ms 
step 9000: train loss 2.4667, val loss 2.8562
iter 9000: loss 2.5939, time 20068.47ms 
iter 9001: loss 2.5076, time 5266.47ms 
iter 9002: loss 2.4225, time 5265.27ms 
iter 9003: loss 2.6307, time 5260.38ms 
iter 9004: loss 2.4145, time 5273.45ms 
iter 9005: loss 2.4755, time 5264.08ms 
iter 9006: loss 2.5028, time 5273.12ms 
iter 9007: loss 2.5097, time 5267.06ms 
iter 9008: loss 2.4353, time 5273.08ms 
iter 9009: loss 2.7603, time 5261.79ms 
iter 9010: loss 2.7033, time 5257.33ms 
iter 9011: loss 2.4820, time 5269.97ms 
iter 9012: loss 2.4970, time 5268.98ms 
iter 9013: loss 2.5772, time 5267.59ms 
iter 9014: loss 2.4792, time 5253.83ms 
iter 9015: loss 2.4529, time 5265.59ms 
iter 9016: loss 2.4841, time 5263.04ms 
iter 9017: loss 2.6723, time 5270.05ms 
iter 9018: loss 2.5093, time 5271.91ms 
iter 9019: loss 2.4638, time 5265.69ms 
iter 9020: loss 2.6098, time 5260.73ms 
iter 9021: loss 2.4555, time 5259.87ms 
iter 9022: loss 2.5014, time 5270.73ms 
iter 9023: loss 2.2075, time 5260.33ms 
iter 9024: loss 2.3897, time 5259.10ms 
iter 9025: loss 2.3836, time 5263.45ms 
iter 9026: loss 2.3715, time 5272.42ms 
iter 9027: loss 2.4934, time 5266.59ms 
iter 9028: loss 2.4497, time 5265.12ms 
iter 9029: loss 2.2867, time 5262.39ms 
iter 9030: loss 2.4216, time 5236.83ms 
iter 9031: loss 2.0698, time 5258.40ms 
iter 9032: loss 2.5825, time 5270.13ms 
iter 9033: loss 2.6377, time 5269.38ms 
iter 9034: loss 2.4243, time 5259.91ms 
iter 9035: loss 2.1086, time 5256.50ms 
iter 9036: loss 2.3471, time 5269.63ms 
iter 9037: loss 2.2055, time 5271.25ms 
iter 9038: loss 2.2529, time 5263.94ms 
iter 9039: loss 2.5848, time 5271.30ms 
iter 9040: loss 2.6122, time 5273.88ms 
iter 9041: loss 2.6627, time 5262.28ms 
iter 9042: loss 2.3141, time 5262.20ms 
iter 9043: loss 2.2072, time 5265.88ms 
iter 9044: loss 2.4672, time 5265.23ms 
iter 9045: loss 2.6366, time 5264.53ms 
iter 9046: loss 2.5342, time 5269.59ms 
iter 9047: loss 2.2942, time 5284.00ms 
iter 9048: loss 2.3282, time 5269.01ms 
iter 9049: loss 2.4953, time 5261.30ms 
step 9050: train loss 2.4629, val loss 2.8529
iter 9050: loss 2.5785, time 20077.17ms 
iter 9051: loss 2.4803, time 5264.91ms 
iter 9052: loss 2.5638, time 5257.77ms 
iter 9053: loss 2.4627, time 5262.88ms 
iter 9054: loss 2.5758, time 5270.63ms 
iter 9055: loss 2.4061, time 5259.78ms 
iter 9056: loss 2.5282, time 5257.40ms 
iter 9057: loss 2.5530, time 5271.36ms 
iter 9058: loss 2.3909, time 5259.19ms 
iter 9059: loss 2.7000, time 5265.19ms 
iter 9060: loss 2.5456, time 5251.66ms 
iter 9061: loss 2.1559, time 5263.57ms 
iter 9062: loss 2.5631, time 5257.27ms 
iter 9063: loss 2.4363, time 5258.24ms 
iter 9064: loss 2.4980, time 5287.28ms 
iter 9065: loss 2.3396, time 5273.35ms 
iter 9066: loss 2.3377, time 5272.49ms 
iter 9067: loss 2.5044, time 5278.66ms 
iter 9068: loss 2.3617, time 5285.37ms 
iter 9069: loss 2.3753, time 5276.24ms 
iter 9070: loss 2.2830, time 5263.38ms 
iter 9071: loss 2.3940, time 5273.98ms 
iter 9072: loss 2.5816, time 5281.89ms 
iter 9073: loss 2.6821, time 5264.01ms 
iter 9074: loss 2.4675, time 5272.32ms 
iter 9075: loss 2.3677, time 5271.98ms 
iter 9076: loss 2.2853, time 5268.75ms 
iter 9077: loss 2.4346, time 5261.40ms 
iter 9078: loss 2.5452, time 5287.69ms 
iter 9079: loss 2.4520, time 5265.22ms 
iter 9080: loss 2.3993, time 5264.19ms 
iter 9081: loss 2.4784, time 5272.37ms 
iter 9082: loss 2.4228, time 5262.81ms 
iter 9083: loss 2.3805, time 5270.94ms 
iter 9084: loss 2.6029, time 5252.65ms 
iter 9085: loss 2.5459, time 5223.50ms 
iter 9086: loss 2.6578, time 5271.77ms 
iter 9087: loss 2.5484, time 5260.43ms 
iter 9088: loss 2.4022, time 5263.11ms 
iter 9089: loss 2.6851, time 5267.69ms 
iter 9090: loss 2.5411, time 5255.18ms 
iter 9091: loss 2.6173, time 5261.99ms 
iter 9092: loss 2.5555, time 5271.80ms 
iter 9093: loss 2.5487, time 5264.15ms 
iter 9094: loss 2.6839, time 5261.57ms 
iter 9095: loss 2.5590, time 5262.90ms 
iter 9096: loss 2.6180, time 5256.49ms 
iter 9097: loss 2.6303, time 5270.62ms 
iter 9098: loss 2.5693, time 5258.63ms 
iter 9099: loss 2.1509, time 5274.27ms 
step 9100: train loss 2.4442, val loss 2.8619
iter 9100: loss 2.4889, time 20066.45ms 
iter 9101: loss 2.1746, time 5261.43ms 
iter 9102: loss 2.3985, time 5259.90ms 
iter 9103: loss 2.3945, time 5271.00ms 
iter 9104: loss 2.6064, time 5265.00ms 
iter 9105: loss 2.5171, time 5257.56ms 
iter 9106: loss 2.4735, time 5257.41ms 
iter 9107: loss 2.4345, time 5272.55ms 
iter 9108: loss 2.3446, time 5261.66ms 
iter 9109: loss 2.5224, time 5262.70ms 
iter 9110: loss 2.4342, time 5277.88ms 
iter 9111: loss 2.5882, time 5328.82ms 
iter 9112: loss 2.5701, time 5302.08ms 
iter 9113: loss 2.4363, time 5268.68ms 
iter 9114: loss 2.5081, time 5279.14ms 
iter 9115: loss 2.6170, time 5269.86ms 
iter 9116: loss 2.5516, time 5265.19ms 
iter 9117: loss 2.3081, time 5259.18ms 
iter 9118: loss 2.7385, time 5274.73ms 
iter 9119: loss 2.1569, time 5276.65ms 
iter 9120: loss 2.5953, time 5262.40ms 
iter 9121: loss 2.5868, time 5267.00ms 
iter 9122: loss 2.3012, time 5271.02ms 
iter 9123: loss 2.2969, time 5259.59ms 
iter 9124: loss 2.4243, time 5277.31ms 
iter 9125: loss 2.6684, time 5281.57ms 
iter 9126: loss 2.4422, time 5265.65ms 
iter 9127: loss 2.1226, time 5265.86ms 
iter 9128: loss 2.1414, time 5272.75ms 
iter 9129: loss 2.3592, time 5271.58ms 
iter 9130: loss 2.4647, time 5257.45ms 
iter 9131: loss 2.4725, time 5255.73ms 
iter 9132: loss 2.5284, time 5271.08ms 
iter 9133: loss 2.2626, time 5299.69ms 
iter 9134: loss 2.3575, time 5270.50ms 
iter 9135: loss 2.2899, time 5335.34ms 
iter 9136: loss 2.3741, time 5344.44ms 
iter 9137: loss 2.5485, time 5345.58ms 
iter 9138: loss 2.5112, time 5316.96ms 
iter 9139: loss 2.5800, time 5348.32ms 
iter 9140: loss 2.3788, time 5331.91ms 
iter 9141: loss 2.2725, time 5270.71ms 
iter 9142: loss 2.6558, time 5275.79ms 
iter 9143: loss 2.5791, time 5270.97ms 
iter 9144: loss 2.4760, time 5276.50ms 
iter 9145: loss 2.4732, time 5260.30ms 
iter 9146: loss 2.4340, time 5274.90ms 
iter 9147: loss 2.3679, time 5257.99ms 
iter 9148: loss 2.6742, time 5252.96ms 
iter 9149: loss 2.5940, time 5262.91ms 
step 9150: train loss 2.4732, val loss 2.8671
iter 9150: loss 2.3697, time 20061.21ms 
iter 9151: loss 2.4063, time 5264.16ms 
iter 9152: loss 2.3827, time 5294.45ms 
iter 9153: loss 2.2515, time 5280.37ms 
iter 9154: loss 2.3291, time 5255.51ms 
iter 9155: loss 2.6268, time 5260.59ms 
iter 9156: loss 2.6000, time 5271.16ms 
iter 9157: loss 2.5214, time 5270.40ms 
iter 9158: loss 2.3394, time 5300.43ms 
iter 9159: loss 2.5199, time 5273.07ms 
iter 9160: loss 2.3706, time 5275.03ms 
iter 9161: loss 2.4236, time 5253.75ms 
iter 9162: loss 2.3023, time 5296.03ms 
iter 9163: loss 2.4041, time 5273.21ms 
iter 9164: loss 2.3406, time 5266.25ms 
iter 9165: loss 2.3376, time 5287.43ms 
iter 9166: loss 2.6717, time 5232.45ms 
iter 9167: loss 2.4860, time 5294.10ms 
iter 9168: loss 2.5014, time 5315.13ms 
iter 9169: loss 2.5747, time 5268.62ms 
iter 9170: loss 2.4311, time 5262.89ms 
iter 9171: loss 2.2851, time 5276.27ms 
iter 9172: loss 2.4811, time 5221.88ms 
iter 9173: loss 2.3911, time 5259.71ms 
iter 9174: loss 2.5631, time 5270.89ms 
iter 9175: loss 2.3072, time 5278.67ms 
iter 9176: loss 2.5608, time 5263.58ms 
iter 9177: loss 2.5453, time 5268.60ms 
iter 9178: loss 2.6642, time 5276.41ms 
iter 9179: loss 2.2540, time 5265.61ms 
iter 9180: loss 2.6209, time 5265.27ms 
iter 9181: loss 2.2976, time 5252.71ms 
iter 9182: loss 2.5483, time 5228.66ms 
iter 9183: loss 2.3977, time 5259.48ms 
iter 9184: loss 2.3649, time 5280.77ms 
iter 9185: loss 2.3296, time 5277.12ms 
iter 9186: loss 2.4943, time 5269.57ms 
iter 9187: loss 2.4708, time 5268.34ms 
iter 9188: loss 2.5341, time 5280.09ms 
iter 9189: loss 2.7567, time 5262.84ms 
iter 9190: loss 2.4271, time 5264.72ms 
iter 9191: loss 2.6034, time 5274.63ms 
iter 9192: loss 2.4798, time 5273.37ms 
iter 9193: loss 2.4788, time 5269.45ms 
iter 9194: loss 2.3603, time 5267.19ms 
iter 9195: loss 2.6149, time 5275.37ms 
iter 9196: loss 2.4766, time 5275.72ms 
iter 9197: loss 2.4862, time 5278.20ms 
iter 9198: loss 2.3792, time 5326.05ms 
iter 9199: loss 2.4933, time 5348.86ms 
step 9200: train loss 2.4528, val loss 2.8304
iter 9200: loss 2.4864, time 20100.30ms 
iter 9201: loss 2.4177, time 5267.97ms 
iter 9202: loss 2.5785, time 5285.85ms 
iter 9203: loss 2.4222, time 5299.21ms 
iter 9204: loss 2.4615, time 5267.79ms 
iter 9205: loss 2.3625, time 5283.24ms 
iter 9206: loss 2.3506, time 5326.42ms 
iter 9207: loss 2.4240, time 5301.54ms 
iter 9208: loss 2.4992, time 5336.69ms 
iter 9209: loss 2.3488, time 5267.14ms 
iter 9210: loss 2.6313, time 5262.04ms 
iter 9211: loss 2.4233, time 5258.63ms 
iter 9212: loss 2.2950, time 5276.96ms 
iter 9213: loss 2.4510, time 5311.54ms 
iter 9214: loss 2.4538, time 5342.90ms 
iter 9215: loss 2.5790, time 5341.30ms 
iter 9216: loss 2.4024, time 5334.33ms 
iter 9217: loss 2.3402, time 5345.55ms 
iter 9218: loss 2.4846, time 5310.00ms 
iter 9219: loss 2.5526, time 5300.82ms 
iter 9220: loss 2.2914, time 5301.49ms 
iter 9221: loss 2.6457, time 5283.44ms 
iter 9222: loss 2.5962, time 5269.91ms 
iter 9223: loss 2.5266, time 5291.76ms 
iter 9224: loss 2.4160, time 5283.54ms 
iter 9225: loss 2.6720, time 5282.30ms 
iter 9226: loss 2.6030, time 5285.74ms 
iter 9227: loss 2.2736, time 5284.84ms 
iter 9228: loss 2.3523, time 5265.78ms 
iter 9229: loss 2.6136, time 5281.36ms 
iter 9230: loss 2.2827, time 5285.44ms 
iter 9231: loss 2.5388, time 5273.91ms 
iter 9232: loss 2.4822, time 5269.66ms 
iter 9233: loss 2.5456, time 5281.57ms 
iter 9234: loss 2.4295, time 5277.91ms 
iter 9235: loss 2.2873, time 5279.45ms 
iter 9236: loss 2.5038, time 5287.60ms 
iter 9237: loss 2.5074, time 5271.60ms 
iter 9238: loss 2.1861, time 5270.75ms 
iter 9239: loss 2.4801, time 5271.69ms 
iter 9240: loss 2.6679, time 5283.77ms 
iter 9241: loss 2.5653, time 5285.21ms 
iter 9242: loss 2.4894, time 5254.98ms 
iter 9243: loss 2.6485, time 5257.33ms 
iter 9244: loss 2.4917, time 5270.29ms 
iter 9245: loss 2.4267, time 5266.73ms 
iter 9246: loss 2.5825, time 5259.00ms 
iter 9247: loss 2.4784, time 5263.83ms 
iter 9248: loss 2.2540, time 5273.89ms 
iter 9249: loss 2.5551, time 5266.10ms 
step 9250: train loss 2.4531, val loss 2.8572
iter 9250: loss 2.4239, time 20068.93ms 
iter 9251: loss 2.5936, time 5266.40ms 
iter 9252: loss 2.6531, time 5265.41ms 
iter 9253: loss 2.6777, time 5262.46ms 
iter 9254: loss 2.3121, time 5256.68ms 
iter 9255: loss 2.4935, time 5283.63ms 
iter 9256: loss 2.3626, time 5259.27ms 
iter 9257: loss 2.4983, time 5272.65ms 
iter 9258: loss 2.2504, time 5275.14ms 
iter 9259: loss 2.3799, time 5269.99ms 
iter 9260: loss 2.4175, time 5260.82ms 
iter 9261: loss 2.5572, time 5270.14ms 
iter 9262: loss 2.7911, time 5273.03ms 
iter 9263: loss 2.4784, time 5259.59ms 
iter 9264: loss 2.1332, time 5259.14ms 
iter 9265: loss 2.1823, time 5276.32ms 
iter 9266: loss 2.4238, time 5275.95ms 
iter 9267: loss 2.5086, time 5230.43ms 
iter 9268: loss 2.5365, time 5261.80ms 
iter 9269: loss 2.3886, time 5281.02ms 
iter 9270: loss 2.5470, time 5267.71ms 
iter 9271: loss 2.3116, time 5250.68ms 
iter 9272: loss 2.4193, time 5265.97ms 
iter 9273: loss 2.3216, time 5270.14ms 
iter 9274: loss 2.4585, time 5259.53ms 
iter 9275: loss 2.3831, time 5278.50ms 
iter 9276: loss 2.5277, time 5269.45ms 
iter 9277: loss 2.4680, time 5257.65ms 
iter 9278: loss 2.4714, time 5259.80ms 
iter 9279: loss 2.4386, time 5285.96ms 
iter 9280: loss 2.1634, time 5268.23ms 
iter 9281: loss 2.5318, time 5273.45ms 
iter 9282: loss 2.4948, time 5265.60ms 
iter 9283: loss 2.4169, time 5284.52ms 
iter 9284: loss 2.6839, time 5265.80ms 
iter 9285: loss 2.4633, time 5261.50ms 
iter 9286: loss 2.2614, time 5281.16ms 
iter 9287: loss 2.5477, time 5267.46ms 
iter 9288: loss 2.3698, time 5261.45ms 
iter 9289: loss 2.2971, time 5269.66ms 
iter 9290: loss 2.5474, time 5279.29ms 
iter 9291: loss 2.4808, time 5260.09ms 
iter 9292: loss 2.3030, time 5266.90ms 
iter 9293: loss 2.4168, time 5267.84ms 
iter 9294: loss 2.3556, time 5252.95ms 
iter 9295: loss 2.4692, time 5268.47ms 
iter 9296: loss 2.4984, time 5271.24ms 
iter 9297: loss 2.6012, time 5274.72ms 
iter 9298: loss 2.4335, time 5267.93ms 
iter 9299: loss 2.4649, time 5275.69ms 
step 9300: train loss 2.4521, val loss 2.8535
iter 9300: loss 2.5515, time 20057.19ms 
iter 9301: loss 2.3509, time 5278.10ms 
iter 9302: loss 2.3584, time 5276.50ms 
iter 9303: loss 2.4845, time 5273.04ms 
iter 9304: loss 2.0743, time 5292.94ms 
iter 9305: loss 2.4928, time 5282.21ms 
iter 9306: loss 2.7380, time 5334.39ms 
iter 9307: loss 2.4147, time 5288.44ms 
iter 9308: loss 2.6381, time 5275.93ms 
iter 9309: loss 2.5765, time 5314.16ms 
iter 9310: loss 2.4629, time 5279.13ms 
iter 9311: loss 2.1302, time 5291.12ms 
iter 9312: loss 2.2326, time 5275.47ms 
iter 9313: loss 2.6483, time 5276.82ms 
iter 9314: loss 2.3851, time 5232.89ms 
iter 9315: loss 2.5195, time 5263.23ms 
iter 9316: loss 2.4020, time 5267.11ms 
iter 9317: loss 2.6548, time 5277.75ms 
iter 9318: loss 2.3825, time 5265.77ms 
iter 9319: loss 2.3923, time 5264.89ms 
iter 9320: loss 2.4735, time 5276.62ms 
iter 9321: loss 2.2642, time 5278.99ms 
iter 9322: loss 2.3841, time 5258.72ms 
iter 9323: loss 2.3945, time 5265.67ms 
iter 9324: loss 2.4364, time 5269.95ms 
iter 9325: loss 2.3070, time 5263.05ms 
iter 9326: loss 2.6515, time 5262.10ms 
iter 9327: loss 2.5903, time 5270.50ms 
iter 9328: loss 2.5705, time 5265.70ms 
iter 9329: loss 2.5310, time 5236.07ms 
iter 9330: loss 2.1269, time 5259.19ms 
iter 9331: loss 2.5590, time 5296.72ms 
iter 9332: loss 2.5392, time 5275.26ms 
iter 9333: loss 2.5908, time 5263.42ms 
iter 9334: loss 2.4013, time 5267.60ms 
iter 9335: loss 2.4201, time 5268.71ms 
iter 9336: loss 2.2753, time 5247.08ms 
iter 9337: loss 2.6294, time 5258.06ms 
iter 9338: loss 2.4711, time 5273.64ms 
iter 9339: loss 2.4554, time 5219.42ms 
iter 9340: loss 2.5697, time 5259.53ms 
iter 9341: loss 2.3875, time 5278.34ms 
iter 9342: loss 2.1128, time 5261.34ms 
iter 9343: loss 2.6466, time 5258.95ms 
iter 9344: loss 2.4291, time 5265.41ms 
iter 9345: loss 2.4258, time 5291.19ms 
iter 9346: loss 2.6404, time 5262.81ms 
iter 9347: loss 2.6434, time 5268.74ms 
iter 9348: loss 2.6023, time 5306.74ms 
iter 9349: loss 2.7641, time 5273.53ms 
step 9350: train loss 2.4638, val loss 2.8469
iter 9350: loss 2.7501, time 20088.98ms 
iter 9351: loss 2.4865, time 5282.72ms 
iter 9352: loss 2.5398, time 5266.21ms 
iter 9353: loss 2.5606, time 5265.74ms 
iter 9354: loss 2.5016, time 5294.59ms 
iter 9355: loss 2.2465, time 5290.30ms 
iter 9356: loss 2.2231, time 5277.28ms 
iter 9357: loss 2.3588, time 5281.54ms 
iter 9358: loss 2.5879, time 5282.42ms 
iter 9359: loss 2.3260, time 5263.71ms 
iter 9360: loss 2.4104, time 5263.27ms 
iter 9361: loss 2.7007, time 5280.14ms 
iter 9362: loss 2.7243, time 5277.67ms 
iter 9363: loss 2.3690, time 5266.65ms 
iter 9364: loss 2.4492, time 5270.72ms 
iter 9365: loss 2.3982, time 5267.44ms 
iter 9366: loss 2.5167, time 5255.57ms 
iter 9367: loss 2.4709, time 5274.04ms 
iter 9368: loss 2.4969, time 5289.74ms 
iter 9369: loss 2.4387, time 5276.14ms 
iter 9370: loss 2.5873, time 5260.53ms 
iter 9371: loss 2.3444, time 5270.02ms 
iter 9372: loss 2.1752, time 5250.86ms 
iter 9373: loss 2.4486, time 5264.52ms 
iter 9374: loss 2.3523, time 5275.49ms 
iter 9375: loss 2.6316, time 5264.33ms 
iter 9376: loss 2.3713, time 5270.25ms 
iter 9377: loss 2.5252, time 5257.11ms 
iter 9378: loss 2.4539, time 5277.67ms 
iter 9379: loss 2.3075, time 5267.39ms 
iter 9380: loss 2.3822, time 5267.83ms 
iter 9381: loss 2.2244, time 5261.04ms 
iter 9382: loss 2.3685, time 5279.26ms 
iter 9383: loss 2.5346, time 5262.06ms 
iter 9384: loss 2.4752, time 5258.22ms 
iter 9385: loss 2.1531, time 5277.02ms 
iter 9386: loss 2.3977, time 5267.31ms 
iter 9387: loss 2.5394, time 5275.37ms 
iter 9388: loss 2.5199, time 5273.64ms 
iter 9389: loss 2.3663, time 5290.50ms 
iter 9390: loss 2.5278, time 5273.72ms 
iter 9391: loss 2.3812, time 5261.29ms 
iter 9392: loss 2.4760, time 5259.47ms 
iter 9393: loss 2.4135, time 5273.79ms 
iter 9394: loss 2.1598, time 5222.49ms 
iter 9395: loss 2.4059, time 5262.58ms 
iter 9396: loss 2.3876, time 5266.18ms 
iter 9397: loss 2.7344, time 5272.54ms 
iter 9398: loss 2.5733, time 5270.24ms 
iter 9399: loss 2.5063, time 5266.81ms 
step 9400: train loss 2.4606, val loss 2.8566
iter 9400: loss 2.6295, time 20078.03ms 
iter 9401: loss 2.5007, time 5269.98ms 
iter 9402: loss 2.5885, time 5260.28ms 
iter 9403: loss 2.4877, time 5266.07ms 
iter 9404: loss 2.3789, time 5274.13ms 
iter 9405: loss 2.6198, time 5258.36ms 
iter 9406: loss 2.3891, time 5263.19ms 
iter 9407: loss 2.5264, time 5270.01ms 
iter 9408: loss 2.2120, time 5266.30ms 
iter 9409: loss 2.5647, time 5262.50ms 
iter 9410: loss 2.4834, time 5272.74ms 
iter 9411: loss 2.3961, time 5268.97ms 
iter 9412: loss 2.3311, time 5264.14ms 
iter 9413: loss 2.4533, time 5239.70ms 
iter 9414: loss 2.3634, time 5272.09ms 
iter 9415: loss 2.3959, time 5262.85ms 
iter 9416: loss 2.4523, time 5265.76ms 
iter 9417: loss 2.1897, time 5268.92ms 
iter 9418: loss 2.5775, time 5262.30ms 
iter 9419: loss 2.4735, time 5263.39ms 
iter 9420: loss 2.4297, time 5264.03ms 
iter 9421: loss 2.6014, time 5265.94ms 
iter 9422: loss 2.5006, time 5274.44ms 
iter 9423: loss 2.6108, time 5261.64ms 
iter 9424: loss 2.3730, time 5269.73ms 
iter 9425: loss 2.3492, time 5260.77ms 
iter 9426: loss 2.5026, time 5258.85ms 
iter 9427: loss 2.5265, time 5274.04ms 
iter 9428: loss 2.6422, time 5277.36ms 
iter 9429: loss 2.4656, time 5263.06ms 
iter 9430: loss 2.4562, time 5274.49ms 
iter 9431: loss 2.5470, time 5269.89ms 
iter 9432: loss 2.4859, time 5270.22ms 
iter 9433: loss 2.6804, time 5239.74ms 
iter 9434: loss 2.4489, time 5280.05ms 
iter 9435: loss 2.2772, time 5263.20ms 
iter 9436: loss 2.3314, time 5266.36ms 
iter 9437: loss 2.5532, time 5271.62ms 
iter 9438: loss 2.4098, time 5273.69ms 
iter 9439: loss 2.3964, time 5264.30ms 
iter 9440: loss 2.3681, time 5266.55ms 
iter 9441: loss 2.1802, time 5281.50ms 
iter 9442: loss 2.5817, time 5262.83ms 
iter 9443: loss 2.5309, time 5271.53ms 
iter 9444: loss 2.5664, time 5269.90ms 
iter 9445: loss 2.3766, time 5274.42ms 
iter 9446: loss 2.1769, time 5276.26ms 
iter 9447: loss 2.2446, time 5265.72ms 
iter 9448: loss 2.5093, time 5282.40ms 
iter 9449: loss 2.2027, time 5258.54ms 
step 9450: train loss 2.4494, val loss 2.8447
iter 9450: loss 2.2504, time 20082.48ms 
iter 9451: loss 2.5976, time 5267.25ms 
iter 9452: loss 2.3271, time 5265.61ms 
iter 9453: loss 2.5253, time 5253.00ms 
iter 9454: loss 2.4980, time 5263.58ms 
iter 9455: loss 2.3963, time 5257.12ms 
iter 9456: loss 2.3855, time 5262.62ms 
iter 9457: loss 2.2373, time 5260.91ms 
iter 9458: loss 2.4666, time 5273.34ms 
iter 9459: loss 2.5368, time 5234.06ms 
iter 9460: loss 2.5743, time 5264.53ms 
iter 9461: loss 2.6015, time 5270.99ms 
iter 9462: loss 2.4083, time 5246.60ms 
iter 9463: loss 2.4005, time 5266.44ms 
iter 9464: loss 2.4665, time 5262.25ms 
iter 9465: loss 2.4781, time 5274.04ms 
iter 9466: loss 2.3685, time 5272.62ms 
iter 9467: loss 2.5076, time 5297.58ms 
iter 9468: loss 2.6191, time 5334.36ms 
iter 9469: loss 2.3195, time 5266.95ms 
iter 9470: loss 2.4184, time 5340.02ms 
iter 9471: loss 2.3765, time 5332.99ms 
iter 9472: loss 2.4770, time 5343.56ms 
iter 9473: loss 2.5760, time 5275.89ms 
iter 9474: loss 2.4072, time 5280.04ms 
iter 9475: loss 2.5100, time 5304.94ms 
iter 9476: loss 2.3948, time 5238.05ms 
iter 9477: loss 2.6422, time 5255.67ms 
iter 9478: loss 2.5779, time 5288.85ms 
iter 9479: loss 2.3873, time 5267.87ms 
iter 9480: loss 2.3932, time 5272.17ms 
iter 9481: loss 2.4924, time 5275.43ms 
iter 9482: loss 2.3998, time 5272.65ms 
iter 9483: loss 2.4162, time 5259.85ms 
iter 9484: loss 2.1827, time 5262.34ms 
iter 9485: loss 2.3502, time 5231.80ms 
iter 9486: loss 2.4258, time 5266.06ms 
iter 9487: loss 2.2896, time 5264.90ms 
iter 9488: loss 2.5037, time 5272.18ms 
iter 9489: loss 2.3883, time 5268.33ms 
iter 9490: loss 2.3725, time 5261.49ms 
iter 9491: loss 2.5714, time 5263.76ms 
iter 9492: loss 2.5595, time 5266.32ms 
iter 9493: loss 2.4473, time 5272.79ms 
iter 9494: loss 2.6455, time 5262.37ms 
iter 9495: loss 2.3096, time 5277.54ms 
iter 9496: loss 2.4286, time 5260.11ms 
iter 9497: loss 2.4909, time 5258.52ms 
iter 9498: loss 2.5336, time 5269.56ms 
iter 9499: loss 2.5761, time 5254.50ms 
step 9500: train loss 2.4627, val loss 2.8512
iter 9500: loss 2.5373, time 20052.49ms 
iter 9501: loss 2.7444, time 5264.70ms 
iter 9502: loss 2.3803, time 5266.83ms 
iter 9503: loss 2.3266, time 5259.44ms 
iter 9504: loss 2.5249, time 5259.60ms 
iter 9505: loss 2.4527, time 5272.71ms 
iter 9506: loss 2.2848, time 5229.90ms 
iter 9507: loss 2.3340, time 5259.47ms 
iter 9508: loss 2.3540, time 5276.42ms 
iter 9509: loss 2.4399, time 5275.07ms 
iter 9510: loss 2.4696, time 5230.10ms 
iter 9511: loss 2.5803, time 5271.97ms 
iter 9512: loss 2.4401, time 5280.86ms 
iter 9513: loss 2.2221, time 5272.40ms 
iter 9514: loss 2.4946, time 5259.45ms 
iter 9515: loss 2.5585, time 5278.74ms 
iter 9516: loss 2.3395, time 5267.36ms 
iter 9517: loss 2.3451, time 5261.43ms 
iter 9518: loss 2.4627, time 5275.08ms 
iter 9519: loss 2.5943, time 5283.04ms 
iter 9520: loss 2.4330, time 5267.65ms 
iter 9521: loss 2.6602, time 5267.91ms 
iter 9522: loss 2.4055, time 5285.12ms 
iter 9523: loss 2.5814, time 5274.87ms 
iter 9524: loss 2.5315, time 5262.07ms 
iter 9525: loss 2.6561, time 5258.88ms 
iter 9526: loss 2.3225, time 5265.94ms 
iter 9527: loss 2.6687, time 5262.75ms 
iter 9528: loss 2.4622, time 5267.03ms 
iter 9529: loss 2.3134, time 5327.40ms 
iter 9530: loss 2.5947, time 5336.10ms 
iter 9531: loss 2.7246, time 5336.32ms 
iter 9532: loss 2.3599, time 5259.68ms 
iter 9533: loss 2.3438, time 5274.52ms 
iter 9534: loss 2.3726, time 5264.95ms 
iter 9535: loss 2.4928, time 5259.60ms 
iter 9536: loss 2.5413, time 5270.07ms 
iter 9537: loss 2.3908, time 5276.43ms 
iter 9538: loss 2.2599, time 5303.46ms 
iter 9539: loss 2.3388, time 5260.11ms 
iter 9540: loss 2.4865, time 5300.94ms 
iter 9541: loss 2.5840, time 5278.77ms 
iter 9542: loss 2.4461, time 5276.60ms 
iter 9543: loss 2.1358, time 5288.52ms 
iter 9544: loss 2.4337, time 5284.92ms 
iter 9545: loss 2.4487, time 5271.20ms 
iter 9546: loss 2.4841, time 5264.00ms 
iter 9547: loss 2.5051, time 5265.81ms 
iter 9548: loss 2.3211, time 5271.07ms 
iter 9549: loss 2.5387, time 5257.61ms 
step 9550: train loss 2.4597, val loss 2.8473
iter 9550: loss 2.7109, time 19968.76ms 
iter 9551: loss 2.6291, time 5262.83ms 
iter 9552: loss 2.4419, time 5259.61ms 
iter 9553: loss 2.6394, time 5268.29ms 
iter 9554: loss 2.4977, time 5266.07ms 
iter 9555: loss 2.6326, time 5273.79ms 
iter 9556: loss 2.7687, time 5284.46ms 
iter 9557: loss 2.6313, time 5270.76ms 
iter 9558: loss 2.4050, time 5259.90ms 
iter 9559: loss 2.4527, time 5271.53ms 
iter 9560: loss 2.4101, time 5274.52ms 
iter 9561: loss 2.2507, time 5269.45ms 
iter 9562: loss 2.6409, time 5272.67ms 
iter 9563: loss 2.5381, time 5286.65ms 
iter 9564: loss 2.4630, time 5268.96ms 
iter 9565: loss 2.1900, time 5281.75ms 
iter 9566: loss 2.5999, time 5271.84ms 
iter 9567: loss 2.5027, time 5275.11ms 
iter 9568: loss 2.7241, time 5318.78ms 
iter 9569: loss 2.2565, time 5275.67ms 
iter 9570: loss 2.3788, time 5260.74ms 
iter 9571: loss 2.6500, time 5291.06ms 
iter 9572: loss 2.4114, time 5274.80ms 
iter 9573: loss 2.3984, time 5276.49ms 
iter 9574: loss 2.4682, time 5276.95ms 
iter 9575: loss 2.4773, time 5283.26ms 
iter 9576: loss 2.3327, time 5275.55ms 
iter 9577: loss 2.3186, time 5280.18ms 
iter 9578: loss 2.5388, time 5258.01ms 
iter 9579: loss 2.3675, time 5270.14ms 
iter 9580: loss 2.4748, time 5285.84ms 
iter 9581: loss 2.2953, time 5280.66ms 
iter 9582: loss 2.7237, time 5262.27ms 
iter 9583: loss 2.4776, time 5271.09ms 
iter 9584: loss 2.5182, time 5274.92ms 
iter 9585: loss 2.5246, time 5279.81ms 
iter 9586: loss 2.4374, time 5319.42ms 
iter 9587: loss 2.4706, time 5308.61ms 
iter 9588: loss 2.5346, time 5268.48ms 
iter 9589: loss 2.4027, time 5270.70ms 
iter 9590: loss 2.4219, time 5271.13ms 
iter 9591: loss 2.4481, time 5273.25ms 
iter 9592: loss 2.4047, time 5263.67ms 
iter 9593: loss 2.3118, time 5265.65ms 
iter 9594: loss 2.4039, time 5291.16ms 
iter 9595: loss 2.6076, time 5262.19ms 
iter 9596: loss 2.4830, time 5260.02ms 
iter 9597: loss 2.2745, time 5274.37ms 
iter 9598: loss 2.3722, time 5269.49ms 
iter 9599: loss 2.1897, time 5276.50ms 
step 9600: train loss 2.4469, val loss 2.8487
iter 9600: loss 2.4913, time 20037.78ms 
iter 9601: loss 2.4004, time 5267.95ms 
iter 9602: loss 2.2922, time 5262.49ms 
iter 9603: loss 2.2574, time 5264.80ms 
iter 9604: loss 2.3860, time 5266.30ms 
iter 9605: loss 2.2372, time 5261.69ms 
iter 9606: loss 2.6582, time 5271.87ms 
iter 9607: loss 2.4694, time 5257.81ms 
iter 9608: loss 2.4120, time 5262.69ms 
iter 9609: loss 2.5967, time 5271.46ms 
iter 9610: loss 2.3302, time 5290.31ms 
iter 9611: loss 2.4132, time 5264.01ms 
iter 9612: loss 2.3554, time 5280.28ms 
iter 9613: loss 2.4658, time 5277.17ms 
iter 9614: loss 2.4855, time 5263.63ms 
iter 9615: loss 2.5296, time 5244.84ms 
iter 9616: loss 2.5518, time 5246.64ms 
iter 9617: loss 2.4552, time 5272.04ms 
iter 9618: loss 2.4681, time 5264.25ms 
iter 9619: loss 2.3674, time 5281.04ms 
iter 9620: loss 2.2247, time 5287.12ms 
iter 9621: loss 2.2203, time 5261.63ms 
iter 9622: loss 2.5663, time 5260.23ms 
iter 9623: loss 2.3183, time 5266.27ms 
iter 9624: loss 2.4618, time 5276.41ms 
iter 9625: loss 2.6444, time 5258.14ms 
iter 9626: loss 2.3929, time 5277.54ms 
iter 9627: loss 2.4777, time 5290.34ms 
iter 9628: loss 2.5284, time 5285.80ms 
iter 9629: loss 2.1918, time 5278.80ms 
iter 9630: loss 2.5642, time 5305.01ms 
iter 9631: loss 2.2696, time 5257.90ms 
iter 9632: loss 2.4861, time 5231.12ms 
iter 9633: loss 2.7743, time 5278.25ms 
iter 9634: loss 2.2051, time 5259.25ms 
iter 9635: loss 2.5175, time 5262.74ms 
iter 9636: loss 2.2194, time 5270.20ms 
iter 9637: loss 2.6056, time 5268.37ms 
iter 9638: loss 2.5787, time 5269.07ms 
iter 9639: loss 2.4846, time 5267.74ms 
iter 9640: loss 2.1360, time 5300.11ms 
iter 9641: loss 2.5081, time 5257.63ms 
iter 9642: loss 2.4658, time 5221.81ms 
iter 9643: loss 2.3276, time 5285.12ms 
iter 9644: loss 2.2373, time 5269.91ms 
iter 9645: loss 2.6527, time 5297.66ms 
iter 9646: loss 2.4929, time 5280.29ms 
iter 9647: loss 2.3187, time 5278.41ms 
iter 9648: loss 2.3612, time 5264.96ms 
iter 9649: loss 2.5310, time 5277.62ms 
step 9650: train loss 2.4601, val loss 2.8586
iter 9650: loss 2.4668, time 20095.14ms 
iter 9651: loss 2.5736, time 5260.13ms 
iter 9652: loss 2.6656, time 5269.77ms 
iter 9653: loss 2.3053, time 5266.05ms 
iter 9654: loss 2.3535, time 5267.77ms 
iter 9655: loss 2.5175, time 5261.46ms 
iter 9656: loss 2.3956, time 5275.91ms 
iter 9657: loss 2.3453, time 5264.27ms 
iter 9658: loss 2.6271, time 5247.05ms 
iter 9659: loss 2.1730, time 5239.99ms 
iter 9660: loss 2.3116, time 5262.39ms 
iter 9661: loss 2.3853, time 5263.61ms 
iter 9662: loss 2.2193, time 5265.24ms 
iter 9663: loss 2.6952, time 5267.31ms 
iter 9664: loss 2.4076, time 5261.16ms 
iter 9665: loss 2.4919, time 5260.81ms 
iter 9666: loss 2.4367, time 5273.23ms 
iter 9667: loss 2.5375, time 5230.14ms 
iter 9668: loss 2.5219, time 5259.66ms 
iter 9669: loss 2.4835, time 5280.46ms 
iter 9670: loss 2.3480, time 5264.96ms 
iter 9671: loss 2.3994, time 5280.34ms 
iter 9672: loss 2.4440, time 5327.40ms 
iter 9673: loss 2.5659, time 5304.15ms 
iter 9674: loss 2.4069, time 5278.99ms 
iter 9675: loss 2.4349, time 5263.80ms 
iter 9676: loss 2.3828, time 5260.53ms 
iter 9677: loss 2.2954, time 5249.54ms 
iter 9678: loss 2.4193, time 5185.42ms 
iter 9679: loss 2.4021, time 5278.66ms 
iter 9680: loss 2.3377, time 5297.28ms 
iter 9681: loss 2.5669, time 5261.53ms 
iter 9682: loss 2.6696, time 5254.06ms 
iter 9683: loss 2.4924, time 5281.44ms 
iter 9684: loss 2.2191, time 5267.37ms 
iter 9685: loss 2.6478, time 5259.16ms 
iter 9686: loss 2.3795, time 5268.31ms 
iter 9687: loss 2.3969, time 5273.58ms 
iter 9688: loss 2.5683, time 5250.62ms 
iter 9689: loss 2.4496, time 5277.07ms 
iter 9690: loss 2.3456, time 5277.68ms 
iter 9691: loss 2.5375, time 5270.74ms 
iter 9692: loss 2.3918, time 5262.75ms 
iter 9693: loss 2.5596, time 5281.23ms 
iter 9694: loss 2.6600, time 5268.61ms 
iter 9695: loss 2.7608, time 5261.81ms 
iter 9696: loss 2.4008, time 5287.35ms 
iter 9697: loss 2.4094, time 5275.56ms 
iter 9698: loss 2.4128, time 5258.02ms 
iter 9699: loss 2.3406, time 5262.63ms 
step 9700: train loss 2.4543, val loss 2.8497
iter 9700: loss 2.1465, time 20047.48ms 
iter 9701: loss 2.1363, time 5244.82ms 
iter 9702: loss 2.4198, time 5262.43ms 
iter 9703: loss 2.6282, time 5279.20ms 
iter 9704: loss 2.5646, time 5265.05ms 
iter 9705: loss 2.5265, time 5237.95ms 
iter 9706: loss 2.5060, time 5264.26ms 
iter 9707: loss 2.3868, time 5281.06ms 
iter 9708: loss 2.5705, time 5263.18ms 
iter 9709: loss 2.4872, time 5253.14ms 
iter 9710: loss 2.3985, time 5254.58ms 
iter 9711: loss 2.3967, time 5247.15ms 
iter 9712: loss 2.4173, time 5265.94ms 
iter 9713: loss 2.5287, time 5263.23ms 
iter 9714: loss 2.6168, time 5265.99ms 
iter 9715: loss 2.3630, time 5257.80ms 
iter 9716: loss 2.3317, time 5264.20ms 
iter 9717: loss 2.4859, time 5272.80ms 
iter 9718: loss 2.5078, time 5261.98ms 
iter 9719: loss 2.7006, time 5269.25ms 
iter 9720: loss 2.4222, time 5271.34ms 
iter 9721: loss 2.5050, time 5262.26ms 
iter 9722: loss 2.6214, time 5260.34ms 
iter 9723: loss 2.3193, time 5290.80ms 
iter 9724: loss 2.2050, time 5171.50ms 
iter 9725: loss 2.5061, time 5264.55ms 
iter 9726: loss 2.4137, time 5267.68ms 
iter 9727: loss 2.4837, time 5267.95ms 
iter 9728: loss 2.5029, time 5261.06ms 
iter 9729: loss 2.2904, time 5260.65ms 
iter 9730: loss 2.2213, time 5280.27ms 
iter 9731: loss 2.4590, time 5271.43ms 
iter 9732: loss 2.2224, time 5273.87ms 
iter 9733: loss 2.4880, time 5284.74ms 
iter 9734: loss 2.6713, time 5267.27ms 
iter 9735: loss 2.4050, time 5255.06ms 
iter 9736: loss 2.1588, time 5265.48ms 
iter 9737: loss 2.6512, time 5283.37ms 
iter 9738: loss 2.3298, time 5274.20ms 
iter 9739: loss 2.5320, time 5270.72ms 
iter 9740: loss 2.4560, time 5280.68ms 
iter 9741: loss 2.2545, time 5269.71ms 
iter 9742: loss 2.6180, time 5276.34ms 
iter 9743: loss 2.2856, time 5282.10ms 
iter 9744: loss 2.4478, time 5271.81ms 
iter 9745: loss 2.4245, time 5271.45ms 
iter 9746: loss 2.3993, time 5268.12ms 
iter 9747: loss 2.2693, time 5278.99ms 
iter 9748: loss 2.5159, time 5264.73ms 
iter 9749: loss 2.2652, time 5272.47ms 
step 9750: train loss 2.4505, val loss 2.8610
iter 9750: loss 2.3704, time 20052.66ms 
iter 9751: loss 2.4917, time 5274.10ms 
iter 9752: loss 2.5266, time 5308.89ms 
iter 9753: loss 2.3281, time 5287.93ms 
iter 9754: loss 2.5859, time 5227.51ms 
iter 9755: loss 2.3125, time 5294.45ms 
iter 9756: loss 2.5399, time 5301.80ms 
iter 9757: loss 2.3697, time 5233.05ms 
iter 9758: loss 2.5408, time 5270.30ms 
iter 9759: loss 2.2551, time 5274.86ms 
iter 9760: loss 2.4164, time 5262.39ms 
iter 9761: loss 2.7606, time 5264.25ms 
iter 9762: loss 2.4052, time 5272.21ms 
iter 9763: loss 2.2645, time 5282.18ms 
iter 9764: loss 2.3472, time 5267.83ms 
iter 9765: loss 2.4706, time 5288.77ms 
iter 9766: loss 2.4640, time 5216.16ms 
iter 9767: loss 2.5518, time 5154.85ms 
iter 9768: loss 2.4373, time 5181.85ms 
iter 9769: loss 2.4815, time 5186.12ms 
iter 9770: loss 2.3882, time 5233.61ms 
iter 9771: loss 2.3813, time 5230.13ms 
iter 9772: loss 2.2964, time 5219.31ms 
iter 9773: loss 2.3662, time 5210.27ms 
iter 9774: loss 2.3487, time 5215.46ms 
iter 9775: loss 2.2446, time 5239.41ms 
iter 9776: loss 2.2759, time 5252.21ms 
iter 9777: loss 2.3506, time 5243.27ms 
iter 9778: loss 2.1967, time 5263.38ms 
iter 9779: loss 2.4206, time 5286.81ms 
iter 9780: loss 2.5472, time 5273.49ms 
iter 9781: loss 2.5710, time 5265.00ms 
iter 9782: loss 2.4955, time 5289.36ms 
iter 9783: loss 2.5371, time 5224.42ms 
iter 9784: loss 2.4854, time 5092.09ms 
iter 9785: loss 2.4591, time 5186.18ms 
iter 9786: loss 2.7160, time 5153.03ms 
iter 9787: loss 2.1256, time 5250.52ms 
iter 9788: loss 2.1181, time 5233.40ms 
iter 9789: loss 2.5413, time 5103.61ms 
iter 9790: loss 2.5997, time 5238.99ms 
iter 9791: loss 2.5482, time 5239.04ms 
iter 9792: loss 2.4858, time 5286.09ms 
iter 9793: loss 2.3403, time 5286.15ms 
iter 9794: loss 2.3609, time 5273.76ms 
iter 9795: loss 2.2392, time 5282.37ms 
iter 9796: loss 2.5288, time 5270.72ms 
iter 9797: loss 2.3830, time 5249.68ms 
iter 9798: loss 2.4915, time 5285.87ms 
iter 9799: loss 2.4433, time 5275.64ms 
step 9800: train loss 2.4549, val loss 2.8444
iter 9800: loss 2.2399, time 20142.95ms 
iter 9801: loss 2.3362, time 5255.83ms 
iter 9802: loss 2.4014, time 5216.29ms 
iter 9803: loss 2.3901, time 5126.31ms 
iter 9804: loss 2.5593, time 5096.56ms 
iter 9805: loss 2.6347, time 5107.38ms 
iter 9806: loss 2.3825, time 5090.00ms 
iter 9807: loss 2.5212, time 5133.07ms 
iter 9808: loss 2.4740, time 5124.51ms 
iter 9809: loss 2.5485, time 5225.28ms 
iter 9810: loss 2.2578, time 5279.78ms 
iter 9811: loss 2.5659, time 5287.96ms 
iter 9812: loss 2.6010, time 5272.91ms 
iter 9813: loss 2.3826, time 5269.85ms 
iter 9814: loss 2.3956, time 5252.83ms 
iter 9815: loss 2.6520, time 5276.31ms 
iter 9816: loss 2.6859, time 5278.27ms 
iter 9817: loss 2.7574, time 5265.18ms 
iter 9818: loss 2.2307, time 5238.84ms 
iter 9819: loss 2.5263, time 5272.48ms 
iter 9820: loss 2.4692, time 5276.68ms 
iter 9821: loss 2.3844, time 5277.82ms 
iter 9822: loss 2.5032, time 5272.72ms 
iter 9823: loss 2.3368, time 5245.03ms 
iter 9824: loss 2.5313, time 5240.48ms 
iter 9825: loss 2.4913, time 5260.69ms 
iter 9826: loss 2.5554, time 5245.80ms 
iter 9827: loss 2.4341, time 5258.04ms 
iter 9828: loss 2.4740, time 5214.68ms 
iter 9829: loss 2.4174, time 5199.04ms 
iter 9830: loss 2.4095, time 5183.79ms 
iter 9831: loss 2.4826, time 5263.67ms 
iter 9832: loss 2.0222, time 5254.71ms 
iter 9833: loss 2.4390, time 5129.75ms 
iter 9834: loss 2.2450, time 5221.87ms 
iter 9835: loss 2.5029, time 5176.61ms 
iter 9836: loss 2.2948, time 5154.50ms 
iter 9837: loss 2.6533, time 5154.57ms 
iter 9838: loss 2.4795, time 5150.92ms 
iter 9839: loss 2.7345, time 5137.46ms 
iter 9840: loss 2.4785, time 5093.31ms 
iter 9841: loss 2.1089, time 5091.65ms 
iter 9842: loss 2.7582, time 5086.59ms 
iter 9843: loss 2.4805, time 5109.65ms 
iter 9844: loss 2.5865, time 5145.35ms 
iter 9845: loss 2.4076, time 5119.38ms 
iter 9846: loss 2.2951, time 5114.32ms 
iter 9847: loss 2.5001, time 5092.10ms 
iter 9848: loss 2.3129, time 5115.92ms 
iter 9849: loss 2.6691, time 5099.68ms 
step 9850: train loss 2.4479, val loss 2.8452
iter 9850: loss 2.2571, time 19730.91ms 
iter 9851: loss 2.3442, time 5136.39ms 
iter 9852: loss 2.3904, time 5080.82ms 
iter 9853: loss 2.3186, time 5091.27ms 
iter 9854: loss 2.6725, time 5079.10ms 
iter 9855: loss 2.4301, time 5142.89ms 
iter 9856: loss 2.6048, time 5236.41ms 
iter 9857: loss 2.2623, time 5089.44ms 
iter 9858: loss 2.3904, time 5170.53ms 
iter 9859: loss 2.3921, time 5281.56ms 
iter 9860: loss 2.3740, time 5258.14ms 
iter 9861: loss 2.4324, time 5263.43ms 
iter 9862: loss 2.4702, time 5266.87ms 
iter 9863: loss 2.3095, time 5279.40ms 
iter 9864: loss 2.3558, time 5258.77ms 
iter 9865: loss 2.4524, time 5258.18ms 
iter 9866: loss 2.6262, time 5294.48ms 
iter 9867: loss 2.5238, time 5265.63ms 
iter 9868: loss 2.6236, time 5265.05ms 
iter 9869: loss 2.4210, time 5268.70ms 
iter 9870: loss 2.5531, time 5275.31ms 
iter 9871: loss 2.5471, time 5222.22ms 
iter 9872: loss 2.2049, time 5273.54ms 
iter 9873: loss 2.3684, time 5278.38ms 
iter 9874: loss 2.3793, time 5274.89ms 
iter 9875: loss 2.3399, time 5274.67ms 
iter 9876: loss 2.3031, time 5210.62ms 
iter 9877: loss 2.5143, time 5245.53ms 
iter 9878: loss 2.5249, time 5261.00ms 
iter 9879: loss 2.5802, time 5269.41ms 
iter 9880: loss 2.2689, time 5266.40ms 
iter 9881: loss 2.3028, time 5253.80ms 
iter 9882: loss 2.5452, time 5273.71ms 
iter 9883: loss 2.3687, time 5275.18ms 
iter 9884: loss 2.5341, time 5264.20ms 
iter 9885: loss 2.5424, time 5219.59ms 
iter 9886: loss 2.4294, time 5265.66ms 
iter 9887: loss 2.6433, time 5150.21ms 
iter 9888: loss 2.2018, time 5211.75ms 
iter 9889: loss 2.3575, time 5260.32ms 
iter 9890: loss 2.4135, time 5271.36ms 
iter 9891: loss 2.4574, time 5262.81ms 
iter 9892: loss 2.4234, time 5274.76ms 
iter 9893: loss 2.4062, time 5308.99ms 
iter 9894: loss 2.6076, time 5286.70ms 
iter 9895: loss 2.1599, time 5340.25ms 
iter 9896: loss 2.2899, time 5268.43ms 
iter 9897: loss 2.5184, time 5263.44ms 
iter 9898: loss 2.4676, time 5268.18ms 
iter 9899: loss 2.5761, time 5266.65ms 
step 9900: train loss 2.4496, val loss 2.8502
iter 9900: loss 2.3036, time 20084.37ms 
iter 9901: loss 2.4905, time 5261.61ms 
iter 9902: loss 2.4439, time 5257.73ms 
iter 9903: loss 2.5978, time 5299.73ms 
iter 9904: loss 2.3729, time 5266.56ms 
iter 9905: loss 2.4832, time 5302.89ms 
iter 9906: loss 2.4295, time 5277.62ms 
iter 9907: loss 2.4862, time 5269.61ms 
iter 9908: loss 2.5704, time 5255.39ms 
iter 9909: loss 2.4394, time 5276.29ms 
iter 9910: loss 2.3009, time 5283.17ms 
iter 9911: loss 2.5057, time 5265.36ms 
iter 9912: loss 2.5296, time 5268.23ms 
iter 9913: loss 2.3950, time 5243.93ms 
iter 9914: loss 2.5509, time 5273.67ms 
iter 9915: loss 2.6250, time 5268.18ms 
iter 9916: loss 2.2845, time 5266.86ms 
iter 9917: loss 2.3416, time 5283.53ms 
iter 9918: loss 2.3608, time 5264.09ms 
iter 9919: loss 2.5091, time 5263.26ms 
iter 9920: loss 2.6243, time 5335.62ms 
iter 9921: loss 2.3027, time 5247.31ms 
iter 9922: loss 2.4623, time 5285.50ms 
iter 9923: loss 2.3421, time 5268.86ms 
iter 9924: loss 2.6666, time 5277.71ms 
iter 9925: loss 2.7808, time 5263.35ms 
iter 9926: loss 2.5724, time 5258.92ms 
iter 9927: loss 2.3714, time 5290.19ms 
iter 9928: loss 2.5386, time 5270.65ms 
iter 9929: loss 2.3639, time 5278.86ms 
iter 9930: loss 2.4805, time 5292.97ms 
iter 9931: loss 2.4773, time 5264.97ms 
iter 9932: loss 2.4688, time 5264.13ms 
iter 9933: loss 2.2082, time 5258.64ms 
iter 9934: loss 2.2618, time 5274.16ms 
iter 9935: loss 2.3559, time 5240.01ms 
iter 9936: loss 2.1465, time 5256.86ms 
iter 9937: loss 2.5559, time 5277.61ms 
iter 9938: loss 2.6937, time 5273.49ms 
iter 9939: loss 2.5319, time 5262.52ms 
iter 9940: loss 2.3878, time 5284.97ms 
iter 9941: loss 2.5403, time 5268.66ms 
iter 9942: loss 2.4125, time 5263.48ms 
iter 9943: loss 2.5969, time 5264.46ms 
iter 9944: loss 2.3911, time 5302.54ms 
iter 9945: loss 2.3807, time 5239.63ms 
iter 9946: loss 2.4972, time 5342.02ms 
iter 9947: loss 2.4977, time 5289.14ms 
iter 9948: loss 2.1555, time 5282.79ms 
iter 9949: loss 2.4686, time 5257.43ms 
step 9950: train loss 2.4473, val loss 2.8569
iter 9950: loss 2.3709, time 20069.21ms 
iter 9951: loss 2.5217, time 5264.73ms 
iter 9952: loss 2.4029, time 5267.14ms 
iter 9953: loss 2.5021, time 5313.57ms 
iter 9954: loss 2.2953, time 5267.17ms 
iter 9955: loss 2.3724, time 5259.06ms 
iter 9956: loss 2.2414, time 5271.50ms 
iter 9957: loss 2.4835, time 5285.11ms 
iter 9958: loss 2.6541, time 5272.42ms 
iter 9959: loss 2.4474, time 5275.36ms 
iter 9960: loss 2.5338, time 5270.52ms 
iter 9961: loss 2.4612, time 5278.83ms 
iter 9962: loss 2.4656, time 5241.86ms 
iter 9963: loss 2.3715, time 5250.44ms 
iter 9964: loss 2.5873, time 5282.54ms 
iter 9965: loss 2.1799, time 5266.50ms 
iter 9966: loss 2.6794, time 5276.67ms 
iter 9967: loss 2.5359, time 5271.87ms 
iter 9968: loss 2.3201, time 5282.95ms 
iter 9969: loss 2.5850, time 5276.64ms 
iter 9970: loss 2.5139, time 5268.39ms 
iter 9971: loss 2.4141, time 5275.90ms 
iter 9972: loss 2.5297, time 5258.48ms 
iter 9973: loss 2.3510, time 5258.88ms 
iter 9974: loss 2.2951, time 5268.95ms 
iter 9975: loss 2.3655, time 5269.71ms 
iter 9976: loss 2.5112, time 5259.23ms 
iter 9977: loss 2.4324, time 5262.14ms 
iter 9978: loss 2.4943, time 5266.82ms 
iter 9979: loss 1.9798, time 5257.59ms 
iter 9980: loss 2.3278, time 5257.12ms 
iter 9981: loss 2.3618, time 5274.15ms 
iter 9982: loss 2.2867, time 5260.16ms 
iter 9983: loss 2.4724, time 5259.23ms 
iter 9984: loss 2.6793, time 5266.75ms 
iter 9985: loss 2.4580, time 5263.55ms 
iter 9986: loss 2.4013, time 5277.05ms 
iter 9987: loss 2.3939, time 5267.51ms 
iter 9988: loss 2.5300, time 5272.34ms 
iter 9989: loss 2.5670, time 5267.74ms 
iter 9990: loss 2.3087, time 5268.66ms 
iter 9991: loss 2.2121, time 5275.80ms 
iter 9992: loss 2.4214, time 5273.23ms 
iter 9993: loss 2.4327, time 5264.46ms 
iter 9994: loss 2.3900, time 5268.97ms 
iter 9995: loss 2.4441, time 5271.15ms 
iter 9996: loss 2.4670, time 5258.87ms 
iter 9997: loss 2.4111, time 5258.00ms 
iter 9998: loss 2.4695, time 5274.43ms 
iter 9999: loss 2.3821, time 5266.46ms 
step 10000: train loss 2.4342, val loss 2.8624
iter 10000: loss 2.5610, time 20047.48ms 
iter 10001: loss 2.3718, time 5307.19ms 
iter 10002: loss 2.4269, time 5323.63ms 
iter 10003: loss 2.6560, time 5290.52ms 
iter 10004: loss 2.4073, time 5276.89ms 
iter 10005: loss 2.5778, time 5286.44ms 
iter 10006: loss 2.2296, time 5260.18ms 
iter 10007: loss 2.5323, time 5270.48ms 
iter 10008: loss 2.2500, time 5283.15ms 
iter 10009: loss 2.5256, time 5263.94ms 
iter 10010: loss 2.4225, time 5261.69ms 
iter 10011: loss 2.3319, time 5271.08ms 
iter 10012: loss 2.4154, time 5276.89ms 
iter 10013: loss 2.4113, time 5259.50ms 
iter 10014: loss 2.5088, time 5263.98ms 
iter 10015: loss 2.4001, time 5281.06ms 
iter 10016: loss 2.5471, time 5266.73ms 
iter 10017: loss 2.5144, time 5286.52ms 
iter 10018: loss 2.5458, time 5274.92ms 
iter 10019: loss 2.4313, time 5265.24ms 
iter 10020: loss 2.3447, time 5230.06ms 
iter 10021: loss 2.5613, time 5259.21ms 
iter 10022: loss 2.3267, time 5279.66ms 
iter 10023: loss 2.7022, time 5262.36ms 
iter 10024: loss 2.4460, time 5261.16ms 
iter 10025: loss 2.5115, time 5289.80ms 
iter 10026: loss 2.3803, time 5280.82ms 
iter 10027: loss 2.4361, time 5286.80ms 
iter 10028: loss 2.3397, time 5291.44ms 
iter 10029: loss 2.5433, time 5305.87ms 
iter 10030: loss 2.2942, time 5281.18ms 
iter 10031: loss 2.4377, time 5276.79ms 
iter 10032: loss 2.6250, time 5273.45ms 
iter 10033: loss 2.3863, time 5330.89ms 
iter 10034: loss 2.3417, time 5261.42ms 
iter 10035: loss 2.4494, time 5268.68ms 
iter 10036: loss 2.4363, time 5265.77ms 
iter 10037: loss 2.5709, time 5261.92ms 
iter 10038: loss 2.4108, time 5262.49ms 
iter 10039: loss 2.4796, time 5275.67ms 
iter 10040: loss 2.4576, time 5258.99ms 
iter 10041: loss 2.5332, time 5245.39ms 
iter 10042: loss 2.4899, time 5272.81ms 
iter 10043: loss 2.6025, time 5269.26ms 
iter 10044: loss 2.6471, time 5260.82ms 
iter 10045: loss 2.4997, time 5274.74ms 
iter 10046: loss 2.3554, time 5278.32ms 
iter 10047: loss 2.4683, time 5259.35ms 
iter 10048: loss 2.5175, time 5258.53ms 
iter 10049: loss 2.2961, time 5280.09ms 
step 10050: train loss 2.4326, val loss 2.8453
iter 10050: loss 2.3128, time 20097.24ms 
iter 10051: loss 2.2737, time 5323.33ms 
iter 10052: loss 2.4066, time 5302.33ms 
iter 10053: loss 2.5908, time 5255.89ms 
iter 10054: loss 2.5849, time 5256.28ms 
iter 10055: loss 2.1934, time 5273.76ms 
iter 10056: loss 2.4718, time 5265.82ms 
iter 10057: loss 2.5261, time 5260.93ms 
iter 10058: loss 2.3504, time 5294.63ms 
iter 10059: loss 2.3162, time 5340.26ms 
iter 10060: loss 2.5119, time 5317.10ms 
iter 10061: loss 2.6191, time 5256.57ms 
iter 10062: loss 2.3980, time 5272.94ms 
iter 10063: loss 2.5474, time 5265.94ms 
iter 10064: loss 2.4453, time 5267.54ms 
iter 10065: loss 2.5987, time 5270.18ms 
iter 10066: loss 2.4979, time 5273.80ms 
iter 10067: loss 2.2208, time 5269.34ms 
iter 10068: loss 2.3994, time 5269.94ms 
iter 10069: loss 2.3748, time 5275.57ms 
iter 10070: loss 2.5966, time 5274.24ms 
iter 10071: loss 2.3433, time 5288.47ms 
iter 10072: loss 2.1510, time 5281.35ms 
iter 10073: loss 2.3531, time 5262.36ms 
iter 10074: loss 2.3176, time 5257.54ms 
iter 10075: loss 2.6329, time 5265.29ms 
iter 10076: loss 2.3740, time 5269.87ms 
iter 10077: loss 2.4005, time 5260.92ms 
iter 10078: loss 2.4544, time 5262.61ms 
iter 10079: loss 2.3974, time 5278.12ms 
iter 10080: loss 2.5346, time 5263.32ms 
iter 10081: loss 2.4512, time 5266.97ms 
iter 10082: loss 2.4050, time 5279.24ms 
iter 10083: loss 2.4850, time 5271.65ms 
iter 10084: loss 2.4225, time 5272.58ms 
iter 10085: loss 2.4588, time 5242.43ms 
iter 10086: loss 2.5194, time 5283.60ms 
iter 10087: loss 2.5092, time 5266.26ms 
iter 10088: loss 2.2597, time 5262.37ms 
iter 10089: loss 2.2554, time 5275.21ms 
iter 10090: loss 2.4359, time 5295.06ms 
iter 10091: loss 2.5055, time 5308.21ms 
iter 10092: loss 2.5578, time 5333.89ms 
iter 10093: loss 2.3538, time 5272.72ms 
iter 10094: loss 2.4875, time 5271.02ms 
iter 10095: loss 2.6518, time 5260.10ms 
iter 10096: loss 2.4375, time 5271.63ms 
iter 10097: loss 2.3326, time 5286.92ms 
iter 10098: loss 2.7426, time 5272.55ms 
iter 10099: loss 2.5596, time 5223.69ms 
step 10100: train loss 2.4312, val loss 2.8589
iter 10100: loss 2.4420, time 20045.38ms 
iter 10101: loss 2.6155, time 5263.13ms 
iter 10102: loss 2.3969, time 5258.76ms 
iter 10103: loss 2.2725, time 5256.43ms 
iter 10104: loss 2.2073, time 5265.25ms 
iter 10105: loss 2.3205, time 5263.13ms 
iter 10106: loss 2.6215, time 5257.35ms 
iter 10107: loss 2.4949, time 5261.17ms 
iter 10108: loss 2.4850, time 5269.95ms 
iter 10109: loss 2.3782, time 5266.01ms 
iter 10110: loss 2.3828, time 5264.54ms 
iter 10111: loss 2.4705, time 5278.45ms 
iter 10112: loss 2.5438, time 5262.30ms 
iter 10113: loss 2.3443, time 5264.45ms 
iter 10114: loss 2.3673, time 5273.87ms 
iter 10115: loss 2.3790, time 5272.73ms 
iter 10116: loss 2.3576, time 5270.13ms 
iter 10117: loss 2.3350, time 5275.76ms 
iter 10118: loss 2.5923, time 5264.63ms 
iter 10119: loss 2.5643, time 5251.31ms 
iter 10120: loss 2.5172, time 5259.00ms 
iter 10121: loss 2.4811, time 5278.55ms 
iter 10122: loss 2.2940, time 5251.85ms 
iter 10123: loss 2.5191, time 5260.04ms 
iter 10124: loss 2.4383, time 5265.04ms 
iter 10125: loss 2.3496, time 5263.10ms 
iter 10126: loss 2.6131, time 5265.55ms 
iter 10127: loss 2.4490, time 5255.63ms 
iter 10128: loss 2.5009, time 5273.53ms 
iter 10129: loss 2.4847, time 5253.42ms 
iter 10130: loss 2.6090, time 5259.14ms 
iter 10131: loss 2.6274, time 5277.27ms 
iter 10132: loss 2.4265, time 5272.14ms 
iter 10133: loss 2.5524, time 5257.26ms 
iter 10134: loss 2.3974, time 5268.09ms 
iter 10135: loss 2.6409, time 5273.86ms 
iter 10136: loss 2.3977, time 5280.87ms 
iter 10137: loss 2.4306, time 5304.99ms 
iter 10138: loss 2.6506, time 5274.87ms 
iter 10139: loss 2.4668, time 5310.46ms 
iter 10140: loss 2.2373, time 5257.09ms 
iter 10141: loss 2.2832, time 5269.10ms 
iter 10142: loss 2.5009, time 5261.15ms 
iter 10143: loss 2.4396, time 5260.68ms 
iter 10144: loss 2.6265, time 5270.49ms 
iter 10145: loss 2.5359, time 5262.40ms 
iter 10146: loss 2.6062, time 5254.24ms 
iter 10147: loss 2.6910, time 5262.02ms 
iter 10148: loss 2.4349, time 5275.62ms 
iter 10149: loss 2.3965, time 5258.36ms 
step 10150: train loss 2.4425, val loss 2.8513
iter 10150: loss 2.3450, time 20054.83ms 
iter 10151: loss 2.4641, time 5273.55ms 
iter 10152: loss 2.4022, time 5258.85ms 
iter 10153: loss 2.5867, time 5255.82ms 
iter 10154: loss 2.3309, time 5266.64ms 
iter 10155: loss 2.4077, time 5263.68ms 
iter 10156: loss 2.6868, time 5256.29ms 
iter 10157: loss 2.4614, time 5258.65ms 
iter 10158: loss 2.5136, time 5275.39ms 
iter 10159: loss 2.5437, time 5257.30ms 
iter 10160: loss 2.2469, time 5258.41ms 
iter 10161: loss 2.4489, time 5317.09ms 
iter 10162: loss 2.4513, time 5278.74ms 
iter 10163: loss 2.3428, time 5302.01ms 
iter 10164: loss 2.3619, time 5282.13ms 
iter 10165: loss 2.3070, time 5327.71ms 
iter 10166: loss 2.3517, time 5296.09ms 
iter 10167: loss 2.4075, time 5344.92ms 
iter 10168: loss 2.2587, time 5330.44ms 
iter 10169: loss 2.6952, time 5272.13ms 
iter 10170: loss 2.5302, time 5272.02ms 
iter 10171: loss 2.4375, time 5313.08ms 
iter 10172: loss 2.4432, time 5324.96ms 
iter 10173: loss 2.4407, time 5310.31ms 
iter 10174: loss 2.4876, time 5262.73ms 
iter 10175: loss 2.2994, time 5265.51ms 
iter 10176: loss 2.5128, time 5262.27ms 
iter 10177: loss 2.5058, time 5255.84ms 
iter 10178: loss 2.2527, time 5274.13ms 
iter 10179: loss 2.4340, time 5251.04ms 
iter 10180: loss 2.5233, time 5262.51ms 
iter 10181: loss 2.6396, time 5302.51ms 
iter 10182: loss 2.7511, time 5338.05ms 
iter 10183: loss 2.6600, time 5325.67ms 
iter 10184: loss 2.3051, time 5300.63ms 
iter 10185: loss 2.1286, time 5303.65ms 
iter 10186: loss 2.3009, time 5271.51ms 
iter 10187: loss 2.4458, time 5272.72ms 
iter 10188: loss 2.3091, time 5294.55ms 
iter 10189: loss 2.6010, time 5277.13ms 
iter 10190: loss 2.6670, time 5270.73ms 
iter 10191: loss 2.7315, time 5277.89ms 
iter 10192: loss 2.5081, time 5262.52ms 
iter 10193: loss 2.3686, time 5268.37ms 
iter 10194: loss 2.3494, time 5265.31ms 
iter 10195: loss 2.4037, time 5276.32ms 
iter 10196: loss 2.5806, time 5268.31ms 
iter 10197: loss 2.3304, time 5300.82ms 
iter 10198: loss 2.4304, time 5282.95ms 
iter 10199: loss 2.2578, time 5272.98ms 
step 10200: train loss 2.4208, val loss 2.8252
iter 10200: loss 2.6107, time 20003.26ms 
iter 10201: loss 2.4420, time 5276.50ms 
iter 10202: loss 2.4762, time 5263.69ms 
iter 10203: loss 2.3727, time 5266.71ms 
iter 10204: loss 2.3687, time 5286.16ms 
iter 10205: loss 2.3594, time 5271.20ms 
iter 10206: loss 2.4612, time 5277.21ms 
iter 10207: loss 2.3710, time 5267.16ms 
iter 10208: loss 2.3527, time 5259.06ms 
iter 10209: loss 2.3109, time 5260.69ms 
iter 10210: loss 2.6505, time 5272.64ms 
iter 10211: loss 2.4931, time 5303.83ms 
iter 10212: loss 2.4139, time 5255.24ms 
iter 10213: loss 2.4141, time 5259.96ms 
iter 10214: loss 2.6932, time 5268.49ms 
iter 10215: loss 2.6397, time 5263.35ms 
iter 10216: loss 2.6674, time 5255.98ms 
iter 10217: loss 2.4584, time 5282.54ms 
iter 10218: loss 2.4449, time 5325.83ms 
iter 10219: loss 2.3338, time 5298.65ms 
iter 10220: loss 2.3078, time 5268.86ms 
iter 10221: loss 2.4302, time 5259.89ms 
iter 10222: loss 2.7050, time 5270.96ms 
iter 10223: loss 2.7516, time 5255.37ms 
iter 10224: loss 2.4758, time 5271.23ms 
iter 10225: loss 2.4652, time 5291.52ms 
iter 10226: loss 2.5935, time 5254.87ms 
iter 10227: loss 2.4313, time 5258.54ms 
iter 10228: loss 2.4419, time 5279.25ms 
iter 10229: loss 2.4435, time 5272.20ms 
iter 10230: loss 2.5496, time 5275.26ms 
iter 10231: loss 2.5608, time 5341.42ms 
iter 10232: loss 2.4490, time 5312.92ms 
iter 10233: loss 2.5056, time 5274.66ms 
iter 10234: loss 2.3072, time 5323.88ms 
iter 10235: loss 2.2695, time 5298.83ms 
iter 10236: loss 2.4936, time 5336.63ms 
iter 10237: loss 2.5101, time 5341.49ms 
iter 10238: loss 2.2431, time 5337.62ms 
iter 10239: loss 2.3490, time 5287.45ms 
iter 10240: loss 2.3625, time 5267.35ms 
iter 10241: loss 2.2675, time 5266.92ms 
iter 10242: loss 2.4066, time 5269.32ms 
iter 10243: loss 2.4325, time 5271.81ms 
iter 10244: loss 2.2849, time 5259.03ms 
iter 10245: loss 2.5851, time 5265.00ms 
iter 10246: loss 2.4188, time 5238.64ms 
iter 10247: loss 2.1771, time 5259.45ms 
iter 10248: loss 2.4224, time 5274.78ms 
iter 10249: loss 2.5394, time 5281.49ms 
step 10250: train loss 2.4362, val loss 2.8671
iter 10250: loss 2.6031, time 19934.58ms 
iter 10251: loss 2.5883, time 5334.38ms 
iter 10252: loss 2.4593, time 5311.67ms 
iter 10253: loss 2.3552, time 5254.71ms 
iter 10254: loss 2.4688, time 5259.89ms 
iter 10255: loss 2.4782, time 5265.26ms 
iter 10256: loss 2.1900, time 5255.70ms 
iter 10257: loss 2.2977, time 5257.71ms 
iter 10258: loss 2.5559, time 5259.49ms 
iter 10259: loss 2.3407, time 5268.71ms 
iter 10260: loss 2.3689, time 5256.51ms 
iter 10261: loss 2.6569, time 5258.49ms 
iter 10262: loss 2.3649, time 5265.72ms 
iter 10263: loss 2.2529, time 5263.55ms 
iter 10264: loss 2.3014, time 5267.05ms 
iter 10265: loss 2.3558, time 5252.12ms 
iter 10266: loss 2.3877, time 5277.39ms 
iter 10267: loss 2.5652, time 5283.35ms 
iter 10268: loss 2.5537, time 5270.51ms 
iter 10269: loss 2.4736, time 5286.21ms 
iter 10270: loss 2.4713, time 5262.13ms 
iter 10271: loss 2.6148, time 5284.90ms 
iter 10272: loss 2.4266, time 5302.86ms 
iter 10273: loss 2.3505, time 5346.71ms 
iter 10274: loss 2.5195, time 5343.76ms 
iter 10275: loss 2.3691, time 5334.32ms 
iter 10276: loss 2.5804, time 5289.25ms 
iter 10277: loss 2.1929, time 5267.39ms 
iter 10278: loss 2.6999, time 5268.82ms 
iter 10279: loss 2.5173, time 5269.00ms 
iter 10280: loss 2.1656, time 5265.13ms 
iter 10281: loss 2.4496, time 5268.33ms 
iter 10282: loss 2.6507, time 5274.51ms 
iter 10283: loss 2.4868, time 5268.70ms 
iter 10284: loss 2.5986, time 5264.10ms 
iter 10285: loss 2.3708, time 5269.64ms 
iter 10286: loss 2.5221, time 5266.41ms 
iter 10287: loss 2.3456, time 5258.49ms 
iter 10288: loss 2.5991, time 5257.83ms 
iter 10289: loss 2.4486, time 5273.31ms 
iter 10290: loss 2.3562, time 5271.20ms 
iter 10291: loss 2.4621, time 5262.39ms 
iter 10292: loss 2.5164, time 5274.19ms 
iter 10293: loss 2.3706, time 5277.73ms 
iter 10294: loss 2.5236, time 5344.82ms 
iter 10295: loss 2.4367, time 5279.79ms 
iter 10296: loss 2.4005, time 5278.17ms 
iter 10297: loss 2.4447, time 5258.20ms 
iter 10298: loss 2.5557, time 5259.39ms 
iter 10299: loss 2.5228, time 5263.82ms 
step 10300: train loss 2.4528, val loss 2.8625
iter 10300: loss 2.6272, time 20050.63ms 
iter 10301: loss 2.3437, time 5256.86ms 
iter 10302: loss 2.5875, time 5269.06ms 
iter 10303: loss 2.2144, time 5267.55ms 
iter 10304: loss 2.5098, time 5261.89ms 
iter 10305: loss 2.5188, time 5261.09ms 
iter 10306: loss 2.4223, time 5273.73ms 
iter 10307: loss 2.5652, time 5257.65ms 
iter 10308: loss 2.0352, time 5257.74ms 
iter 10309: loss 2.6219, time 5279.06ms 
iter 10310: loss 2.5257, time 5263.50ms 
iter 10311: loss 2.4597, time 5260.01ms 
iter 10312: loss 2.2560, time 5267.00ms 
iter 10313: loss 2.4912, time 5267.15ms 
iter 10314: loss 2.5040, time 5260.57ms 
iter 10315: loss 2.3744, time 5263.69ms 
iter 10316: loss 2.4587, time 5268.71ms 
iter 10317: loss 2.5851, time 5261.47ms 
iter 10318: loss 2.1806, time 5260.60ms 
iter 10319: loss 2.6024, time 5274.53ms 
iter 10320: loss 2.3655, time 5266.91ms 
iter 10321: loss 2.4597, time 5277.06ms 
iter 10322: loss 2.5729, time 5275.69ms 
iter 10323: loss 2.3132, time 5276.37ms 
iter 10324: loss 2.3352, time 5279.60ms 
iter 10325: loss 2.5108, time 5276.00ms 
iter 10326: loss 2.5067, time 5276.24ms 
iter 10327: loss 2.4864, time 5261.51ms 
iter 10328: loss 2.5878, time 5258.49ms 
iter 10329: loss 2.3176, time 5278.76ms 
iter 10330: loss 2.6043, time 5268.93ms 
iter 10331: loss 2.4986, time 5265.06ms 
iter 10332: loss 2.5227, time 5269.17ms 
iter 10333: loss 2.3713, time 5262.51ms 
iter 10334: loss 2.5407, time 5258.23ms 
iter 10335: loss 2.6846, time 5262.01ms 
iter 10336: loss 2.4821, time 5274.91ms 
iter 10337: loss 2.5699, time 5256.25ms 
iter 10338: loss 2.1912, time 5260.84ms 
iter 10339: loss 2.5216, time 5283.72ms 
iter 10340: loss 2.0534, time 5262.29ms 
iter 10341: loss 2.6600, time 5261.33ms 
iter 10342: loss 2.6382, time 5292.38ms 
iter 10343: loss 2.4278, time 5222.15ms 
iter 10344: loss 2.3701, time 5263.39ms 
iter 10345: loss 2.0952, time 5268.21ms 
iter 10346: loss 2.4681, time 5270.51ms 
iter 10347: loss 2.3550, time 5259.71ms 
iter 10348: loss 2.5482, time 5241.00ms 
iter 10349: loss 2.5094, time 5271.57ms 
step 10350: train loss 2.4491, val loss 2.8562
iter 10350: loss 2.4148, time 20068.08ms 
iter 10351: loss 2.5585, time 5258.53ms 
iter 10352: loss 2.3426, time 5273.47ms 
iter 10353: loss 2.5144, time 5258.51ms 
iter 10354: loss 2.4569, time 5258.10ms 
iter 10355: loss 2.4691, time 5269.39ms 
iter 10356: loss 2.6523, time 5260.19ms 
iter 10357: loss 2.6633, time 5260.69ms 
iter 10358: loss 2.5515, time 5273.92ms 
iter 10359: loss 2.4407, time 5273.16ms 
iter 10360: loss 2.5154, time 5247.36ms 
iter 10361: loss 2.3436, time 5269.82ms 
iter 10362: loss 2.5625, time 5257.03ms 
iter 10363: loss 2.3917, time 5261.17ms 
iter 10364: loss 2.4834, time 5267.17ms 
iter 10365: loss 2.5929, time 5276.18ms 
iter 10366: loss 2.5502, time 5260.68ms 
iter 10367: loss 2.4959, time 5262.86ms 
iter 10368: loss 2.4784, time 5262.40ms 
iter 10369: loss 2.4314, time 5264.20ms 
iter 10370: loss 2.4313, time 5256.73ms 
iter 10371: loss 2.5890, time 5254.51ms 
iter 10372: loss 2.5089, time 5269.09ms 
iter 10373: loss 2.5985, time 5267.92ms 
iter 10374: loss 2.5127, time 5267.19ms 
iter 10375: loss 2.4281, time 5303.00ms 
iter 10376: loss 2.5110, time 5285.00ms 
iter 10377: loss 2.4887, time 5272.30ms 
iter 10378: loss 2.4243, time 5309.99ms 
iter 10379: loss 2.3123, time 5271.12ms 
iter 10380: loss 2.5864, time 5252.97ms 
iter 10381: loss 2.4070, time 5258.33ms 
iter 10382: loss 2.3953, time 5259.93ms 
iter 10383: loss 2.6823, time 5269.23ms 
iter 10384: loss 2.5002, time 5291.48ms 
iter 10385: loss 2.5287, time 5121.84ms 
iter 10386: loss 2.4283, time 5265.41ms 
iter 10387: loss 2.5036, time 5268.48ms 
iter 10388: loss 2.3905, time 5266.28ms 
iter 10389: loss 2.4040, time 5269.26ms 
iter 10390: loss 2.3559, time 5280.65ms 
iter 10391: loss 2.4008, time 5269.10ms 
iter 10392: loss 2.3823, time 5263.81ms 
iter 10393: loss 2.2389, time 5287.04ms 
iter 10394: loss 2.5095, time 5268.98ms 
iter 10395: loss 2.0773, time 5269.31ms 
iter 10396: loss 2.3216, time 5274.45ms 
iter 10397: loss 2.3734, time 5270.76ms 
iter 10398: loss 2.2703, time 5270.79ms 
iter 10399: loss 2.5039, time 5281.27ms 
step 10400: train loss 2.4498, val loss 2.8535
iter 10400: loss 2.2886, time 20045.67ms 
iter 10401: loss 2.4606, time 5261.75ms 
iter 10402: loss 2.6307, time 5307.28ms 
iter 10403: loss 2.4539, time 5340.85ms 
iter 10404: loss 2.5435, time 5309.40ms 
iter 10405: loss 2.1740, time 5259.85ms 
iter 10406: loss 2.4476, time 5275.28ms 
iter 10407: loss 2.3408, time 5260.73ms 
iter 10408: loss 2.5782, time 5265.65ms 
iter 10409: loss 2.4846, time 5279.79ms 
iter 10410: loss 2.3966, time 5269.59ms 
iter 10411: loss 2.5189, time 5263.79ms 
iter 10412: loss 2.2502, time 5265.37ms 
iter 10413: loss 2.7037, time 5265.89ms 
iter 10414: loss 2.2055, time 5251.80ms 
iter 10415: loss 2.5054, time 5263.65ms 
iter 10416: loss 2.4834, time 5275.44ms 
iter 10417: loss 2.4469, time 5259.43ms 
iter 10418: loss 2.0750, time 5258.18ms 
iter 10419: loss 2.4595, time 5279.03ms 
iter 10420: loss 2.4303, time 5265.68ms 
iter 10421: loss 2.2441, time 5270.62ms 
iter 10422: loss 2.4249, time 5276.48ms 
iter 10423: loss 2.3486, time 5274.16ms 
iter 10424: loss 2.4038, time 5266.34ms 
iter 10425: loss 2.3747, time 5272.56ms 
iter 10426: loss 2.3998, time 5244.68ms 
iter 10427: loss 2.5228, time 5284.72ms 
iter 10428: loss 2.3484, time 5295.96ms 
iter 10429: loss 2.5066, time 5275.33ms 
iter 10430: loss 2.5306, time 5264.70ms 
iter 10431: loss 2.4496, time 5260.08ms 
iter 10432: loss 2.4508, time 5272.24ms 
iter 10433: loss 2.5862, time 5274.44ms 
iter 10434: loss 2.2588, time 5269.55ms 
iter 10435: loss 2.7264, time 5267.24ms 
iter 10436: loss 2.4516, time 5274.89ms 
iter 10437: loss 2.3711, time 5260.34ms 
iter 10438: loss 2.4916, time 5266.63ms 
iter 10439: loss 2.5207, time 5266.29ms 
iter 10440: loss 2.4608, time 5262.31ms 
iter 10441: loss 2.4748, time 5261.06ms 
iter 10442: loss 2.4850, time 5274.80ms 
iter 10443: loss 2.2034, time 5270.95ms 
iter 10444: loss 2.4070, time 5257.66ms 
iter 10445: loss 2.4736, time 5258.75ms 
iter 10446: loss 2.4411, time 5278.96ms 
iter 10447: loss 2.3586, time 5260.97ms 
iter 10448: loss 2.4975, time 5305.67ms 
iter 10449: loss 2.4108, time 5280.99ms 
step 10450: train loss 2.4467, val loss 2.8624
iter 10450: loss 2.3403, time 20039.31ms 
iter 10451: loss 2.4951, time 5259.61ms 
iter 10452: loss 2.4045, time 5283.43ms 
iter 10453: loss 2.3836, time 5274.04ms 
iter 10454: loss 2.5415, time 5262.96ms 
iter 10455: loss 2.4866, time 5263.06ms 
iter 10456: loss 2.4544, time 5267.77ms 
iter 10457: loss 2.4733, time 5262.79ms 
iter 10458: loss 2.5603, time 5266.32ms 
iter 10459: loss 2.6726, time 5280.36ms 
iter 10460: loss 2.5559, time 5258.81ms 
iter 10461: loss 2.2749, time 5263.94ms 
iter 10462: loss 2.4008, time 5285.60ms 
iter 10463: loss 2.3415, time 5220.12ms 
iter 10464: loss 2.4471, time 5261.31ms 
iter 10465: loss 2.3434, time 5296.87ms 
iter 10466: loss 2.2423, time 5337.67ms 
iter 10467: loss 2.3036, time 5310.46ms 
iter 10468: loss 2.3097, time 5282.72ms 
iter 10469: loss 2.4724, time 5274.32ms 
iter 10470: loss 2.5561, time 5253.16ms 
iter 10471: loss 2.4157, time 5265.31ms 
iter 10472: loss 2.5473, time 5283.34ms 
iter 10473: loss 2.6550, time 5260.54ms 
iter 10474: loss 2.1796, time 5263.98ms 
iter 10475: loss 2.3594, time 5256.83ms 
iter 10476: loss 2.6817, time 5261.45ms 
iter 10477: loss 2.4103, time 5258.23ms 
iter 10478: loss 2.4254, time 5271.92ms 
iter 10479: loss 2.6288, time 5258.01ms 
iter 10480: loss 2.7525, time 5231.38ms 
iter 10481: loss 2.4032, time 5272.55ms 
iter 10482: loss 2.3476, time 5260.66ms 
iter 10483: loss 2.3143, time 5261.19ms 
iter 10484: loss 2.4174, time 5270.86ms 
iter 10485: loss 2.3777, time 5267.63ms 
iter 10486: loss 2.5865, time 5260.23ms 
iter 10487: loss 2.7344, time 5274.93ms 
iter 10488: loss 2.4189, time 5280.21ms 
iter 10489: loss 2.4766, time 5274.08ms 
iter 10490: loss 2.5211, time 5267.43ms 
iter 10491: loss 2.6743, time 5281.85ms 
iter 10492: loss 2.6574, time 5260.02ms 
iter 10493: loss 2.3290, time 5259.45ms 
iter 10494: loss 2.4431, time 5271.23ms 
iter 10495: loss 2.3274, time 5261.93ms 
iter 10496: loss 2.2293, time 5261.56ms 
iter 10497: loss 2.3670, time 5244.39ms 
iter 10498: loss 2.3269, time 5266.93ms 
iter 10499: loss 2.7018, time 5256.39ms 
step 10500: train loss 2.4519, val loss 2.8605
iter 10500: loss 2.5344, time 20085.09ms 
iter 10501: loss 2.7593, time 5260.54ms 
iter 10502: loss 2.3692, time 5260.31ms 
iter 10503: loss 2.4953, time 5268.81ms 
iter 10504: loss 2.4512, time 5265.39ms 
iter 10505: loss 2.4420, time 5262.96ms 
iter 10506: loss 2.5817, time 5259.48ms 
iter 10507: loss 2.4251, time 5275.13ms 
iter 10508: loss 2.5414, time 5260.31ms 
iter 10509: loss 2.4675, time 5239.26ms 
iter 10510: loss 2.2910, time 5279.66ms 
iter 10511: loss 2.4117, time 5265.67ms 
iter 10512: loss 2.5060, time 5275.39ms 
iter 10513: loss 2.4765, time 5277.20ms 
iter 10514: loss 2.2318, time 5265.66ms 
iter 10515: loss 2.4877, time 5263.25ms 
iter 10516: loss 2.5540, time 5261.57ms 
iter 10517: loss 2.2430, time 5273.99ms 
iter 10518: loss 2.6197, time 5261.62ms 
iter 10519: loss 2.3960, time 5266.02ms 
iter 10520: loss 2.2404, time 5274.66ms 
iter 10521: loss 2.2479, time 5265.83ms 
iter 10522: loss 2.3480, time 5256.45ms 
iter 10523: loss 2.4622, time 5259.67ms 
iter 10524: loss 2.6200, time 5270.38ms 
iter 10525: loss 2.3162, time 5261.74ms 
iter 10526: loss 2.6349, time 5268.44ms 
iter 10527: loss 2.3555, time 5286.83ms 
iter 10528: loss 2.4265, time 5272.91ms 
iter 10529: loss 2.5656, time 5271.78ms 
iter 10530: loss 2.2523, time 5260.94ms 
iter 10531: loss 2.3300, time 5280.80ms 
iter 10532: loss 2.4544, time 5265.60ms 
iter 10533: loss 2.5203, time 5269.92ms 
iter 10534: loss 2.5451, time 5267.90ms 
iter 10535: loss 2.5080, time 5238.71ms 
iter 10536: loss 2.4881, time 5259.42ms 
iter 10537: loss 2.2333, time 5260.70ms 
iter 10538: loss 2.2014, time 5288.39ms 
iter 10539: loss 2.5433, time 5326.46ms 
iter 10540: loss 2.4608, time 5302.37ms 
iter 10541: loss 2.5410, time 5289.14ms 
iter 10542: loss 2.3723, time 5284.20ms 
iter 10543: loss 2.4599, time 5273.68ms 
iter 10544: loss 2.6573, time 5274.22ms 
iter 10545: loss 2.1693, time 5269.22ms 
iter 10546: loss 2.3326, time 5266.71ms 
iter 10547: loss 2.5778, time 5280.72ms 
iter 10548: loss 2.4777, time 5260.17ms 
iter 10549: loss 2.6038, time 5263.55ms 
step 10550: train loss 2.4579, val loss 2.8830
iter 10550: loss 2.4296, time 20080.94ms 
iter 10551: loss 2.3880, time 5280.37ms 
iter 10552: loss 2.4949, time 5313.22ms 
iter 10553: loss 2.2266, time 5310.92ms 
iter 10554: loss 2.3734, time 5313.23ms 
iter 10555: loss 2.3736, time 5327.05ms 
iter 10556: loss 2.3623, time 5315.34ms 
iter 10557: loss 2.4060, time 5305.85ms 
iter 10558: loss 2.5656, time 5309.03ms 
iter 10559: loss 2.5524, time 5292.77ms 
iter 10560: loss 2.3548, time 5317.55ms 
iter 10561: loss 2.4168, time 5273.16ms 
iter 10562: loss 2.3300, time 5266.72ms 
iter 10563: loss 2.5349, time 5303.90ms 
iter 10564: loss 2.4700, time 5272.02ms 
iter 10565: loss 2.5536, time 5271.42ms 
iter 10566: loss 2.5228, time 5267.10ms 
iter 10567: loss 2.2929, time 5265.20ms 
iter 10568: loss 2.6132, time 5325.60ms 
iter 10569: loss 2.3070, time 5325.08ms 
iter 10570: loss 2.5473, time 5333.27ms 
iter 10571: loss 2.3150, time 5311.86ms 
iter 10572: loss 2.3940, time 5260.47ms 
iter 10573: loss 2.6349, time 5276.00ms 
iter 10574: loss 2.2628, time 5277.54ms 
iter 10575: loss 2.4774, time 5314.39ms 
iter 10576: loss 2.3518, time 5313.63ms 
iter 10577: loss 2.4689, time 5304.28ms 
iter 10578: loss 2.3260, time 5277.09ms 
iter 10579: loss 2.4013, time 5266.71ms 
iter 10580: loss 2.5698, time 5236.89ms 
iter 10581: loss 2.2917, time 5261.49ms 
iter 10582: loss 2.6353, time 5259.79ms 
iter 10583: loss 2.2214, time 5276.25ms 
iter 10584: loss 2.4834, time 5261.01ms 
iter 10585: loss 2.4984, time 5255.97ms 
iter 10586: loss 2.4946, time 5233.05ms 
iter 10587: loss 2.2934, time 5262.20ms 
iter 10588: loss 2.2673, time 5259.40ms 
iter 10589: loss 2.5115, time 5272.01ms 
iter 10590: loss 2.3473, time 5243.36ms 
iter 10591: loss 2.4460, time 5261.64ms 
iter 10592: loss 2.5654, time 5233.25ms 
iter 10593: loss 2.3512, time 5344.94ms 
iter 10594: loss 2.4823, time 5261.27ms 
iter 10595: loss 2.6282, time 5255.94ms 
iter 10596: loss 2.3780, time 5288.22ms 
iter 10597: loss 2.4494, time 5268.04ms 
iter 10598: loss 2.5941, time 5320.41ms 
iter 10599: loss 2.4454, time 5295.93ms 
step 10600: train loss 2.4313, val loss 2.8480
iter 10600: loss 2.6378, time 20074.01ms 
iter 10601: loss 2.4376, time 5268.85ms 
iter 10602: loss 2.3275, time 5310.69ms 
iter 10603: loss 2.4554, time 5267.61ms 
iter 10604: loss 2.4610, time 5274.82ms 
iter 10605: loss 2.3987, time 5302.73ms 
iter 10606: loss 2.5107, time 5299.90ms 
iter 10607: loss 2.5939, time 5266.38ms 
iter 10608: loss 2.5417, time 5279.97ms 
iter 10609: loss 2.3752, time 5246.27ms 
iter 10610: loss 2.0984, time 5351.27ms 
iter 10611: loss 2.5922, time 5313.54ms 
iter 10612: loss 2.3825, time 5273.78ms 
iter 10613: loss 2.3952, time 5257.23ms 
iter 10614: loss 2.5063, time 5257.25ms 
iter 10615: loss 2.3539, time 5275.80ms 
iter 10616: loss 2.5409, time 5261.29ms 
iter 10617: loss 2.6861, time 5261.01ms 
iter 10618: loss 2.3729, time 5321.90ms 
iter 10619: loss 2.6089, time 5343.37ms 
iter 10620: loss 2.3678, time 5322.81ms 
iter 10621: loss 2.4473, time 5342.23ms 
iter 10622: loss 2.2986, time 5287.70ms 
iter 10623: loss 2.5335, time 5267.71ms 
iter 10624: loss 2.5130, time 5271.73ms 
iter 10625: loss 2.5843, time 5270.16ms 
iter 10626: loss 2.5820, time 5267.94ms 
iter 10627: loss 2.3565, time 5277.59ms 
iter 10628: loss 2.4886, time 5295.67ms 
iter 10629: loss 2.5186, time 5270.08ms 
iter 10630: loss 2.2493, time 5331.51ms 
iter 10631: loss 2.3060, time 5338.74ms 
iter 10632: loss 2.2289, time 5300.25ms 
iter 10633: loss 2.2340, time 5305.86ms 
iter 10634: loss 2.2557, time 5278.85ms 
iter 10635: loss 2.3779, time 5277.82ms 
iter 10636: loss 2.5622, time 5255.10ms 
iter 10637: loss 2.4453, time 5272.89ms 
iter 10638: loss 2.7876, time 5332.31ms 
iter 10639: loss 2.3923, time 5340.77ms 
iter 10640: loss 2.4775, time 5323.71ms 
iter 10641: loss 2.5268, time 5285.38ms 
iter 10642: loss 2.5146, time 5305.34ms 
iter 10643: loss 2.4955, time 5269.02ms 
iter 10644: loss 2.5358, time 5282.07ms 
iter 10645: loss 2.4849, time 5270.18ms 
iter 10646: loss 2.2331, time 5270.44ms 
iter 10647: loss 2.3907, time 5276.20ms 
iter 10648: loss 2.4763, time 5247.54ms 
iter 10649: loss 2.2184, time 5263.19ms 
