tokens per iteration will be: 491,520
Initializing a new model from scratch
config:OpenELMConfig {
  "_name_or_path": "./",
  "activation_fn_name": "swish",
  "architectures": [
    "OpenELMForCausalLM"
  ],
  "attn_pdrop": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_openelm.OpenELMConfig",
    "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "ffn_dim_divisor": 256,
  "ffn_multipliers": [
    0.5,
    0.75,
    1.0,
    1.25,
    1.5,
    1.75,
    2.0,
    2.25
  ],
  "ffn_with_glu": true,
  "head_dim": 64,
  "initializer_range": 0.02,
  "max_context_length": 1024,
  "model_dim": 1280,
  "model_type": "openelm",
  "normalization_layer_name": "rms_norm",
  "normalize_qk_projections": true,
  "num_gqa_groups": 2,
  "num_kv_heads": [
    3,
    3,
    4,
    4,
    4,
    4,
    5,
    5
  ],
  "num_query_heads": [
    10,
    12,
    12,
    14,
    16,
    18,
    18,
    20
  ],
  "num_transformer_layers": 8,
  "qkv_multipliers": [
    0.5,
    1.0
  ],
  "rope_freq_constant": 10000,
  "rope_max_length": 1024,
  "share_input_output_layers": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

config:OpenELMConfig {
  "_name_or_path": "./",
  "activation_fn_name": "swish",
  "architectures": [
    "OpenELMForCausalLM"
  ],
  "attn_pdrop": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_openelm.OpenELMConfig",
    "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "ffn_dim_divisor": 256,
  "ffn_multipliers": [
    0.5,
    0.75,
    1.0,
    1.25,
    1.5,
    1.5,
    1.75,
    2.0
  ],
  "ffn_with_glu": true,
  "head_dim": 64,
  "initializer_range": 0.02,
  "max_context_length": 1024,
  "model_dim": 954,
  "model_type": "openelm",
  "normalization_layer_name": "rms_norm",
  "normalize_qk_projections": true,
  "num_gqa_groups": 2,
  "num_kv_heads": [
    3,
    3,
    3,
    3,
    4,
    4,
    4,
    5
  ],
  "num_query_heads": [
    6,
    6,
    6,
    6,
    8,
    8,
    8,
    10
  ],
  "num_transformer_layers": 8,
  "qkv_multipliers": [
    0.5,
    1.0
  ],
  "rope_freq_constant": 10000,
  "rope_max_length": 1024,
  "share_input_output_layers": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

num decayed parameter tensors: 33, with 87,875,802 parameters
num non-decayed parameter tensors: 33, with 17,242 parameters
using fused AdamW: True
compiling the model... (takes a ~minute)
number of parameters: 87.89M
number of transformer parameters: 39.95M
step 0: train loss 11.5433, val loss 11.5212
iter 0: loss 11.6493, time 53942.03ms 
iter 1: loss 11.6434, time 5200.26ms 
iter 2: loss 11.5153, time 5255.28ms 
iter 3: loss 11.2922, time 5252.08ms 
iter 4: loss 11.2060, time 5258.22ms 
iter 5: loss 10.7959, time 5252.12ms 
iter 6: loss 10.3452, time 5325.75ms 
iter 7: loss 9.8849, time 5310.71ms 
iter 8: loss 9.4886, time 5331.64ms 
iter 9: loss 8.6256, time 5307.33ms 
iter 10: loss 8.2251, time 5295.91ms 
iter 11: loss 7.6794, time 5258.15ms 
iter 12: loss 7.5529, time 5252.91ms 
iter 13: loss 7.2108, time 5269.53ms 
iter 14: loss 7.2674, time 5325.13ms 
iter 15: loss 7.0501, time 5270.23ms 
iter 16: loss 7.2781, time 5279.94ms 
iter 17: loss 7.1987, time 5265.15ms 
iter 18: loss 7.1800, time 5291.56ms 
iter 19: loss 6.9245, time 5243.67ms 
iter 20: loss 6.9672, time 5267.19ms 
iter 21: loss 6.9656, time 5246.54ms 
iter 22: loss 6.8791, time 5254.75ms 
iter 23: loss 6.7974, time 5248.59ms 
iter 24: loss 6.6137, time 5248.61ms 
iter 25: loss 6.7296, time 5246.85ms 
iter 26: loss 6.4877, time 5257.39ms 
iter 27: loss 6.4148, time 5268.69ms 
iter 28: loss 6.3650, time 5250.14ms 
iter 29: loss 6.4998, time 5262.75ms 
iter 30: loss 6.2816, time 5326.28ms 
iter 31: loss 6.8570, time 5270.50ms 
iter 32: loss 6.2473, time 5241.46ms 
iter 33: loss 6.2595, time 5252.98ms 
iter 34: loss 6.2433, time 5245.75ms 
iter 35: loss 6.3435, time 5246.99ms 
iter 36: loss 6.6845, time 5245.77ms 
iter 37: loss 6.1741, time 5250.85ms 
iter 38: loss 6.4528, time 5289.36ms 
iter 39: loss 6.1453, time 5300.10ms 
iter 40: loss 6.1089, time 5247.69ms 
iter 41: loss 6.3420, time 5287.11ms 
iter 42: loss 6.0136, time 5251.18ms 
iter 43: loss 6.3047, time 5265.06ms 
iter 44: loss 5.9039, time 5292.63ms 
iter 45: loss 5.6820, time 5289.68ms 
iter 46: loss 5.9598, time 5332.36ms 
iter 47: loss 5.9458, time 5327.40ms 
iter 48: loss 6.0673, time 5281.75ms 
iter 49: loss 6.1324, time 5325.89ms 
step 50: train loss 5.9398, val loss 5.8811
iter 50: loss 5.9663, time 19986.98ms 
iter 51: loss 5.8749, time 5248.74ms 
iter 52: loss 5.8933, time 5251.83ms 
iter 53: loss 5.9416, time 5219.72ms 
iter 54: loss 5.5721, time 5244.78ms 
iter 55: loss 5.6418, time 5248.80ms 
iter 56: loss 5.6214, time 5248.81ms 
iter 57: loss 5.5754, time 5252.41ms 
iter 58: loss 6.0307, time 5252.82ms 
iter 59: loss 5.7026, time 5252.23ms 
iter 60: loss 5.9001, time 5252.51ms 
iter 61: loss 5.9611, time 5324.21ms 
iter 62: loss 5.8298, time 5293.44ms 
iter 63: loss 5.7934, time 5242.98ms 
iter 64: loss 5.9780, time 5243.37ms 
iter 65: loss 5.5367, time 5249.22ms 
iter 66: loss 5.5431, time 5253.44ms 
iter 67: loss 5.8943, time 5254.52ms 
iter 68: loss 5.3925, time 5251.75ms 
iter 69: loss 5.7499, time 5243.32ms 
iter 70: loss 5.7528, time 5251.36ms 
iter 71: loss 5.5912, time 5249.92ms 
iter 72: loss 5.4860, time 5247.21ms 
iter 73: loss 5.6646, time 5243.39ms 
iter 74: loss 5.3684, time 5261.25ms 
iter 75: loss 5.2993, time 5216.81ms 
iter 76: loss 5.4098, time 5287.14ms 
iter 77: loss 5.3674, time 5249.03ms 
iter 78: loss 5.2025, time 5246.53ms 
iter 79: loss 5.4533, time 5248.30ms 
iter 80: loss 5.4203, time 5283.62ms 
iter 81: loss 5.1554, time 5247.67ms 
iter 82: loss 5.3225, time 5245.70ms 
iter 83: loss 5.5347, time 5248.43ms 
iter 84: loss 5.3376, time 5245.14ms 
iter 85: loss 5.2311, time 5249.94ms 
iter 86: loss 5.4027, time 5245.97ms 
iter 87: loss 5.3104, time 5249.00ms 
iter 88: loss 5.2289, time 5249.92ms 
iter 89: loss 5.1688, time 5249.21ms 
iter 90: loss 5.3086, time 5287.68ms 
iter 91: loss 5.2795, time 5277.64ms 
iter 92: loss 5.2239, time 5225.92ms 
iter 93: loss 5.1487, time 5242.22ms 
iter 94: loss 4.9920, time 5112.33ms 
iter 95: loss 5.0155, time 5195.42ms 
iter 96: loss 5.2197, time 5240.51ms 
iter 97: loss 5.3231, time 5249.19ms 
iter 98: loss 4.9684, time 5146.73ms 
iter 99: loss 5.0068, time 5235.33ms 
step 100: train loss 5.0625, val loss 5.0054
iter 100: loss 4.9722, time 19940.37ms 
iter 101: loss 5.3097, time 5201.56ms 
iter 102: loss 5.1145, time 5166.66ms 
iter 103: loss 5.1880, time 5211.37ms 
iter 104: loss 4.9469, time 5249.56ms 
iter 105: loss 4.7715, time 5243.29ms 
iter 106: loss 4.9514, time 5211.01ms 
iter 107: loss 4.7285, time 5240.21ms 
iter 108: loss 4.7022, time 5244.06ms 
iter 109: loss 4.6808, time 5251.56ms 
iter 110: loss 4.7527, time 5245.45ms 
iter 111: loss 4.8477, time 5254.61ms 
iter 112: loss 4.9648, time 5248.10ms 
iter 113: loss 4.8816, time 5257.67ms 
iter 114: loss 4.6850, time 5254.21ms 
iter 115: loss 4.7711, time 5243.10ms 
iter 116: loss 4.7717, time 5241.43ms 
iter 117: loss 4.6560, time 5243.44ms 
iter 118: loss 4.9156, time 5247.87ms 
iter 119: loss 5.0142, time 5243.55ms 
iter 120: loss 4.9317, time 5261.36ms 
iter 121: loss 4.6749, time 5270.06ms 
iter 122: loss 4.7141, time 5262.76ms 
iter 123: loss 4.8544, time 5239.15ms 
iter 124: loss 4.7399, time 5244.86ms 
iter 125: loss 4.7003, time 5252.94ms 
iter 126: loss 4.7273, time 5255.01ms 
iter 127: loss 4.5465, time 5265.82ms 
iter 128: loss 4.4682, time 5241.14ms 
iter 129: loss 4.5968, time 5255.66ms 
iter 130: loss 4.6619, time 5250.67ms 
iter 131: loss 4.5345, time 5253.16ms 
iter 132: loss 4.4568, time 5245.61ms 
iter 133: loss 4.5418, time 5256.42ms 
iter 134: loss 4.2782, time 5287.41ms 
iter 135: loss 4.6803, time 5255.93ms 
iter 136: loss 4.3710, time 5267.55ms 
iter 137: loss 4.5547, time 5290.17ms 
iter 138: loss 4.4704, time 5267.26ms 
iter 139: loss 4.4715, time 5253.49ms 
iter 140: loss 4.6723, time 5254.68ms 
iter 141: loss 4.3958, time 5250.79ms 
iter 142: loss 4.3428, time 5247.55ms 
iter 143: loss 4.3149, time 5246.02ms 
iter 144: loss 4.5205, time 5294.96ms 
iter 145: loss 4.3444, time 5021.72ms 
iter 146: loss 4.3022, time 5169.93ms 
iter 147: loss 4.2527, time 5246.65ms 
iter 148: loss 4.3819, time 5251.61ms 
iter 149: loss 4.4557, time 5249.84ms 
step 150: train loss 4.3735, val loss 4.2852
iter 150: loss 4.1976, time 19984.28ms 
iter 151: loss 4.2134, time 5246.22ms 
iter 152: loss 4.1107, time 5243.03ms 
iter 153: loss 4.1886, time 5244.61ms 
iter 154: loss 4.1305, time 5248.86ms 
iter 155: loss 4.2948, time 5246.16ms 
iter 156: loss 4.3275, time 5223.86ms 
iter 157: loss 4.2787, time 5247.58ms 
iter 158: loss 4.2816, time 5243.06ms 
iter 159: loss 4.4085, time 5243.18ms 
iter 160: loss 4.3606, time 5247.05ms 
iter 161: loss 4.1522, time 5245.71ms 
iter 162: loss 4.2566, time 5242.33ms 
iter 163: loss 4.5563, time 5254.80ms 
iter 164: loss 4.2247, time 5254.78ms 
iter 165: loss 4.1960, time 5248.70ms 
iter 166: loss 4.1924, time 5246.18ms 
iter 167: loss 4.1703, time 5251.63ms 
iter 168: loss 4.2380, time 5245.18ms 
iter 169: loss 4.2042, time 5247.52ms 
iter 170: loss 4.4379, time 5255.87ms 
iter 171: loss 4.0130, time 5250.31ms 
iter 172: loss 4.2938, time 5242.67ms 
iter 173: loss 4.1044, time 5253.02ms 
iter 174: loss 4.1287, time 5245.44ms 
iter 175: loss 4.2104, time 5273.26ms 
iter 176: loss 4.1216, time 5282.61ms 
iter 177: loss 4.3511, time 5241.94ms 
iter 178: loss 4.1709, time 5243.25ms 
iter 179: loss 4.0454, time 5249.81ms 
iter 180: loss 4.3864, time 5262.87ms 
iter 181: loss 4.2936, time 5247.20ms 
iter 182: loss 4.2726, time 5244.29ms 
iter 183: loss 4.2245, time 5278.55ms 
iter 184: loss 4.0465, time 5251.93ms 
iter 185: loss 3.9607, time 5258.05ms 
iter 186: loss 4.2054, time 5234.24ms 
iter 187: loss 4.0272, time 5260.15ms 
iter 188: loss 4.0509, time 5246.54ms 
iter 189: loss 4.0250, time 5262.22ms 
iter 190: loss 3.9058, time 5265.64ms 
iter 191: loss 4.0060, time 5270.35ms 
iter 192: loss 4.0530, time 5266.60ms 
iter 193: loss 4.0988, time 5269.08ms 
iter 194: loss 3.8113, time 5252.66ms 
iter 195: loss 3.9937, time 5247.39ms 
iter 196: loss 3.9081, time 5325.45ms 
iter 197: loss 4.0016, time 5318.03ms 
iter 198: loss 3.8462, time 5323.30ms 
iter 199: loss 3.9068, time 5242.65ms 
step 200: train loss 4.0466, val loss 3.9652
iter 200: loss 3.8426, time 20013.38ms 
iter 201: loss 3.7625, time 5258.48ms 
iter 202: loss 3.9311, time 5277.80ms 
iter 203: loss 3.9632, time 5256.26ms 
iter 204: loss 3.9033, time 5256.64ms 
iter 205: loss 3.8628, time 5242.15ms 
iter 206: loss 4.1113, time 5261.22ms 
iter 207: loss 4.0762, time 5264.08ms 
iter 208: loss 3.9208, time 5278.90ms 
iter 209: loss 4.2424, time 5244.93ms 
iter 210: loss 4.1474, time 5238.61ms 
iter 211: loss 3.9831, time 5252.16ms 
iter 212: loss 3.9301, time 5245.18ms 
iter 213: loss 3.9501, time 5247.40ms 
iter 214: loss 4.0119, time 5248.65ms 
iter 215: loss 3.8266, time 5245.65ms 
iter 216: loss 4.0223, time 5255.93ms 
iter 217: loss 3.8784, time 5246.22ms 
iter 218: loss 3.8938, time 5241.63ms 
iter 219: loss 4.1094, time 5242.19ms 
iter 220: loss 4.0135, time 5248.91ms 
iter 221: loss 3.8546, time 5250.52ms 
iter 222: loss 3.9146, time 5246.73ms 
iter 223: loss 4.0419, time 5250.36ms 
iter 224: loss 4.0418, time 5246.94ms 
iter 225: loss 3.8873, time 5244.03ms 
iter 226: loss 4.1718, time 5244.17ms 
iter 227: loss 3.8907, time 5240.51ms 
iter 228: loss 4.0393, time 5252.14ms 
iter 229: loss 3.9718, time 5254.73ms 
iter 230: loss 3.9655, time 5240.58ms 
iter 231: loss 3.9520, time 5242.82ms 
iter 232: loss 3.7754, time 5244.21ms 
iter 233: loss 4.0804, time 5243.10ms 
iter 234: loss 4.0171, time 5241.01ms 
iter 235: loss 3.9572, time 5245.23ms 
iter 236: loss 4.0995, time 5244.60ms 
iter 237: loss 3.6391, time 5246.33ms 
iter 238: loss 3.9097, time 5251.98ms 
iter 239: loss 3.8137, time 5241.29ms 
iter 240: loss 3.9366, time 5246.61ms 
iter 241: loss 4.0940, time 5243.78ms 
iter 242: loss 4.0316, time 5246.31ms 
iter 243: loss 4.0057, time 5241.35ms 
iter 244: loss 3.8558, time 5246.76ms 
iter 245: loss 3.8277, time 5249.66ms 
iter 246: loss 4.0172, time 5239.26ms 
iter 247: loss 3.8573, time 5241.38ms 
iter 248: loss 4.0298, time 5249.70ms 
iter 249: loss 4.0629, time 5250.34ms 
step 250: train loss 3.8856, val loss 3.8361
iter 250: loss 3.7079, time 19919.72ms 
iter 251: loss 3.8310, time 5247.32ms 
iter 252: loss 3.8510, time 5249.03ms 
iter 253: loss 4.2946, time 5254.57ms 
iter 254: loss 3.9597, time 5251.18ms 
iter 255: loss 3.7503, time 5251.80ms 
iter 256: loss 4.0994, time 5257.58ms 
iter 257: loss 3.7201, time 5250.12ms 
iter 258: loss 3.8990, time 5245.96ms 
iter 259: loss 3.7620, time 5250.03ms 
iter 260: loss 3.7787, time 5245.48ms 
iter 261: loss 3.8567, time 5255.95ms 
iter 262: loss 4.1255, time 5262.00ms 
iter 263: loss 3.7573, time 5261.67ms 
iter 264: loss 3.7663, time 5253.00ms 
iter 265: loss 3.6552, time 5249.78ms 
iter 266: loss 3.7519, time 5250.15ms 
iter 267: loss 3.7858, time 5246.90ms 
iter 268: loss 3.7657, time 5248.43ms 
iter 269: loss 3.6923, time 5255.25ms 
iter 270: loss 3.7699, time 5246.39ms 
iter 271: loss 3.7616, time 5260.46ms 
iter 272: loss 3.8034, time 5252.34ms 
iter 273: loss 3.7830, time 5243.13ms 
iter 274: loss 3.7793, time 5259.29ms 
iter 275: loss 3.7597, time 5262.09ms 
iter 276: loss 3.9590, time 5258.08ms 
iter 277: loss 3.9256, time 5252.82ms 
iter 278: loss 3.9520, time 5282.57ms 
iter 279: loss 3.6776, time 5258.48ms 
iter 280: loss 3.7267, time 5248.81ms 
iter 281: loss 3.8669, time 5241.29ms 
iter 282: loss 4.0771, time 5265.85ms 
iter 283: loss 4.0928, time 5246.95ms 
iter 284: loss 3.6956, time 5253.69ms 
iter 285: loss 3.7767, time 5259.62ms 
iter 286: loss 3.8638, time 5254.48ms 
iter 287: loss 3.6785, time 5248.41ms 
iter 288: loss 3.7617, time 5251.60ms 
iter 289: loss 3.6463, time 5276.44ms 
iter 290: loss 3.8915, time 5296.13ms 
iter 291: loss 3.7671, time 5140.03ms 
iter 292: loss 3.7671, time 5170.83ms 
iter 293: loss 3.7691, time 5241.86ms 
iter 294: loss 4.0211, time 5242.29ms 
iter 295: loss 3.8263, time 5255.16ms 
iter 296: loss 3.9683, time 5246.28ms 
iter 297: loss 3.8368, time 5251.18ms 
iter 298: loss 3.9031, time 5138.94ms 
iter 299: loss 3.7825, time 5134.11ms 
step 300: train loss 3.7694, val loss 3.7445
iter 300: loss 3.7631, time 19900.79ms 
iter 301: loss 3.8388, time 5106.04ms 
iter 302: loss 3.8740, time 5144.39ms 
iter 303: loss 3.9285, time 5248.14ms 
iter 304: loss 3.6325, time 5256.79ms 
iter 305: loss 3.6483, time 5244.89ms 
iter 306: loss 3.6071, time 5230.15ms 
iter 307: loss 3.7150, time 5260.69ms 
iter 308: loss 3.7199, time 5322.40ms 
iter 309: loss 3.5671, time 5253.17ms 
iter 310: loss 3.9822, time 5309.34ms 
iter 311: loss 3.8702, time 5257.63ms 
iter 312: loss 3.9906, time 5225.68ms 
iter 313: loss 3.7930, time 5251.43ms 
iter 314: loss 3.8134, time 5257.69ms 
iter 315: loss 3.8339, time 5254.98ms 
iter 316: loss 3.9444, time 5250.60ms 
iter 317: loss 3.7170, time 5115.64ms 
iter 318: loss 4.0155, time 5208.29ms 
iter 319: loss 3.6695, time 5225.92ms 
iter 320: loss 3.8115, time 5257.85ms 
iter 321: loss 3.4985, time 5251.01ms 
iter 322: loss 3.5976, time 5251.46ms 
iter 323: loss 3.7376, time 5248.13ms 
iter 324: loss 3.6844, time 5246.57ms 
iter 325: loss 3.7355, time 5242.60ms 
iter 326: loss 3.6004, time 5232.99ms 
iter 327: loss 3.7107, time 5220.26ms 
iter 328: loss 3.6733, time 5241.60ms 
iter 329: loss 3.6355, time 5241.47ms 
iter 330: loss 3.5994, time 5226.88ms 
iter 331: loss 3.9525, time 5238.45ms 
iter 332: loss 3.6671, time 5245.58ms 
iter 333: loss 3.5313, time 5246.07ms 
iter 334: loss 3.6953, time 5225.11ms 
iter 335: loss 3.9793, time 5233.52ms 
iter 336: loss 3.5798, time 5127.57ms 
iter 337: loss 3.8999, time 5095.24ms 
iter 338: loss 3.6032, time 5069.27ms 
iter 339: loss 3.9979, time 5076.71ms 
iter 340: loss 3.8935, time 5070.82ms 
iter 341: loss 3.7158, time 5101.74ms 
iter 342: loss 3.7570, time 5109.28ms 
iter 343: loss 3.6501, time 5144.41ms 
iter 344: loss 3.7714, time 5131.84ms 
iter 345: loss 3.6529, time 5190.95ms 
iter 346: loss 3.7316, time 5115.56ms 
iter 347: loss 3.6802, time 5093.77ms 
iter 348: loss 3.8537, time 5090.16ms 
iter 349: loss 3.6392, time 5127.16ms 
step 350: train loss 3.7014, val loss 3.6632
iter 350: loss 3.8061, time 19958.64ms 
iter 351: loss 3.9125, time 5095.85ms 
iter 352: loss 3.6816, time 5115.83ms 
iter 353: loss 3.7123, time 5138.44ms 
iter 354: loss 3.8550, time 5072.86ms 
iter 355: loss 3.6732, time 5242.16ms 
iter 356: loss 3.7264, time 5299.77ms 
iter 357: loss 3.7454, time 5275.46ms 
iter 358: loss 3.6235, time 5324.43ms 
iter 359: loss 3.5638, time 5306.08ms 
iter 360: loss 3.5943, time 5279.23ms 
iter 361: loss 3.4595, time 5234.15ms 
iter 362: loss 3.8586, time 5254.23ms 
iter 363: loss 3.4767, time 5245.94ms 
iter 364: loss 3.6756, time 5254.74ms 
iter 365: loss 3.9796, time 5246.25ms 
iter 366: loss 3.9135, time 5224.49ms 
iter 367: loss 3.5812, time 5239.97ms 
iter 368: loss 3.7916, time 5243.57ms 
iter 369: loss 3.7949, time 5256.70ms 
iter 370: loss 3.6254, time 5249.05ms 
iter 371: loss 3.7253, time 5256.56ms 
iter 372: loss 3.8199, time 5246.98ms 
iter 373: loss 3.5750, time 5186.52ms 
iter 374: loss 3.7494, time 5172.03ms 
iter 375: loss 3.7353, time 5131.12ms 
iter 376: loss 3.6065, time 5205.71ms 
iter 377: loss 3.8006, time 5150.71ms 
iter 378: loss 3.5324, time 5198.77ms 
iter 379: loss 3.7386, time 5146.68ms 
iter 380: loss 3.7107, time 5177.16ms 
iter 381: loss 3.6563, time 5152.98ms 
iter 382: loss 3.6759, time 5125.49ms 
iter 383: loss 3.4969, time 5200.65ms 
iter 384: loss 3.6085, time 5223.15ms 
iter 385: loss 3.7530, time 5199.05ms 
iter 386: loss 3.6059, time 5210.22ms 
iter 387: loss 3.5871, time 5247.83ms 
iter 388: loss 3.5230, time 5252.89ms 
iter 389: loss 3.7712, time 5252.83ms 
iter 390: loss 3.6694, time 5244.88ms 
iter 391: loss 3.7990, time 5229.20ms 
iter 392: loss 3.6869, time 5289.11ms 
iter 393: loss 3.5971, time 5253.56ms 
iter 394: loss 3.6074, time 5246.40ms 
iter 395: loss 3.5299, time 5250.65ms 
iter 396: loss 3.5012, time 5244.74ms 
iter 397: loss 3.6853, time 5252.00ms 
iter 398: loss 3.6905, time 5266.73ms 
iter 399: loss 3.5158, time 5290.76ms 
step 400: train loss 3.6283, val loss 3.5888
iter 400: loss 3.6295, time 19985.86ms 
iter 401: loss 3.7907, time 5251.60ms 
iter 402: loss 3.6418, time 5247.19ms 
iter 403: loss 3.5489, time 5247.62ms 
iter 404: loss 3.5583, time 5275.10ms 
iter 405: loss 3.7423, time 5179.63ms 
iter 406: loss 3.7762, time 5266.35ms 
iter 407: loss 3.5774, time 5302.65ms 
iter 408: loss 3.5017, time 5246.09ms 
iter 409: loss 3.7430, time 5257.20ms 
iter 410: loss 3.4720, time 5263.16ms 
iter 411: loss 3.6432, time 5273.89ms 
iter 412: loss 3.5510, time 5313.02ms 
iter 413: loss 3.5689, time 5250.38ms 
iter 414: loss 3.5603, time 5233.15ms 
iter 415: loss 3.5344, time 5232.76ms 
iter 416: loss 3.5964, time 5235.21ms 
iter 417: loss 3.7810, time 5248.78ms 
iter 418: loss 3.7421, time 5264.86ms 
iter 419: loss 3.5396, time 5250.81ms 
iter 420: loss 3.4602, time 5176.46ms 
iter 421: loss 3.6392, time 5115.12ms 
iter 422: loss 3.5614, time 5232.72ms 
iter 423: loss 3.6271, time 5236.07ms 
iter 424: loss 3.6613, time 5244.85ms 
iter 425: loss 3.7032, time 5213.92ms 
iter 426: loss 3.4706, time 5193.59ms 
iter 427: loss 3.4338, time 5215.05ms 
iter 428: loss 3.5842, time 5234.21ms 
iter 429: loss 3.7082, time 5139.28ms 
iter 430: loss 3.4180, time 5203.58ms 
iter 431: loss 3.6402, time 5245.68ms 
iter 432: loss 3.5194, time 5236.77ms 
iter 433: loss 3.4622, time 5258.20ms 
iter 434: loss 3.7352, time 5251.65ms 
iter 435: loss 3.7272, time 5247.65ms 
iter 436: loss 3.6373, time 5238.18ms 
iter 437: loss 3.5619, time 5137.55ms 
iter 438: loss 3.4999, time 5250.36ms 
iter 439: loss 3.5959, time 5251.32ms 
iter 440: loss 3.4693, time 5305.09ms 
iter 441: loss 3.6193, time 5273.93ms 
iter 442: loss 3.4839, time 5278.12ms 
iter 443: loss 3.4421, time 5303.37ms 
iter 444: loss 3.7122, time 5263.34ms 
iter 445: loss 3.5621, time 5278.00ms 
iter 446: loss 3.4138, time 5297.41ms 
iter 447: loss 3.6773, time 5310.98ms 
iter 448: loss 3.2550, time 5287.15ms 
iter 449: loss 3.4091, time 5266.70ms 
step 450: train loss 3.5674, val loss 3.5368
iter 450: loss 3.5370, time 20062.21ms 
iter 451: loss 3.5450, time 5252.96ms 
iter 452: loss 3.6699, time 5252.04ms 
iter 453: loss 3.6459, time 5254.77ms 
iter 454: loss 3.5987, time 5253.99ms 
iter 455: loss 3.5105, time 5248.66ms 
iter 456: loss 3.2958, time 5251.44ms 
iter 457: loss 3.3944, time 5253.54ms 
iter 458: loss 3.5713, time 5260.08ms 
iter 459: loss 3.6524, time 5313.29ms 
iter 460: loss 3.6734, time 5298.85ms 
iter 461: loss 3.7220, time 5249.29ms 
iter 462: loss 3.5028, time 5319.93ms 
iter 463: loss 3.5457, time 5314.16ms 
iter 464: loss 3.8129, time 5266.87ms 
iter 465: loss 3.5381, time 5257.99ms 
iter 466: loss 3.5304, time 5249.84ms 
iter 467: loss 3.4997, time 5225.17ms 
iter 468: loss 3.5094, time 5253.15ms 
iter 469: loss 3.3634, time 5286.79ms 
iter 470: loss 3.5726, time 5273.96ms 
iter 471: loss 3.5504, time 5273.88ms 
iter 472: loss 3.4674, time 5277.04ms 
iter 473: loss 3.5642, time 5280.41ms 
iter 474: loss 3.7692, time 5273.56ms 
iter 475: loss 3.3693, time 5254.18ms 
iter 476: loss 3.5467, time 5249.72ms 
iter 477: loss 3.3733, time 5248.39ms 
iter 478: loss 3.4974, time 5247.89ms 
iter 479: loss 3.5220, time 5246.85ms 
iter 480: loss 3.3603, time 5246.57ms 
iter 481: loss 3.4460, time 5245.01ms 
iter 482: loss 3.3951, time 5236.59ms 
iter 483: loss 3.6733, time 5223.36ms 
iter 484: loss 3.6869, time 5245.85ms 
iter 485: loss 3.5369, time 5246.31ms 
iter 486: loss 3.5339, time 5253.13ms 
iter 487: loss 3.4249, time 5256.96ms 
iter 488: loss 3.5933, time 5245.74ms 
iter 489: loss 3.3009, time 5251.43ms 
iter 490: loss 3.5004, time 5252.87ms 
iter 491: loss 3.6371, time 5246.11ms 
iter 492: loss 3.3646, time 5252.21ms 
iter 493: loss 3.4013, time 5254.55ms 
iter 494: loss 3.7169, time 5254.62ms 
iter 495: loss 3.5186, time 5248.42ms 
iter 496: loss 3.2644, time 5247.03ms 
iter 497: loss 3.6587, time 5231.42ms 
iter 498: loss 3.6390, time 5217.38ms 
iter 499: loss 3.4501, time 5244.82ms 
step 500: train loss 3.5231, val loss 3.4996
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 500: loss 3.3221, time 21697.30ms 
iter 501: loss 3.6474, time 5264.11ms 
iter 502: loss 3.5165, time 5313.04ms 
iter 503: loss 3.4912, time 5275.29ms 
iter 504: loss 3.4308, time 5260.88ms 
iter 505: loss 3.4558, time 5257.15ms 
iter 506: loss 3.8934, time 5251.87ms 
iter 507: loss 3.5389, time 5286.01ms 
iter 508: loss 3.4009, time 5265.15ms 
iter 509: loss 3.4819, time 5266.10ms 
iter 510: loss 3.4430, time 5252.50ms 
iter 511: loss 3.5307, time 5250.65ms 
iter 512: loss 3.5229, time 5251.66ms 
iter 513: loss 3.4350, time 5323.33ms 
iter 514: loss 3.3914, time 5326.48ms 
iter 515: loss 3.2763, time 5273.30ms 
iter 516: loss 3.6128, time 5264.93ms 
iter 517: loss 3.3922, time 5259.07ms 
iter 518: loss 3.6979, time 5259.01ms 
iter 519: loss 3.6038, time 5262.49ms 
iter 520: loss 3.6528, time 5260.48ms 
iter 521: loss 3.4756, time 5266.84ms 
iter 522: loss 3.3855, time 5263.65ms 
iter 523: loss 3.5979, time 5259.42ms 
iter 524: loss 3.5118, time 5264.07ms 
iter 525: loss 3.2066, time 5260.44ms 
iter 526: loss 3.4406, time 5262.03ms 
iter 527: loss 3.4762, time 5262.63ms 
iter 528: loss 3.5135, time 5254.30ms 
iter 529: loss 3.3890, time 5252.58ms 
iter 530: loss 3.4011, time 5249.92ms 
iter 531: loss 3.4513, time 5295.38ms 
iter 532: loss 3.7199, time 5252.60ms 
iter 533: loss 3.4847, time 5276.31ms 
iter 534: loss 3.5364, time 5259.24ms 
iter 535: loss 3.7226, time 5299.91ms 
iter 536: loss 3.3256, time 5253.32ms 
iter 537: loss 3.3545, time 5250.90ms 
iter 538: loss 3.3269, time 5250.15ms 
iter 539: loss 3.5404, time 5255.70ms 
iter 540: loss 3.5110, time 5250.94ms 
iter 541: loss 3.4079, time 5254.04ms 
iter 542: loss 3.5626, time 5184.35ms 
iter 543: loss 3.3639, time 5256.14ms 
iter 544: loss 3.5166, time 5246.99ms 
iter 545: loss 3.5113, time 5256.77ms 
iter 546: loss 3.5190, time 5250.93ms 
iter 547: loss 3.4847, time 5252.06ms 
iter 548: loss 3.5495, time 5247.27ms 
iter 549: loss 3.3684, time 5258.62ms 
step 550: train loss 3.4589, val loss 3.4385
iter 550: loss 3.4068, time 20037.30ms 
iter 551: loss 3.3939, time 5268.32ms 
iter 552: loss 3.3170, time 5263.97ms 
iter 553: loss 3.3432, time 5263.25ms 
iter 554: loss 3.4341, time 5273.84ms 
iter 555: loss 3.3816, time 5248.92ms 
iter 556: loss 3.5462, time 5245.70ms 
iter 557: loss 3.4958, time 5297.49ms 
iter 558: loss 3.5706, time 5282.45ms 
iter 559: loss 3.4554, time 5253.28ms 
iter 560: loss 3.2159, time 5247.03ms 
iter 561: loss 3.3393, time 5260.61ms 
iter 562: loss 3.5151, time 5260.50ms 
iter 563: loss 3.4711, time 5262.07ms 
iter 564: loss 3.3530, time 5297.79ms 
iter 565: loss 3.4558, time 5302.64ms 
iter 566: loss 3.4113, time 5317.90ms 
iter 567: loss 3.4345, time 5280.59ms 
iter 568: loss 3.4733, time 5258.31ms 
iter 569: loss 3.3109, time 5288.42ms 
iter 570: loss 3.6428, time 5268.13ms 
iter 571: loss 3.3209, time 5263.41ms 
iter 572: loss 3.2663, time 5271.92ms 
iter 573: loss 3.4498, time 5255.21ms 
iter 574: loss 3.5393, time 5246.95ms 
iter 575: loss 3.3413, time 5251.84ms 
iter 576: loss 3.3497, time 5256.24ms 
iter 577: loss 3.3358, time 5253.98ms 
iter 578: loss 3.3950, time 5251.70ms 
iter 579: loss 3.5043, time 5311.03ms 
iter 580: loss 3.3350, time 5246.40ms 
iter 581: loss 3.5897, time 5252.58ms 
iter 582: loss 3.4787, time 5259.22ms 
iter 583: loss 3.3746, time 5254.17ms 
iter 584: loss 3.2721, time 5267.12ms 
iter 585: loss 3.5853, time 5262.30ms 
iter 586: loss 3.3581, time 5259.01ms 
iter 587: loss 3.3559, time 5266.17ms 
iter 588: loss 3.5367, time 5246.17ms 
iter 589: loss 3.2025, time 5261.30ms 
iter 590: loss 3.4708, time 5301.55ms 
iter 591: loss 3.4777, time 5250.48ms 
iter 592: loss 3.6120, time 5232.80ms 
iter 593: loss 3.5895, time 5254.86ms 
iter 594: loss 3.3850, time 5252.89ms 
iter 595: loss 3.4715, time 5254.13ms 
iter 596: loss 3.8525, time 5249.08ms 
iter 597: loss 3.4154, time 5248.00ms 
iter 598: loss 3.4125, time 5248.09ms 
iter 599: loss 3.3841, time 5246.12ms 
step 600: train loss 3.4403, val loss 3.3973
iter 600: loss 3.4455, time 20019.60ms 
iter 601: loss 3.3765, time 5249.65ms 
iter 602: loss 3.2392, time 5256.33ms 
iter 603: loss 3.3058, time 5251.48ms 
iter 604: loss 3.3686, time 5248.10ms 
iter 605: loss 3.2273, time 5255.23ms 
iter 606: loss 3.3218, time 5252.75ms 
iter 607: loss 3.4005, time 5255.69ms 
iter 608: loss 3.6553, time 5256.90ms 
iter 609: loss 3.3142, time 5257.76ms 
iter 610: loss 3.6480, time 5254.14ms 
iter 611: loss 3.4678, time 5264.87ms 
iter 612: loss 3.7347, time 5256.74ms 
iter 613: loss 3.4072, time 5259.39ms 
iter 614: loss 3.2650, time 5294.40ms 
iter 615: loss 3.5633, time 5306.00ms 
iter 616: loss 3.3484, time 5256.29ms 
iter 617: loss 3.4275, time 5237.94ms 
iter 618: loss 3.4789, time 5263.93ms 
iter 619: loss 3.2935, time 5260.56ms 
iter 620: loss 3.6275, time 5262.06ms 
iter 621: loss 3.3414, time 5259.10ms 
iter 622: loss 3.2051, time 5260.01ms 
iter 623: loss 3.3082, time 5258.13ms 
iter 624: loss 3.3158, time 5260.84ms 
iter 625: loss 3.3326, time 5270.29ms 
iter 626: loss 3.6012, time 5273.07ms 
iter 627: loss 3.4413, time 5254.64ms 
iter 628: loss 3.3673, time 5255.25ms 
iter 629: loss 3.2304, time 5294.79ms 
iter 630: loss 3.5296, time 5266.77ms 
iter 631: loss 3.5319, time 5255.16ms 
iter 632: loss 3.2145, time 5253.40ms 
iter 633: loss 3.3844, time 5253.66ms 
iter 634: loss 3.4178, time 5249.75ms 
iter 635: loss 3.5094, time 5250.36ms 
iter 636: loss 3.2435, time 5194.39ms 
iter 637: loss 3.4124, time 5249.80ms 
iter 638: loss 3.2205, time 5250.45ms 
iter 639: loss 3.3519, time 5255.09ms 
iter 640: loss 3.3408, time 5252.49ms 
iter 641: loss 3.5245, time 5249.49ms 
iter 642: loss 3.3872, time 5246.91ms 
iter 643: loss 3.3156, time 5232.38ms 
iter 644: loss 3.3441, time 5228.26ms 
iter 645: loss 3.3999, time 5238.42ms 
iter 646: loss 3.2629, time 5250.44ms 
iter 647: loss 3.3947, time 5260.46ms 
iter 648: loss 3.4477, time 5256.03ms 
iter 649: loss 3.3768, time 5245.84ms 
step 650: train loss 3.3662, val loss 3.3552
iter 650: loss 3.4121, time 20023.61ms 
iter 651: loss 3.2391, time 5260.09ms 
iter 652: loss 3.6114, time 5245.59ms 
iter 653: loss 3.3761, time 5248.43ms 
iter 654: loss 3.2584, time 5246.65ms 
iter 655: loss 3.3388, time 5248.17ms 
iter 656: loss 3.2292, time 5252.56ms 
iter 657: loss 3.3110, time 5246.36ms 
iter 658: loss 3.4799, time 5253.78ms 
iter 659: loss 3.3654, time 5261.04ms 
iter 660: loss 3.2997, time 5267.75ms 
iter 661: loss 3.3712, time 5267.05ms 
iter 662: loss 3.3327, time 5264.49ms 
iter 663: loss 3.3471, time 5264.27ms 
iter 664: loss 3.4725, time 5262.79ms 
iter 665: loss 3.3270, time 5255.02ms 
iter 666: loss 3.2676, time 5256.47ms 
iter 667: loss 3.3515, time 5270.51ms 
iter 668: loss 3.1791, time 5266.29ms 
iter 669: loss 3.4880, time 5254.65ms 
iter 670: loss 3.2974, time 5246.48ms 
iter 671: loss 3.1308, time 5249.97ms 
iter 672: loss 3.4823, time 5246.82ms 
iter 673: loss 3.1278, time 5246.30ms 
iter 674: loss 3.4005, time 5248.95ms 
iter 675: loss 3.5130, time 5249.50ms 
iter 676: loss 3.4066, time 5240.33ms 
iter 677: loss 3.4762, time 5232.50ms 
iter 678: loss 3.4965, time 5258.18ms 
iter 679: loss 3.3634, time 5253.69ms 
iter 680: loss 3.4365, time 5244.43ms 
iter 681: loss 3.2968, time 5259.56ms 
iter 682: loss 3.2930, time 5225.82ms 
iter 683: loss 3.4672, time 5253.26ms 
iter 684: loss 3.3795, time 5248.39ms 
iter 685: loss 3.2389, time 5254.00ms 
iter 686: loss 3.3116, time 5244.71ms 
iter 687: loss 3.4402, time 5257.80ms 
iter 688: loss 3.1466, time 5260.91ms 
iter 689: loss 3.3251, time 5258.20ms 
iter 690: loss 3.2842, time 5249.48ms 
iter 691: loss 3.3416, time 5253.20ms 
iter 692: loss 3.4831, time 5262.07ms 
iter 693: loss 3.5091, time 5255.38ms 
iter 694: loss 3.3463, time 5256.65ms 
iter 695: loss 3.1555, time 5253.31ms 
iter 696: loss 3.4705, time 5223.18ms 
iter 697: loss 3.2046, time 5262.12ms 
iter 698: loss 3.2764, time 5249.52ms 
iter 699: loss 3.4874, time 5245.71ms 
step 700: train loss 3.3347, val loss 3.3174
iter 700: loss 3.2622, time 20024.65ms 
iter 701: loss 3.3056, time 5250.28ms 
iter 702: loss 3.2586, time 5266.35ms 
iter 703: loss 3.3780, time 5249.02ms 
iter 704: loss 3.3905, time 5251.51ms 
iter 705: loss 3.2793, time 5246.86ms 
iter 706: loss 3.5603, time 5235.64ms 
iter 707: loss 3.3993, time 5248.90ms 
iter 708: loss 3.2901, time 5243.40ms 
iter 709: loss 3.3609, time 5253.29ms 
iter 710: loss 3.2996, time 5244.04ms 
iter 711: loss 3.3150, time 5249.86ms 
iter 712: loss 3.5853, time 5260.95ms 
iter 713: loss 3.3529, time 5250.83ms 
iter 714: loss 3.2424, time 5243.18ms 
iter 715: loss 3.3213, time 5250.45ms 
iter 716: loss 3.2723, time 5249.35ms 
iter 717: loss 3.3986, time 5247.71ms 
iter 718: loss 3.2182, time 5250.11ms 
iter 719: loss 3.2233, time 5247.04ms 
iter 720: loss 3.6053, time 5246.57ms 
iter 721: loss 3.4053, time 5247.26ms 
iter 722: loss 3.1600, time 5256.94ms 
iter 723: loss 3.3360, time 5246.87ms 
iter 724: loss 3.2749, time 5245.83ms 
iter 725: loss 3.1075, time 5249.51ms 
iter 726: loss 3.2324, time 5245.38ms 
iter 727: loss 3.3087, time 5250.57ms 
iter 728: loss 3.3161, time 5252.13ms 
iter 729: loss 3.2084, time 5255.84ms 
iter 730: loss 3.2518, time 5255.15ms 
iter 731: loss 3.2613, time 5260.61ms 
iter 732: loss 3.0592, time 5252.92ms 
iter 733: loss 3.2700, time 5246.41ms 
iter 734: loss 3.1037, time 5250.98ms 
iter 735: loss 3.3152, time 5245.44ms 
iter 736: loss 3.3392, time 5245.56ms 
iter 737: loss 3.2745, time 5252.30ms 
iter 738: loss 3.5418, time 5260.14ms 
iter 739: loss 3.2933, time 5254.26ms 
iter 740: loss 3.2375, time 5253.12ms 
iter 741: loss 3.3576, time 5253.16ms 
iter 742: loss 3.4732, time 5244.76ms 
iter 743: loss 3.1964, time 5248.69ms 
iter 744: loss 3.2874, time 5248.65ms 
iter 745: loss 3.1027, time 5247.09ms 
iter 746: loss 3.3190, time 5250.29ms 
iter 747: loss 3.0369, time 5251.65ms 
iter 748: loss 3.2401, time 5246.81ms 
iter 749: loss 3.4166, time 5244.56ms 
step 750: train loss 3.3040, val loss 3.2884
iter 750: loss 3.2563, time 20007.72ms 
iter 751: loss 3.3989, time 5243.76ms 
iter 752: loss 3.2445, time 5265.85ms 
iter 753: loss 3.2713, time 5253.49ms 
iter 754: loss 3.6088, time 5256.50ms 
iter 755: loss 3.2693, time 5256.93ms 
iter 756: loss 3.2297, time 5219.40ms 
iter 757: loss 3.1402, time 5221.25ms 
iter 758: loss 3.4484, time 5197.25ms 
iter 759: loss 3.2730, time 5237.88ms 
iter 760: loss 3.3119, time 5257.75ms 
iter 761: loss 3.3556, time 5216.24ms 
iter 762: loss 3.2003, time 5230.75ms 
iter 763: loss 3.4316, time 5263.50ms 
iter 764: loss 3.3137, time 5272.07ms 
iter 765: loss 3.2596, time 5251.62ms 
iter 766: loss 3.2647, time 5256.85ms 
iter 767: loss 3.2347, time 5253.13ms 
iter 768: loss 3.1811, time 5254.16ms 
iter 769: loss 3.1105, time 5271.14ms 
iter 770: loss 3.2320, time 5243.96ms 
iter 771: loss 3.3761, time 5306.83ms 
iter 772: loss 3.2493, time 5249.58ms 
iter 773: loss 3.1606, time 5259.65ms 
iter 774: loss 3.3492, time 5319.79ms 
iter 775: loss 3.1344, time 5325.11ms 
iter 776: loss 3.3598, time 5278.64ms 
iter 777: loss 3.2232, time 5252.63ms 
iter 778: loss 3.2256, time 5283.60ms 
iter 779: loss 3.1516, time 5292.13ms 
iter 780: loss 3.2133, time 5303.50ms 
iter 781: loss 3.3634, time 5268.65ms 
iter 782: loss 3.1594, time 5255.27ms 
iter 783: loss 3.2437, time 5220.30ms 
iter 784: loss 3.0681, time 5217.05ms 
iter 785: loss 3.2747, time 5212.90ms 
iter 786: loss 3.3052, time 5227.61ms 
iter 787: loss 3.2726, time 5281.92ms 
iter 788: loss 3.1984, time 5273.78ms 
iter 789: loss 3.3593, time 5275.41ms 
iter 790: loss 3.3227, time 5282.03ms 
iter 791: loss 3.1542, time 5309.57ms 
iter 792: loss 3.2390, time 5246.82ms 
iter 793: loss 3.3300, time 5257.93ms 
iter 794: loss 3.4580, time 5257.99ms 
iter 795: loss 3.2074, time 5249.60ms 
iter 796: loss 3.3304, time 5314.03ms 
iter 797: loss 3.2004, time 5323.37ms 
iter 798: loss 3.1381, time 5317.16ms 
iter 799: loss 3.2338, time 5294.01ms 
step 800: train loss 3.2775, val loss 3.2535
iter 800: loss 3.1620, time 20089.08ms 
iter 801: loss 3.3419, time 5253.44ms 
iter 802: loss 3.2584, time 5294.09ms 
iter 803: loss 3.3169, time 5315.98ms 
iter 804: loss 3.1317, time 5332.84ms 
iter 805: loss 3.1480, time 5184.99ms 
iter 806: loss 3.1904, time 5256.31ms 
iter 807: loss 3.2625, time 5247.50ms 
iter 808: loss 3.1360, time 5254.45ms 
iter 809: loss 3.1715, time 5259.57ms 
iter 810: loss 3.1492, time 5259.50ms 
iter 811: loss 3.2318, time 5254.25ms 
iter 812: loss 3.1901, time 5265.50ms 
iter 813: loss 3.2407, time 5259.74ms 
iter 814: loss 3.0755, time 5265.01ms 
iter 815: loss 3.3604, time 5263.44ms 
iter 816: loss 3.2206, time 5247.58ms 
iter 817: loss 3.2332, time 5274.01ms 
iter 818: loss 3.0492, time 5268.87ms 
iter 819: loss 3.1382, time 5260.97ms 
iter 820: loss 3.0834, time 5258.64ms 
iter 821: loss 3.3054, time 5263.53ms 
iter 822: loss 3.4954, time 5255.81ms 
iter 823: loss 3.1785, time 5264.23ms 
iter 824: loss 3.1425, time 5261.57ms 
iter 825: loss 3.2568, time 5267.34ms 
iter 826: loss 3.1678, time 5269.07ms 
iter 827: loss 3.2171, time 5261.85ms 
iter 828: loss 3.4188, time 5259.15ms 
iter 829: loss 3.4250, time 5259.87ms 
iter 830: loss 3.1885, time 5263.69ms 
iter 831: loss 3.2455, time 5263.83ms 
iter 832: loss 3.2561, time 5299.34ms 
iter 833: loss 3.0892, time 5306.63ms 
iter 834: loss 3.4159, time 5271.39ms 
iter 835: loss 3.1365, time 5303.92ms 
iter 836: loss 3.1590, time 5252.15ms 
iter 837: loss 3.1127, time 5253.55ms 
iter 838: loss 3.2845, time 5316.18ms 
iter 839: loss 3.2667, time 5235.72ms 
iter 840: loss 3.3754, time 5240.03ms 
iter 841: loss 3.2596, time 5218.75ms 
iter 842: loss 3.1186, time 5225.49ms 
iter 843: loss 3.1990, time 5217.99ms 
iter 844: loss 3.2734, time 5226.69ms 
iter 845: loss 3.2209, time 5161.34ms 
iter 846: loss 3.0878, time 5192.06ms 
iter 847: loss 3.2844, time 5258.66ms 
iter 848: loss 3.2531, time 5216.54ms 
iter 849: loss 3.1958, time 5205.63ms 
step 850: train loss 3.2330, val loss 3.2243
iter 850: loss 3.1182, time 19965.32ms 
iter 851: loss 3.1134, time 5217.53ms 
iter 852: loss 3.4526, time 5281.48ms 
iter 853: loss 3.3465, time 5299.21ms 
iter 854: loss 3.1473, time 5253.56ms 
iter 855: loss 3.2182, time 5225.00ms 
iter 856: loss 3.2441, time 5172.88ms 
iter 857: loss 3.3803, time 5225.76ms 
iter 858: loss 3.3840, time 5235.44ms 
iter 859: loss 3.1526, time 5219.86ms 
iter 860: loss 3.2544, time 5216.04ms 
iter 861: loss 3.0656, time 5219.05ms 
iter 862: loss 3.5658, time 5218.42ms 
iter 863: loss 3.3142, time 5254.96ms 
iter 864: loss 3.1780, time 5257.59ms 
iter 865: loss 3.2207, time 5251.75ms 
iter 866: loss 3.0539, time 5249.37ms 
iter 867: loss 3.4116, time 5249.61ms 
iter 868: loss 3.4316, time 5252.40ms 
iter 869: loss 3.0825, time 5252.89ms 
iter 870: loss 3.2154, time 5246.91ms 
iter 871: loss 3.0876, time 5253.06ms 
iter 872: loss 3.2034, time 5274.85ms 
iter 873: loss 3.1848, time 5336.91ms 
iter 874: loss 3.3362, time 5315.48ms 
iter 875: loss 3.1845, time 5287.39ms 
iter 876: loss 3.2029, time 5266.83ms 
iter 877: loss 3.2094, time 5280.68ms 
iter 878: loss 3.4072, time 5289.56ms 
iter 879: loss 3.2186, time 5269.52ms 
iter 880: loss 3.0304, time 5258.85ms 
iter 881: loss 3.3483, time 5258.68ms 
iter 882: loss 3.2919, time 5269.64ms 
iter 883: loss 3.3519, time 5262.21ms 
iter 884: loss 3.3550, time 5280.01ms 
iter 885: loss 3.2914, time 5251.04ms 
iter 886: loss 3.2160, time 5284.14ms 
iter 887: loss 3.1928, time 5206.32ms 
iter 888: loss 3.2704, time 5250.59ms 
iter 889: loss 3.2769, time 5238.94ms 
iter 890: loss 3.0817, time 5231.15ms 
iter 891: loss 3.3162, time 5206.48ms 
iter 892: loss 3.5048, time 5247.89ms 
iter 893: loss 3.2667, time 5261.45ms 
iter 894: loss 3.1135, time 5266.42ms 
iter 895: loss 3.2305, time 5252.07ms 
iter 896: loss 3.2414, time 5245.89ms 
iter 897: loss 3.1195, time 5249.88ms 
iter 898: loss 3.2492, time 5252.96ms 
iter 899: loss 3.4403, time 5255.94ms 
step 900: train loss 3.2010, val loss 3.2174
iter 900: loss 3.1282, time 20043.68ms 
iter 901: loss 3.2150, time 5257.75ms 
iter 902: loss 3.1795, time 5261.11ms 
iter 903: loss 3.0160, time 5254.27ms 
iter 904: loss 3.4621, time 5249.93ms 
iter 905: loss 3.2979, time 5260.52ms 
iter 906: loss 3.1001, time 5262.78ms 
iter 907: loss 3.4895, time 5265.29ms 
iter 908: loss 3.1917, time 5254.76ms 
iter 909: loss 3.2183, time 5255.63ms 
iter 910: loss 3.1954, time 5246.41ms 
iter 911: loss 2.8864, time 5244.93ms 
iter 912: loss 3.0445, time 5223.53ms 
iter 913: loss 3.2411, time 5227.87ms 
iter 914: loss 3.0149, time 5232.68ms 
iter 915: loss 3.1921, time 5242.82ms 
iter 916: loss 3.1706, time 5255.04ms 
iter 917: loss 3.2362, time 5257.27ms 
iter 918: loss 3.0976, time 5261.01ms 
iter 919: loss 3.0235, time 5257.64ms 
iter 920: loss 3.2330, time 5257.48ms 
iter 921: loss 3.2532, time 5256.38ms 
iter 922: loss 3.2943, time 5252.14ms 
iter 923: loss 3.1632, time 5250.38ms 
iter 924: loss 3.1832, time 5255.39ms 
iter 925: loss 3.0308, time 5261.55ms 
iter 926: loss 3.1420, time 5300.67ms 
iter 927: loss 3.1845, time 5256.30ms 
iter 928: loss 2.9387, time 5241.59ms 
iter 929: loss 3.1196, time 5261.16ms 
iter 930: loss 3.2513, time 5084.75ms 
iter 931: loss 3.4550, time 5206.95ms 
iter 932: loss 3.1561, time 5277.59ms 
iter 933: loss 3.0660, time 5257.43ms 
iter 934: loss 3.2940, time 5258.40ms 
iter 935: loss 3.1382, time 5260.81ms 
iter 936: loss 3.1855, time 5239.91ms 
iter 937: loss 3.1979, time 5239.91ms 
iter 938: loss 2.9486, time 5252.00ms 
iter 939: loss 3.3304, time 5265.19ms 
iter 940: loss 3.2591, time 5260.49ms 
iter 941: loss 3.1518, time 5256.96ms 
iter 942: loss 3.1601, time 5251.85ms 
iter 943: loss 3.1729, time 5261.43ms 
iter 944: loss 3.2801, time 5256.99ms 
iter 945: loss 3.0586, time 5261.83ms 
iter 946: loss 3.1107, time 5251.07ms 
iter 947: loss 3.1097, time 5247.03ms 
iter 948: loss 3.0466, time 5256.28ms 
iter 949: loss 3.1464, time 5272.97ms 
step 950: train loss 3.1768, val loss 3.1850
iter 950: loss 2.9870, time 20018.38ms 
iter 951: loss 3.1389, time 5251.35ms 
iter 952: loss 3.2248, time 5245.36ms 
iter 953: loss 2.9368, time 5226.12ms 
iter 954: loss 3.1328, time 5247.75ms 
iter 955: loss 3.2741, time 5251.38ms 
iter 956: loss 3.1795, time 5264.37ms 
iter 957: loss 3.1540, time 5268.11ms 
iter 958: loss 3.3248, time 5267.48ms 
iter 959: loss 3.2791, time 5267.38ms 
iter 960: loss 3.3443, time 5236.23ms 
iter 961: loss 3.1857, time 5323.28ms 
iter 962: loss 3.3296, time 5314.27ms 
iter 963: loss 3.2103, time 5291.09ms 
iter 964: loss 3.2641, time 5262.39ms 
iter 965: loss 2.9951, time 5277.19ms 
iter 966: loss 3.1554, time 5263.63ms 
iter 967: loss 3.2253, time 5249.78ms 
iter 968: loss 3.0167, time 5132.34ms 
iter 969: loss 3.1670, time 5022.74ms 
iter 970: loss 3.0761, time 5059.35ms 
iter 971: loss 2.9455, time 5249.52ms 
iter 972: loss 3.2290, time 5251.69ms 
iter 973: loss 2.9468, time 5251.09ms 
iter 974: loss 3.1303, time 5246.82ms 
iter 975: loss 3.1864, time 5248.26ms 
iter 976: loss 3.2640, time 5245.33ms 
iter 977: loss 3.1525, time 5246.62ms 
iter 978: loss 3.0870, time 5249.88ms 
iter 979: loss 2.9360, time 5248.03ms 
iter 980: loss 3.2055, time 5247.24ms 
iter 981: loss 3.2557, time 5251.27ms 
iter 982: loss 3.1464, time 5087.86ms 
iter 983: loss 3.1567, time 5223.22ms 
iter 984: loss 3.2576, time 5308.74ms 
iter 985: loss 3.0875, time 5258.70ms 
iter 986: loss 3.1555, time 5263.07ms 
iter 987: loss 3.3185, time 5259.11ms 
iter 988: loss 3.1631, time 5249.78ms 
iter 989: loss 3.1573, time 5254.20ms 
iter 990: loss 3.2761, time 5247.73ms 
iter 991: loss 3.1830, time 5255.88ms 
iter 992: loss 2.9191, time 5257.90ms 
iter 993: loss 3.0610, time 5309.93ms 
iter 994: loss 3.0854, time 5269.24ms 
iter 995: loss 3.2940, time 5253.35ms 
iter 996: loss 3.1691, time 5262.00ms 
iter 997: loss 3.2217, time 5263.91ms 
iter 998: loss 3.2683, time 5249.69ms 
iter 999: loss 3.1641, time 5254.84ms 
step 1000: train loss 3.1513, val loss 3.1782
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 1000: loss 3.3015, time 21153.53ms 
iter 1001: loss 2.9805, time 5260.15ms 
iter 1002: loss 3.3448, time 5255.62ms 
iter 1003: loss 3.3925, time 5244.49ms 
iter 1004: loss 3.1508, time 5187.43ms 
iter 1005: loss 3.2950, time 5251.48ms 
iter 1006: loss 3.2868, time 5247.91ms 
iter 1007: loss 3.1648, time 5248.70ms 
iter 1008: loss 3.2027, time 5244.52ms 
iter 1009: loss 3.4096, time 5253.55ms 
iter 1010: loss 3.0665, time 5252.22ms 
iter 1011: loss 3.2696, time 5296.75ms 
iter 1012: loss 3.1268, time 5257.62ms 
iter 1013: loss 3.1625, time 5259.48ms 
iter 1014: loss 3.2666, time 5277.03ms 
iter 1015: loss 2.9912, time 5298.12ms 
iter 1016: loss 3.2285, time 5266.04ms 
iter 1017: loss 2.9665, time 5255.69ms 
iter 1018: loss 3.1872, time 5270.38ms 
iter 1019: loss 3.0972, time 5108.30ms 
iter 1020: loss 3.3738, time 5117.20ms 
iter 1021: loss 3.0395, time 5168.89ms 
iter 1022: loss 3.1481, time 5144.35ms 
iter 1023: loss 3.2541, time 5242.08ms 
iter 1024: loss 3.1401, time 5243.94ms 
iter 1025: loss 3.0794, time 5249.47ms 
iter 1026: loss 3.1540, time 5251.14ms 
iter 1027: loss 3.0817, time 5198.34ms 
iter 1028: loss 2.9137, time 5242.56ms 
iter 1029: loss 2.9597, time 5247.96ms 
iter 1030: loss 3.0415, time 5238.46ms 
iter 1031: loss 3.2523, time 5220.50ms 
iter 1032: loss 3.0900, time 5185.01ms 
iter 1033: loss 3.1241, time 5102.90ms 
iter 1034: loss 2.9845, time 5243.02ms 
iter 1035: loss 3.2114, time 5206.83ms 
iter 1036: loss 3.1496, time 5234.46ms 
iter 1037: loss 3.0411, time 5158.27ms 
iter 1038: loss 3.1846, time 5119.78ms 
iter 1039: loss 3.0764, time 5097.15ms 
iter 1040: loss 3.1202, time 5203.90ms 
iter 1041: loss 3.2000, time 5234.45ms 
iter 1042: loss 2.8571, time 5244.70ms 
iter 1043: loss 3.0407, time 5235.68ms 
iter 1044: loss 3.2053, time 5198.39ms 
iter 1045: loss 3.3266, time 5233.70ms 
iter 1046: loss 3.0559, time 5231.16ms 
iter 1047: loss 3.2009, time 5240.75ms 
iter 1048: loss 3.2509, time 5200.62ms 
iter 1049: loss 2.9782, time 5234.68ms 
step 1050: train loss 3.1374, val loss 3.1507
iter 1050: loss 3.1273, time 19988.51ms 
iter 1051: loss 3.4204, time 5214.04ms 
iter 1052: loss 3.0619, time 5260.58ms 
iter 1053: loss 3.0282, time 5254.79ms 
iter 1054: loss 3.1860, time 5267.47ms 
iter 1055: loss 3.1145, time 5252.92ms 
iter 1056: loss 3.0734, time 5248.48ms 
iter 1057: loss 3.2579, time 5249.91ms 
iter 1058: loss 3.1337, time 5250.57ms 
iter 1059: loss 3.0558, time 5275.33ms 
iter 1060: loss 3.0940, time 5265.01ms 
iter 1061: loss 3.1338, time 5252.35ms 
iter 1062: loss 3.2750, time 5326.29ms 
iter 1063: loss 3.2073, time 5212.28ms 
iter 1064: loss 3.4375, time 5091.77ms 
iter 1065: loss 3.0774, time 5097.35ms 
iter 1066: loss 3.1204, time 5086.58ms 
iter 1067: loss 3.0511, time 5091.61ms 
iter 1068: loss 3.0840, time 5110.81ms 
iter 1069: loss 3.2030, time 5132.01ms 
iter 1070: loss 3.1189, time 5101.80ms 
iter 1071: loss 3.0316, time 5213.29ms 
iter 1072: loss 3.3025, time 5333.27ms 
iter 1073: loss 3.3468, time 5330.69ms 
iter 1074: loss 3.1044, time 5245.41ms 
iter 1075: loss 3.1690, time 5250.39ms 
iter 1076: loss 3.3612, time 5255.18ms 
iter 1077: loss 2.9699, time 5248.32ms 
iter 1078: loss 2.9796, time 5251.12ms 
iter 1079: loss 3.0245, time 5246.27ms 
iter 1080: loss 3.0696, time 5248.62ms 
iter 1081: loss 3.1858, time 5244.60ms 
iter 1082: loss 3.2306, time 5245.11ms 
iter 1083: loss 3.1204, time 5244.60ms 
iter 1084: loss 3.0653, time 5247.84ms 
iter 1085: loss 3.4548, time 5244.55ms 
iter 1086: loss 3.3047, time 5250.46ms 
iter 1087: loss 3.1261, time 5249.33ms 
iter 1088: loss 3.0200, time 5218.18ms 
iter 1089: loss 3.3045, time 5164.01ms 
iter 1090: loss 3.0592, time 5111.38ms 
iter 1091: loss 3.1037, time 5164.18ms 
iter 1092: loss 2.9766, time 5181.31ms 
iter 1093: loss 3.0703, time 5103.87ms 
iter 1094: loss 3.1722, time 5191.21ms 
iter 1095: loss 3.0434, time 5220.08ms 
iter 1096: loss 3.1316, time 5242.86ms 
iter 1097: loss 3.1441, time 5260.21ms 
iter 1098: loss 3.0707, time 5265.70ms 
iter 1099: loss 3.1317, time 5251.80ms 
step 1100: train loss 3.1126, val loss 3.1210
iter 1100: loss 2.9510, time 20006.69ms 
iter 1101: loss 3.0421, time 5249.48ms 
iter 1102: loss 3.0734, time 5196.34ms 
iter 1103: loss 2.8999, time 5004.87ms 
iter 1104: loss 3.1609, time 4972.34ms 
iter 1105: loss 3.3085, time 5031.75ms 
iter 1106: loss 3.2400, time 5230.16ms 
iter 1107: loss 3.0234, time 5232.03ms 
iter 1108: loss 3.0872, time 5271.09ms 
iter 1109: loss 3.0323, time 5288.12ms 
iter 1110: loss 3.1157, time 5240.63ms 
iter 1111: loss 3.0640, time 5242.37ms 
iter 1112: loss 3.2174, time 5127.81ms 
iter 1113: loss 3.1883, time 5228.22ms 
iter 1114: loss 3.2679, time 5312.56ms 
iter 1115: loss 3.0438, time 5258.30ms 
iter 1116: loss 3.0493, time 5257.05ms 
iter 1117: loss 3.1167, time 5260.36ms 
iter 1118: loss 3.0291, time 5252.60ms 
iter 1119: loss 3.1362, time 5262.53ms 
iter 1120: loss 3.1212, time 5295.36ms 
iter 1121: loss 2.9761, time 5260.44ms 
iter 1122: loss 3.1007, time 5262.15ms 
iter 1123: loss 3.0069, time 5249.55ms 
iter 1124: loss 3.0871, time 5272.52ms 
iter 1125: loss 3.2745, time 5326.63ms 
iter 1126: loss 3.1284, time 5325.26ms 
iter 1127: loss 3.1532, time 5323.98ms 
iter 1128: loss 3.3882, time 5214.57ms 
iter 1129: loss 3.1356, time 5219.19ms 
iter 1130: loss 3.2344, time 5236.85ms 
iter 1131: loss 3.0392, time 5226.48ms 
iter 1132: loss 3.1027, time 5227.97ms 
iter 1133: loss 3.1991, time 5249.78ms 
iter 1134: loss 2.9396, time 5257.27ms 
iter 1135: loss 3.2120, time 5256.51ms 
iter 1136: loss 2.8201, time 5258.78ms 
iter 1137: loss 3.0021, time 5258.81ms 
iter 1138: loss 3.0351, time 5216.28ms 
iter 1139: loss 3.1525, time 5254.99ms 
iter 1140: loss 3.0999, time 5248.61ms 
iter 1141: loss 3.3049, time 5250.88ms 
iter 1142: loss 3.1204, time 5243.09ms 
iter 1143: loss 3.0934, time 5249.14ms 
iter 1144: loss 3.3280, time 5249.42ms 
iter 1145: loss 3.1103, time 5262.03ms 
iter 1146: loss 2.9943, time 5280.75ms 
iter 1147: loss 2.9336, time 5260.32ms 
iter 1148: loss 2.9799, time 5163.05ms 
iter 1149: loss 3.1876, time 5312.14ms 
step 1150: train loss 3.0721, val loss 3.1108
iter 1150: loss 3.0330, time 19990.36ms 
iter 1151: loss 3.0762, time 5271.65ms 
iter 1152: loss 2.9274, time 5243.30ms 
iter 1153: loss 3.1812, time 5289.32ms 
iter 1154: loss 2.9717, time 5270.42ms 
iter 1155: loss 2.9954, time 5247.44ms 
iter 1156: loss 3.0210, time 5277.26ms 
iter 1157: loss 3.0707, time 5281.33ms 
iter 1158: loss 3.0833, time 5252.54ms 
iter 1159: loss 3.1004, time 5288.02ms 
iter 1160: loss 3.2051, time 5256.73ms 
iter 1161: loss 3.0397, time 5243.12ms 
iter 1162: loss 3.2105, time 5264.26ms 
iter 1163: loss 3.2062, time 5249.22ms 
iter 1164: loss 3.0657, time 5262.63ms 
iter 1165: loss 3.1504, time 5226.07ms 
iter 1166: loss 3.1015, time 5310.03ms 
iter 1167: loss 2.9996, time 5297.22ms 
iter 1168: loss 3.0229, time 5296.27ms 
iter 1169: loss 2.8940, time 5269.66ms 
iter 1170: loss 3.2287, time 5256.62ms 
iter 1171: loss 2.9421, time 5261.77ms 
iter 1172: loss 2.9218, time 5283.28ms 
iter 1173: loss 3.1691, time 5256.65ms 
iter 1174: loss 2.9109, time 5286.30ms 
iter 1175: loss 3.3855, time 5266.94ms 
iter 1176: loss 3.1554, time 5245.36ms 
iter 1177: loss 3.0961, time 5260.63ms 
iter 1178: loss 3.3929, time 5281.66ms 
iter 1179: loss 3.1873, time 5322.50ms 
iter 1180: loss 2.9991, time 5325.32ms 
iter 1181: loss 3.1084, time 5281.71ms 
iter 1182: loss 3.1795, time 5250.66ms 
iter 1183: loss 3.0708, time 5258.83ms 
iter 1184: loss 3.0938, time 5249.09ms 
iter 1185: loss 3.0181, time 5245.64ms 
iter 1186: loss 2.9874, time 5231.75ms 
iter 1187: loss 3.1639, time 5249.77ms 
iter 1188: loss 3.0100, time 5248.25ms 
iter 1189: loss 3.0534, time 5250.20ms 
iter 1190: loss 3.1190, time 5254.46ms 
iter 1191: loss 3.2791, time 5244.75ms 
iter 1192: loss 3.0403, time 5246.51ms 
iter 1193: loss 3.1062, time 5256.11ms 
iter 1194: loss 3.0644, time 5252.66ms 
iter 1195: loss 3.0958, time 5246.88ms 
iter 1196: loss 2.9189, time 5245.61ms 
iter 1197: loss 3.0573, time 5245.33ms 
iter 1198: loss 2.9907, time 5260.25ms 
iter 1199: loss 2.9597, time 5273.61ms 
step 1200: train loss 3.0705, val loss 3.1090
iter 1200: loss 3.0159, time 19925.79ms 
iter 1201: loss 3.1212, time 5239.51ms 
iter 1202: loss 2.9288, time 5296.79ms 
iter 1203: loss 3.0061, time 5319.03ms 
iter 1204: loss 3.1300, time 5317.52ms 
iter 1205: loss 3.1508, time 5313.20ms 
iter 1206: loss 3.0612, time 5271.47ms 
iter 1207: loss 3.1949, time 5191.39ms 
iter 1208: loss 2.9943, time 5101.63ms 
iter 1209: loss 3.0117, time 5205.57ms 
iter 1210: loss 3.0230, time 5261.43ms 
iter 1211: loss 3.2260, time 5254.61ms 
iter 1212: loss 3.0738, time 5267.29ms 
iter 1213: loss 3.2364, time 5332.82ms 
iter 1214: loss 2.9539, time 5327.18ms 
iter 1215: loss 3.0732, time 5260.15ms 
iter 1216: loss 2.9679, time 5256.59ms 
iter 1217: loss 3.0688, time 5248.46ms 
iter 1218: loss 3.1471, time 5232.59ms 
iter 1219: loss 3.0468, time 5251.03ms 
iter 1220: loss 2.9754, time 5250.81ms 
iter 1221: loss 3.0651, time 5296.01ms 
iter 1222: loss 3.1760, time 5254.27ms 
iter 1223: loss 2.9997, time 5255.38ms 
iter 1224: loss 2.9693, time 5260.80ms 
iter 1225: loss 2.9360, time 5259.37ms 
iter 1226: loss 2.9632, time 5253.53ms 
iter 1227: loss 3.1493, time 5259.18ms 
iter 1228: loss 3.2594, time 5264.91ms 
iter 1229: loss 3.1215, time 5299.16ms 
iter 1230: loss 3.0545, time 5260.50ms 
iter 1231: loss 3.1753, time 5286.51ms 
iter 1232: loss 3.2249, time 5246.42ms 
iter 1233: loss 3.2120, time 5321.43ms 
iter 1234: loss 3.0692, time 5329.28ms 
iter 1235: loss 3.0680, time 5320.67ms 
iter 1236: loss 3.0579, time 5320.96ms 
iter 1237: loss 2.9675, time 5324.46ms 
iter 1238: loss 3.0835, time 5341.67ms 
iter 1239: loss 3.0718, time 5257.71ms 
iter 1240: loss 2.8942, time 5226.29ms 
iter 1241: loss 3.1929, time 5249.20ms 
iter 1242: loss 3.3729, time 5252.09ms 
iter 1243: loss 3.1655, time 5251.69ms 
iter 1244: loss 3.0997, time 5243.11ms 
iter 1245: loss 3.0906, time 5245.62ms 
iter 1246: loss 3.0359, time 5249.50ms 
iter 1247: loss 3.1791, time 5243.41ms 
iter 1248: loss 3.1490, time 5248.54ms 
iter 1249: loss 2.9286, time 5261.47ms 
step 1250: train loss 3.0527, val loss 3.0947
iter 1250: loss 2.8988, time 19931.61ms 
iter 1251: loss 2.9100, time 5245.07ms 
iter 1252: loss 2.9205, time 5252.11ms 
iter 1253: loss 2.9101, time 5221.26ms 
iter 1254: loss 2.9967, time 5251.18ms 
iter 1255: loss 2.9534, time 5241.80ms 
iter 1256: loss 3.1834, time 5252.70ms 
iter 1257: loss 2.9554, time 5258.50ms 
iter 1258: loss 3.0801, time 5248.98ms 
iter 1259: loss 3.3472, time 5240.76ms 
iter 1260: loss 2.9535, time 5254.15ms 
iter 1261: loss 2.9244, time 5228.23ms 
iter 1262: loss 2.9185, time 5268.28ms 
iter 1263: loss 3.1168, time 5286.65ms 
iter 1264: loss 3.0542, time 5233.23ms 
iter 1265: loss 3.0062, time 5243.25ms 
iter 1266: loss 2.9391, time 5242.41ms 
iter 1267: loss 2.9621, time 5242.82ms 
iter 1268: loss 3.3980, time 5248.98ms 
iter 1269: loss 2.9599, time 5239.21ms 
iter 1270: loss 2.9969, time 5250.54ms 
iter 1271: loss 3.1282, time 5253.96ms 
iter 1272: loss 3.0395, time 5215.86ms 
iter 1273: loss 3.0281, time 5285.43ms 
iter 1274: loss 3.3363, time 5245.20ms 
iter 1275: loss 3.0379, time 5230.19ms 
iter 1276: loss 3.2023, time 5231.80ms 
iter 1277: loss 3.0901, time 5246.67ms 
iter 1278: loss 3.2163, time 5254.06ms 
iter 1279: loss 3.2218, time 5255.41ms 
iter 1280: loss 2.8123, time 5245.68ms 
iter 1281: loss 3.1181, time 5284.38ms 
iter 1282: loss 3.0269, time 5304.73ms 
iter 1283: loss 2.9428, time 5252.52ms 
iter 1284: loss 2.8572, time 5258.52ms 
iter 1285: loss 3.0903, time 5247.65ms 
iter 1286: loss 2.9331, time 5260.67ms 
iter 1287: loss 3.1842, time 5256.06ms 
iter 1288: loss 2.9869, time 5254.04ms 
iter 1289: loss 3.3092, time 5246.03ms 
iter 1290: loss 3.1129, time 5280.32ms 
iter 1291: loss 2.7667, time 5249.85ms 
iter 1292: loss 3.0680, time 5253.44ms 
iter 1293: loss 2.9216, time 5256.77ms 
iter 1294: loss 3.0611, time 5251.69ms 
iter 1295: loss 3.0359, time 5250.30ms 
iter 1296: loss 2.8679, time 5245.92ms 
iter 1297: loss 2.9665, time 5251.90ms 
iter 1298: loss 3.0906, time 5250.06ms 
iter 1299: loss 3.0907, time 5248.50ms 
step 1300: train loss 3.0364, val loss 3.0884
iter 1300: loss 3.2439, time 19979.38ms 
iter 1301: loss 3.2029, time 5250.32ms 
iter 1302: loss 2.8883, time 5255.80ms 
iter 1303: loss 3.0229, time 5245.82ms 
iter 1304: loss 2.8814, time 5257.08ms 
iter 1305: loss 3.1164, time 5255.78ms 
iter 1306: loss 3.0447, time 5239.43ms 
iter 1307: loss 2.9393, time 5264.68ms 
iter 1308: loss 2.8299, time 5257.48ms 
iter 1309: loss 3.0111, time 5250.84ms 
iter 1310: loss 3.1183, time 5246.64ms 
iter 1311: loss 3.1449, time 5255.78ms 
iter 1312: loss 3.0128, time 5246.93ms 
iter 1313: loss 2.9546, time 5243.15ms 
iter 1314: loss 2.9141, time 5252.80ms 
iter 1315: loss 2.9307, time 5254.60ms 
iter 1316: loss 3.1389, time 5245.07ms 
iter 1317: loss 3.0569, time 5254.25ms 
iter 1318: loss 3.0371, time 5262.78ms 
iter 1319: loss 2.9652, time 5251.97ms 
iter 1320: loss 3.0656, time 5248.32ms 
iter 1321: loss 3.0170, time 5250.08ms 
iter 1322: loss 3.0551, time 5245.70ms 
iter 1323: loss 3.1281, time 5248.61ms 
iter 1324: loss 2.9932, time 5246.37ms 
iter 1325: loss 2.9657, time 5243.36ms 
iter 1326: loss 2.9007, time 5247.90ms 
iter 1327: loss 2.9457, time 5250.54ms 
iter 1328: loss 2.9441, time 5248.97ms 
iter 1329: loss 2.8660, time 5257.48ms 
iter 1330: loss 2.9367, time 5267.94ms 
iter 1331: loss 3.1247, time 5255.67ms 
iter 1332: loss 3.0286, time 5251.12ms 
iter 1333: loss 2.7943, time 5258.22ms 
iter 1334: loss 3.0831, time 5246.82ms 
iter 1335: loss 3.0735, time 5259.95ms 
iter 1336: loss 3.2142, time 5268.77ms 
iter 1337: loss 3.1002, time 5255.32ms 
iter 1338: loss 3.1194, time 5263.12ms 
iter 1339: loss 3.0619, time 5251.58ms 
iter 1340: loss 3.0083, time 5250.89ms 
iter 1341: loss 3.1760, time 5253.33ms 
iter 1342: loss 2.8306, time 5260.69ms 
iter 1343: loss 3.1015, time 5244.19ms 
iter 1344: loss 3.0505, time 5256.38ms 
iter 1345: loss 3.0901, time 5264.44ms 
iter 1346: loss 3.0897, time 5258.14ms 
iter 1347: loss 2.9868, time 5262.16ms 
iter 1348: loss 3.0216, time 5275.47ms 
iter 1349: loss 2.9849, time 5258.92ms 
step 1350: train loss 3.0063, val loss 3.0659
iter 1350: loss 2.8501, time 20011.68ms 
iter 1351: loss 3.0529, time 5252.59ms 
iter 1352: loss 3.0340, time 5250.74ms 
iter 1353: loss 3.0597, time 5255.41ms 
iter 1354: loss 3.0614, time 5256.22ms 
iter 1355: loss 3.0409, time 5254.69ms 
iter 1356: loss 2.9062, time 5248.14ms 
iter 1357: loss 3.0449, time 5246.57ms 
iter 1358: loss 2.8218, time 5252.58ms 
iter 1359: loss 3.2555, time 5257.80ms 
iter 1360: loss 3.1422, time 5247.53ms 
iter 1361: loss 2.9265, time 5236.83ms 
iter 1362: loss 2.9550, time 5256.54ms 
iter 1363: loss 3.0874, time 5242.44ms 
iter 1364: loss 2.9864, time 5260.37ms 
iter 1365: loss 2.9967, time 5259.28ms 
iter 1366: loss 2.9507, time 5250.62ms 
iter 1367: loss 2.8241, time 5244.68ms 
iter 1368: loss 2.8630, time 5249.39ms 
iter 1369: loss 2.9611, time 5244.88ms 
iter 1370: loss 2.9804, time 5210.05ms 
iter 1371: loss 2.8964, time 5025.38ms 
iter 1372: loss 2.9746, time 5038.11ms 
iter 1373: loss 3.1352, time 5036.73ms 
iter 1374: loss 2.9506, time 5031.43ms 
iter 1375: loss 3.1393, time 5070.25ms 
iter 1376: loss 3.0434, time 5065.38ms 
iter 1377: loss 2.7694, time 5170.60ms 
iter 1378: loss 3.0797, time 5196.96ms 
iter 1379: loss 2.7985, time 5226.16ms 
iter 1380: loss 2.9634, time 5214.43ms 
iter 1381: loss 3.1304, time 5135.80ms 
iter 1382: loss 2.9798, time 5180.33ms 
iter 1383: loss 3.0669, time 5184.37ms 
iter 1384: loss 3.0091, time 5160.81ms 
iter 1385: loss 2.9450, time 5170.49ms 
iter 1386: loss 2.9513, time 5213.74ms 
iter 1387: loss 2.9540, time 5157.69ms 
iter 1388: loss 3.2525, time 5247.91ms 
iter 1389: loss 2.9101, time 5248.95ms 
iter 1390: loss 3.0060, time 5246.84ms 
iter 1391: loss 2.9995, time 5236.84ms 
iter 1392: loss 3.0332, time 5260.96ms 
iter 1393: loss 2.9439, time 5263.92ms 
iter 1394: loss 2.9979, time 5268.59ms 
iter 1395: loss 2.9067, time 5267.95ms 
iter 1396: loss 2.9708, time 5258.60ms 
iter 1397: loss 2.8514, time 5258.61ms 
iter 1398: loss 2.8390, time 5256.97ms 
iter 1399: loss 2.9472, time 5247.05ms 
step 1400: train loss 3.0219, val loss 3.0568
iter 1400: loss 3.0951, time 19989.33ms 
iter 1401: loss 3.0039, time 5260.55ms 
iter 1402: loss 3.1706, time 5256.93ms 
iter 1403: loss 3.1489, time 5255.08ms 
iter 1404: loss 3.0228, time 5253.19ms 
iter 1405: loss 2.9650, time 5250.24ms 
iter 1406: loss 2.9998, time 5256.31ms 
iter 1407: loss 2.9210, time 5245.53ms 
iter 1408: loss 3.0159, time 5252.08ms 
iter 1409: loss 2.9968, time 5252.61ms 
iter 1410: loss 2.9991, time 5259.76ms 
iter 1411: loss 2.8683, time 5254.17ms 
iter 1412: loss 2.8846, time 5169.03ms 
iter 1413: loss 3.2769, time 5144.92ms 
iter 1414: loss 2.8631, time 5096.12ms 
iter 1415: loss 2.9403, time 5137.83ms 
iter 1416: loss 2.9989, time 5084.76ms 
iter 1417: loss 3.1239, time 5076.64ms 
iter 1418: loss 3.1591, time 5194.82ms 
iter 1419: loss 2.8040, time 5116.63ms 
iter 1420: loss 2.9935, time 5191.85ms 
iter 1421: loss 3.0037, time 5118.85ms 
iter 1422: loss 3.1050, time 5257.89ms 
iter 1423: loss 3.2854, time 5270.05ms 
iter 1424: loss 2.8994, time 5208.54ms 
iter 1425: loss 2.9531, time 5253.09ms 
iter 1426: loss 2.9930, time 5205.25ms 
iter 1427: loss 3.0447, time 5259.91ms 
iter 1428: loss 3.0859, time 5247.89ms 
iter 1429: loss 3.0586, time 5248.17ms 
iter 1430: loss 2.9006, time 5163.89ms 
iter 1431: loss 2.8521, time 5097.70ms 
iter 1432: loss 2.9059, time 5099.38ms 
iter 1433: loss 2.8832, time 5098.30ms 
iter 1434: loss 3.0522, time 5181.83ms 
iter 1435: loss 3.1063, time 5264.44ms 
iter 1436: loss 2.8786, time 5284.50ms 
iter 1437: loss 2.9956, time 5249.04ms 
iter 1438: loss 2.9775, time 5246.35ms 
iter 1439: loss 3.3797, time 5306.66ms 
iter 1440: loss 2.9118, time 5269.79ms 
iter 1441: loss 3.1592, time 5284.51ms 
iter 1442: loss 3.0503, time 5244.95ms 
iter 1443: loss 2.9351, time 5260.86ms 
iter 1444: loss 3.1081, time 5281.20ms 
iter 1445: loss 2.9599, time 5261.94ms 
iter 1446: loss 2.9491, time 5263.50ms 
iter 1447: loss 3.0894, time 5255.71ms 
iter 1448: loss 2.9921, time 5257.51ms 
iter 1449: loss 2.9857, time 5258.61ms 
step 1450: train loss 2.9826, val loss 3.0494
iter 1450: loss 2.9637, time 20006.66ms 
iter 1451: loss 2.9059, time 5252.30ms 
iter 1452: loss 3.0442, time 5246.57ms 
iter 1453: loss 2.9227, time 5249.01ms 
iter 1454: loss 2.9461, time 5244.56ms 
iter 1455: loss 3.1435, time 5235.99ms 
iter 1456: loss 2.8083, time 5249.12ms 
iter 1457: loss 3.0556, time 5215.54ms 
iter 1458: loss 2.9641, time 5250.62ms 
iter 1459: loss 3.0159, time 5250.10ms 
iter 1460: loss 2.9312, time 5244.14ms 
iter 1461: loss 3.0925, time 5243.48ms 
iter 1462: loss 2.9344, time 5244.06ms 
iter 1463: loss 3.0061, time 5243.91ms 
iter 1464: loss 2.9631, time 5245.26ms 
iter 1465: loss 2.8234, time 5254.50ms 
iter 1466: loss 2.9905, time 5249.22ms 
iter 1467: loss 2.9806, time 5245.36ms 
iter 1468: loss 3.0764, time 5246.57ms 
iter 1469: loss 2.8404, time 5245.24ms 
iter 1470: loss 3.2309, time 5247.41ms 
iter 1471: loss 3.0078, time 5307.47ms 
iter 1472: loss 2.8487, time 5291.49ms 
iter 1473: loss 3.0424, time 5317.67ms 
iter 1474: loss 2.9689, time 5297.30ms 
iter 1475: loss 3.1019, time 5321.16ms 
iter 1476: loss 2.8483, time 5336.32ms 
iter 1477: loss 2.9726, time 5330.60ms 
iter 1478: loss 2.8767, time 5320.33ms 
iter 1479: loss 3.0679, time 5323.67ms 
iter 1480: loss 3.0150, time 5325.31ms 
iter 1481: loss 2.9443, time 5260.85ms 
iter 1482: loss 2.8180, time 5240.94ms 
iter 1483: loss 2.9565, time 5233.08ms 
iter 1484: loss 3.0061, time 5250.63ms 
iter 1485: loss 2.9547, time 5253.47ms 
iter 1486: loss 2.9522, time 5256.89ms 
iter 1487: loss 3.0313, time 5259.00ms 
iter 1488: loss 2.8478, time 5249.88ms 
iter 1489: loss 3.1511, time 5251.49ms 
iter 1490: loss 2.8946, time 5270.87ms 
iter 1491: loss 2.8957, time 5247.65ms 
iter 1492: loss 2.9479, time 5251.16ms 
iter 1493: loss 3.0492, time 5244.60ms 
iter 1494: loss 3.2135, time 5256.57ms 
iter 1495: loss 2.8970, time 5247.77ms 
iter 1496: loss 3.0916, time 5252.30ms 
iter 1497: loss 3.0444, time 5277.04ms 
iter 1498: loss 3.0723, time 5261.78ms 
iter 1499: loss 2.9055, time 5258.07ms 
step 1500: train loss 2.9706, val loss 3.0399
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 1500: loss 3.2395, time 21141.38ms 
iter 1501: loss 2.9023, time 5257.92ms 
iter 1502: loss 3.0849, time 5255.98ms 
iter 1503: loss 3.0293, time 5265.68ms 
iter 1504: loss 2.8439, time 5263.75ms 
iter 1505: loss 2.9195, time 5271.16ms 
iter 1506: loss 3.0673, time 5260.85ms 
iter 1507: loss 2.9963, time 5256.82ms 
iter 1508: loss 3.0255, time 5257.52ms 
iter 1509: loss 2.8708, time 5260.39ms 
iter 1510: loss 2.8892, time 5255.90ms 
iter 1511: loss 3.0410, time 5256.25ms 
iter 1512: loss 2.9750, time 5258.52ms 
iter 1513: loss 3.1220, time 5244.76ms 
iter 1514: loss 2.8919, time 5238.77ms 
iter 1515: loss 3.0331, time 5239.59ms 
iter 1516: loss 2.9266, time 5239.04ms 
iter 1517: loss 2.9437, time 5245.40ms 
iter 1518: loss 2.8847, time 5255.78ms 
iter 1519: loss 3.0078, time 5254.91ms 
iter 1520: loss 2.9444, time 5258.32ms 
iter 1521: loss 2.9275, time 5256.51ms 
iter 1522: loss 2.9163, time 5252.35ms 
iter 1523: loss 2.9584, time 5254.29ms 
iter 1524: loss 3.0232, time 5252.94ms 
iter 1525: loss 2.8822, time 5251.21ms 
iter 1526: loss 2.9321, time 5252.95ms 
iter 1527: loss 2.9201, time 5246.68ms 
iter 1528: loss 3.2317, time 5254.03ms 
iter 1529: loss 2.8647, time 5250.37ms 
iter 1530: loss 3.0054, time 5246.96ms 
iter 1531: loss 2.8584, time 5248.70ms 
iter 1532: loss 2.9513, time 5246.19ms 
iter 1533: loss 2.9767, time 5314.56ms 
iter 1534: loss 3.0127, time 5316.98ms 
iter 1535: loss 3.0230, time 5254.71ms 
iter 1536: loss 2.9172, time 5251.21ms 
iter 1537: loss 2.9736, time 5253.88ms 
iter 1538: loss 3.0750, time 5180.10ms 
iter 1539: loss 3.0837, time 5224.77ms 
iter 1540: loss 2.7444, time 5229.41ms 
iter 1541: loss 2.9135, time 5228.74ms 
iter 1542: loss 3.0411, time 5241.78ms 
iter 1543: loss 3.1004, time 5246.10ms 
iter 1544: loss 3.1077, time 5239.80ms 
iter 1545: loss 2.8248, time 5236.37ms 
iter 1546: loss 2.9290, time 5237.28ms 
iter 1547: loss 2.8761, time 5236.73ms 
iter 1548: loss 2.9685, time 5226.96ms 
iter 1549: loss 3.1163, time 5250.00ms 
step 1550: train loss 2.9672, val loss 3.0448
iter 1550: loss 2.9182, time 19997.46ms 
iter 1551: loss 3.0098, time 5232.04ms 
iter 1552: loss 2.9756, time 5183.73ms 
iter 1553: loss 2.9696, time 5240.08ms 
iter 1554: loss 2.9504, time 5231.22ms 
iter 1555: loss 3.1218, time 5276.08ms 
iter 1556: loss 2.9029, time 5276.49ms 
iter 1557: loss 2.8270, time 5259.52ms 
iter 1558: loss 2.9303, time 5258.78ms 
iter 1559: loss 3.0401, time 5264.54ms 
iter 1560: loss 2.8958, time 5327.08ms 
iter 1561: loss 2.8587, time 5313.77ms 
iter 1562: loss 2.8674, time 5330.45ms 
iter 1563: loss 2.9268, time 5247.40ms 
iter 1564: loss 2.8323, time 5264.36ms 
iter 1565: loss 2.8643, time 5254.03ms 
iter 1566: loss 2.6242, time 5220.60ms 
iter 1567: loss 3.0933, time 5255.24ms 
iter 1568: loss 2.9400, time 5260.23ms 
iter 1569: loss 3.0223, time 5253.83ms 
iter 1570: loss 2.9659, time 5247.61ms 
iter 1571: loss 2.8763, time 5246.45ms 
iter 1572: loss 2.9007, time 5254.76ms 
iter 1573: loss 2.8675, time 5245.84ms 
iter 1574: loss 2.9254, time 5251.97ms 
iter 1575: loss 2.7964, time 5257.78ms 
iter 1576: loss 2.9063, time 5243.55ms 
iter 1577: loss 2.9909, time 5248.53ms 
iter 1578: loss 3.0494, time 5262.65ms 
iter 1579: loss 2.7675, time 5249.07ms 
iter 1580: loss 3.0418, time 5261.30ms 
iter 1581: loss 3.0271, time 5250.54ms 
iter 1582: loss 3.0436, time 5312.54ms 
iter 1583: loss 2.9767, time 5321.11ms 
iter 1584: loss 2.8478, time 5324.23ms 
iter 1585: loss 2.8672, time 5301.51ms 
iter 1586: loss 2.8591, time 5320.82ms 
iter 1587: loss 2.9429, time 5255.75ms 
iter 1588: loss 2.9492, time 5269.04ms 
iter 1589: loss 3.0016, time 5253.78ms 
iter 1590: loss 2.9514, time 5261.78ms 
iter 1591: loss 2.8922, time 5270.41ms 
iter 1592: loss 2.7927, time 5304.51ms 
iter 1593: loss 2.9026, time 5325.00ms 
iter 1594: loss 2.8279, time 5255.75ms 
iter 1595: loss 2.9772, time 5250.09ms 
iter 1596: loss 3.0385, time 5245.18ms 
iter 1597: loss 2.9097, time 5248.93ms 
iter 1598: loss 2.8886, time 5260.61ms 
iter 1599: loss 2.8751, time 5187.69ms 
step 1600: train loss 2.9445, val loss 3.0104
iter 1600: loss 3.2323, time 20023.35ms 
iter 1601: loss 3.0724, time 5311.12ms 
iter 1602: loss 2.7880, time 5320.45ms 
iter 1603: loss 2.9290, time 5330.81ms 
iter 1604: loss 3.0002, time 5322.42ms 
iter 1605: loss 2.7247, time 5328.56ms 
iter 1606: loss 3.1039, time 5255.22ms 
iter 1607: loss 2.9116, time 5254.25ms 
iter 1608: loss 3.0876, time 5267.64ms 
iter 1609: loss 2.9854, time 5280.74ms 
iter 1610: loss 2.7764, time 5243.32ms 
iter 1611: loss 2.8615, time 5245.44ms 
iter 1612: loss 3.0824, time 5246.07ms 
iter 1613: loss 2.8669, time 5241.71ms 
iter 1614: loss 2.7120, time 5242.07ms 
iter 1615: loss 3.0463, time 5246.26ms 
iter 1616: loss 2.8455, time 5249.66ms 
iter 1617: loss 2.7633, time 5246.56ms 
iter 1618: loss 3.0813, time 5263.06ms 
iter 1619: loss 3.0216, time 5256.12ms 
iter 1620: loss 2.9650, time 5245.51ms 
iter 1621: loss 2.7908, time 5252.07ms 
iter 1622: loss 3.1881, time 5267.76ms 
iter 1623: loss 2.6573, time 5253.49ms 
iter 1624: loss 2.8546, time 5261.23ms 
iter 1625: loss 2.9162, time 5249.12ms 
iter 1626: loss 3.2243, time 5263.65ms 
iter 1627: loss 2.9182, time 5257.73ms 
iter 1628: loss 2.9004, time 5260.32ms 
iter 1629: loss 2.8428, time 5261.89ms 
iter 1630: loss 3.0146, time 5251.53ms 
iter 1631: loss 2.8471, time 5314.11ms 
iter 1632: loss 2.9256, time 5249.47ms 
iter 1633: loss 2.8985, time 5233.63ms 
iter 1634: loss 3.2402, time 5245.54ms 
iter 1635: loss 3.1027, time 5233.16ms 
iter 1636: loss 3.0083, time 5253.21ms 
iter 1637: loss 2.9626, time 5246.04ms 
iter 1638: loss 2.9741, time 5246.55ms 
iter 1639: loss 3.1156, time 5250.63ms 
iter 1640: loss 2.7120, time 5265.24ms 
iter 1641: loss 2.9938, time 5250.55ms 
iter 1642: loss 2.7616, time 5256.30ms 
iter 1643: loss 2.9851, time 5227.08ms 
iter 1644: loss 2.9343, time 5201.95ms 
iter 1645: loss 3.0407, time 5265.36ms 
iter 1646: loss 2.9257, time 5256.39ms 
iter 1647: loss 2.9033, time 5257.31ms 
iter 1648: loss 2.9612, time 5256.42ms 
iter 1649: loss 2.8194, time 5242.60ms 
step 1650: train loss 2.9374, val loss 3.0147
iter 1650: loss 2.9250, time 19968.23ms 
iter 1651: loss 3.0226, time 5238.59ms 
iter 1652: loss 2.8099, time 5235.52ms 
iter 1653: loss 2.9213, time 5230.04ms 
iter 1654: loss 2.9385, time 5230.81ms 
iter 1655: loss 2.8653, time 5231.13ms 
iter 1656: loss 2.9865, time 5231.15ms 
iter 1657: loss 2.7579, time 5232.49ms 
iter 1658: loss 2.8141, time 5233.03ms 
iter 1659: loss 2.9072, time 5231.60ms 
iter 1660: loss 2.9709, time 5232.20ms 
iter 1661: loss 2.9270, time 5229.90ms 
iter 1662: loss 3.0997, time 5250.96ms 
iter 1663: loss 3.0022, time 5236.77ms 
iter 1664: loss 3.0350, time 5205.77ms 
iter 1665: loss 3.3255, time 5235.35ms 
iter 1666: loss 2.8847, time 5255.30ms 
iter 1667: loss 2.7006, time 5249.37ms 
iter 1668: loss 3.0046, time 5251.21ms 
iter 1669: loss 2.7890, time 5253.60ms 
iter 1670: loss 2.8238, time 5258.31ms 
iter 1671: loss 3.1338, time 5260.07ms 
iter 1672: loss 2.7904, time 5241.59ms 
iter 1673: loss 3.0703, time 5245.33ms 
iter 1674: loss 2.9025, time 5246.79ms 
iter 1675: loss 3.0231, time 5243.63ms 
iter 1676: loss 3.0129, time 5247.28ms 
iter 1677: loss 3.0007, time 5253.50ms 
iter 1678: loss 2.9542, time 5249.49ms 
iter 1679: loss 2.9371, time 5259.64ms 
iter 1680: loss 2.7821, time 5259.62ms 
iter 1681: loss 2.8083, time 5267.10ms 
iter 1682: loss 2.8338, time 5260.92ms 
iter 1683: loss 2.8777, time 5246.86ms 
iter 1684: loss 3.0303, time 5255.54ms 
iter 1685: loss 2.9767, time 5265.00ms 
iter 1686: loss 2.9499, time 5261.30ms 
iter 1687: loss 3.0159, time 5266.24ms 
iter 1688: loss 3.0612, time 5282.25ms 
iter 1689: loss 3.0330, time 5280.32ms 
iter 1690: loss 2.9041, time 5264.27ms 
iter 1691: loss 2.9019, time 5251.52ms 
iter 1692: loss 2.9499, time 5261.57ms 
iter 1693: loss 2.9301, time 5248.79ms 
iter 1694: loss 2.9280, time 5275.46ms 
iter 1695: loss 3.0132, time 5283.15ms 
iter 1696: loss 2.9065, time 5247.52ms 
iter 1697: loss 2.8790, time 5244.08ms 
iter 1698: loss 2.7929, time 5252.00ms 
iter 1699: loss 3.0087, time 5257.52ms 
step 1700: train loss 2.9338, val loss 2.9963
iter 1700: loss 2.9373, time 19998.41ms 
iter 1701: loss 3.0441, time 5301.11ms 
iter 1702: loss 2.8207, time 5328.42ms 
iter 1703: loss 2.9027, time 5267.04ms 
iter 1704: loss 3.0590, time 5248.52ms 
iter 1705: loss 3.0180, time 5250.30ms 
iter 1706: loss 2.8823, time 5246.76ms 
iter 1707: loss 2.7093, time 5312.30ms 
iter 1708: loss 2.9532, time 5278.94ms 
iter 1709: loss 2.9091, time 5269.33ms 
iter 1710: loss 2.9684, time 5326.39ms 
iter 1711: loss 2.9189, time 5320.15ms 
iter 1712: loss 3.0467, time 5249.78ms 
iter 1713: loss 3.0384, time 5245.70ms 
iter 1714: loss 2.9428, time 5251.25ms 
iter 1715: loss 3.0513, time 5280.68ms 
iter 1716: loss 2.9720, time 5252.28ms 
iter 1717: loss 2.8021, time 5244.26ms 
iter 1718: loss 2.9515, time 5245.87ms 
iter 1719: loss 2.9383, time 5302.07ms 
iter 1720: loss 2.9260, time 5320.83ms 
iter 1721: loss 2.9038, time 5265.93ms 
iter 1722: loss 2.9453, time 5269.04ms 
iter 1723: loss 2.7815, time 5265.21ms 
iter 1724: loss 2.9486, time 5322.02ms 
iter 1725: loss 2.8939, time 5280.81ms 
iter 1726: loss 2.8093, time 5259.26ms 
iter 1727: loss 2.8695, time 5294.88ms 
iter 1728: loss 2.9316, time 5256.56ms 
iter 1729: loss 3.0471, time 5254.87ms 
iter 1730: loss 2.9036, time 5259.31ms 
iter 1731: loss 2.9341, time 5257.93ms 
iter 1732: loss 2.6590, time 5275.41ms 
iter 1733: loss 2.9196, time 5274.17ms 
iter 1734: loss 3.0427, time 5248.29ms 
iter 1735: loss 2.9196, time 5242.73ms 
iter 1736: loss 3.1029, time 5248.95ms 
iter 1737: loss 2.8104, time 5256.55ms 
iter 1738: loss 2.8499, time 5255.08ms 
iter 1739: loss 2.8900, time 5252.69ms 
iter 1740: loss 2.8480, time 5250.98ms 
iter 1741: loss 2.6471, time 5259.57ms 
iter 1742: loss 2.7208, time 5249.14ms 
iter 1743: loss 2.7866, time 5252.34ms 
iter 1744: loss 3.0884, time 5252.33ms 
iter 1745: loss 3.0769, time 5243.99ms 
iter 1746: loss 2.9588, time 5246.35ms 
iter 1747: loss 2.8615, time 5247.73ms 
iter 1748: loss 2.7687, time 5247.17ms 
iter 1749: loss 3.1080, time 5255.27ms 
step 1750: train loss 2.9233, val loss 2.9812
iter 1750: loss 2.9862, time 20019.37ms 
iter 1751: loss 2.7542, time 5246.10ms 
iter 1752: loss 2.9737, time 5249.91ms 
iter 1753: loss 2.8069, time 5239.90ms 
iter 1754: loss 2.9472, time 5253.35ms 
iter 1755: loss 2.9472, time 5256.40ms 
iter 1756: loss 2.8290, time 5257.43ms 
iter 1757: loss 2.9577, time 5249.98ms 
iter 1758: loss 2.8598, time 5254.35ms 
iter 1759: loss 2.9123, time 5257.80ms 
iter 1760: loss 2.6758, time 5311.25ms 
iter 1761: loss 2.9122, time 5324.06ms 
iter 1762: loss 2.9546, time 5302.29ms 
iter 1763: loss 2.9091, time 5256.67ms 
iter 1764: loss 2.8906, time 5257.97ms 
iter 1765: loss 2.9236, time 5326.92ms 
iter 1766: loss 2.9655, time 5251.53ms 
iter 1767: loss 2.9033, time 5256.08ms 
iter 1768: loss 2.9705, time 5249.91ms 
iter 1769: loss 2.8219, time 5268.18ms 
iter 1770: loss 2.9821, time 5253.16ms 
iter 1771: loss 2.9521, time 5228.89ms 
iter 1772: loss 2.9528, time 5249.22ms 
iter 1773: loss 2.9687, time 5288.30ms 
iter 1774: loss 2.7937, time 5257.78ms 
iter 1775: loss 2.9509, time 5238.78ms 
iter 1776: loss 2.8257, time 5254.12ms 
iter 1777: loss 3.0161, time 5291.66ms 
iter 1778: loss 2.8314, time 5261.56ms 
iter 1779: loss 2.8813, time 5306.81ms 
iter 1780: loss 2.8965, time 5310.60ms 
iter 1781: loss 3.0210, time 5309.59ms 
iter 1782: loss 2.8726, time 5276.44ms 
iter 1783: loss 2.7014, time 5332.57ms 
iter 1784: loss 2.9343, time 5326.44ms 
iter 1785: loss 2.6512, time 5317.14ms 
iter 1786: loss 2.8067, time 5317.92ms 
iter 1787: loss 2.9175, time 5327.66ms 
iter 1788: loss 2.8351, time 5330.40ms 
iter 1789: loss 2.9061, time 5277.18ms 
iter 1790: loss 2.9077, time 5247.18ms 
iter 1791: loss 3.0108, time 5250.98ms 
iter 1792: loss 2.7708, time 5265.62ms 
iter 1793: loss 2.7903, time 5252.48ms 
iter 1794: loss 3.2649, time 5270.80ms 
iter 1795: loss 2.8878, time 5250.11ms 
iter 1796: loss 2.8448, time 5262.50ms 
iter 1797: loss 2.8489, time 5328.69ms 
iter 1798: loss 2.9391, time 5311.25ms 
iter 1799: loss 2.8167, time 5315.52ms 
step 1800: train loss 2.8967, val loss 2.9899
iter 1800: loss 2.8229, time 20040.67ms 
iter 1801: loss 3.0778, time 5274.60ms 
iter 1802: loss 2.8313, time 5260.93ms 
iter 1803: loss 2.9984, time 5289.38ms 
iter 1804: loss 2.9618, time 5309.59ms 
iter 1805: loss 2.9846, time 5301.65ms 
iter 1806: loss 2.8999, time 5282.57ms 
iter 1807: loss 2.8884, time 5255.19ms 
iter 1808: loss 2.9174, time 5258.53ms 
iter 1809: loss 2.8531, time 5256.21ms 
iter 1810: loss 2.8370, time 5260.45ms 
iter 1811: loss 2.9092, time 5258.11ms 
iter 1812: loss 3.0170, time 5258.81ms 
iter 1813: loss 2.8502, time 5265.91ms 
iter 1814: loss 2.8676, time 5255.13ms 
iter 1815: loss 2.7797, time 5252.09ms 
iter 1816: loss 2.9082, time 5253.85ms 
iter 1817: loss 2.7817, time 5256.64ms 
iter 1818: loss 3.0085, time 5256.46ms 
iter 1819: loss 2.7675, time 5258.35ms 
iter 1820: loss 2.9730, time 5259.40ms 
iter 1821: loss 2.8102, time 5258.85ms 
iter 1822: loss 2.9103, time 5257.08ms 
iter 1823: loss 2.6571, time 5254.24ms 
iter 1824: loss 2.9619, time 5253.43ms 
iter 1825: loss 3.0104, time 5255.20ms 
iter 1826: loss 2.8035, time 5254.80ms 
iter 1827: loss 2.9705, time 5267.32ms 
iter 1828: loss 3.0376, time 5262.05ms 
iter 1829: loss 3.0291, time 5273.79ms 
iter 1830: loss 2.8567, time 5256.02ms 
iter 1831: loss 2.7991, time 5253.65ms 
iter 1832: loss 2.8629, time 5248.22ms 
iter 1833: loss 2.7846, time 5268.01ms 
iter 1834: loss 2.9070, time 5262.06ms 
iter 1835: loss 2.9496, time 5266.23ms 
iter 1836: loss 2.8958, time 5252.09ms 
iter 1837: loss 2.9781, time 5256.43ms 
iter 1838: loss 3.0293, time 5263.86ms 
iter 1839: loss 3.0382, time 5254.02ms 
iter 1840: loss 2.8176, time 5251.37ms 
iter 1841: loss 2.9087, time 5255.35ms 
iter 1842: loss 2.7731, time 5253.91ms 
iter 1843: loss 2.7498, time 5247.63ms 
iter 1844: loss 2.8950, time 5237.43ms 
iter 1845: loss 2.7609, time 5250.37ms 
iter 1846: loss 2.7715, time 5259.51ms 
iter 1847: loss 2.7897, time 5221.69ms 
iter 1848: loss 2.8251, time 5260.04ms 
iter 1849: loss 2.9868, time 5236.69ms 
step 1850: train loss 2.9108, val loss 2.9823
iter 1850: loss 3.0207, time 20008.48ms 
iter 1851: loss 3.0427, time 5249.56ms 
iter 1852: loss 2.9360, time 5253.72ms 
iter 1853: loss 3.0580, time 5259.71ms 
iter 1854: loss 2.7773, time 5252.44ms 
iter 1855: loss 2.8584, time 5249.10ms 
iter 1856: loss 2.9164, time 5247.12ms 
iter 1857: loss 2.8449, time 5255.19ms 
iter 1858: loss 2.9358, time 5251.35ms 
iter 1859: loss 2.8575, time 5246.84ms 
iter 1860: loss 2.9060, time 5259.44ms 
iter 1861: loss 2.8297, time 5248.76ms 
iter 1862: loss 2.8027, time 5264.29ms 
iter 1863: loss 2.9024, time 5202.46ms 
iter 1864: loss 3.0883, time 5257.59ms 
iter 1865: loss 2.7572, time 5256.25ms 
iter 1866: loss 2.8786, time 5252.23ms 
iter 1867: loss 3.0011, time 5252.16ms 
iter 1868: loss 2.7241, time 5245.17ms 
iter 1869: loss 2.8910, time 5264.30ms 
iter 1870: loss 2.9519, time 5255.37ms 
iter 1871: loss 3.0069, time 5273.77ms 
iter 1872: loss 3.0119, time 5268.33ms 
iter 1873: loss 2.7886, time 5276.35ms 
iter 1874: loss 2.7425, time 5245.99ms 
iter 1875: loss 2.7326, time 5248.07ms 
iter 1876: loss 2.9528, time 5241.81ms 
iter 1877: loss 2.7875, time 5247.07ms 
iter 1878: loss 2.8824, time 5258.54ms 
iter 1879: loss 3.0151, time 5246.07ms 
iter 1880: loss 2.9205, time 5255.69ms 
iter 1881: loss 2.9025, time 5254.35ms 
iter 1882: loss 2.8009, time 5281.96ms 
iter 1883: loss 2.9684, time 5247.61ms 
iter 1884: loss 2.9877, time 5264.07ms 
iter 1885: loss 2.7662, time 5248.90ms 
iter 1886: loss 2.8861, time 5252.62ms 
iter 1887: loss 2.8954, time 5262.44ms 
iter 1888: loss 2.5906, time 5255.52ms 
iter 1889: loss 2.8206, time 5244.61ms 
iter 1890: loss 2.9009, time 5254.62ms 
iter 1891: loss 2.7561, time 5255.70ms 
iter 1892: loss 2.7050, time 5250.83ms 
iter 1893: loss 2.6386, time 5247.87ms 
iter 1894: loss 2.8430, time 5248.06ms 
iter 1895: loss 2.7576, time 5256.01ms 
iter 1896: loss 2.9255, time 5246.33ms 
iter 1897: loss 3.0054, time 5256.09ms 
iter 1898: loss 2.7434, time 5258.09ms 
iter 1899: loss 2.9547, time 5262.39ms 
step 1900: train loss 2.8947, val loss 2.9782
iter 1900: loss 3.0124, time 20167.75ms 
iter 1901: loss 2.9628, time 5235.56ms 
iter 1902: loss 2.8750, time 5239.24ms 
iter 1903: loss 2.7756, time 5253.91ms 
iter 1904: loss 2.9609, time 5292.19ms 
iter 1905: loss 2.9244, time 5243.84ms 
iter 1906: loss 2.8729, time 5251.81ms 
iter 1907: loss 2.7456, time 5300.61ms 
iter 1908: loss 2.7931, time 5278.41ms 
iter 1909: loss 2.6815, time 5252.57ms 
iter 1910: loss 2.9178, time 5250.98ms 
iter 1911: loss 2.9902, time 5251.62ms 
iter 1912: loss 2.8039, time 5262.98ms 
iter 1913: loss 2.7495, time 5263.68ms 
iter 1914: loss 2.9569, time 5271.52ms 
iter 1915: loss 2.8976, time 5297.45ms 
iter 1916: loss 2.8609, time 5267.19ms 
iter 1917: loss 2.8547, time 5255.45ms 
iter 1918: loss 2.8091, time 5291.47ms 
iter 1919: loss 2.9791, time 5278.00ms 
iter 1920: loss 2.7582, time 5249.55ms 
iter 1921: loss 2.9236, time 5276.96ms 
iter 1922: loss 3.1350, time 5264.78ms 
iter 1923: loss 3.0589, time 5257.59ms 
iter 1924: loss 2.8317, time 5257.30ms 
iter 1925: loss 2.9498, time 5247.58ms 
iter 1926: loss 2.8747, time 5248.51ms 
iter 1927: loss 2.8735, time 5251.81ms 
iter 1928: loss 2.7764, time 5248.59ms 
iter 1929: loss 3.0024, time 5303.34ms 
iter 1930: loss 2.8662, time 5273.29ms 
iter 1931: loss 2.8490, time 5329.90ms 
iter 1932: loss 2.9309, time 5271.37ms 
iter 1933: loss 2.7638, time 5235.16ms 
iter 1934: loss 2.9624, time 5234.92ms 
iter 1935: loss 2.7829, time 5236.49ms 
iter 1936: loss 2.9203, time 5240.55ms 
iter 1937: loss 2.8890, time 5247.56ms 
iter 1938: loss 2.9653, time 5247.52ms 
iter 1939: loss 2.9363, time 5248.91ms 
iter 1940: loss 2.7950, time 5256.85ms 
iter 1941: loss 2.6695, time 5270.74ms 
iter 1942: loss 2.8026, time 5252.99ms 
iter 1943: loss 2.8029, time 5249.61ms 
iter 1944: loss 2.8647, time 5248.79ms 
iter 1945: loss 3.0533, time 5255.95ms 
iter 1946: loss 2.9912, time 5248.82ms 
iter 1947: loss 2.8806, time 5255.39ms 
iter 1948: loss 2.8759, time 5267.47ms 
iter 1949: loss 2.7981, time 5248.93ms 
step 1950: train loss 2.8792, val loss 2.9790
iter 1950: loss 2.7368, time 20000.99ms 
iter 1951: loss 3.0555, time 5266.01ms 
iter 1952: loss 2.9556, time 5263.12ms 
iter 1953: loss 2.9127, time 5269.91ms 
iter 1954: loss 2.9162, time 5253.65ms 
iter 1955: loss 2.8969, time 5246.70ms 
iter 1956: loss 2.9989, time 5246.82ms 
iter 1957: loss 2.8310, time 5247.06ms 
iter 1958: loss 2.9751, time 5258.15ms 
iter 1959: loss 2.8712, time 5251.79ms 
iter 1960: loss 2.7525, time 5252.54ms 
iter 1961: loss 2.8666, time 5263.26ms 
iter 1962: loss 2.7097, time 5248.72ms 
iter 1963: loss 3.0906, time 5250.36ms 
iter 1964: loss 2.9898, time 5250.23ms 
iter 1965: loss 2.9637, time 5244.76ms 
iter 1966: loss 2.9642, time 5249.06ms 
iter 1967: loss 2.6871, time 5256.49ms 
iter 1968: loss 2.9530, time 5251.76ms 
iter 1969: loss 2.7511, time 5249.35ms 
iter 1970: loss 3.1237, time 5254.13ms 
iter 1971: loss 2.8490, time 5245.21ms 
iter 1972: loss 3.0658, time 5243.57ms 
iter 1973: loss 2.8723, time 5237.05ms 
iter 1974: loss 2.8064, time 5216.08ms 
iter 1975: loss 2.8912, time 5251.92ms 
iter 1976: loss 2.9689, time 5255.11ms 
iter 1977: loss 2.6532, time 5270.15ms 
iter 1978: loss 2.9094, time 5334.47ms 
iter 1979: loss 2.9578, time 5279.25ms 
iter 1980: loss 2.9520, time 5325.34ms 
iter 1981: loss 2.9404, time 5244.68ms 
iter 1982: loss 2.7295, time 5242.82ms 
iter 1983: loss 2.9479, time 5246.62ms 
iter 1984: loss 2.7686, time 5246.59ms 
iter 1985: loss 2.8781, time 5251.16ms 
iter 1986: loss 3.1594, time 5244.69ms 
iter 1987: loss 2.9863, time 5258.24ms 
iter 1988: loss 2.9961, time 5247.68ms 
iter 1989: loss 2.9496, time 5248.17ms 
iter 1990: loss 2.8865, time 5249.87ms 
iter 1991: loss 2.8943, time 5249.27ms 
iter 1992: loss 2.8506, time 5254.25ms 
iter 1993: loss 2.9187, time 5256.60ms 
iter 1994: loss 2.9015, time 5260.91ms 
iter 1995: loss 3.0302, time 5274.85ms 
iter 1996: loss 2.8957, time 5260.40ms 
iter 1997: loss 3.0309, time 5256.33ms 
iter 1998: loss 2.7226, time 5248.89ms 
iter 1999: loss 2.8610, time 5255.92ms 
step 2000: train loss 2.8764, val loss 2.9570
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 2000: loss 2.9695, time 21107.54ms 
iter 2001: loss 2.9667, time 5258.47ms 
iter 2002: loss 2.9985, time 5255.39ms 
iter 2003: loss 2.6392, time 5262.23ms 
iter 2004: loss 2.7655, time 5254.20ms 
iter 2005: loss 2.8994, time 5259.72ms 
iter 2006: loss 2.8639, time 5261.77ms 
iter 2007: loss 2.8477, time 5258.09ms 
iter 2008: loss 2.8972, time 5257.29ms 
iter 2009: loss 2.8183, time 5256.16ms 
iter 2010: loss 2.8693, time 5302.54ms 
iter 2011: loss 2.9337, time 5332.97ms 
iter 2012: loss 2.9446, time 5333.88ms 
iter 2013: loss 2.6461, time 5284.29ms 
iter 2014: loss 2.8859, time 5298.51ms 
iter 2015: loss 2.8349, time 5276.27ms 
iter 2016: loss 3.1193, time 5296.83ms 
iter 2017: loss 2.8672, time 5293.30ms 
iter 2018: loss 2.6660, time 5246.45ms 
iter 2019: loss 2.8760, time 5283.43ms 
iter 2020: loss 2.9629, time 5289.25ms 
iter 2021: loss 2.6762, time 5332.27ms 
iter 2022: loss 2.8404, time 5199.47ms 
iter 2023: loss 2.7959, time 5212.72ms 
iter 2024: loss 2.7961, time 5236.95ms 
iter 2025: loss 2.7350, time 5142.50ms 
iter 2026: loss 2.8378, time 5192.22ms 
iter 2027: loss 2.8417, time 5226.67ms 
iter 2028: loss 2.9163, time 5232.88ms 
iter 2029: loss 2.9804, time 5247.80ms 
iter 2030: loss 2.7713, time 5230.50ms 
iter 2031: loss 2.8461, time 5240.85ms 
iter 2032: loss 2.9009, time 5228.83ms 
iter 2033: loss 2.7968, time 5268.71ms 
iter 2034: loss 2.8761, time 5233.48ms 
iter 2035: loss 2.8264, time 5226.93ms 
iter 2036: loss 2.9245, time 5246.69ms 
iter 2037: loss 2.7771, time 5220.87ms 
iter 2038: loss 2.9539, time 5221.77ms 
iter 2039: loss 3.0759, time 5242.20ms 
iter 2040: loss 3.0876, time 5234.67ms 
iter 2041: loss 2.9130, time 5248.31ms 
iter 2042: loss 2.6732, time 5239.26ms 
iter 2043: loss 2.9190, time 5231.77ms 
iter 2044: loss 2.8584, time 5236.02ms 
iter 2045: loss 2.9153, time 5268.18ms 
iter 2046: loss 2.8163, time 5310.53ms 
iter 2047: loss 2.8175, time 5326.48ms 
iter 2048: loss 2.7765, time 5287.38ms 
iter 2049: loss 3.0226, time 5273.28ms 
step 2050: train loss 2.8730, val loss 2.9668
iter 2050: loss 2.8148, time 19998.52ms 
iter 2051: loss 2.8172, time 5244.37ms 
iter 2052: loss 2.7295, time 5245.41ms 
iter 2053: loss 2.9320, time 5253.80ms 
iter 2054: loss 3.0510, time 5255.87ms 
iter 2055: loss 2.8872, time 5249.39ms 
iter 2056: loss 2.7987, time 5267.94ms 
iter 2057: loss 2.7326, time 5259.41ms 
iter 2058: loss 2.9815, time 5255.08ms 
iter 2059: loss 2.7329, time 5234.80ms 
iter 2060: loss 2.7657, time 5208.65ms 
iter 2061: loss 2.8129, time 5176.86ms 
iter 2062: loss 2.7892, time 5280.38ms 
iter 2063: loss 2.9228, time 5301.26ms 
iter 2064: loss 3.0395, time 5254.08ms 
iter 2065: loss 2.7369, time 5255.59ms 
iter 2066: loss 2.8202, time 5249.49ms 
iter 2067: loss 2.9287, time 5256.36ms 
iter 2068: loss 2.8009, time 5246.17ms 
iter 2069: loss 2.9150, time 5273.58ms 
iter 2070: loss 2.7601, time 5240.27ms 
iter 2071: loss 2.8222, time 5232.79ms 
iter 2072: loss 2.7427, time 5228.35ms 
iter 2073: loss 2.8070, time 5200.88ms 
iter 2074: loss 2.7814, time 5246.58ms 
iter 2075: loss 2.9088, time 5257.38ms 
iter 2076: loss 2.9438, time 5167.29ms 
iter 2077: loss 2.9125, time 5107.41ms 
iter 2078: loss 3.0002, time 5131.22ms 
iter 2079: loss 2.7862, time 5172.05ms 
iter 2080: loss 2.9142, time 5189.22ms 
iter 2081: loss 2.8798, time 5203.45ms 
iter 2082: loss 2.6912, time 5200.33ms 
iter 2083: loss 2.8439, time 5149.12ms 
iter 2084: loss 3.0023, time 5120.49ms 
iter 2085: loss 2.8317, time 5100.03ms 
iter 2086: loss 2.9789, time 5125.73ms 
iter 2087: loss 2.7532, time 5207.17ms 
iter 2088: loss 2.7755, time 5226.79ms 
iter 2089: loss 2.8303, time 5217.43ms 
iter 2090: loss 2.7225, time 5179.33ms 
iter 2091: loss 2.8151, time 5214.62ms 
iter 2092: loss 2.7472, time 5282.38ms 
iter 2093: loss 2.8292, time 5287.79ms 
iter 2094: loss 2.7580, time 5316.14ms 
iter 2095: loss 2.9255, time 5322.28ms 
iter 2096: loss 2.8467, time 5330.04ms 
iter 2097: loss 2.7193, time 5328.50ms 
iter 2098: loss 2.6663, time 5298.92ms 
iter 2099: loss 2.8060, time 5285.33ms 
step 2100: train loss 2.8486, val loss 2.9515
iter 2100: loss 2.8055, time 20052.87ms 
iter 2101: loss 2.9687, time 5276.35ms 
iter 2102: loss 2.8590, time 5282.54ms 
iter 2103: loss 2.8317, time 5269.32ms 
iter 2104: loss 2.7872, time 5265.88ms 
iter 2105: loss 2.6755, time 5265.52ms 
iter 2106: loss 2.8022, time 5270.36ms 
iter 2107: loss 2.7500, time 5262.61ms 
iter 2108: loss 2.6230, time 5265.86ms 
iter 2109: loss 2.7144, time 5259.83ms 
iter 2110: loss 2.7287, time 5262.76ms 
iter 2111: loss 2.8803, time 5267.59ms 
iter 2112: loss 2.8570, time 5257.65ms 
iter 2113: loss 2.7688, time 5252.15ms 
iter 2114: loss 2.8861, time 5247.13ms 
iter 2115: loss 3.0130, time 5256.21ms 
iter 2116: loss 2.8398, time 5250.77ms 
iter 2117: loss 2.8456, time 5249.21ms 
iter 2118: loss 3.0141, time 5256.57ms 
iter 2119: loss 2.8979, time 5254.87ms 
iter 2120: loss 2.9228, time 5261.83ms 
iter 2121: loss 2.9124, time 5248.78ms 
iter 2122: loss 2.9410, time 5255.11ms 
iter 2123: loss 3.0228, time 5247.39ms 
iter 2124: loss 2.7269, time 5244.20ms 
iter 2125: loss 3.1159, time 5250.49ms 
iter 2126: loss 2.8223, time 5244.65ms 
iter 2127: loss 2.7087, time 5246.96ms 
iter 2128: loss 2.8703, time 5248.17ms 
iter 2129: loss 2.9309, time 5267.90ms 
iter 2130: loss 2.9346, time 5247.68ms 
iter 2131: loss 2.8254, time 5250.58ms 
iter 2132: loss 2.9942, time 5258.95ms 
iter 2133: loss 2.5741, time 5249.02ms 
iter 2134: loss 2.8014, time 5247.74ms 
iter 2135: loss 2.8906, time 5254.06ms 
iter 2136: loss 2.7368, time 5244.17ms 
iter 2137: loss 2.7630, time 5243.97ms 
iter 2138: loss 2.8327, time 5249.79ms 
iter 2139: loss 2.8704, time 5270.35ms 
iter 2140: loss 2.7267, time 5250.75ms 
iter 2141: loss 2.6885, time 5249.11ms 
iter 2142: loss 2.7700, time 5256.26ms 
iter 2143: loss 2.9424, time 5245.92ms 
iter 2144: loss 2.8396, time 5253.61ms 
iter 2145: loss 2.8277, time 5261.12ms 
iter 2146: loss 2.9076, time 5257.19ms 
iter 2147: loss 2.6091, time 5273.70ms 
iter 2148: loss 2.7148, time 5251.39ms 
iter 2149: loss 2.7143, time 5259.59ms 
step 2150: train loss 2.8387, val loss 2.9421
iter 2150: loss 2.7402, time 19974.22ms 
iter 2151: loss 2.8050, time 5246.98ms 
iter 2152: loss 2.8708, time 5262.17ms 
iter 2153: loss 2.8526, time 5264.26ms 
iter 2154: loss 2.9846, time 5256.72ms 
iter 2155: loss 2.8767, time 5275.47ms 
iter 2156: loss 2.8367, time 5252.07ms 
iter 2157: loss 2.7293, time 5235.73ms 
iter 2158: loss 2.8500, time 5255.42ms 
iter 2159: loss 2.6105, time 5257.34ms 
iter 2160: loss 2.7774, time 5246.75ms 
iter 2161: loss 2.7210, time 5255.41ms 
iter 2162: loss 2.8488, time 5253.70ms 
iter 2163: loss 2.8533, time 5236.09ms 
iter 2164: loss 2.6909, time 5252.26ms 
iter 2165: loss 2.6128, time 5254.65ms 
iter 2166: loss 3.0452, time 5251.18ms 
iter 2167: loss 2.7552, time 5248.70ms 
iter 2168: loss 2.8518, time 5254.43ms 
iter 2169: loss 2.7541, time 5254.67ms 
iter 2170: loss 2.7096, time 5255.08ms 
iter 2171: loss 2.7987, time 5248.43ms 
iter 2172: loss 2.7884, time 5294.37ms 
iter 2173: loss 2.7888, time 5255.52ms 
iter 2174: loss 2.9177, time 5274.43ms 
iter 2175: loss 2.9961, time 5254.98ms 
iter 2176: loss 2.9617, time 5254.04ms 
iter 2177: loss 3.0278, time 5250.75ms 
iter 2178: loss 2.7152, time 5257.56ms 
iter 2179: loss 2.7620, time 5264.00ms 
iter 2180: loss 2.9382, time 5247.21ms 
iter 2181: loss 2.7708, time 5253.86ms 
iter 2182: loss 2.7994, time 5251.27ms 
iter 2183: loss 2.7216, time 5248.63ms 
iter 2184: loss 2.8136, time 5248.02ms 
iter 2185: loss 2.7695, time 5245.35ms 
iter 2186: loss 2.8373, time 5249.90ms 
iter 2187: loss 2.9654, time 5200.08ms 
iter 2188: loss 2.9513, time 5253.18ms 
iter 2189: loss 2.6208, time 5295.80ms 
iter 2190: loss 2.8880, time 5336.45ms 
iter 2191: loss 2.8267, time 5248.51ms 
iter 2192: loss 2.8335, time 5247.16ms 
iter 2193: loss 2.7914, time 5245.80ms 
iter 2194: loss 2.7568, time 5245.84ms 
iter 2195: loss 2.7791, time 5326.93ms 
iter 2196: loss 2.8558, time 5231.78ms 
iter 2197: loss 2.8612, time 5245.80ms 
iter 2198: loss 2.9504, time 5254.07ms 
iter 2199: loss 2.7848, time 5257.51ms 
step 2200: train loss 2.8430, val loss 2.9334
iter 2200: loss 2.8006, time 19985.97ms 
iter 2201: loss 2.9069, time 5243.74ms 
iter 2202: loss 2.8521, time 5271.01ms 
iter 2203: loss 2.9764, time 5278.93ms 
iter 2204: loss 2.7665, time 5290.66ms 
iter 2205: loss 2.9124, time 5279.91ms 
iter 2206: loss 2.7380, time 5317.81ms 
iter 2207: loss 2.7867, time 5326.60ms 
iter 2208: loss 2.7840, time 5126.27ms 
iter 2209: loss 2.8114, time 5091.17ms 
iter 2210: loss 2.7881, time 5079.21ms 
iter 2211: loss 2.5718, time 5131.52ms 
iter 2212: loss 2.6179, time 5259.42ms 
iter 2213: loss 2.6917, time 5250.34ms 
iter 2214: loss 2.8413, time 5258.62ms 
iter 2215: loss 2.8523, time 5257.22ms 
iter 2216: loss 2.8535, time 5254.65ms 
iter 2217: loss 2.8365, time 5264.23ms 
iter 2218: loss 2.7256, time 5260.21ms 
iter 2219: loss 2.8146, time 5262.42ms 
iter 2220: loss 2.6036, time 5261.10ms 
iter 2221: loss 2.7988, time 5252.89ms 
iter 2222: loss 2.9603, time 5247.82ms 
iter 2223: loss 2.9046, time 5253.69ms 
iter 2224: loss 2.8948, time 5258.74ms 
iter 2225: loss 2.8721, time 5273.34ms 
iter 2226: loss 2.8993, time 5254.69ms 
iter 2227: loss 2.6783, time 5245.88ms 
iter 2228: loss 2.7416, time 5266.22ms 
iter 2229: loss 2.8453, time 5265.37ms 
iter 2230: loss 2.7144, time 5265.65ms 
iter 2231: loss 2.6738, time 5246.79ms 
iter 2232: loss 2.8789, time 5243.24ms 
iter 2233: loss 2.8548, time 5242.40ms 
iter 2234: loss 2.7926, time 5251.45ms 
iter 2235: loss 2.6963, time 5177.16ms 
iter 2236: loss 2.8380, time 5253.97ms 
iter 2237: loss 2.6167, time 5281.04ms 
iter 2238: loss 2.9540, time 5293.65ms 
iter 2239: loss 2.8811, time 5262.12ms 
iter 2240: loss 2.8711, time 5248.42ms 
iter 2241: loss 2.7305, time 5257.64ms 
iter 2242: loss 2.8243, time 5253.62ms 
iter 2243: loss 2.7026, time 5260.37ms 
iter 2244: loss 2.7002, time 5253.20ms 
iter 2245: loss 2.9027, time 5258.12ms 
iter 2246: loss 2.9123, time 5250.73ms 
iter 2247: loss 3.0218, time 5267.15ms 
iter 2248: loss 2.8083, time 5255.01ms 
iter 2249: loss 3.0123, time 5308.03ms 
step 2250: train loss 2.8230, val loss 2.9238
iter 2250: loss 2.8594, time 20106.79ms 
iter 2251: loss 2.6934, time 5296.10ms 
iter 2252: loss 2.9745, time 5267.34ms 
iter 2253: loss 2.8588, time 5292.23ms 
iter 2254: loss 2.9155, time 5292.75ms 
iter 2255: loss 2.8257, time 5284.08ms 
iter 2256: loss 2.9224, time 5260.24ms 
iter 2257: loss 2.8003, time 5272.55ms 
iter 2258: loss 2.5493, time 5265.61ms 
iter 2259: loss 2.7779, time 5260.79ms 
iter 2260: loss 2.7226, time 5268.22ms 
iter 2261: loss 2.8613, time 5253.49ms 
iter 2262: loss 2.9374, time 5266.09ms 
iter 2263: loss 2.7939, time 5255.75ms 
iter 2264: loss 2.8305, time 5260.40ms 
iter 2265: loss 2.7661, time 5255.43ms 
iter 2266: loss 2.7576, time 5257.25ms 
iter 2267: loss 2.8657, time 5250.82ms 
iter 2268: loss 2.8169, time 5251.67ms 
iter 2269: loss 2.7644, time 5253.37ms 
iter 2270: loss 2.7010, time 5253.68ms 
iter 2271: loss 2.9125, time 5253.77ms 
iter 2272: loss 2.8234, time 5257.40ms 
iter 2273: loss 2.7034, time 5246.42ms 
iter 2274: loss 2.8364, time 5251.97ms 
iter 2275: loss 2.6747, time 5245.70ms 
iter 2276: loss 2.8938, time 5246.33ms 
iter 2277: loss 2.9276, time 5245.38ms 
iter 2278: loss 2.7301, time 5255.35ms 
iter 2279: loss 2.7760, time 5254.45ms 
iter 2280: loss 2.7199, time 5280.93ms 
iter 2281: loss 2.7871, time 5254.86ms 
iter 2282: loss 2.8126, time 5247.36ms 
iter 2283: loss 2.7605, time 5250.35ms 
iter 2284: loss 2.9221, time 5247.19ms 
iter 2285: loss 2.9274, time 5254.37ms 
iter 2286: loss 2.9193, time 5259.63ms 
iter 2287: loss 2.9801, time 5249.49ms 
iter 2288: loss 3.0983, time 5252.13ms 
iter 2289: loss 3.0204, time 5250.36ms 
iter 2290: loss 2.7678, time 5251.14ms 
iter 2291: loss 2.9489, time 5250.29ms 
iter 2292: loss 2.8155, time 5247.51ms 
iter 2293: loss 2.9919, time 5253.47ms 
iter 2294: loss 2.8230, time 5214.80ms 
iter 2295: loss 2.7721, time 5281.84ms 
iter 2296: loss 2.8203, time 5256.04ms 
iter 2297: loss 2.7642, time 5263.01ms 
iter 2298: loss 3.0556, time 5263.36ms 
iter 2299: loss 2.7560, time 5265.12ms 
step 2300: train loss 2.8350, val loss 2.9177
iter 2300: loss 2.7870, time 19920.73ms 
iter 2301: loss 2.8905, time 5252.81ms 
iter 2302: loss 2.7800, time 5265.83ms 
iter 2303: loss 2.7821, time 5251.20ms 
iter 2304: loss 2.9502, time 5258.79ms 
iter 2305: loss 2.8239, time 5254.42ms 
iter 2306: loss 2.9923, time 5326.88ms 
iter 2307: loss 2.8128, time 5333.00ms 
iter 2308: loss 2.6954, time 5256.14ms 
iter 2309: loss 2.8488, time 5233.02ms 
iter 2310: loss 2.7322, time 5225.94ms 
iter 2311: loss 3.0384, time 5319.96ms 
iter 2312: loss 2.8955, time 5288.91ms 
iter 2313: loss 2.6398, time 5267.32ms 
iter 2314: loss 2.8597, time 5281.77ms 
iter 2315: loss 2.7972, time 5271.36ms 
iter 2316: loss 2.7005, time 5261.96ms 
iter 2317: loss 2.7140, time 5316.22ms 
iter 2318: loss 2.8907, time 5266.67ms 
iter 2319: loss 3.0104, time 5248.26ms 
iter 2320: loss 2.7800, time 5247.06ms 
iter 2321: loss 2.8499, time 5260.35ms 
iter 2322: loss 2.8349, time 5258.74ms 
iter 2323: loss 2.7307, time 5259.25ms 
iter 2324: loss 2.8354, time 5260.55ms 
iter 2325: loss 2.7892, time 5260.75ms 
iter 2326: loss 2.7256, time 5260.75ms 
iter 2327: loss 2.8856, time 5258.36ms 
iter 2328: loss 3.0106, time 5261.88ms 
iter 2329: loss 2.8419, time 5265.62ms 
iter 2330: loss 2.7858, time 5264.37ms 
iter 2331: loss 2.6963, time 5242.49ms 
iter 2332: loss 2.6511, time 5251.49ms 
iter 2333: loss 2.6608, time 5253.86ms 
iter 2334: loss 2.7011, time 5305.07ms 
iter 2335: loss 2.8553, time 5284.13ms 
iter 2336: loss 2.7761, time 5242.06ms 
iter 2337: loss 2.7331, time 5180.83ms 
iter 2338: loss 2.7254, time 5256.02ms 
iter 2339: loss 2.6425, time 5254.34ms 
iter 2340: loss 2.8484, time 5253.64ms 
iter 2341: loss 2.7869, time 5250.02ms 
iter 2342: loss 2.9288, time 5264.42ms 
iter 2343: loss 2.7353, time 5257.78ms 
iter 2344: loss 2.7030, time 5249.35ms 
iter 2345: loss 2.9013, time 5248.16ms 
iter 2346: loss 2.7731, time 5261.88ms 
iter 2347: loss 2.6402, time 5266.90ms 
iter 2348: loss 2.8139, time 5264.29ms 
iter 2349: loss 3.0124, time 5255.66ms 
step 2350: train loss 2.7986, val loss 2.9085
iter 2350: loss 2.5015, time 20064.68ms 
iter 2351: loss 2.7235, time 5259.55ms 
iter 2352: loss 2.9071, time 5268.53ms 
iter 2353: loss 2.6472, time 5322.12ms 
iter 2354: loss 2.8601, time 5242.14ms 
iter 2355: loss 2.9048, time 5252.16ms 
iter 2356: loss 2.7878, time 5254.91ms 
iter 2357: loss 2.7406, time 5262.59ms 
iter 2358: loss 2.9520, time 5264.99ms 
iter 2359: loss 2.6632, time 5271.67ms 
iter 2360: loss 2.9456, time 5275.38ms 
iter 2361: loss 2.6039, time 5280.84ms 
iter 2362: loss 2.7407, time 5267.09ms 
iter 2363: loss 2.7160, time 5272.95ms 
iter 2364: loss 2.8168, time 5312.65ms 
iter 2365: loss 2.6833, time 5323.61ms 
iter 2366: loss 2.8299, time 5265.36ms 
iter 2367: loss 2.5946, time 5251.13ms 
iter 2368: loss 2.6026, time 5254.41ms 
iter 2369: loss 2.8406, time 5262.51ms 
iter 2370: loss 3.0059, time 5255.87ms 
iter 2371: loss 2.9773, time 5251.57ms 
iter 2372: loss 2.8292, time 5222.00ms 
iter 2373: loss 2.8057, time 5249.24ms 
iter 2374: loss 2.8219, time 5265.13ms 
iter 2375: loss 2.9183, time 5256.24ms 
iter 2376: loss 2.8398, time 5254.86ms 
iter 2377: loss 2.7929, time 5258.72ms 
iter 2378: loss 2.7110, time 5251.09ms 
iter 2379: loss 2.6755, time 5226.38ms 
iter 2380: loss 2.9256, time 5249.49ms 
iter 2381: loss 2.6366, time 5229.15ms 
iter 2382: loss 2.9230, time 5246.00ms 
iter 2383: loss 2.7952, time 5253.14ms 
iter 2384: loss 2.7412, time 5256.64ms 
iter 2385: loss 2.6859, time 5248.26ms 
iter 2386: loss 3.0130, time 5243.39ms 
iter 2387: loss 2.8751, time 5259.68ms 
iter 2388: loss 2.7832, time 5253.54ms 
iter 2389: loss 2.8034, time 5246.44ms 
iter 2390: loss 2.8744, time 5247.86ms 
iter 2391: loss 2.6829, time 5055.24ms 
iter 2392: loss 2.8821, time 5261.12ms 
iter 2393: loss 2.7135, time 5253.47ms 
iter 2394: loss 2.8283, time 5232.14ms 
iter 2395: loss 2.7907, time 5247.05ms 
iter 2396: loss 2.6908, time 5254.83ms 
iter 2397: loss 2.9341, time 5252.36ms 
iter 2398: loss 2.9946, time 5251.80ms 
iter 2399: loss 2.7421, time 5257.54ms 
step 2400: train loss 2.8025, val loss 2.9200
iter 2400: loss 2.7587, time 19924.10ms 
iter 2401: loss 2.8658, time 5244.53ms 
iter 2402: loss 2.9690, time 5247.29ms 
iter 2403: loss 2.8874, time 5252.62ms 
iter 2404: loss 2.9327, time 5252.37ms 
iter 2405: loss 2.7187, time 5254.91ms 
iter 2406: loss 2.8003, time 5250.93ms 
iter 2407: loss 2.6570, time 5254.61ms 
iter 2408: loss 2.7534, time 5247.64ms 
iter 2409: loss 2.9050, time 5251.27ms 
iter 2410: loss 2.8524, time 5245.72ms 
iter 2411: loss 2.7472, time 5264.63ms 
iter 2412: loss 2.8341, time 5247.87ms 
iter 2413: loss 2.8771, time 5125.59ms 
iter 2414: loss 2.8245, time 5036.69ms 
iter 2415: loss 2.8196, time 5191.31ms 
iter 2416: loss 2.8613, time 5208.27ms 
iter 2417: loss 2.7162, time 5155.04ms 
iter 2418: loss 2.7296, time 5146.83ms 
iter 2419: loss 2.9576, time 5235.41ms 
iter 2420: loss 3.0475, time 5167.20ms 
iter 2421: loss 2.8888, time 5259.64ms 
iter 2422: loss 2.8167, time 5162.37ms 
iter 2423: loss 2.7856, time 5247.90ms 
iter 2424: loss 2.7102, time 5127.65ms 
iter 2425: loss 2.8305, time 5242.92ms 
iter 2426: loss 2.6916, time 5135.14ms 
iter 2427: loss 2.7003, time 5229.63ms 
iter 2428: loss 2.9339, time 5065.26ms 
iter 2429: loss 2.7354, time 5209.54ms 
iter 2430: loss 2.7697, time 5186.20ms 
iter 2431: loss 2.8514, time 5108.34ms 
iter 2432: loss 2.7762, time 5205.15ms 
iter 2433: loss 2.8174, time 5209.86ms 
iter 2434: loss 2.6886, time 5234.28ms 
iter 2435: loss 2.7915, time 5265.38ms 
iter 2436: loss 2.7538, time 5258.70ms 
iter 2437: loss 2.9716, time 5256.40ms 
iter 2438: loss 2.6177, time 5258.19ms 
iter 2439: loss 2.6106, time 5257.05ms 
iter 2440: loss 2.8036, time 5264.59ms 
iter 2441: loss 2.7975, time 5263.21ms 
iter 2442: loss 2.9430, time 5261.28ms 
iter 2443: loss 2.7740, time 5254.71ms 
iter 2444: loss 2.8118, time 5271.56ms 
iter 2445: loss 2.8687, time 5233.32ms 
iter 2446: loss 2.9761, time 5274.22ms 
iter 2447: loss 2.9551, time 5253.17ms 
iter 2448: loss 2.9429, time 5254.76ms 
iter 2449: loss 2.7802, time 5302.78ms 
step 2450: train loss 2.7914, val loss 2.9095
iter 2450: loss 2.7634, time 20008.68ms 
iter 2451: loss 2.8545, time 5245.51ms 
iter 2452: loss 3.0111, time 5250.38ms 
iter 2453: loss 2.8202, time 5246.39ms 
iter 2454: loss 2.9893, time 5244.42ms 
iter 2455: loss 2.7756, time 5250.09ms 
iter 2456: loss 2.7083, time 5256.20ms 
iter 2457: loss 2.8192, time 5250.43ms 
iter 2458: loss 2.7800, time 5238.19ms 
iter 2459: loss 2.8155, time 5249.26ms 
iter 2460: loss 2.6928, time 5265.70ms 
iter 2461: loss 2.8469, time 5261.10ms 
iter 2462: loss 2.5459, time 5254.05ms 
iter 2463: loss 2.7818, time 5289.81ms 
iter 2464: loss 2.7246, time 5286.67ms 
iter 2465: loss 2.9644, time 5256.65ms 
iter 2466: loss 2.6406, time 5270.72ms 
iter 2467: loss 2.6116, time 5290.97ms 
iter 2468: loss 2.7331, time 5313.80ms 
iter 2469: loss 2.8210, time 5280.73ms 
iter 2470: loss 2.9109, time 5244.74ms 
iter 2471: loss 2.6916, time 5245.45ms 
iter 2472: loss 2.9304, time 5249.94ms 
iter 2473: loss 3.0054, time 5255.78ms 
iter 2474: loss 2.8571, time 5177.80ms 
iter 2475: loss 2.8088, time 5223.24ms 
iter 2476: loss 2.8209, time 5238.80ms 
iter 2477: loss 2.7899, time 5219.66ms 
iter 2478: loss 2.9207, time 5192.29ms 
iter 2479: loss 2.8178, time 5266.60ms 
iter 2480: loss 2.8386, time 5322.88ms 
iter 2481: loss 2.6719, time 5322.84ms 
iter 2482: loss 2.9342, time 5330.92ms 
iter 2483: loss 2.9816, time 5217.22ms 
iter 2484: loss 2.8261, time 5191.91ms 
iter 2485: loss 2.6462, time 5226.26ms 
iter 2486: loss 2.7561, time 5217.64ms 
iter 2487: loss 2.7346, time 5252.68ms 
iter 2488: loss 2.7988, time 5249.96ms 
iter 2489: loss 2.7062, time 5252.67ms 
iter 2490: loss 2.8392, time 5247.81ms 
iter 2491: loss 2.8886, time 5255.01ms 
iter 2492: loss 2.5640, time 5247.35ms 
iter 2493: loss 2.8807, time 5246.28ms 
iter 2494: loss 2.7887, time 5233.06ms 
iter 2495: loss 2.7397, time 5248.08ms 
iter 2496: loss 2.7590, time 5247.06ms 
iter 2497: loss 2.7577, time 5251.10ms 
iter 2498: loss 2.9801, time 5219.17ms 
iter 2499: loss 2.6193, time 5211.58ms 
step 2500: train loss 2.7844, val loss 2.9079
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 2500: loss 2.7709, time 21183.35ms 
iter 2501: loss 3.0488, time 5258.94ms 
iter 2502: loss 2.8451, time 5253.04ms 
iter 2503: loss 2.8071, time 5258.67ms 
iter 2504: loss 2.7617, time 5255.78ms 
iter 2505: loss 2.6538, time 5252.95ms 
iter 2506: loss 2.8395, time 5253.70ms 
iter 2507: loss 2.8167, time 5283.68ms 
iter 2508: loss 2.8001, time 5321.18ms 
iter 2509: loss 2.8011, time 5326.83ms 
iter 2510: loss 2.7887, time 5328.88ms 
iter 2511: loss 2.8312, time 5328.01ms 
iter 2512: loss 2.7464, time 5324.26ms 
iter 2513: loss 2.9803, time 5325.84ms 
iter 2514: loss 2.6527, time 5333.16ms 
iter 2515: loss 2.9122, time 5235.66ms 
iter 2516: loss 2.9130, time 5258.65ms 
iter 2517: loss 2.8072, time 5261.09ms 
iter 2518: loss 2.7337, time 5223.99ms 
iter 2519: loss 2.9382, time 5258.19ms 
iter 2520: loss 2.9201, time 5269.93ms 
iter 2521: loss 2.7799, time 5260.17ms 
iter 2522: loss 2.7928, time 5167.03ms 
iter 2523: loss 2.5909, time 5214.75ms 
iter 2524: loss 2.4559, time 5248.66ms 
iter 2525: loss 2.6653, time 5218.33ms 
iter 2526: loss 2.8770, time 5222.43ms 
iter 2527: loss 2.9258, time 5208.82ms 
iter 2528: loss 2.5167, time 5253.10ms 
iter 2529: loss 3.0210, time 5218.89ms 
iter 2530: loss 2.8519, time 5256.09ms 
iter 2531: loss 2.7214, time 5314.65ms 
iter 2532: loss 2.9337, time 5121.73ms 
iter 2533: loss 2.7071, time 5005.38ms 
iter 2534: loss 2.6769, time 5246.68ms 
iter 2535: loss 2.7539, time 5189.36ms 
iter 2536: loss 2.8986, time 5248.10ms 
iter 2537: loss 2.7077, time 5213.20ms 
iter 2538: loss 3.0496, time 5236.27ms 
iter 2539: loss 2.6679, time 5214.40ms 
iter 2540: loss 2.7811, time 5256.34ms 
iter 2541: loss 2.7176, time 5245.37ms 
iter 2542: loss 2.6152, time 5254.03ms 
iter 2543: loss 2.7926, time 5264.69ms 
iter 2544: loss 2.7506, time 5249.93ms 
iter 2545: loss 2.7850, time 5248.67ms 
iter 2546: loss 2.6459, time 5248.32ms 
iter 2547: loss 2.7318, time 5251.19ms 
iter 2548: loss 2.9184, time 5259.99ms 
iter 2549: loss 2.8382, time 5225.71ms 
step 2550: train loss 2.7876, val loss 2.9000
iter 2550: loss 2.6152, time 19973.84ms 
iter 2551: loss 2.9521, time 5215.90ms 
iter 2552: loss 2.7239, time 5215.06ms 
iter 2553: loss 2.7355, time 5322.25ms 
iter 2554: loss 2.7680, time 5204.15ms 
iter 2555: loss 2.5050, time 5211.19ms 
iter 2556: loss 2.8714, time 5212.94ms 
iter 2557: loss 2.6315, time 5217.76ms 
iter 2558: loss 2.7283, time 5224.11ms 
iter 2559: loss 2.5437, time 5219.80ms 
iter 2560: loss 2.7450, time 5203.97ms 
iter 2561: loss 2.6748, time 5204.94ms 
iter 2562: loss 2.9750, time 5230.39ms 
iter 2563: loss 2.8958, time 5228.81ms 
iter 2564: loss 2.8191, time 5252.93ms 
iter 2565: loss 2.7372, time 5262.82ms 
iter 2566: loss 2.8340, time 5266.15ms 
iter 2567: loss 2.7591, time 5219.59ms 
iter 2568: loss 2.7915, time 5223.55ms 
iter 2569: loss 2.8777, time 5240.85ms 
iter 2570: loss 2.8160, time 5207.17ms 
iter 2571: loss 2.7215, time 5237.64ms 
iter 2572: loss 2.7348, time 5221.60ms 
iter 2573: loss 2.9345, time 5224.15ms 
iter 2574: loss 2.6851, time 5183.13ms 
iter 2575: loss 2.6861, time 5229.75ms 
iter 2576: loss 2.6139, time 5232.74ms 
iter 2577: loss 2.6199, time 5222.26ms 
iter 2578: loss 2.5940, time 5192.25ms 
iter 2579: loss 2.9320, time 5241.93ms 
iter 2580: loss 2.9000, time 5234.89ms 
iter 2581: loss 2.7026, time 5261.98ms 
iter 2582: loss 2.9159, time 5266.68ms 
iter 2583: loss 2.8089, time 5261.80ms 
iter 2584: loss 2.8473, time 5275.03ms 
iter 2585: loss 2.8543, time 5279.89ms 
iter 2586: loss 2.6464, time 5268.41ms 
iter 2587: loss 2.7383, time 5275.01ms 
iter 2588: loss 2.7201, time 5272.18ms 
iter 2589: loss 2.5236, time 5274.14ms 
iter 2590: loss 2.6786, time 5280.10ms 
iter 2591: loss 2.7849, time 5240.10ms 
iter 2592: loss 2.7385, time 5260.66ms 
iter 2593: loss 2.9014, time 5245.74ms 
iter 2594: loss 2.9017, time 5253.70ms 
iter 2595: loss 2.8110, time 5253.81ms 
iter 2596: loss 2.6926, time 5252.81ms 
iter 2597: loss 2.7477, time 5254.48ms 
iter 2598: loss 2.7621, time 5268.45ms 
iter 2599: loss 2.6949, time 5253.22ms 
step 2600: train loss 2.7733, val loss 2.8986
iter 2600: loss 2.6580, time 19987.70ms 
iter 2601: loss 2.7108, time 5246.67ms 
iter 2602: loss 2.9236, time 5257.52ms 
iter 2603: loss 2.9543, time 5269.74ms 
iter 2604: loss 2.8088, time 5259.34ms 
iter 2605: loss 2.8296, time 5267.62ms 
iter 2606: loss 2.7106, time 5258.41ms 
iter 2607: loss 2.6942, time 5269.99ms 
iter 2608: loss 2.9293, time 5279.20ms 
iter 2609: loss 2.7548, time 5254.25ms 
iter 2610: loss 2.7958, time 5254.05ms 
iter 2611: loss 2.8424, time 5256.04ms 
iter 2612: loss 2.9271, time 5252.33ms 
iter 2613: loss 2.6948, time 5283.25ms 
iter 2614: loss 2.6728, time 5254.05ms 
iter 2615: loss 2.6451, time 5283.33ms 
iter 2616: loss 2.6001, time 5249.51ms 
iter 2617: loss 2.9501, time 5245.34ms 
iter 2618: loss 2.8583, time 5250.53ms 
iter 2619: loss 2.7425, time 5257.42ms 
iter 2620: loss 2.7685, time 5258.79ms 
iter 2621: loss 2.9052, time 5258.32ms 
iter 2622: loss 2.8416, time 5266.50ms 
iter 2623: loss 2.8617, time 5280.61ms 
iter 2624: loss 2.7622, time 5267.72ms 
iter 2625: loss 2.8778, time 5232.11ms 
iter 2626: loss 2.4901, time 5251.68ms 
iter 2627: loss 2.8005, time 5253.38ms 
iter 2628: loss 2.7546, time 5254.88ms 
iter 2629: loss 2.9295, time 5251.90ms 
iter 2630: loss 2.8938, time 5283.73ms 
iter 2631: loss 2.7774, time 5277.90ms 
iter 2632: loss 2.7841, time 5253.32ms 
iter 2633: loss 2.9214, time 5249.50ms 
iter 2634: loss 2.7311, time 5277.17ms 
iter 2635: loss 2.7655, time 5265.06ms 
iter 2636: loss 2.8310, time 5274.20ms 
iter 2637: loss 2.7479, time 5278.26ms 
iter 2638: loss 2.7673, time 5266.33ms 
iter 2639: loss 2.6474, time 5244.17ms 
iter 2640: loss 2.8857, time 5265.47ms 
iter 2641: loss 2.6945, time 5204.97ms 
iter 2642: loss 2.6173, time 5247.96ms 
iter 2643: loss 2.7817, time 5215.16ms 
iter 2644: loss 2.7227, time 5317.68ms 
iter 2645: loss 2.6714, time 5327.54ms 
iter 2646: loss 2.7112, time 5246.75ms 
iter 2647: loss 2.7681, time 5253.96ms 
iter 2648: loss 2.6164, time 5261.71ms 
iter 2649: loss 2.7181, time 5256.30ms 
step 2650: train loss 2.7561, val loss 2.8984
iter 2650: loss 2.7082, time 19948.54ms 
iter 2651: loss 2.2043, time 5214.63ms 
iter 2652: loss 2.7725, time 5215.04ms 
iter 2653: loss 2.7564, time 5219.32ms 
iter 2654: loss 2.7785, time 5213.36ms 
iter 2655: loss 2.7599, time 5249.56ms 
iter 2656: loss 2.8900, time 5241.21ms 
iter 2657: loss 2.6460, time 5244.77ms 
iter 2658: loss 2.6852, time 5258.93ms 
iter 2659: loss 2.8264, time 5259.84ms 
iter 2660: loss 2.8819, time 5266.71ms 
iter 2661: loss 2.8467, time 5256.26ms 
iter 2662: loss 2.7296, time 5255.75ms 
iter 2663: loss 2.8818, time 5259.75ms 
iter 2664: loss 2.8346, time 5328.54ms 
iter 2665: loss 2.7250, time 5225.89ms 
iter 2666: loss 2.8352, time 5259.23ms 
iter 2667: loss 2.6825, time 5254.69ms 
iter 2668: loss 2.9503, time 5298.00ms 
iter 2669: loss 2.7132, time 5258.22ms 
iter 2670: loss 2.7043, time 5249.11ms 
iter 2671: loss 2.5430, time 5251.08ms 
iter 2672: loss 2.8389, time 5249.67ms 
iter 2673: loss 2.8813, time 5254.94ms 
iter 2674: loss 2.6427, time 5253.66ms 
iter 2675: loss 2.5898, time 5285.26ms 
iter 2676: loss 2.8486, time 5310.83ms 
iter 2677: loss 2.7968, time 5317.81ms 
iter 2678: loss 2.8914, time 5248.34ms 
iter 2679: loss 2.8301, time 5253.18ms 
iter 2680: loss 2.4662, time 5146.10ms 
iter 2681: loss 2.7001, time 5241.52ms 
iter 2682: loss 2.8260, time 5255.61ms 
iter 2683: loss 2.9747, time 5283.12ms 
iter 2684: loss 2.7834, time 5217.95ms 
iter 2685: loss 2.7887, time 5218.17ms 
iter 2686: loss 2.8986, time 5234.13ms 
iter 2687: loss 2.7147, time 5235.88ms 
iter 2688: loss 2.7427, time 5241.38ms 
iter 2689: loss 2.6838, time 5174.87ms 
iter 2690: loss 2.9026, time 5227.28ms 
iter 2691: loss 2.7782, time 5231.45ms 
iter 2692: loss 2.6980, time 5231.93ms 
iter 2693: loss 2.6749, time 5228.67ms 
iter 2694: loss 2.9388, time 5238.68ms 
iter 2695: loss 2.6731, time 5235.63ms 
iter 2696: loss 2.7065, time 5234.17ms 
iter 2697: loss 2.7290, time 5237.63ms 
iter 2698: loss 2.8331, time 5241.58ms 
iter 2699: loss 2.8494, time 5238.45ms 
step 2700: train loss 2.7643, val loss 2.8964
iter 2700: loss 2.8105, time 19970.69ms 
iter 2701: loss 2.6661, time 5247.20ms 
iter 2702: loss 2.6388, time 5241.50ms 
iter 2703: loss 2.8996, time 5239.05ms 
iter 2704: loss 2.7626, time 5177.98ms 
iter 2705: loss 2.7424, time 5225.03ms 
iter 2706: loss 2.8755, time 5255.24ms 
iter 2707: loss 2.8091, time 5260.13ms 
iter 2708: loss 2.9278, time 5264.06ms 
iter 2709: loss 2.7388, time 5268.14ms 
iter 2710: loss 2.8621, time 5309.02ms 
iter 2711: loss 2.6837, time 5325.55ms 
iter 2712: loss 2.8131, time 5307.47ms 
iter 2713: loss 2.6585, time 5294.29ms 
iter 2714: loss 2.6535, time 5304.04ms 
iter 2715: loss 2.7671, time 5234.18ms 
iter 2716: loss 2.7490, time 5252.36ms 
iter 2717: loss 2.6714, time 5256.76ms 
iter 2718: loss 2.9299, time 5239.58ms 
iter 2719: loss 2.7336, time 5238.28ms 
iter 2720: loss 2.9608, time 5240.09ms 
iter 2721: loss 2.7745, time 5242.89ms 
iter 2722: loss 2.7591, time 5239.31ms 
iter 2723: loss 2.7387, time 5244.69ms 
iter 2724: loss 2.7748, time 5251.81ms 
iter 2725: loss 2.8357, time 5238.78ms 
iter 2726: loss 2.5591, time 5252.55ms 
iter 2727: loss 2.6340, time 5282.03ms 
iter 2728: loss 2.9240, time 5225.42ms 
iter 2729: loss 2.7365, time 5220.65ms 
iter 2730: loss 2.9148, time 5251.55ms 
iter 2731: loss 2.7802, time 5200.47ms 
iter 2732: loss 2.7289, time 5160.25ms 
iter 2733: loss 2.7552, time 5164.36ms 
iter 2734: loss 2.6753, time 5169.38ms 
iter 2735: loss 2.7600, time 5227.53ms 
iter 2736: loss 2.7443, time 5242.57ms 
iter 2737: loss 2.8305, time 5200.35ms 
iter 2738: loss 2.7591, time 5245.11ms 
iter 2739: loss 2.9294, time 5237.44ms 
iter 2740: loss 2.8169, time 5145.51ms 
iter 2741: loss 2.8122, time 5138.06ms 
iter 2742: loss 2.8051, time 5197.12ms 
iter 2743: loss 2.7592, time 5251.04ms 
iter 2744: loss 2.8922, time 5242.03ms 
iter 2745: loss 2.6178, time 5202.25ms 
iter 2746: loss 2.7586, time 5099.44ms 
iter 2747: loss 2.7757, time 5216.26ms 
iter 2748: loss 2.8858, time 5234.45ms 
iter 2749: loss 2.6873, time 5212.08ms 
step 2750: train loss 2.7595, val loss 2.8909
iter 2750: loss 2.8062, time 19971.87ms 
iter 2751: loss 2.7533, time 5201.00ms 
iter 2752: loss 2.8754, time 5211.59ms 
iter 2753: loss 2.7908, time 5222.14ms 
iter 2754: loss 2.8281, time 5229.16ms 
iter 2755: loss 2.7798, time 5213.38ms 
iter 2756: loss 2.6009, time 5219.22ms 
iter 2757: loss 2.7015, time 5217.83ms 
iter 2758: loss 2.5619, time 5187.30ms 
iter 2759: loss 2.9480, time 5202.96ms 
iter 2760: loss 2.5940, time 5220.77ms 
iter 2761: loss 2.7241, time 5202.18ms 
iter 2762: loss 2.8657, time 5238.18ms 
iter 2763: loss 2.6797, time 5214.93ms 
iter 2764: loss 2.8056, time 5119.55ms 
iter 2765: loss 2.7770, time 5239.49ms 
iter 2766: loss 2.7665, time 5189.05ms 
iter 2767: loss 2.5287, time 5222.08ms 
iter 2768: loss 2.5565, time 5252.33ms 
iter 2769: loss 2.7502, time 5176.33ms 
iter 2770: loss 2.4374, time 5253.85ms 
iter 2771: loss 2.8983, time 5253.08ms 
iter 2772: loss 2.6646, time 5246.20ms 
iter 2773: loss 2.8161, time 5255.43ms 
iter 2774: loss 2.9154, time 5249.52ms 
iter 2775: loss 2.7161, time 5220.63ms 
iter 2776: loss 2.7328, time 5251.23ms 
iter 2777: loss 2.8262, time 5253.70ms 
iter 2778: loss 2.8535, time 5247.43ms 
iter 2779: loss 2.7364, time 5246.71ms 
iter 2780: loss 2.7667, time 5252.51ms 
iter 2781: loss 2.7479, time 5253.14ms 
iter 2782: loss 2.6949, time 5251.19ms 
iter 2783: loss 2.7515, time 5204.70ms 
iter 2784: loss 2.6178, time 5203.81ms 
iter 2785: loss 2.8636, time 5185.83ms 
iter 2786: loss 2.5433, time 5215.47ms 
iter 2787: loss 2.7527, time 5220.91ms 
iter 2788: loss 2.9116, time 5211.93ms 
iter 2789: loss 2.7895, time 5229.93ms 
iter 2790: loss 2.6613, time 5216.66ms 
iter 2791: loss 2.7347, time 5187.62ms 
iter 2792: loss 2.6503, time 5239.37ms 
iter 2793: loss 2.8318, time 5238.56ms 
iter 2794: loss 2.7435, time 5262.72ms 
iter 2795: loss 2.7713, time 5259.32ms 
iter 2796: loss 2.7625, time 5241.86ms 
iter 2797: loss 2.5063, time 5206.18ms 
iter 2798: loss 2.7417, time 5239.01ms 
iter 2799: loss 2.7522, time 5214.42ms 
step 2800: train loss 2.7412, val loss 2.8949
iter 2800: loss 2.8282, time 19935.34ms 
iter 2801: loss 2.9313, time 5255.07ms 
iter 2802: loss 2.9004, time 5252.18ms 
iter 2803: loss 2.7504, time 5251.28ms 
iter 2804: loss 2.8312, time 5267.28ms 
iter 2805: loss 2.8373, time 5249.17ms 
iter 2806: loss 2.7654, time 5252.57ms 
iter 2807: loss 2.8309, time 5245.37ms 
iter 2808: loss 2.5983, time 5245.78ms 
iter 2809: loss 2.8403, time 5243.91ms 
iter 2810: loss 2.6573, time 5245.07ms 
iter 2811: loss 2.6758, time 5251.14ms 
iter 2812: loss 2.6873, time 5173.18ms 
iter 2813: loss 2.7420, time 5256.89ms 
iter 2814: loss 2.8496, time 5250.38ms 
iter 2815: loss 2.8034, time 5252.97ms 
iter 2816: loss 2.6297, time 5248.37ms 
iter 2817: loss 2.7940, time 5270.85ms 
iter 2818: loss 2.8730, time 5259.10ms 
iter 2819: loss 2.7656, time 5267.15ms 
iter 2820: loss 2.7440, time 5240.29ms 
iter 2821: loss 2.8464, time 5250.62ms 
iter 2822: loss 2.6101, time 5255.26ms 
iter 2823: loss 2.7784, time 5251.39ms 
iter 2824: loss 2.7794, time 5165.23ms 
iter 2825: loss 2.9510, time 5247.72ms 
iter 2826: loss 2.7455, time 5217.61ms 
iter 2827: loss 2.7553, time 5250.23ms 
iter 2828: loss 2.7521, time 5245.34ms 
iter 2829: loss 2.6090, time 5249.04ms 
iter 2830: loss 2.6529, time 5249.94ms 
iter 2831: loss 2.7233, time 5244.12ms 
iter 2832: loss 2.8793, time 5251.61ms 
iter 2833: loss 2.8142, time 5249.47ms 
iter 2834: loss 2.8137, time 5203.72ms 
iter 2835: loss 2.9448, time 5106.12ms 
iter 2836: loss 2.6110, time 5209.56ms 
iter 2837: loss 2.8038, time 5111.40ms 
iter 2838: loss 2.6248, time 5245.84ms 
iter 2839: loss 2.7758, time 5248.24ms 
iter 2840: loss 2.7405, time 5249.14ms 
iter 2841: loss 2.7338, time 5198.37ms 
iter 2842: loss 2.6411, time 5233.65ms 
iter 2843: loss 2.8630, time 5234.98ms 
iter 2844: loss 2.8947, time 5237.05ms 
iter 2845: loss 2.7793, time 5224.17ms 
iter 2846: loss 2.7801, time 5241.10ms 
iter 2847: loss 2.7630, time 5237.90ms 
iter 2848: loss 2.6167, time 5234.56ms 
iter 2849: loss 2.7606, time 5236.06ms 
step 2850: train loss 2.7578, val loss 2.8908
iter 2850: loss 2.6457, time 19976.24ms 
iter 2851: loss 2.6698, time 5152.10ms 
iter 2852: loss 2.9149, time 5122.18ms 
iter 2853: loss 2.7860, time 5096.18ms 
iter 2854: loss 2.9268, time 5103.66ms 
iter 2855: loss 2.7021, time 5136.40ms 
iter 2856: loss 2.5371, time 5243.18ms 
iter 2857: loss 2.6901, time 5256.66ms 
iter 2858: loss 2.6112, time 5249.98ms 
iter 2859: loss 2.7264, time 5246.80ms 
iter 2860: loss 2.6672, time 5247.06ms 
iter 2861: loss 2.7246, time 5252.28ms 
iter 2862: loss 2.5825, time 5259.78ms 
iter 2863: loss 2.6907, time 5251.57ms 
iter 2864: loss 2.6905, time 5263.99ms 
iter 2865: loss 2.6404, time 5252.19ms 
iter 2866: loss 2.8990, time 5246.76ms 
iter 2867: loss 2.8353, time 5260.29ms 
iter 2868: loss 2.8287, time 5251.41ms 
iter 2869: loss 2.8405, time 5257.27ms 
iter 2870: loss 2.6771, time 5252.61ms 
iter 2871: loss 2.6446, time 5256.68ms 
iter 2872: loss 2.7830, time 5253.78ms 
iter 2873: loss 2.5731, time 5253.51ms 
iter 2874: loss 2.7071, time 5249.32ms 
iter 2875: loss 2.5961, time 5272.99ms 
iter 2876: loss 2.7252, time 5288.00ms 
iter 2877: loss 2.7054, time 5292.29ms 
iter 2878: loss 2.9383, time 5263.27ms 
iter 2879: loss 2.5876, time 5260.27ms 
iter 2880: loss 2.7187, time 5259.15ms 
iter 2881: loss 2.8616, time 5257.14ms 
iter 2882: loss 2.7226, time 5269.88ms 
iter 2883: loss 2.7347, time 5247.41ms 
iter 2884: loss 2.6504, time 5266.43ms 
iter 2885: loss 2.6221, time 5267.36ms 
iter 2886: loss 2.8862, time 5268.33ms 
iter 2887: loss 2.7023, time 5259.83ms 
iter 2888: loss 2.8673, time 5261.01ms 
iter 2889: loss 2.6346, time 5261.81ms 
iter 2890: loss 2.6689, time 5263.65ms 
iter 2891: loss 2.6714, time 5266.68ms 
iter 2892: loss 2.6456, time 5317.73ms 
iter 2893: loss 2.9154, time 5267.86ms 
iter 2894: loss 2.5253, time 5263.24ms 
iter 2895: loss 2.6749, time 5257.43ms 
iter 2896: loss 3.0045, time 5257.95ms 
iter 2897: loss 2.6189, time 5258.57ms 
iter 2898: loss 2.7362, time 5262.84ms 
iter 2899: loss 2.7976, time 5301.24ms 
step 2900: train loss 2.7254, val loss 2.8956
iter 2900: loss 2.7045, time 20055.95ms 
iter 2901: loss 2.7337, time 5261.47ms 
iter 2902: loss 2.8124, time 5252.94ms 
iter 2903: loss 2.8862, time 5253.56ms 
iter 2904: loss 2.7103, time 5256.82ms 
iter 2905: loss 2.7750, time 5254.04ms 
iter 2906: loss 2.6805, time 5255.92ms 
iter 2907: loss 2.7408, time 5213.85ms 
iter 2908: loss 2.5465, time 5109.50ms 
iter 2909: loss 2.6046, time 5134.15ms 
iter 2910: loss 2.7781, time 5139.09ms 
iter 2911: loss 2.5239, time 5144.01ms 
iter 2912: loss 2.7553, time 5167.52ms 
iter 2913: loss 2.9834, time 5213.56ms 
iter 2914: loss 2.8015, time 5221.03ms 
iter 2915: loss 2.5757, time 5227.33ms 
iter 2916: loss 2.7886, time 5207.56ms 
iter 2917: loss 2.7098, time 5245.20ms 
iter 2918: loss 2.7026, time 5270.84ms 
iter 2919: loss 2.5281, time 5262.95ms 
iter 2920: loss 2.8173, time 5265.36ms 
iter 2921: loss 2.6796, time 5265.34ms 
iter 2922: loss 2.8507, time 5253.69ms 
iter 2923: loss 2.8612, time 5259.54ms 
iter 2924: loss 2.6956, time 5263.57ms 
iter 2925: loss 2.7311, time 5260.55ms 
iter 2926: loss 2.7523, time 5261.04ms 
iter 2927: loss 2.7020, time 5261.27ms 
iter 2928: loss 2.7947, time 5238.77ms 
iter 2929: loss 2.7750, time 5236.43ms 
iter 2930: loss 2.7356, time 5239.34ms 
iter 2931: loss 2.8414, time 5247.61ms 
iter 2932: loss 3.0639, time 5241.12ms 
iter 2933: loss 2.5736, time 5241.29ms 
iter 2934: loss 2.7729, time 5236.71ms 
iter 2935: loss 2.7742, time 5254.38ms 
iter 2936: loss 2.8534, time 5237.60ms 
iter 2937: loss 2.8765, time 5245.71ms 
iter 2938: loss 2.7207, time 5248.34ms 
iter 2939: loss 2.7667, time 5246.23ms 
iter 2940: loss 2.8991, time 5191.48ms 
iter 2941: loss 2.6326, time 5220.72ms 
iter 2942: loss 2.8615, time 5224.14ms 
iter 2943: loss 2.7000, time 5208.04ms 
iter 2944: loss 2.7355, time 5148.18ms 
iter 2945: loss 2.6431, time 5162.22ms 
iter 2946: loss 2.7575, time 5247.17ms 
iter 2947: loss 2.5695, time 5246.62ms 
iter 2948: loss 2.7158, time 5248.55ms 
iter 2949: loss 2.6165, time 5256.86ms 
step 2950: train loss 2.7273, val loss 2.8763
iter 2950: loss 2.9122, time 19968.46ms 
iter 2951: loss 2.8544, time 5252.23ms 
iter 2952: loss 2.6738, time 5245.79ms 
iter 2953: loss 2.7662, time 5245.76ms 
iter 2954: loss 2.9108, time 5248.68ms 
iter 2955: loss 2.6227, time 5245.60ms 
iter 2956: loss 2.7756, time 5269.36ms 
iter 2957: loss 2.7091, time 5260.14ms 
iter 2958: loss 2.7940, time 5267.52ms 
iter 2959: loss 2.7005, time 5255.27ms 
iter 2960: loss 2.6939, time 5253.70ms 
iter 2961: loss 2.6315, time 5259.45ms 
iter 2962: loss 2.6618, time 5255.00ms 
iter 2963: loss 2.7034, time 5260.23ms 
iter 2964: loss 2.8126, time 5254.45ms 
iter 2965: loss 2.5866, time 5246.41ms 
iter 2966: loss 2.7622, time 5267.74ms 
iter 2967: loss 2.5988, time 5252.08ms 
iter 2968: loss 2.6904, time 5190.35ms 
iter 2969: loss 2.6363, time 5249.16ms 
iter 2970: loss 2.7725, time 5254.24ms 
iter 2971: loss 2.7288, time 5251.87ms 
iter 2972: loss 2.7832, time 5245.63ms 
iter 2973: loss 2.6244, time 5238.43ms 
iter 2974: loss 2.7414, time 5155.62ms 
iter 2975: loss 2.6359, time 5193.16ms 
iter 2976: loss 2.7442, time 5224.86ms 
iter 2977: loss 2.5741, time 5215.31ms 
iter 2978: loss 2.8445, time 5219.05ms 
iter 2979: loss 2.7900, time 5215.94ms 
iter 2980: loss 2.7118, time 5221.99ms 
iter 2981: loss 2.6932, time 5213.37ms 
iter 2982: loss 2.5931, time 5208.17ms 
iter 2983: loss 2.8570, time 5132.07ms 
iter 2984: loss 2.7139, time 5107.57ms 
iter 2985: loss 2.6438, time 5272.75ms 
iter 2986: loss 2.7071, time 5321.25ms 
iter 2987: loss 2.8828, time 5271.36ms 
iter 2988: loss 2.9310, time 5281.55ms 
iter 2989: loss 2.7632, time 5311.73ms 
iter 2990: loss 2.8296, time 5269.79ms 
iter 2991: loss 2.7753, time 5268.39ms 
iter 2992: loss 2.7544, time 5258.07ms 
iter 2993: loss 2.6796, time 5251.54ms 
iter 2994: loss 2.7781, time 5258.82ms 
iter 2995: loss 2.7128, time 5263.82ms 
iter 2996: loss 2.7262, time 5261.77ms 
iter 2997: loss 2.7770, time 5252.93ms 
iter 2998: loss 2.7502, time 5251.62ms 
iter 2999: loss 2.7820, time 5249.66ms 
step 3000: train loss 2.7306, val loss 2.8876
iter 3000: loss 2.6917, time 20036.97ms 
iter 3001: loss 2.8877, time 5241.91ms 
iter 3002: loss 2.7581, time 5250.95ms 
iter 3003: loss 2.8190, time 5253.43ms 
iter 3004: loss 2.8357, time 5269.53ms 
iter 3005: loss 2.5998, time 5264.57ms 
iter 3006: loss 2.9380, time 5257.12ms 
iter 3007: loss 2.7043, time 5282.36ms 
iter 3008: loss 2.6830, time 5271.74ms 
iter 3009: loss 2.6602, time 5254.43ms 
iter 3010: loss 2.8546, time 5300.74ms 
iter 3011: loss 2.5815, time 5280.12ms 
iter 3012: loss 2.7705, time 5287.39ms 
iter 3013: loss 2.6350, time 5286.42ms 
iter 3014: loss 2.5787, time 5312.15ms 
iter 3015: loss 2.8200, time 5320.14ms 
iter 3016: loss 2.6092, time 5321.93ms 
iter 3017: loss 2.5486, time 5285.65ms 
iter 3018: loss 2.6514, time 5267.61ms 
iter 3019: loss 2.7960, time 5258.10ms 
iter 3020: loss 2.6105, time 5149.12ms 
iter 3021: loss 2.7386, time 5168.57ms 
iter 3022: loss 2.7097, time 5159.12ms 
iter 3023: loss 2.8570, time 5164.75ms 
iter 3024: loss 2.7692, time 5158.11ms 
iter 3025: loss 2.6696, time 5223.21ms 
iter 3026: loss 2.8397, time 5246.14ms 
iter 3027: loss 2.7040, time 5271.74ms 
iter 3028: loss 2.6603, time 5257.60ms 
iter 3029: loss 2.8895, time 5207.95ms 
iter 3030: loss 2.6920, time 5238.11ms 
iter 3031: loss 2.6309, time 5245.46ms 
iter 3032: loss 2.6115, time 5226.20ms 
iter 3033: loss 2.4769, time 5247.16ms 
iter 3034: loss 2.6294, time 5224.57ms 
iter 3035: loss 2.8700, time 5226.60ms 
iter 3036: loss 2.7843, time 5169.63ms 
iter 3037: loss 2.7132, time 5145.72ms 
iter 3038: loss 2.6710, time 5136.87ms 
iter 3039: loss 2.7658, time 5247.51ms 
iter 3040: loss 2.6205, time 5263.61ms 
iter 3041: loss 2.7210, time 5255.80ms 
iter 3042: loss 2.5578, time 5272.92ms 
iter 3043: loss 2.6302, time 5262.09ms 
iter 3044: loss 2.7296, time 5312.19ms 
iter 3045: loss 2.7112, time 5318.08ms 
iter 3046: loss 2.8560, time 5297.23ms 
iter 3047: loss 2.6680, time 5300.49ms 
iter 3048: loss 2.5992, time 5328.68ms 
iter 3049: loss 2.8115, time 5322.06ms 
step 3050: train loss 2.7139, val loss 2.8730
iter 3050: loss 2.7152, time 20012.23ms 
iter 3051: loss 2.6458, time 5257.80ms 
iter 3052: loss 2.5295, time 5253.74ms 
iter 3053: loss 2.5616, time 5251.65ms 
iter 3054: loss 2.4858, time 5258.25ms 
iter 3055: loss 2.5362, time 5260.77ms 
iter 3056: loss 2.7086, time 5267.48ms 
iter 3057: loss 2.8165, time 5257.56ms 
iter 3058: loss 2.7904, time 5254.40ms 
iter 3059: loss 2.7832, time 5255.46ms 
iter 3060: loss 2.7268, time 5252.19ms 
iter 3061: loss 2.5001, time 5250.93ms 
iter 3062: loss 2.5526, time 5257.52ms 
iter 3063: loss 2.6898, time 5257.63ms 
iter 3064: loss 2.9356, time 5253.50ms 
iter 3065: loss 2.7886, time 5250.31ms 
iter 3066: loss 2.7712, time 5255.62ms 
iter 3067: loss 2.7000, time 5252.97ms 
iter 3068: loss 2.6249, time 5258.54ms 
iter 3069: loss 2.8967, time 5255.89ms 
iter 3070: loss 2.5894, time 5248.62ms 
iter 3071: loss 2.7486, time 5246.53ms 
iter 3072: loss 2.6187, time 5252.90ms 
iter 3073: loss 2.9638, time 5245.68ms 
iter 3074: loss 2.5678, time 5259.66ms 
iter 3075: loss 2.6427, time 5249.50ms 
iter 3076: loss 2.6962, time 5261.20ms 
iter 3077: loss 2.6973, time 5250.32ms 
iter 3078: loss 2.7318, time 5245.55ms 
iter 3079: loss 2.6450, time 5254.02ms 
iter 3080: loss 3.0388, time 5249.28ms 
iter 3081: loss 2.6411, time 5250.86ms 
iter 3082: loss 2.7221, time 5255.16ms 
iter 3083: loss 2.6653, time 5248.66ms 
iter 3084: loss 2.6823, time 5248.94ms 
iter 3085: loss 2.6565, time 5247.06ms 
iter 3086: loss 2.8989, time 5251.07ms 
iter 3087: loss 2.7125, time 5245.86ms 
iter 3088: loss 2.6200, time 5256.64ms 
iter 3089: loss 2.4697, time 5248.88ms 
iter 3090: loss 2.6858, time 5251.20ms 
iter 3091: loss 2.6215, time 5246.14ms 
iter 3092: loss 2.6235, time 5255.16ms 
iter 3093: loss 2.7723, time 5246.51ms 
iter 3094: loss 2.5352, time 5298.09ms 
iter 3095: loss 2.8246, time 5287.33ms 
iter 3096: loss 2.5816, time 5281.86ms 
iter 3097: loss 2.6775, time 5312.39ms 
iter 3098: loss 2.7058, time 5330.88ms 
iter 3099: loss 2.7372, time 5330.66ms 
step 3100: train loss 2.7198, val loss 2.8820
iter 3100: loss 2.9176, time 20068.36ms 
iter 3101: loss 2.6691, time 5266.50ms 
iter 3102: loss 2.6857, time 5236.28ms 
iter 3103: loss 2.6435, time 5250.80ms 
iter 3104: loss 2.6835, time 5265.33ms 
iter 3105: loss 2.6395, time 5263.52ms 
iter 3106: loss 2.7048, time 5254.15ms 
iter 3107: loss 2.7209, time 5285.59ms 
iter 3108: loss 2.7777, time 5287.14ms 
iter 3109: loss 2.8516, time 5276.59ms 
iter 3110: loss 2.7524, time 5269.80ms 
iter 3111: loss 2.5933, time 5265.14ms 
iter 3112: loss 2.5891, time 5284.82ms 
iter 3113: loss 2.6714, time 5227.51ms 
iter 3114: loss 2.7490, time 5258.47ms 
iter 3115: loss 2.6084, time 5253.41ms 
iter 3116: loss 2.8009, time 5275.59ms 
iter 3117: loss 2.8085, time 5273.97ms 
iter 3118: loss 2.6563, time 5250.77ms 
iter 3119: loss 2.8338, time 5275.10ms 
iter 3120: loss 2.8739, time 5263.89ms 
iter 3121: loss 2.6865, time 5266.26ms 
iter 3122: loss 2.7254, time 5272.54ms 
iter 3123: loss 2.7162, time 5268.70ms 
iter 3124: loss 2.7370, time 5265.40ms 
iter 3125: loss 2.7448, time 5264.23ms 
iter 3126: loss 2.9457, time 5265.07ms 
iter 3127: loss 2.8650, time 5267.10ms 
iter 3128: loss 2.8253, time 5278.45ms 
iter 3129: loss 2.5723, time 5264.73ms 
iter 3130: loss 2.6354, time 5264.91ms 
iter 3131: loss 2.7835, time 5263.20ms 
iter 3132: loss 2.6591, time 5267.83ms 
iter 3133: loss 2.6164, time 5266.86ms 
iter 3134: loss 2.6742, time 5273.28ms 
iter 3135: loss 2.7611, time 5291.56ms 
iter 3136: loss 2.6566, time 5298.95ms 
iter 3137: loss 2.8660, time 5296.66ms 
iter 3138: loss 2.8414, time 5283.99ms 
iter 3139: loss 2.5431, time 5295.17ms 
iter 3140: loss 2.6844, time 5294.33ms 
iter 3141: loss 2.6567, time 5285.10ms 
iter 3142: loss 2.7134, time 5277.14ms 
iter 3143: loss 2.8139, time 5289.18ms 
iter 3144: loss 2.5859, time 5280.85ms 
iter 3145: loss 2.6426, time 5300.73ms 
iter 3146: loss 2.6909, time 5270.58ms 
iter 3147: loss 2.8799, time 5275.78ms 
iter 3148: loss 2.7961, time 5264.23ms 
iter 3149: loss 2.7936, time 5282.29ms 
step 3150: train loss 2.7223, val loss 2.8683
iter 3150: loss 2.8524, time 20035.07ms 
iter 3151: loss 2.7277, time 5307.51ms 
iter 3152: loss 2.8536, time 5300.83ms 
iter 3153: loss 2.7932, time 5285.59ms 
iter 3154: loss 2.6004, time 5265.15ms 
iter 3155: loss 2.7157, time 5271.04ms 
iter 3156: loss 2.9513, time 5267.41ms 
iter 3157: loss 2.6076, time 5271.07ms 
iter 3158: loss 2.8269, time 5266.22ms 
iter 3159: loss 2.7627, time 5262.85ms 
iter 3160: loss 2.5406, time 5308.07ms 
iter 3161: loss 2.8155, time 5291.24ms 
iter 3162: loss 2.7485, time 5252.58ms 
iter 3163: loss 2.6756, time 5256.29ms 
iter 3164: loss 2.5714, time 5254.60ms 
iter 3165: loss 2.8010, time 5257.34ms 
iter 3166: loss 2.6083, time 5263.65ms 
iter 3167: loss 2.7375, time 5254.64ms 
iter 3168: loss 2.5756, time 5255.53ms 
iter 3169: loss 2.6096, time 5241.95ms 
iter 3170: loss 2.7861, time 5256.92ms 
iter 3171: loss 2.6050, time 5256.45ms 
iter 3172: loss 2.8139, time 5260.32ms 
iter 3173: loss 2.8789, time 5260.83ms 
iter 3174: loss 2.7150, time 5264.56ms 
iter 3175: loss 2.8947, time 5268.06ms 
iter 3176: loss 2.5695, time 5341.94ms 
iter 3177: loss 2.5743, time 5340.10ms 
iter 3178: loss 2.5605, time 5335.57ms 
iter 3179: loss 2.6953, time 5284.06ms 
iter 3180: loss 2.5407, time 5243.71ms 
iter 3181: loss 2.7151, time 5258.15ms 
iter 3182: loss 2.7197, time 5252.44ms 
iter 3183: loss 2.5772, time 5233.86ms 
iter 3184: loss 2.6979, time 5257.19ms 
iter 3185: loss 2.7551, time 5272.20ms 
iter 3186: loss 2.9102, time 5251.06ms 
iter 3187: loss 2.6399, time 5253.04ms 
iter 3188: loss 2.6106, time 5257.77ms 
iter 3189: loss 2.8115, time 5260.42ms 
iter 3190: loss 2.9166, time 5251.07ms 
iter 3191: loss 2.7806, time 5254.47ms 
iter 3192: loss 2.6531, time 5260.45ms 
iter 3193: loss 2.6627, time 5240.46ms 
iter 3194: loss 2.8337, time 5250.08ms 
iter 3195: loss 2.8615, time 5246.89ms 
iter 3196: loss 2.8411, time 5260.64ms 
iter 3197: loss 2.7662, time 5251.37ms 
iter 3198: loss 2.5044, time 5247.22ms 
iter 3199: loss 2.5434, time 5222.94ms 
step 3200: train loss 2.7002, val loss 2.8540
iter 3200: loss 2.5198, time 20006.46ms 
iter 3201: loss 2.6574, time 5196.25ms 
iter 3202: loss 3.0333, time 5254.40ms 
iter 3203: loss 2.8772, time 5249.23ms 
iter 3204: loss 2.6783, time 5258.57ms 
iter 3205: loss 2.6941, time 5268.19ms 
iter 3206: loss 2.6256, time 5254.90ms 
iter 3207: loss 2.6497, time 5251.79ms 
iter 3208: loss 2.6808, time 5261.62ms 
iter 3209: loss 2.6229, time 5248.35ms 
iter 3210: loss 2.7526, time 5253.25ms 
iter 3211: loss 2.5358, time 5253.53ms 
iter 3212: loss 2.7396, time 5258.62ms 
iter 3213: loss 2.7596, time 5250.16ms 
iter 3214: loss 2.5431, time 5249.47ms 
iter 3215: loss 2.6045, time 5224.01ms 
iter 3216: loss 2.7200, time 5175.76ms 
iter 3217: loss 2.5859, time 5230.12ms 
iter 3218: loss 2.6870, time 5258.60ms 
iter 3219: loss 2.7527, time 5247.75ms 
iter 3220: loss 2.7609, time 5233.47ms 
iter 3221: loss 2.5340, time 5252.15ms 
iter 3222: loss 2.8579, time 5255.52ms 
iter 3223: loss 2.5554, time 5264.07ms 
iter 3224: loss 2.6310, time 5260.84ms 
iter 3225: loss 2.7015, time 5259.36ms 
iter 3226: loss 2.8430, time 5257.23ms 
iter 3227: loss 2.8999, time 5243.01ms 
iter 3228: loss 2.7221, time 5248.12ms 
iter 3229: loss 2.4687, time 5248.37ms 
iter 3230: loss 2.5922, time 5251.74ms 
iter 3231: loss 2.6863, time 5266.44ms 
iter 3232: loss 2.8396, time 5263.70ms 
iter 3233: loss 2.7320, time 5247.44ms 
iter 3234: loss 2.7870, time 5256.65ms 
iter 3235: loss 2.7224, time 5266.00ms 
iter 3236: loss 2.6311, time 5266.30ms 
iter 3237: loss 2.6102, time 5259.69ms 
iter 3238: loss 2.7699, time 5270.59ms 
iter 3239: loss 2.6450, time 5272.11ms 
iter 3240: loss 2.8466, time 5259.54ms 
iter 3241: loss 2.9190, time 5261.28ms 
iter 3242: loss 2.6912, time 5260.26ms 
iter 3243: loss 2.6997, time 5248.55ms 
iter 3244: loss 2.6619, time 5253.00ms 
iter 3245: loss 3.0634, time 5263.75ms 
iter 3246: loss 2.6426, time 5260.20ms 
iter 3247: loss 2.6779, time 5251.86ms 
iter 3248: loss 2.7686, time 5255.68ms 
iter 3249: loss 2.7153, time 5249.21ms 
step 3250: train loss 2.6916, val loss 2.8663
iter 3250: loss 2.5150, time 19995.01ms 
iter 3251: loss 2.7569, time 5251.72ms 
iter 3252: loss 2.5473, time 5253.49ms 
iter 3253: loss 2.6435, time 5250.34ms 
iter 3254: loss 2.9353, time 5258.01ms 
iter 3255: loss 3.0085, time 5257.00ms 
iter 3256: loss 2.6529, time 5248.78ms 
iter 3257: loss 2.7978, time 5293.75ms 
iter 3258: loss 2.2785, time 5256.12ms 
iter 3259: loss 2.5668, time 5250.90ms 
iter 3260: loss 2.6250, time 5256.81ms 
iter 3261: loss 2.7352, time 5265.01ms 
iter 3262: loss 2.6890, time 5251.43ms 
iter 3263: loss 2.6110, time 5249.91ms 
iter 3264: loss 2.6552, time 5255.58ms 
iter 3265: loss 2.5824, time 5228.45ms 
iter 3266: loss 2.6228, time 5233.47ms 
iter 3267: loss 2.8887, time 5245.34ms 
iter 3268: loss 2.7167, time 5259.54ms 
iter 3269: loss 2.4171, time 5247.53ms 
iter 3270: loss 2.6511, time 5254.62ms 
iter 3271: loss 2.5482, time 5254.60ms 
iter 3272: loss 2.6787, time 5247.13ms 
iter 3273: loss 2.5100, time 5261.55ms 
iter 3274: loss 2.7259, time 5242.43ms 
iter 3275: loss 2.5059, time 5257.57ms 
iter 3276: loss 2.8115, time 5248.99ms 
iter 3277: loss 2.7200, time 5245.83ms 
iter 3278: loss 2.8206, time 5249.26ms 
iter 3279: loss 2.5667, time 5255.50ms 
iter 3280: loss 2.7692, time 5265.01ms 
iter 3281: loss 2.7611, time 5254.47ms 
iter 3282: loss 2.7154, time 5262.91ms 
iter 3283: loss 2.6233, time 5255.74ms 
iter 3284: loss 2.7073, time 5257.66ms 
iter 3285: loss 2.7719, time 5258.42ms 
iter 3286: loss 2.5646, time 5261.21ms 
iter 3287: loss 2.8574, time 5257.55ms 
iter 3288: loss 2.7463, time 5256.44ms 
iter 3289: loss 2.4680, time 5255.15ms 
iter 3290: loss 2.7846, time 5254.23ms 
iter 3291: loss 2.8286, time 5256.53ms 
iter 3292: loss 2.6654, time 5257.05ms 
iter 3293: loss 2.5060, time 5232.78ms 
iter 3294: loss 2.5513, time 5230.13ms 
iter 3295: loss 2.6929, time 5265.84ms 
iter 3296: loss 2.7191, time 5253.26ms 
iter 3297: loss 2.8246, time 5250.21ms 
iter 3298: loss 2.6302, time 5259.38ms 
iter 3299: loss 2.8100, time 5248.94ms 
step 3300: train loss 2.7109, val loss 2.8655
iter 3300: loss 2.4849, time 20013.84ms 
iter 3301: loss 2.6108, time 5278.08ms 
iter 3302: loss 2.6457, time 5306.07ms 
iter 3303: loss 2.6394, time 5298.34ms 
iter 3304: loss 2.6742, time 5311.30ms 
iter 3305: loss 2.6643, time 5190.79ms 
iter 3306: loss 2.7915, time 5273.81ms 
iter 3307: loss 2.7153, time 5289.13ms 
iter 3308: loss 2.5634, time 5279.12ms 
iter 3309: loss 2.6648, time 5255.41ms 
iter 3310: loss 2.7384, time 5255.54ms 
iter 3311: loss 2.4404, time 5249.33ms 
iter 3312: loss 2.7489, time 5177.21ms 
iter 3313: loss 2.7319, time 5192.29ms 
iter 3314: loss 2.4721, time 5252.16ms 
iter 3315: loss 2.6649, time 5262.70ms 
iter 3316: loss 2.7130, time 5252.57ms 
iter 3317: loss 2.7758, time 5255.87ms 
iter 3318: loss 2.7680, time 5253.76ms 
iter 3319: loss 2.6228, time 5212.21ms 
iter 3320: loss 2.6724, time 5258.20ms 
iter 3321: loss 2.6015, time 5266.33ms 
iter 3322: loss 2.8862, time 5262.55ms 
iter 3323: loss 2.7619, time 5261.03ms 
iter 3324: loss 2.7356, time 5256.30ms 
iter 3325: loss 2.6370, time 5255.52ms 
iter 3326: loss 2.7566, time 5233.24ms 
iter 3327: loss 2.6070, time 5247.87ms 
iter 3328: loss 2.7455, time 5253.61ms 
iter 3329: loss 2.5931, time 5255.75ms 
iter 3330: loss 2.7284, time 5265.08ms 
iter 3331: loss 2.6582, time 5263.79ms 
iter 3332: loss 2.8151, time 5257.99ms 
iter 3333: loss 2.6757, time 5201.29ms 
iter 3334: loss 2.6936, time 5105.45ms 
iter 3335: loss 2.7689, time 5196.76ms 
iter 3336: loss 2.8155, time 5225.44ms 
iter 3337: loss 2.7641, time 5219.16ms 
iter 3338: loss 2.6943, time 5210.55ms 
iter 3339: loss 2.5130, time 5213.68ms 
iter 3340: loss 2.6897, time 5233.31ms 
iter 3341: loss 2.6085, time 5216.87ms 
iter 3342: loss 2.8942, time 5276.50ms 
iter 3343: loss 2.6554, time 5267.71ms 
iter 3344: loss 2.6973, time 5272.01ms 
iter 3345: loss 2.6557, time 5195.29ms 
iter 3346: loss 2.6450, time 5259.60ms 
iter 3347: loss 2.5719, time 5260.55ms 
iter 3348: loss 2.6937, time 5262.57ms 
iter 3349: loss 2.7855, time 5230.84ms 
step 3350: train loss 2.6955, val loss 2.8586
iter 3350: loss 2.6740, time 19984.07ms 
iter 3351: loss 2.8001, time 5259.60ms 
iter 3352: loss 2.5927, time 5259.93ms 
iter 3353: loss 2.6692, time 5257.85ms 
iter 3354: loss 2.6155, time 5251.33ms 
iter 3355: loss 2.6055, time 5249.51ms 
iter 3356: loss 2.7066, time 5252.90ms 
iter 3357: loss 2.6512, time 5263.48ms 
iter 3358: loss 2.6820, time 5273.10ms 
iter 3359: loss 2.6427, time 5274.72ms 
iter 3360: loss 2.7626, time 5261.37ms 
iter 3361: loss 2.8228, time 5266.17ms 
iter 3362: loss 2.7776, time 5263.30ms 
iter 3363: loss 2.6629, time 5268.76ms 
iter 3364: loss 2.6284, time 5260.91ms 
iter 3365: loss 2.6450, time 5253.68ms 
iter 3366: loss 2.8018, time 5256.54ms 
iter 3367: loss 2.4832, time 5237.21ms 
iter 3368: loss 2.8164, time 5235.14ms 
iter 3369: loss 2.5131, time 5243.89ms 
iter 3370: loss 2.5329, time 5225.57ms 
iter 3371: loss 2.6224, time 5239.10ms 
iter 3372: loss 2.7054, time 5241.95ms 
iter 3373: loss 2.5391, time 5239.09ms 
iter 3374: loss 2.8478, time 5239.95ms 
iter 3375: loss 2.7693, time 5204.48ms 
iter 3376: loss 2.6403, time 5253.73ms 
iter 3377: loss 2.8813, time 5251.54ms 
iter 3378: loss 2.6519, time 5246.42ms 
iter 3379: loss 2.7941, time 5259.43ms 
iter 3380: loss 2.9315, time 5237.56ms 
iter 3381: loss 2.8628, time 5237.64ms 
iter 3382: loss 2.7819, time 5259.67ms 
iter 3383: loss 2.8484, time 5285.89ms 
iter 3384: loss 2.6781, time 5279.35ms 
iter 3385: loss 2.6372, time 5272.45ms 
iter 3386: loss 2.7711, time 5278.89ms 
iter 3387: loss 2.7365, time 5259.77ms 
iter 3388: loss 2.6688, time 5259.72ms 
iter 3389: loss 2.6954, time 5241.29ms 
iter 3390: loss 2.8833, time 5226.03ms 
iter 3391: loss 2.6390, time 5243.55ms 
iter 3392: loss 2.4376, time 5239.32ms 
iter 3393: loss 2.5507, time 5232.65ms 
iter 3394: loss 2.6562, time 5233.33ms 
iter 3395: loss 2.5588, time 5246.02ms 
iter 3396: loss 2.9450, time 5235.32ms 
iter 3397: loss 2.6628, time 5252.41ms 
iter 3398: loss 2.7119, time 5242.08ms 
iter 3399: loss 2.9770, time 5233.50ms 
step 3400: train loss 2.6752, val loss 2.8575
iter 3400: loss 2.5613, time 20087.38ms 
iter 3401: loss 2.5303, time 5275.39ms 
iter 3402: loss 2.8492, time 5257.42ms 
iter 3403: loss 2.8558, time 5262.88ms 
iter 3404: loss 2.6246, time 5257.28ms 
iter 3405: loss 2.6497, time 5250.73ms 
iter 3406: loss 2.6927, time 5273.92ms 
iter 3407: loss 2.6028, time 5269.70ms 
iter 3408: loss 2.8257, time 5257.82ms 
iter 3409: loss 2.7152, time 5258.89ms 
iter 3410: loss 2.6304, time 5259.69ms 
iter 3411: loss 2.8854, time 5256.17ms 
iter 3412: loss 2.6223, time 5263.49ms 
iter 3413: loss 2.6031, time 5262.26ms 
iter 3414: loss 2.6772, time 5245.02ms 
iter 3415: loss 2.7239, time 5244.54ms 
iter 3416: loss 2.6756, time 5254.14ms 
iter 3417: loss 2.6289, time 5259.90ms 
iter 3418: loss 2.8070, time 5257.18ms 
iter 3419: loss 2.2427, time 5269.09ms 
iter 3420: loss 2.3818, time 5258.67ms 
iter 3421: loss 2.7144, time 5254.90ms 
iter 3422: loss 2.7231, time 5255.77ms 
iter 3423: loss 2.5864, time 5261.30ms 
iter 3424: loss 2.6933, time 5254.83ms 
iter 3425: loss 2.6848, time 5256.39ms 
iter 3426: loss 2.7384, time 5259.96ms 
iter 3427: loss 2.7038, time 5258.89ms 
iter 3428: loss 2.8046, time 5269.16ms 
iter 3429: loss 2.8066, time 5258.80ms 
iter 3430: loss 2.7462, time 5251.88ms 
iter 3431: loss 2.6422, time 5254.20ms 
iter 3432: loss 2.6325, time 5264.52ms 
iter 3433: loss 2.6752, time 5255.31ms 
iter 3434: loss 2.6742, time 5255.93ms 
iter 3435: loss 2.5994, time 5260.74ms 
iter 3436: loss 2.8455, time 5250.18ms 
iter 3437: loss 2.7803, time 5252.42ms 
iter 3438: loss 2.5454, time 5257.77ms 
iter 3439: loss 2.7605, time 5256.17ms 
iter 3440: loss 2.7015, time 5258.50ms 
iter 3441: loss 2.6303, time 5258.26ms 
iter 3442: loss 2.8896, time 5252.82ms 
iter 3443: loss 2.6387, time 5254.14ms 
iter 3444: loss 2.8196, time 5265.47ms 
iter 3445: loss 2.5522, time 5262.25ms 
iter 3446: loss 2.6326, time 5254.26ms 
iter 3447: loss 2.6666, time 5273.40ms 
iter 3448: loss 2.6492, time 5256.21ms 
iter 3449: loss 2.7508, time 5239.07ms 
step 3450: train loss 2.6888, val loss 2.8503
iter 3450: loss 2.6376, time 20110.69ms 
iter 3451: loss 3.0031, time 5271.09ms 
iter 3452: loss 2.8146, time 5254.76ms 
iter 3453: loss 2.9185, time 5256.93ms 
iter 3454: loss 2.7204, time 5259.04ms 
iter 3455: loss 2.5500, time 5255.76ms 
iter 3456: loss 2.8426, time 5266.84ms 
iter 3457: loss 2.6607, time 5262.21ms 
iter 3458: loss 2.8253, time 5258.61ms 
iter 3459: loss 2.8239, time 5257.80ms 
iter 3460: loss 2.6891, time 5256.38ms 
iter 3461: loss 2.5604, time 5252.62ms 
iter 3462: loss 2.4903, time 5253.90ms 
iter 3463: loss 2.5819, time 5263.30ms 
iter 3464: loss 2.5761, time 5252.58ms 
iter 3465: loss 2.6707, time 5259.28ms 
iter 3466: loss 2.8560, time 5256.76ms 
iter 3467: loss 2.7335, time 5267.57ms 
iter 3468: loss 2.6668, time 5253.15ms 
iter 3469: loss 2.6840, time 5258.84ms 
iter 3470: loss 2.5272, time 5251.35ms 
iter 3471: loss 2.9288, time 5270.32ms 
iter 3472: loss 2.5991, time 5264.25ms 
iter 3473: loss 2.7704, time 5260.06ms 
iter 3474: loss 2.8304, time 5258.84ms 
iter 3475: loss 2.7047, time 5272.44ms 
iter 3476: loss 2.6424, time 5258.09ms 
iter 3477: loss 2.7880, time 5260.59ms 
iter 3478: loss 2.6185, time 5262.60ms 
iter 3479: loss 2.7241, time 5260.72ms 
iter 3480: loss 2.6526, time 5257.92ms 
iter 3481: loss 2.6269, time 5264.97ms 
iter 3482: loss 2.4936, time 5255.92ms 
iter 3483: loss 2.6770, time 5267.46ms 
iter 3484: loss 2.8995, time 5257.57ms 
iter 3485: loss 2.6508, time 5259.24ms 
iter 3486: loss 2.6265, time 5258.07ms 
iter 3487: loss 2.5285, time 5261.29ms 
iter 3488: loss 2.4778, time 5266.11ms 
iter 3489: loss 2.7295, time 5258.31ms 
iter 3490: loss 2.6916, time 5257.56ms 
iter 3491: loss 2.7754, time 5258.62ms 
iter 3492: loss 2.6518, time 5254.94ms 
iter 3493: loss 2.6972, time 5266.47ms 
iter 3494: loss 2.6755, time 5257.11ms 
iter 3495: loss 2.6950, time 5259.09ms 
iter 3496: loss 2.5849, time 5258.94ms 
iter 3497: loss 2.6742, time 5258.59ms 
iter 3498: loss 2.4865, time 5257.79ms 
iter 3499: loss 2.6619, time 5256.99ms 
step 3500: train loss 2.6813, val loss 2.8565
iter 3500: loss 2.7995, time 20028.10ms 
iter 3501: loss 2.7540, time 5262.70ms 
iter 3502: loss 2.5431, time 5263.21ms 
iter 3503: loss 2.6573, time 5260.65ms 
iter 3504: loss 2.7520, time 5218.12ms 
iter 3505: loss 2.7732, time 5255.15ms 
iter 3506: loss 2.8000, time 5265.93ms 
iter 3507: loss 2.6529, time 5252.84ms 
iter 3508: loss 2.6041, time 5249.23ms 
iter 3509: loss 2.9282, time 5249.39ms 
iter 3510: loss 2.6845, time 5251.10ms 
iter 3511: loss 2.4887, time 5249.34ms 
iter 3512: loss 2.4870, time 5257.81ms 
iter 3513: loss 2.7320, time 5257.62ms 
iter 3514: loss 2.8488, time 5248.57ms 
iter 3515: loss 2.6128, time 5250.32ms 
iter 3516: loss 2.5814, time 5249.31ms 
iter 3517: loss 2.6308, time 5253.08ms 
iter 3518: loss 2.6175, time 5247.68ms 
iter 3519: loss 2.4374, time 5261.27ms 
iter 3520: loss 2.7389, time 5254.28ms 
iter 3521: loss 2.6170, time 5253.52ms 
iter 3522: loss 2.5869, time 5251.45ms 
iter 3523: loss 2.6172, time 5246.88ms 
iter 3524: loss 2.6513, time 5248.79ms 
iter 3525: loss 2.6649, time 5252.12ms 
iter 3526: loss 2.5047, time 5260.97ms 
iter 3527: loss 2.6989, time 5250.98ms 
iter 3528: loss 2.4977, time 5248.15ms 
iter 3529: loss 2.6045, time 5249.42ms 
iter 3530: loss 2.8346, time 5253.64ms 
iter 3531: loss 2.6085, time 5251.47ms 
iter 3532: loss 2.7253, time 5260.87ms 
iter 3533: loss 2.8263, time 5255.30ms 
iter 3534: loss 2.6877, time 5252.18ms 
iter 3535: loss 2.5979, time 5252.01ms 
iter 3536: loss 2.6120, time 5247.52ms 
iter 3537: loss 2.5404, time 5247.94ms 
iter 3538: loss 2.8606, time 5257.48ms 
iter 3539: loss 2.8535, time 5255.89ms 
iter 3540: loss 2.7545, time 5251.02ms 
iter 3541: loss 2.5328, time 5253.41ms 
iter 3542: loss 2.3722, time 5260.18ms 
iter 3543: loss 2.9419, time 5257.69ms 
iter 3544: loss 2.5891, time 5257.93ms 
iter 3545: loss 2.7526, time 5259.33ms 
iter 3546: loss 2.6898, time 5260.98ms 
iter 3547: loss 2.5324, time 5251.15ms 
iter 3548: loss 2.7442, time 5260.20ms 
iter 3549: loss 2.7349, time 5259.38ms 
step 3550: train loss 2.6779, val loss 2.8462
iter 3550: loss 2.7037, time 20017.57ms 
iter 3551: loss 2.7918, time 5252.88ms 
iter 3552: loss 2.6167, time 5263.27ms 
iter 3553: loss 2.6695, time 5255.92ms 
iter 3554: loss 2.7194, time 5254.22ms 
iter 3555: loss 2.6387, time 5251.50ms 
iter 3556: loss 2.5721, time 5253.52ms 
iter 3557: loss 2.7897, time 5254.83ms 
iter 3558: loss 2.7288, time 5249.69ms 
iter 3559: loss 2.6180, time 5268.80ms 
iter 3560: loss 2.5686, time 5278.53ms 
iter 3561: loss 2.9685, time 5304.20ms 
iter 3562: loss 2.4727, time 5263.15ms 
iter 3563: loss 2.6447, time 5263.78ms 
iter 3564: loss 2.8231, time 5262.44ms 
iter 3565: loss 2.5311, time 5264.38ms 
iter 3566: loss 2.6799, time 5264.14ms 
iter 3567: loss 2.5043, time 5260.37ms 
iter 3568: loss 2.5276, time 5256.60ms 
iter 3569: loss 2.7154, time 5253.22ms 
iter 3570: loss 2.8896, time 5256.71ms 
iter 3571: loss 2.6606, time 5251.99ms 
iter 3572: loss 2.5223, time 5260.99ms 
iter 3573: loss 2.7882, time 5252.14ms 
iter 3574: loss 2.6058, time 5260.08ms 
iter 3575: loss 2.7042, time 5272.01ms 
iter 3576: loss 2.5436, time 5250.01ms 
iter 3577: loss 2.7528, time 5240.75ms 
iter 3578: loss 2.4016, time 5210.62ms 
iter 3579: loss 2.6351, time 5266.95ms 
iter 3580: loss 2.7127, time 5234.95ms 
iter 3581: loss 2.5669, time 5240.75ms 
iter 3582: loss 2.6020, time 5233.71ms 
iter 3583: loss 2.5189, time 5250.00ms 
iter 3584: loss 2.5876, time 5240.52ms 
iter 3585: loss 2.5385, time 5236.85ms 
iter 3586: loss 2.6934, time 5250.21ms 
iter 3587: loss 2.7699, time 5216.95ms 
iter 3588: loss 2.8062, time 5260.68ms 
iter 3589: loss 2.5169, time 5242.25ms 
iter 3590: loss 2.5011, time 5244.95ms 
iter 3591: loss 2.6698, time 5248.82ms 
iter 3592: loss 2.6230, time 5250.41ms 
iter 3593: loss 2.6364, time 5257.69ms 
iter 3594: loss 2.6514, time 5251.81ms 
iter 3595: loss 2.6991, time 5257.58ms 
iter 3596: loss 2.7308, time 5248.98ms 
iter 3597: loss 2.8643, time 5222.32ms 
iter 3598: loss 2.5426, time 5246.79ms 
iter 3599: loss 2.6567, time 5253.02ms 
step 3600: train loss 2.6497, val loss 2.8588
iter 3600: loss 2.7049, time 19871.49ms 
iter 3601: loss 2.7381, time 5260.67ms 
iter 3602: loss 2.7242, time 5267.12ms 
iter 3603: loss 2.9797, time 5282.30ms 
iter 3604: loss 2.8369, time 5267.82ms 
iter 3605: loss 2.7791, time 5269.10ms 
iter 3606: loss 2.4693, time 5256.33ms 
iter 3607: loss 2.7918, time 5250.05ms 
iter 3608: loss 2.4381, time 5263.48ms 
iter 3609: loss 2.9414, time 5264.82ms 
iter 3610: loss 2.5986, time 5162.57ms 
iter 3611: loss 2.5292, time 5122.89ms 
iter 3612: loss 2.6352, time 5115.23ms 
iter 3613: loss 2.7861, time 5216.59ms 
iter 3614: loss 2.5642, time 5257.08ms 
iter 3615: loss 2.7025, time 5273.13ms 
iter 3616: loss 2.7793, time 5262.17ms 
iter 3617: loss 2.7497, time 5268.08ms 
iter 3618: loss 2.6895, time 5249.49ms 
iter 3619: loss 2.7424, time 5260.84ms 
iter 3620: loss 2.6617, time 5259.46ms 
iter 3621: loss 2.6948, time 5259.41ms 
iter 3622: loss 2.5590, time 5272.94ms 
iter 3623: loss 2.6265, time 5267.36ms 
iter 3624: loss 2.6936, time 5258.69ms 
iter 3625: loss 2.6555, time 5259.64ms 
iter 3626: loss 2.7362, time 5255.85ms 
iter 3627: loss 2.7335, time 5265.40ms 
iter 3628: loss 2.6205, time 5254.30ms 
iter 3629: loss 2.6172, time 5257.15ms 
iter 3630: loss 2.5343, time 5258.85ms 
iter 3631: loss 2.6140, time 5278.62ms 
iter 3632: loss 2.7093, time 5263.58ms 
iter 3633: loss 2.6704, time 5259.70ms 
iter 3634: loss 2.6078, time 5264.50ms 
iter 3635: loss 2.7490, time 5264.97ms 
iter 3636: loss 2.6478, time 5297.77ms 
iter 3637: loss 2.7338, time 5263.95ms 
iter 3638: loss 2.6037, time 5340.34ms 
iter 3639: loss 2.6971, time 5341.33ms 
iter 3640: loss 2.7418, time 5358.24ms 
iter 3641: loss 2.9738, time 5288.97ms 
iter 3642: loss 2.8715, time 5306.18ms 
iter 3643: loss 2.6171, time 5255.60ms 
iter 3644: loss 2.7327, time 5276.24ms 
iter 3645: loss 2.4666, time 5260.12ms 
iter 3646: loss 2.5738, time 5262.19ms 
iter 3647: loss 2.7999, time 5270.22ms 
iter 3648: loss 2.5478, time 5262.63ms 
iter 3649: loss 2.7303, time 5231.93ms 
step 3650: train loss 2.6681, val loss 2.8534
iter 3650: loss 2.6648, time 20083.29ms 
iter 3651: loss 2.6365, time 5332.72ms 
iter 3652: loss 2.6034, time 5269.75ms 
iter 3653: loss 2.7301, time 5264.41ms 
iter 3654: loss 2.6623, time 5284.63ms 
iter 3655: loss 2.7679, time 5282.03ms 
iter 3656: loss 2.7163, time 5258.32ms 
iter 3657: loss 2.7309, time 5252.89ms 
iter 3658: loss 2.6221, time 5257.30ms 
iter 3659: loss 2.7234, time 5262.79ms 
iter 3660: loss 2.5923, time 5258.20ms 
iter 3661: loss 2.7156, time 5271.91ms 
iter 3662: loss 2.5465, time 5264.57ms 
iter 3663: loss 2.7287, time 5262.33ms 
iter 3664: loss 2.4462, time 5281.53ms 
iter 3665: loss 2.7068, time 5314.09ms 
iter 3666: loss 2.5238, time 5307.86ms 
iter 3667: loss 2.5163, time 5326.91ms 
iter 3668: loss 2.5761, time 5257.59ms 
iter 3669: loss 2.6407, time 5261.36ms 
iter 3670: loss 2.5919, time 5252.56ms 
iter 3671: loss 2.5750, time 5267.51ms 
iter 3672: loss 2.6366, time 5268.85ms 
iter 3673: loss 2.3881, time 5255.58ms 
iter 3674: loss 2.6943, time 5247.33ms 
iter 3675: loss 2.5788, time 5247.72ms 
iter 3676: loss 2.6855, time 5251.34ms 
iter 3677: loss 2.6869, time 5255.63ms 
iter 3678: loss 2.7413, time 5265.02ms 
iter 3679: loss 2.8286, time 5257.86ms 
iter 3680: loss 2.7313, time 5252.11ms 
iter 3681: loss 2.4138, time 5251.14ms 
iter 3682: loss 2.6451, time 5254.83ms 
iter 3683: loss 2.6161, time 5249.44ms 
iter 3684: loss 2.5748, time 5259.55ms 
iter 3685: loss 2.8518, time 5259.43ms 
iter 3686: loss 2.4759, time 5249.52ms 
iter 3687: loss 2.6133, time 5253.28ms 
iter 3688: loss 2.6124, time 5226.78ms 
iter 3689: loss 2.7418, time 5252.81ms 
iter 3690: loss 2.7055, time 5260.73ms 
iter 3691: loss 2.7161, time 5309.13ms 
iter 3692: loss 2.6488, time 5262.66ms 
iter 3693: loss 2.6119, time 5299.56ms 
iter 3694: loss 2.6401, time 5322.98ms 
iter 3695: loss 2.5817, time 5274.75ms 
iter 3696: loss 2.6943, time 5298.15ms 
iter 3697: loss 2.7104, time 5208.08ms 
iter 3698: loss 2.4175, time 5172.21ms 
iter 3699: loss 2.6946, time 5170.45ms 
step 3700: train loss 2.6607, val loss 2.8512
iter 3700: loss 2.6323, time 19779.78ms 
iter 3701: loss 2.8338, time 5209.39ms 
iter 3702: loss 2.6989, time 5217.43ms 
iter 3703: loss 2.6067, time 5243.11ms 
iter 3704: loss 2.4814, time 5190.84ms 
iter 3705: loss 2.4964, time 5189.59ms 
iter 3706: loss 2.4763, time 5274.33ms 
iter 3707: loss 2.6653, time 5263.05ms 
iter 3708: loss 2.8438, time 5252.67ms 
iter 3709: loss 2.6487, time 5257.10ms 
iter 3710: loss 2.5644, time 5258.32ms 
iter 3711: loss 2.8045, time 5258.02ms 
iter 3712: loss 2.6380, time 5276.47ms 
iter 3713: loss 3.0113, time 5267.24ms 
iter 3714: loss 2.6394, time 5263.27ms 
iter 3715: loss 2.5342, time 5255.36ms 
iter 3716: loss 2.5912, time 5255.68ms 
iter 3717: loss 2.6834, time 5268.19ms 
iter 3718: loss 2.7057, time 5265.95ms 
iter 3719: loss 2.6712, time 5275.19ms 
iter 3720: loss 2.7860, time 5266.75ms 
iter 3721: loss 2.6785, time 5260.67ms 
iter 3722: loss 2.7128, time 5235.62ms 
iter 3723: loss 2.6416, time 5239.72ms 
iter 3724: loss 2.8943, time 5237.37ms 
iter 3725: loss 2.6548, time 5248.83ms 
iter 3726: loss 2.6945, time 5266.80ms 
iter 3727: loss 2.6932, time 5264.60ms 
iter 3728: loss 2.6621, time 5261.46ms 
iter 3729: loss 2.6748, time 5253.70ms 
iter 3730: loss 2.7890, time 5255.52ms 
iter 3731: loss 2.6903, time 5258.46ms 
iter 3732: loss 2.6655, time 5253.03ms 
iter 3733: loss 2.7050, time 5263.60ms 
iter 3734: loss 2.8091, time 5261.35ms 
iter 3735: loss 2.5034, time 5245.87ms 
iter 3736: loss 2.9179, time 5257.82ms 
iter 3737: loss 2.7203, time 5255.49ms 
iter 3738: loss 2.5186, time 5253.50ms 
iter 3739: loss 2.5806, time 5254.14ms 
iter 3740: loss 2.7540, time 5257.06ms 
iter 3741: loss 2.5794, time 5265.37ms 
iter 3742: loss 2.4963, time 5251.30ms 
iter 3743: loss 2.7116, time 5253.09ms 
iter 3744: loss 2.7957, time 5253.74ms 
iter 3745: loss 2.7101, time 5253.01ms 
iter 3746: loss 2.5591, time 5261.52ms 
iter 3747: loss 2.4821, time 5261.72ms 
iter 3748: loss 2.6566, time 5251.47ms 
iter 3749: loss 2.5710, time 5243.68ms 
step 3750: train loss 2.6553, val loss 2.8453
iter 3750: loss 2.6156, time 19999.41ms 
iter 3751: loss 2.6533, time 5242.27ms 
iter 3752: loss 2.7305, time 5236.60ms 
iter 3753: loss 2.6426, time 5234.71ms 
iter 3754: loss 2.6388, time 5214.51ms 
iter 3755: loss 2.5769, time 5243.16ms 
iter 3756: loss 2.5320, time 5239.98ms 
iter 3757: loss 2.5490, time 5230.24ms 
iter 3758: loss 2.5150, time 5247.31ms 
iter 3759: loss 2.6730, time 5236.35ms 
iter 3760: loss 2.6725, time 5239.11ms 
iter 3761: loss 2.7525, time 5250.51ms 
iter 3762: loss 2.9316, time 5245.42ms 
iter 3763: loss 2.6684, time 5238.45ms 
iter 3764: loss 2.4820, time 5241.27ms 
iter 3765: loss 2.5515, time 5234.57ms 
iter 3766: loss 2.7637, time 5244.54ms 
iter 3767: loss 2.4691, time 5243.56ms 
iter 3768: loss 2.7704, time 5262.32ms 
iter 3769: loss 2.4947, time 5247.10ms 
iter 3770: loss 2.4489, time 5248.78ms 
iter 3771: loss 2.6907, time 5234.20ms 
iter 3772: loss 2.6755, time 5252.78ms 
iter 3773: loss 2.8690, time 5249.85ms 
iter 3774: loss 2.3413, time 5249.78ms 
iter 3775: loss 2.8055, time 5233.41ms 
iter 3776: loss 2.6936, time 5235.46ms 
iter 3777: loss 2.7981, time 5243.95ms 
iter 3778: loss 2.5894, time 5240.34ms 
iter 3779: loss 2.7001, time 5237.50ms 
iter 3780: loss 2.6442, time 5233.22ms 
iter 3781: loss 2.5374, time 5242.56ms 
iter 3782: loss 2.6152, time 5238.54ms 
iter 3783: loss 2.7764, time 5239.71ms 
iter 3784: loss 2.5834, time 5250.31ms 
iter 3785: loss 2.7516, time 5236.05ms 
iter 3786: loss 2.6270, time 5237.54ms 
iter 3787: loss 2.7521, time 5247.77ms 
iter 3788: loss 2.8754, time 5238.86ms 
iter 3789: loss 2.4113, time 5246.31ms 
iter 3790: loss 2.9148, time 5246.16ms 
iter 3791: loss 2.7527, time 5192.39ms 
iter 3792: loss 2.6992, time 5235.45ms 
iter 3793: loss 2.7957, time 5232.39ms 
iter 3794: loss 2.8663, time 5243.49ms 
iter 3795: loss 2.4390, time 5249.37ms 
iter 3796: loss 2.6367, time 5248.32ms 
iter 3797: loss 2.7940, time 5242.04ms 
iter 3798: loss 2.4857, time 5206.78ms 
iter 3799: loss 2.6072, time 5234.00ms 
step 3800: train loss 2.6441, val loss 2.8385
iter 3800: loss 2.5081, time 19949.62ms 
iter 3801: loss 2.7661, time 5059.20ms 
iter 3802: loss 2.9148, time 5065.00ms 
iter 3803: loss 2.6765, time 5233.70ms 
iter 3804: loss 2.7039, time 5259.37ms 
iter 3805: loss 2.6011, time 5291.37ms 
iter 3806: loss 2.7366, time 5284.70ms 
iter 3807: loss 2.6723, time 5260.24ms 
iter 3808: loss 2.7652, time 5252.27ms 
iter 3809: loss 2.6087, time 5258.01ms 
iter 3810: loss 2.6529, time 5265.60ms 
iter 3811: loss 2.7483, time 5275.31ms 
iter 3812: loss 2.7736, time 5269.30ms 
iter 3813: loss 2.7204, time 5263.29ms 
iter 3814: loss 2.7344, time 5268.02ms 
iter 3815: loss 2.7362, time 5263.47ms 
iter 3816: loss 2.6941, time 5274.41ms 
iter 3817: loss 2.5439, time 5277.98ms 
iter 3818: loss 2.5601, time 5266.47ms 
iter 3819: loss 2.7257, time 5263.81ms 
iter 3820: loss 2.6174, time 5267.39ms 
iter 3821: loss 2.7601, time 5284.36ms 
iter 3822: loss 2.5284, time 5273.86ms 
iter 3823: loss 2.9101, time 5259.29ms 
iter 3824: loss 2.6347, time 5265.73ms 
iter 3825: loss 2.6826, time 5257.45ms 
iter 3826: loss 2.6687, time 5264.34ms 
iter 3827: loss 2.9171, time 5247.68ms 
iter 3828: loss 2.9317, time 5253.33ms 
iter 3829: loss 2.6057, time 5245.54ms 
iter 3830: loss 2.6758, time 5254.17ms 
iter 3831: loss 2.6908, time 5246.60ms 
iter 3832: loss 2.5627, time 5248.37ms 
iter 3833: loss 2.5510, time 5262.16ms 
iter 3834: loss 2.6500, time 5245.55ms 
iter 3835: loss 2.4974, time 5246.65ms 
iter 3836: loss 2.9119, time 5244.10ms 
iter 3837: loss 2.5999, time 5244.55ms 
iter 3838: loss 2.6758, time 5252.27ms 
iter 3839: loss 2.5684, time 5249.15ms 
iter 3840: loss 2.6714, time 5239.79ms 
iter 3841: loss 2.6148, time 5241.11ms 
iter 3842: loss 2.7413, time 5242.45ms 
iter 3843: loss 2.5691, time 5246.95ms 
iter 3844: loss 2.4601, time 5250.10ms 
iter 3845: loss 2.6728, time 5258.78ms 
iter 3846: loss 2.8634, time 5245.37ms 
iter 3847: loss 2.9040, time 5266.11ms 
iter 3848: loss 2.4858, time 5241.92ms 
iter 3849: loss 2.5535, time 5261.26ms 
step 3850: train loss 2.6537, val loss 2.8531
iter 3850: loss 2.5846, time 20051.77ms 
iter 3851: loss 2.7111, time 5253.96ms 
iter 3852: loss 2.6815, time 5256.96ms 
iter 3853: loss 2.5964, time 5256.30ms 
iter 3854: loss 2.5361, time 5254.72ms 
iter 3855: loss 2.5535, time 5250.79ms 
iter 3856: loss 2.5826, time 5248.43ms 
iter 3857: loss 2.9516, time 5265.23ms 
iter 3858: loss 2.7011, time 5255.90ms 
iter 3859: loss 2.8044, time 5255.24ms 
iter 3860: loss 2.6697, time 5250.49ms 
iter 3861: loss 2.6180, time 5256.60ms 
iter 3862: loss 2.6510, time 5233.22ms 
iter 3863: loss 2.8477, time 5249.89ms 
iter 3864: loss 2.5985, time 5263.51ms 
iter 3865: loss 2.7414, time 5257.46ms 
iter 3866: loss 2.7334, time 5250.60ms 
iter 3867: loss 2.5650, time 5254.90ms 
iter 3868: loss 2.7098, time 5256.73ms 
iter 3869: loss 2.8106, time 5258.60ms 
iter 3870: loss 2.8473, time 5261.48ms 
iter 3871: loss 2.5231, time 5269.05ms 
iter 3872: loss 2.5675, time 5255.17ms 
iter 3873: loss 2.6198, time 5252.06ms 
iter 3874: loss 2.5565, time 5252.72ms 
iter 3875: loss 2.6609, time 5260.60ms 
iter 3876: loss 2.6602, time 5260.14ms 
iter 3877: loss 2.4587, time 5271.83ms 
iter 3878: loss 2.7810, time 5273.48ms 
iter 3879: loss 2.6348, time 5260.63ms 
iter 3880: loss 2.5916, time 5266.23ms 
iter 3881: loss 2.6336, time 5260.11ms 
iter 3882: loss 2.7773, time 5255.53ms 
iter 3883: loss 2.8067, time 5263.30ms 
iter 3884: loss 2.5740, time 5261.13ms 
iter 3885: loss 2.7642, time 5258.28ms 
iter 3886: loss 2.5988, time 5257.39ms 
iter 3887: loss 2.7514, time 5241.76ms 
iter 3888: loss 2.7371, time 5243.48ms 
iter 3889: loss 2.8157, time 5228.12ms 
iter 3890: loss 2.4485, time 5261.43ms 
iter 3891: loss 2.6798, time 5241.44ms 
iter 3892: loss 2.5774, time 5247.12ms 
iter 3893: loss 2.5591, time 5240.83ms 
iter 3894: loss 2.6556, time 5246.78ms 
iter 3895: loss 2.7845, time 5252.55ms 
iter 3896: loss 2.7641, time 5253.20ms 
iter 3897: loss 2.7498, time 5255.40ms 
iter 3898: loss 2.4137, time 5243.65ms 
iter 3899: loss 2.5077, time 5243.75ms 
step 3900: train loss 2.6377, val loss 2.8494
iter 3900: loss 2.5072, time 20020.50ms 
iter 3901: loss 2.5625, time 5262.33ms 
iter 3902: loss 2.5945, time 5258.67ms 
iter 3903: loss 2.5645, time 5286.31ms 
iter 3904: loss 2.7827, time 5308.24ms 
iter 3905: loss 2.6294, time 5238.82ms 
iter 3906: loss 2.6152, time 5253.75ms 
iter 3907: loss 2.6655, time 5270.29ms 
iter 3908: loss 2.6664, time 5267.53ms 
iter 3909: loss 2.6896, time 5262.13ms 
iter 3910: loss 2.7927, time 5265.36ms 
iter 3911: loss 2.8244, time 5286.31ms 
iter 3912: loss 2.4991, time 5258.49ms 
iter 3913: loss 2.8115, time 5325.15ms 
iter 3914: loss 2.6046, time 5298.51ms 
iter 3915: loss 2.6395, time 5334.76ms 
iter 3916: loss 2.8189, time 5287.89ms 
iter 3917: loss 2.4567, time 5334.02ms 
iter 3918: loss 2.4695, time 5344.87ms 
iter 3919: loss 2.8079, time 5337.02ms 
iter 3920: loss 2.5541, time 5343.63ms 
iter 3921: loss 2.5261, time 5335.10ms 
iter 3922: loss 2.6806, time 5326.27ms 
iter 3923: loss 2.7415, time 5302.82ms 
iter 3924: loss 2.6491, time 5269.87ms 
iter 3925: loss 2.5612, time 5291.97ms 
iter 3926: loss 2.5835, time 5254.72ms 
iter 3927: loss 2.6090, time 5259.99ms 
iter 3928: loss 2.6937, time 5255.98ms 
iter 3929: loss 2.6440, time 5283.32ms 
iter 3930: loss 2.7202, time 5253.42ms 
iter 3931: loss 2.5219, time 5264.61ms 
iter 3932: loss 2.5303, time 5257.73ms 
iter 3933: loss 2.6384, time 5256.16ms 
iter 3934: loss 2.5825, time 5254.05ms 
iter 3935: loss 2.4914, time 5259.03ms 
iter 3936: loss 2.6696, time 5258.68ms 
iter 3937: loss 2.7012, time 5259.90ms 
iter 3938: loss 2.7772, time 5298.95ms 
iter 3939: loss 2.6690, time 5271.93ms 
iter 3940: loss 2.7994, time 5321.58ms 
iter 3941: loss 2.5635, time 5298.94ms 
iter 3942: loss 2.5163, time 5272.40ms 
iter 3943: loss 2.7364, time 5265.17ms 
iter 3944: loss 2.7006, time 5259.48ms 
iter 3945: loss 2.6010, time 5251.82ms 
iter 3946: loss 2.6053, time 5251.77ms 
iter 3947: loss 2.5466, time 5255.45ms 
iter 3948: loss 2.5099, time 5259.16ms 
iter 3949: loss 2.5986, time 5255.57ms 
step 3950: train loss 2.6454, val loss 2.8475
iter 3950: loss 2.6207, time 20018.01ms 
iter 3951: loss 2.4533, time 5253.62ms 
iter 3952: loss 2.5086, time 5255.99ms 
iter 3953: loss 2.7264, time 5258.88ms 
iter 3954: loss 2.7423, time 5257.58ms 
iter 3955: loss 2.7037, time 5260.80ms 
iter 3956: loss 2.7736, time 5259.90ms 
iter 3957: loss 2.5208, time 5227.23ms 
iter 3958: loss 2.8077, time 5258.54ms 
iter 3959: loss 2.7092, time 5251.58ms 
iter 3960: loss 2.6514, time 5181.44ms 
iter 3961: loss 2.7786, time 5262.06ms 
iter 3962: loss 2.7201, time 5256.89ms 
iter 3963: loss 2.5256, time 5259.04ms 
iter 3964: loss 2.7821, time 5259.85ms 
iter 3965: loss 2.7917, time 5254.46ms 
iter 3966: loss 2.7026, time 5274.37ms 
iter 3967: loss 2.7371, time 5269.83ms 
iter 3968: loss 2.7816, time 5253.18ms 
iter 3969: loss 2.7172, time 5258.04ms 
iter 3970: loss 2.5641, time 5255.59ms 
iter 3971: loss 2.9340, time 5262.00ms 
iter 3972: loss 2.7786, time 5263.76ms 
iter 3973: loss 2.5956, time 5260.09ms 
iter 3974: loss 2.5829, time 5260.44ms 
iter 3975: loss 2.7327, time 5259.24ms 
iter 3976: loss 2.3870, time 5241.03ms 
iter 3977: loss 2.7968, time 5258.58ms 
iter 3978: loss 2.6043, time 5265.13ms 
iter 3979: loss 2.6380, time 5260.59ms 
iter 3980: loss 2.8053, time 5253.80ms 
iter 3981: loss 2.7653, time 5259.19ms 
iter 3982: loss 2.6847, time 5255.18ms 
iter 3983: loss 2.7917, time 5268.58ms 
iter 3984: loss 2.7226, time 5253.20ms 
iter 3985: loss 2.6213, time 5201.69ms 
iter 3986: loss 2.6680, time 5097.97ms 
iter 3987: loss 2.7129, time 5255.68ms 
iter 3988: loss 2.4734, time 5262.28ms 
iter 3989: loss 2.7174, time 5292.40ms 
iter 3990: loss 2.7494, time 5264.74ms 
iter 3991: loss 2.5431, time 5252.19ms 
iter 3992: loss 2.5617, time 5228.83ms 
iter 3993: loss 2.5070, time 5259.09ms 
iter 3994: loss 2.7004, time 5256.02ms 
iter 3995: loss 2.4673, time 5263.83ms 
iter 3996: loss 2.5863, time 5259.05ms 
iter 3997: loss 2.5540, time 5263.38ms 
iter 3998: loss 2.8356, time 5253.85ms 
iter 3999: loss 2.6552, time 5262.63ms 
step 4000: train loss 2.6421, val loss 2.8460
iter 4000: loss 2.6204, time 20032.99ms 
iter 4001: loss 2.7947, time 5221.45ms 
iter 4002: loss 2.5860, time 5200.86ms 
iter 4003: loss 2.7332, time 5234.83ms 
iter 4004: loss 2.6021, time 5216.08ms 
iter 4005: loss 2.6810, time 5219.09ms 
iter 4006: loss 2.3957, time 5248.06ms 
iter 4007: loss 2.7926, time 5186.15ms 
iter 4008: loss 2.6450, time 5167.48ms 
iter 4009: loss 2.3608, time 5241.71ms 
iter 4010: loss 2.7218, time 5174.99ms 
iter 4011: loss 2.6040, time 5248.58ms 
iter 4012: loss 2.5498, time 5200.21ms 
iter 4013: loss 2.6238, time 5165.15ms 
iter 4014: loss 2.7856, time 5257.68ms 
iter 4015: loss 2.7253, time 5262.85ms 
iter 4016: loss 2.9205, time 5261.60ms 
iter 4017: loss 2.6940, time 5265.60ms 
iter 4018: loss 2.5470, time 5253.38ms 
iter 4019: loss 2.8225, time 5265.54ms 
iter 4020: loss 2.5216, time 5260.85ms 
iter 4021: loss 2.6452, time 5261.14ms 
iter 4022: loss 2.7343, time 5283.71ms 
iter 4023: loss 2.7063, time 5261.14ms 
iter 4024: loss 2.5683, time 5258.07ms 
iter 4025: loss 2.8110, time 5266.48ms 
iter 4026: loss 2.6551, time 5259.32ms 
iter 4027: loss 2.6217, time 5259.48ms 
iter 4028: loss 2.7294, time 5271.84ms 
iter 4029: loss 2.6084, time 5273.74ms 
iter 4030: loss 2.7396, time 5266.30ms 
iter 4031: loss 2.8414, time 5262.66ms 
iter 4032: loss 2.6172, time 5263.51ms 
iter 4033: loss 2.4309, time 5265.91ms 
iter 4034: loss 2.7039, time 5248.25ms 
iter 4035: loss 2.7953, time 5219.56ms 
iter 4036: loss 2.5062, time 5279.82ms 
iter 4037: loss 2.5632, time 5246.24ms 
iter 4038: loss 2.6944, time 5078.05ms 
iter 4039: loss 2.6398, time 5087.40ms 
iter 4040: loss 2.6332, time 5072.34ms 
iter 4041: loss 2.6178, time 5215.55ms 
iter 4042: loss 2.6167, time 5237.46ms 
iter 4043: loss 2.6174, time 5240.99ms 
iter 4044: loss 2.3933, time 5216.69ms 
iter 4045: loss 2.5329, time 5240.98ms 
iter 4046: loss 2.7011, time 5246.88ms 
iter 4047: loss 2.7968, time 5241.05ms 
iter 4048: loss 2.5460, time 5234.43ms 
iter 4049: loss 2.5739, time 5248.60ms 
step 4050: train loss 2.6308, val loss 2.8501
iter 4050: loss 2.7072, time 20051.78ms 
iter 4051: loss 2.5751, time 5275.22ms 
iter 4052: loss 2.5550, time 5270.42ms 
iter 4053: loss 2.6160, time 5261.12ms 
iter 4054: loss 2.6457, time 5270.28ms 
iter 4055: loss 2.6623, time 5266.28ms 
iter 4056: loss 2.6069, time 5270.85ms 
iter 4057: loss 2.8597, time 5205.32ms 
iter 4058: loss 2.4981, time 5265.35ms 
iter 4059: loss 2.6776, time 5268.00ms 
iter 4060: loss 2.6051, time 5265.04ms 
iter 4061: loss 2.6560, time 5276.74ms 
iter 4062: loss 2.7213, time 5260.52ms 
iter 4063: loss 2.6582, time 5266.89ms 
iter 4064: loss 2.8568, time 5266.34ms 
iter 4065: loss 2.7566, time 5264.93ms 
iter 4066: loss 2.6894, time 5266.60ms 
iter 4067: loss 2.5752, time 5264.44ms 
iter 4068: loss 2.6687, time 5238.65ms 
iter 4069: loss 2.6701, time 5254.19ms 
iter 4070: loss 2.6155, time 5247.43ms 
iter 4071: loss 3.0138, time 5241.86ms 
iter 4072: loss 2.7227, time 5247.05ms 
iter 4073: loss 2.7186, time 5248.72ms 
iter 4074: loss 2.6561, time 5239.38ms 
iter 4075: loss 2.5374, time 5244.87ms 
iter 4076: loss 2.6486, time 5241.85ms 
iter 4077: loss 2.6056, time 5242.10ms 
iter 4078: loss 2.7655, time 5247.90ms 
iter 4079: loss 2.6461, time 5254.68ms 
iter 4080: loss 2.7289, time 5242.78ms 
iter 4081: loss 2.6438, time 5240.13ms 
iter 4082: loss 2.4204, time 5244.30ms 
iter 4083: loss 2.7175, time 5243.60ms 
iter 4084: loss 2.5582, time 5259.92ms 
iter 4085: loss 2.5357, time 5252.82ms 
iter 4086: loss 2.8145, time 5243.92ms 
iter 4087: loss 2.4889, time 5244.98ms 
iter 4088: loss 2.7559, time 5243.38ms 
iter 4089: loss 2.4463, time 5240.59ms 
iter 4090: loss 2.7334, time 5250.32ms 
iter 4091: loss 2.4941, time 5258.79ms 
iter 4092: loss 2.6211, time 5245.94ms 
iter 4093: loss 2.5690, time 5244.55ms 
iter 4094: loss 2.4143, time 5256.32ms 
iter 4095: loss 2.4945, time 5246.07ms 
iter 4096: loss 2.6797, time 5254.60ms 
iter 4097: loss 2.6477, time 5251.14ms 
iter 4098: loss 2.7606, time 5251.07ms 
iter 4099: loss 2.4514, time 5252.44ms 
step 4100: train loss 2.6314, val loss 2.8409
iter 4100: loss 2.7646, time 20018.59ms 
iter 4101: loss 2.6963, time 5241.17ms 
iter 4102: loss 2.9789, time 5213.94ms 
iter 4103: loss 2.4986, time 5227.16ms 
iter 4104: loss 2.5745, time 5241.16ms 
iter 4105: loss 2.5442, time 5265.30ms 
iter 4106: loss 2.4079, time 5265.77ms 
iter 4107: loss 2.4872, time 5249.92ms 
iter 4108: loss 2.6889, time 5243.20ms 
iter 4109: loss 2.4937, time 5265.83ms 
iter 4110: loss 2.7260, time 5271.46ms 
iter 4111: loss 2.8967, time 5259.54ms 
iter 4112: loss 2.5876, time 5260.97ms 
iter 4113: loss 2.7582, time 5253.99ms 
iter 4114: loss 2.5752, time 5272.46ms 
iter 4115: loss 2.6763, time 5275.86ms 
iter 4116: loss 2.7505, time 5270.70ms 
iter 4117: loss 2.5753, time 5261.61ms 
iter 4118: loss 2.5972, time 5261.28ms 
iter 4119: loss 2.6046, time 5220.78ms 
iter 4120: loss 2.7252, time 5123.96ms 
iter 4121: loss 2.8304, time 5141.25ms 
iter 4122: loss 2.7653, time 5189.11ms 
iter 4123: loss 2.6365, time 5267.54ms 
iter 4124: loss 2.5614, time 5264.49ms 
iter 4125: loss 2.5482, time 5265.82ms 
iter 4126: loss 2.7144, time 5260.79ms 
iter 4127: loss 2.5914, time 5274.50ms 
iter 4128: loss 2.7111, time 5303.84ms 
iter 4129: loss 2.5120, time 5254.72ms 
iter 4130: loss 2.5853, time 5267.62ms 
iter 4131: loss 2.5111, time 5277.11ms 
iter 4132: loss 2.7355, time 5266.46ms 
iter 4133: loss 2.6259, time 5267.16ms 
iter 4134: loss 2.5279, time 5254.98ms 
iter 4135: loss 2.4760, time 5260.05ms 
iter 4136: loss 2.6644, time 5250.82ms 
iter 4137: loss 2.7750, time 5254.52ms 
iter 4138: loss 2.5269, time 5254.16ms 
iter 4139: loss 2.6633, time 5252.36ms 
iter 4140: loss 2.7390, time 5262.20ms 
iter 4141: loss 2.7418, time 5258.44ms 
iter 4142: loss 2.7268, time 5143.88ms 
iter 4143: loss 2.6517, time 5099.56ms 
iter 4144: loss 2.5282, time 5094.85ms 
iter 4145: loss 2.4908, time 5102.91ms 
iter 4146: loss 2.6277, time 5102.71ms 
iter 4147: loss 2.7395, time 5111.58ms 
iter 4148: loss 2.3675, time 5245.65ms 
iter 4149: loss 2.6196, time 5254.98ms 
step 4150: train loss 2.6324, val loss 2.8558
iter 4150: loss 2.7170, time 20058.50ms 
iter 4151: loss 2.8039, time 5263.13ms 
iter 4152: loss 2.7027, time 5140.18ms 
iter 4153: loss 2.5933, time 5159.33ms 
iter 4154: loss 2.5122, time 5248.92ms 
iter 4155: loss 2.6016, time 5091.02ms 
iter 4156: loss 2.7791, time 5093.43ms 
iter 4157: loss 2.6569, time 5148.73ms 
iter 4158: loss 2.6049, time 5210.75ms 
iter 4159: loss 2.6346, time 5249.00ms 
iter 4160: loss 2.8116, time 5253.83ms 
iter 4161: loss 2.5929, time 5251.67ms 
iter 4162: loss 2.5124, time 5255.00ms 
iter 4163: loss 2.7959, time 5252.05ms 
iter 4164: loss 2.7237, time 5247.95ms 
iter 4165: loss 2.6466, time 5251.41ms 
iter 4166: loss 2.7045, time 5259.40ms 
iter 4167: loss 2.5994, time 5250.52ms 
iter 4168: loss 2.7238, time 5261.50ms 
iter 4169: loss 2.7032, time 5256.20ms 
iter 4170: loss 2.5168, time 5259.75ms 
iter 4171: loss 2.6091, time 5261.54ms 
iter 4172: loss 2.6765, time 5294.29ms 
iter 4173: loss 2.7327, time 5272.88ms 
iter 4174: loss 2.6790, time 5259.33ms 
iter 4175: loss 2.5928, time 5280.04ms 
iter 4176: loss 2.6475, time 5285.00ms 
iter 4177: loss 2.6396, time 5294.56ms 
iter 4178: loss 2.7259, time 5272.07ms 
iter 4179: loss 2.7687, time 5264.99ms 
iter 4180: loss 2.7062, time 5256.09ms 
iter 4181: loss 2.6809, time 5264.22ms 
iter 4182: loss 2.6958, time 5258.33ms 
iter 4183: loss 2.5594, time 5261.95ms 
iter 4184: loss 2.4415, time 5269.21ms 
iter 4185: loss 2.6657, time 5268.54ms 
iter 4186: loss 2.5511, time 5261.40ms 
iter 4187: loss 2.7221, time 5271.43ms 
iter 4188: loss 2.6190, time 5262.08ms 
iter 4189: loss 2.6182, time 5272.27ms 
iter 4190: loss 2.6680, time 5262.02ms 
iter 4191: loss 2.4288, time 5262.88ms 
iter 4192: loss 2.5059, time 5261.80ms 
iter 4193: loss 2.5157, time 5269.62ms 
iter 4194: loss 2.5716, time 5214.55ms 
iter 4195: loss 2.5583, time 5247.10ms 
iter 4196: loss 2.5517, time 5232.67ms 
iter 4197: loss 2.3008, time 5263.43ms 
iter 4198: loss 2.6126, time 5274.08ms 
iter 4199: loss 2.6160, time 5268.24ms 
step 4200: train loss 2.6367, val loss 2.8488
iter 4200: loss 2.3737, time 20077.27ms 
iter 4201: loss 2.4546, time 5264.48ms 
iter 4202: loss 2.6233, time 5259.73ms 
iter 4203: loss 2.8542, time 5262.12ms 
iter 4204: loss 2.6470, time 5250.18ms 
iter 4205: loss 2.5811, time 5264.04ms 
iter 4206: loss 2.6856, time 5260.78ms 
iter 4207: loss 2.6472, time 5254.91ms 
iter 4208: loss 2.4687, time 5247.28ms 
iter 4209: loss 2.6847, time 5250.22ms 
iter 4210: loss 2.4779, time 5260.35ms 
iter 4211: loss 2.5934, time 5259.67ms 
iter 4212: loss 2.5122, time 5254.30ms 
iter 4213: loss 2.6036, time 5251.68ms 
iter 4214: loss 2.5844, time 5255.26ms 
iter 4215: loss 2.6469, time 5260.23ms 
iter 4216: loss 2.7243, time 5265.85ms 
iter 4217: loss 2.6604, time 5261.96ms 
iter 4218: loss 2.5915, time 5259.13ms 
iter 4219: loss 2.7306, time 5252.67ms 
iter 4220: loss 2.6426, time 5254.77ms 
iter 4221: loss 2.5389, time 5262.65ms 
iter 4222: loss 2.7215, time 5276.22ms 
iter 4223: loss 2.5283, time 5276.49ms 
iter 4224: loss 2.7123, time 5267.94ms 
iter 4225: loss 2.5193, time 5258.09ms 
iter 4226: loss 2.5961, time 5257.77ms 
iter 4227: loss 2.5803, time 5263.46ms 
iter 4228: loss 2.4869, time 5268.80ms 
iter 4229: loss 2.6194, time 5257.14ms 
iter 4230: loss 2.4645, time 5255.15ms 
iter 4231: loss 2.6740, time 5253.69ms 
iter 4232: loss 2.5756, time 5262.29ms 
iter 4233: loss 2.5338, time 5236.58ms 
iter 4234: loss 2.8504, time 5259.05ms 
iter 4235: loss 2.6011, time 5262.11ms 
iter 4236: loss 2.5163, time 5273.85ms 
iter 4237: loss 2.5315, time 5271.65ms 
iter 4238: loss 2.6585, time 5269.25ms 
iter 4239: loss 2.8021, time 5262.00ms 
iter 4240: loss 2.6966, time 5264.78ms 
iter 4241: loss 2.5295, time 5258.18ms 
iter 4242: loss 2.7416, time 5267.46ms 
iter 4243: loss 2.7479, time 5266.65ms 
iter 4244: loss 2.4819, time 5257.01ms 
iter 4245: loss 2.6619, time 5255.69ms 
iter 4246: loss 2.7201, time 5259.04ms 
iter 4247: loss 2.5508, time 5227.96ms 
iter 4248: loss 2.5628, time 5270.77ms 
iter 4249: loss 2.5629, time 5259.78ms 
step 4250: train loss 2.6157, val loss 2.8465
iter 4250: loss 2.6320, time 20010.51ms 
iter 4251: loss 2.5400, time 5259.58ms 
iter 4252: loss 2.6615, time 5254.85ms 
iter 4253: loss 2.5160, time 5261.49ms 
iter 4254: loss 2.7135, time 5257.85ms 
iter 4255: loss 2.5363, time 5268.43ms 
iter 4256: loss 2.4293, time 5262.02ms 
iter 4257: loss 2.7907, time 5263.35ms 
iter 4258: loss 2.5968, time 5256.23ms 
iter 4259: loss 2.6219, time 5264.07ms 
iter 4260: loss 2.2024, time 5258.17ms 
iter 4261: loss 2.5879, time 5266.48ms 
iter 4262: loss 2.5763, time 5257.21ms 
iter 4263: loss 2.6645, time 5258.21ms 
iter 4264: loss 2.4738, time 5267.03ms 
iter 4265: loss 2.5955, time 5270.95ms 
iter 4266: loss 2.5713, time 5275.21ms 
iter 4267: loss 2.7070, time 5252.38ms 
iter 4268: loss 2.4847, time 5258.20ms 
iter 4269: loss 2.8744, time 5251.44ms 
iter 4270: loss 2.6531, time 5260.61ms 
iter 4271: loss 2.7778, time 5270.70ms 
iter 4272: loss 2.5555, time 5265.57ms 
iter 4273: loss 2.8367, time 5257.29ms 
iter 4274: loss 2.7185, time 5246.69ms 
iter 4275: loss 2.6478, time 5254.67ms 
iter 4276: loss 2.6950, time 5260.55ms 
iter 4277: loss 2.6342, time 5261.67ms 
iter 4278: loss 2.6851, time 5256.88ms 
iter 4279: loss 2.5255, time 5226.91ms 
iter 4280: loss 2.5999, time 5250.60ms 
iter 4281: loss 2.6161, time 5258.95ms 
iter 4282: loss 2.5802, time 5256.01ms 
iter 4283: loss 2.6873, time 5257.38ms 
iter 4284: loss 2.7176, time 5256.74ms 
iter 4285: loss 2.5907, time 5265.23ms 
iter 4286: loss 2.6948, time 5263.68ms 
iter 4287: loss 2.5899, time 5236.64ms 
iter 4288: loss 2.6325, time 5264.26ms 
iter 4289: loss 2.5164, time 5259.64ms 
iter 4290: loss 2.6172, time 5249.41ms 
iter 4291: loss 2.6004, time 5267.64ms 
iter 4292: loss 2.5611, time 5261.32ms 
iter 4293: loss 2.8315, time 5287.94ms 
iter 4294: loss 2.6473, time 5328.58ms 
iter 4295: loss 2.6685, time 5324.93ms 
iter 4296: loss 2.6974, time 5286.34ms 
iter 4297: loss 2.7671, time 5265.43ms 
iter 4298: loss 2.7672, time 5261.09ms 
iter 4299: loss 2.7111, time 5239.66ms 
step 4300: train loss 2.6050, val loss 2.8351
iter 4300: loss 2.6649, time 20051.82ms 
iter 4301: loss 2.4321, time 5253.43ms 
iter 4302: loss 2.5670, time 5254.62ms 
iter 4303: loss 2.6190, time 5260.48ms 
iter 4304: loss 2.5662, time 5270.77ms 
iter 4305: loss 2.5708, time 5251.61ms 
iter 4306: loss 2.5582, time 5252.60ms 
iter 4307: loss 2.7928, time 5261.88ms 
iter 4308: loss 2.4865, time 5260.24ms 
iter 4309: loss 2.6102, time 5259.34ms 
iter 4310: loss 2.7993, time 5267.78ms 
iter 4311: loss 2.7609, time 5263.16ms 
iter 4312: loss 2.6951, time 5255.80ms 
iter 4313: loss 2.6409, time 5260.29ms 
iter 4314: loss 2.4082, time 5255.27ms 
iter 4315: loss 2.4812, time 5263.69ms 
iter 4316: loss 2.7512, time 5279.56ms 
iter 4317: loss 2.6367, time 5252.11ms 
iter 4318: loss 2.5737, time 5268.76ms 
iter 4319: loss 2.7854, time 5267.62ms 
iter 4320: loss 2.7400, time 5265.26ms 
iter 4321: loss 2.6774, time 5270.29ms 
iter 4322: loss 2.7390, time 5272.48ms 
iter 4323: loss 2.6351, time 5266.55ms 
iter 4324: loss 2.6873, time 5256.76ms 
iter 4325: loss 2.5695, time 5251.29ms 
iter 4326: loss 2.5809, time 5260.22ms 
iter 4327: loss 2.6002, time 5264.94ms 
iter 4328: loss 2.4990, time 5267.11ms 
iter 4329: loss 2.7686, time 5262.22ms 
iter 4330: loss 2.5071, time 5255.68ms 
iter 4331: loss 2.5850, time 5257.83ms 
iter 4332: loss 2.6216, time 5254.84ms 
iter 4333: loss 2.5166, time 5257.45ms 
iter 4334: loss 2.7597, time 5258.49ms 
iter 4335: loss 2.5131, time 5261.48ms 
iter 4336: loss 2.8194, time 5258.08ms 
iter 4337: loss 2.4798, time 5253.04ms 
iter 4338: loss 2.6971, time 5256.20ms 
iter 4339: loss 2.6085, time 5267.09ms 
iter 4340: loss 2.6926, time 5271.55ms 
iter 4341: loss 2.7030, time 5258.04ms 
iter 4342: loss 2.6072, time 5255.08ms 
iter 4343: loss 2.6445, time 5259.12ms 
iter 4344: loss 2.8388, time 5270.82ms 
iter 4345: loss 2.6908, time 5264.42ms 
iter 4346: loss 2.6107, time 5257.24ms 
iter 4347: loss 2.6848, time 5259.26ms 
iter 4348: loss 2.7376, time 5262.31ms 
iter 4349: loss 2.4634, time 5279.68ms 
step 4350: train loss 2.6257, val loss 2.8641
iter 4350: loss 2.4227, time 20070.92ms 
iter 4351: loss 2.7144, time 5259.43ms 
iter 4352: loss 2.4033, time 5278.44ms 
iter 4353: loss 2.6178, time 5267.49ms 
iter 4354: loss 2.5876, time 5265.59ms 
iter 4355: loss 2.4451, time 5257.54ms 
iter 4356: loss 2.6464, time 5269.23ms 
iter 4357: loss 2.7751, time 5260.58ms 
iter 4358: loss 2.5876, time 5262.86ms 
iter 4359: loss 2.6399, time 5262.36ms 
iter 4360: loss 2.6224, time 5257.41ms 
iter 4361: loss 2.5952, time 5260.78ms 
iter 4362: loss 2.6788, time 5264.49ms 
iter 4363: loss 2.5759, time 5257.02ms 
iter 4364: loss 2.6177, time 5270.52ms 
iter 4365: loss 2.6590, time 5273.61ms 
iter 4366: loss 2.6793, time 5260.76ms 
iter 4367: loss 2.5635, time 5261.01ms 
iter 4368: loss 2.4511, time 5257.21ms 
iter 4369: loss 2.5295, time 5261.32ms 
iter 4370: loss 2.5632, time 5271.14ms 
iter 4371: loss 2.6111, time 5261.83ms 
iter 4372: loss 2.5804, time 5252.31ms 
iter 4373: loss 2.4744, time 5256.01ms 
iter 4374: loss 2.4214, time 5255.77ms 
iter 4375: loss 2.7751, time 5266.77ms 
iter 4376: loss 2.7029, time 5260.95ms 
iter 4377: loss 2.4908, time 5263.90ms 
iter 4378: loss 2.5279, time 5248.29ms 
iter 4379: loss 2.6848, time 5256.21ms 
iter 4380: loss 2.6138, time 5250.80ms 
iter 4381: loss 2.8180, time 5261.45ms 
iter 4382: loss 2.5374, time 5251.70ms 
iter 4383: loss 2.7248, time 5253.87ms 
iter 4384: loss 2.6449, time 5263.09ms 
iter 4385: loss 2.6221, time 5246.69ms 
iter 4386: loss 2.5345, time 5258.64ms 
iter 4387: loss 2.6417, time 5257.01ms 
iter 4388: loss 2.6006, time 5251.09ms 
iter 4389: loss 2.5172, time 5249.65ms 
iter 4390: loss 2.6375, time 5250.73ms 
iter 4391: loss 2.5195, time 5255.38ms 
iter 4392: loss 2.8081, time 5266.44ms 
iter 4393: loss 2.6539, time 5256.61ms 
iter 4394: loss 2.7930, time 5237.70ms 
iter 4395: loss 2.5639, time 5261.11ms 
iter 4396: loss 2.7417, time 5227.69ms 
iter 4397: loss 2.6304, time 5257.89ms 
iter 4398: loss 2.6951, time 5261.16ms 
iter 4399: loss 2.6489, time 5258.58ms 
step 4400: train loss 2.6321, val loss 2.8463
iter 4400: loss 2.7704, time 19934.36ms 
iter 4401: loss 2.6554, time 5251.88ms 
iter 4402: loss 2.5042, time 5254.93ms 
iter 4403: loss 2.4836, time 5258.53ms 
iter 4404: loss 2.6086, time 5262.04ms 
iter 4405: loss 2.7752, time 5272.23ms 
iter 4406: loss 2.4555, time 5257.43ms 
iter 4407: loss 2.4801, time 5253.44ms 
iter 4408: loss 2.9466, time 5254.29ms 
iter 4409: loss 2.4165, time 5252.29ms 
iter 4410: loss 2.7523, time 5264.03ms 
iter 4411: loss 2.5412, time 5263.36ms 
iter 4412: loss 2.4847, time 5254.44ms 
iter 4413: loss 2.5252, time 5256.61ms 
iter 4414: loss 2.7742, time 5253.95ms 
iter 4415: loss 2.5256, time 5251.49ms 
iter 4416: loss 2.7202, time 5257.92ms 
iter 4417: loss 2.5237, time 5266.42ms 
iter 4418: loss 2.5250, time 5252.05ms 
iter 4419: loss 2.6150, time 5251.63ms 
iter 4420: loss 2.6574, time 5253.93ms 
iter 4421: loss 2.6075, time 5259.04ms 
iter 4422: loss 2.7610, time 5260.48ms 
iter 4423: loss 2.5977, time 5264.04ms 
iter 4424: loss 2.6637, time 5266.59ms 
iter 4425: loss 2.8359, time 5251.55ms 
iter 4426: loss 2.5696, time 5127.52ms 
iter 4427: loss 2.5834, time 5101.89ms 
iter 4428: loss 2.5949, time 5135.44ms 
iter 4429: loss 2.7086, time 5114.39ms 
iter 4430: loss 2.5326, time 5141.33ms 
iter 4431: loss 2.7071, time 5244.31ms 
iter 4432: loss 2.5049, time 5267.41ms 
iter 4433: loss 2.7394, time 5269.55ms 
iter 4434: loss 2.8495, time 5268.65ms 
iter 4435: loss 2.5591, time 5264.61ms 
iter 4436: loss 2.5684, time 5263.74ms 
iter 4437: loss 2.2758, time 5270.14ms 
iter 4438: loss 2.8045, time 5267.02ms 
iter 4439: loss 2.7848, time 5271.23ms 
iter 4440: loss 2.6017, time 5268.72ms 
iter 4441: loss 2.4054, time 5276.80ms 
iter 4442: loss 2.5402, time 5268.80ms 
iter 4443: loss 2.7866, time 5266.94ms 
iter 4444: loss 2.6903, time 5267.57ms 
iter 4445: loss 2.5058, time 5255.23ms 
iter 4446: loss 2.6631, time 5252.41ms 
iter 4447: loss 2.8807, time 5266.05ms 
iter 4448: loss 2.6059, time 5260.94ms 
iter 4449: loss 2.7380, time 5260.67ms 
step 4450: train loss 2.6334, val loss 2.8402
iter 4450: loss 2.6119, time 20028.89ms 
iter 4451: loss 2.7359, time 5255.83ms 
iter 4452: loss 2.5401, time 5255.91ms 
iter 4453: loss 2.5625, time 5251.08ms 
iter 4454: loss 2.7477, time 5264.46ms 
iter 4455: loss 2.5361, time 5260.30ms 
iter 4456: loss 2.4665, time 5258.14ms 
iter 4457: loss 2.5875, time 5254.42ms 
iter 4458: loss 2.6078, time 5233.54ms 
iter 4459: loss 2.5691, time 5251.79ms 
iter 4460: loss 2.5783, time 5253.63ms 
iter 4461: loss 2.6341, time 5314.80ms 
iter 4462: loss 2.7950, time 5279.93ms 
iter 4463: loss 2.5416, time 5302.17ms 
iter 4464: loss 2.5606, time 5314.75ms 
iter 4465: loss 2.5971, time 5258.94ms 
iter 4466: loss 2.6728, time 5275.47ms 
iter 4467: loss 2.7061, time 5266.53ms 
iter 4468: loss 2.6894, time 5311.73ms 
iter 4469: loss 2.5989, time 5143.75ms 
iter 4470: loss 2.6625, time 5240.11ms 
iter 4471: loss 2.5130, time 5260.53ms 
iter 4472: loss 2.5720, time 5259.76ms 
iter 4473: loss 2.6537, time 5279.34ms 
iter 4474: loss 2.9037, time 5263.24ms 
iter 4475: loss 2.7190, time 5263.81ms 
iter 4476: loss 2.6799, time 5277.87ms 
iter 4477: loss 2.5471, time 5271.46ms 
iter 4478: loss 2.6914, time 5265.82ms 
iter 4479: loss 2.5224, time 5269.97ms 
iter 4480: loss 2.6027, time 5261.54ms 
iter 4481: loss 2.3409, time 5263.20ms 
iter 4482: loss 2.5927, time 5272.89ms 
iter 4483: loss 2.6405, time 5269.08ms 
iter 4484: loss 2.7786, time 5269.62ms 
iter 4485: loss 2.6550, time 5261.46ms 
iter 4486: loss 2.6417, time 5267.86ms 
iter 4487: loss 2.3706, time 5260.89ms 
iter 4488: loss 2.8253, time 5252.70ms 
iter 4489: loss 2.4771, time 5248.19ms 
iter 4490: loss 2.6359, time 5254.33ms 
iter 4491: loss 2.5919, time 5263.31ms 
iter 4492: loss 2.8250, time 5264.73ms 
iter 4493: loss 2.7798, time 5271.49ms 
iter 4494: loss 2.6314, time 5275.88ms 
iter 4495: loss 2.4470, time 5256.21ms 
iter 4496: loss 2.7817, time 5260.49ms 
iter 4497: loss 2.5877, time 5262.19ms 
iter 4498: loss 2.6920, time 5268.79ms 
iter 4499: loss 2.6418, time 5263.62ms 
step 4500: train loss 2.6159, val loss 2.8509
iter 4500: loss 2.6065, time 20023.97ms 
iter 4501: loss 2.5784, time 5257.79ms 
iter 4502: loss 2.8233, time 5264.81ms 
iter 4503: loss 2.5708, time 5265.13ms 
iter 4504: loss 2.8839, time 5260.63ms 
iter 4505: loss 2.5032, time 5254.70ms 
iter 4506: loss 2.5717, time 5255.11ms 
iter 4507: loss 2.6903, time 5253.05ms 
iter 4508: loss 2.5773, time 5262.74ms 
iter 4509: loss 2.5314, time 5281.39ms 
iter 4510: loss 2.4261, time 5253.30ms 
iter 4511: loss 2.6377, time 5254.65ms 
iter 4512: loss 2.5627, time 5255.87ms 
iter 4513: loss 2.5349, time 5259.34ms 
iter 4514: loss 2.8326, time 5264.63ms 
iter 4515: loss 2.4893, time 5260.51ms 
iter 4516: loss 2.6027, time 5253.57ms 
iter 4517: loss 2.5068, time 5147.39ms 
iter 4518: loss 2.8052, time 5259.28ms 
iter 4519: loss 2.5674, time 5259.22ms 
iter 4520: loss 2.7210, time 5274.57ms 
iter 4521: loss 2.5455, time 5262.57ms 
iter 4522: loss 2.7394, time 5264.01ms 
iter 4523: loss 2.6886, time 5260.72ms 
iter 4524: loss 2.5534, time 5260.24ms 
iter 4525: loss 2.7247, time 5261.97ms 
iter 4526: loss 2.7489, time 5269.23ms 
iter 4527: loss 2.4105, time 5263.86ms 
iter 4528: loss 2.6845, time 5261.19ms 
iter 4529: loss 2.4547, time 5267.00ms 
iter 4530: loss 2.6657, time 5263.07ms 
iter 4531: loss 2.5216, time 5266.01ms 
iter 4532: loss 2.7841, time 5270.36ms 
iter 4533: loss 2.3595, time 5261.68ms 
iter 4534: loss 2.5098, time 5253.35ms 
iter 4535: loss 2.4164, time 5266.52ms 
iter 4536: loss 2.7458, time 5273.72ms 
iter 4537: loss 2.6663, time 5267.68ms 
iter 4538: loss 2.5652, time 5266.86ms 
iter 4539: loss 2.4790, time 5263.14ms 
iter 4540: loss 2.7099, time 5256.52ms 
iter 4541: loss 2.6663, time 5263.30ms 
iter 4542: loss 2.4412, time 5268.73ms 
iter 4543: loss 2.8017, time 5258.21ms 
iter 4544: loss 2.6544, time 5264.68ms 
iter 4545: loss 2.8507, time 5251.80ms 
iter 4546: loss 2.7443, time 5255.98ms 
iter 4547: loss 2.4762, time 5270.55ms 
iter 4548: loss 2.6896, time 5259.45ms 
iter 4549: loss 2.6675, time 5260.75ms 
step 4550: train loss 2.6044, val loss 2.8486
iter 4550: loss 2.4771, time 20137.22ms 
iter 4551: loss 2.6192, time 5343.53ms 
iter 4552: loss 2.5701, time 5348.02ms 
iter 4553: loss 2.5914, time 5340.20ms 
iter 4554: loss 2.8639, time 5332.94ms 
iter 4555: loss 2.5923, time 5302.73ms 
iter 4556: loss 2.7572, time 5332.96ms 
iter 4557: loss 2.4485, time 5302.93ms 
iter 4558: loss 2.6459, time 5268.60ms 
iter 4559: loss 2.6054, time 5262.33ms 
iter 4560: loss 2.8138, time 5258.63ms 
iter 4561: loss 2.4911, time 5265.54ms 
iter 4562: loss 2.4833, time 5262.08ms 
iter 4563: loss 2.6683, time 5259.14ms 
iter 4564: loss 2.5964, time 5253.17ms 
iter 4565: loss 2.6088, time 5258.26ms 
iter 4566: loss 2.4298, time 5259.49ms 
iter 4567: loss 2.5264, time 5259.44ms 
iter 4568: loss 2.6132, time 5267.89ms 
iter 4569: loss 2.5153, time 5252.61ms 
iter 4570: loss 2.6991, time 5262.77ms 
iter 4571: loss 2.6674, time 5262.07ms 
iter 4572: loss 2.6096, time 5259.67ms 
iter 4573: loss 2.7675, time 5266.08ms 
iter 4574: loss 2.5762, time 5263.95ms 
iter 4575: loss 2.6484, time 5271.69ms 
iter 4576: loss 2.4710, time 5287.77ms 
iter 4577: loss 2.9033, time 5258.73ms 
iter 4578: loss 2.6899, time 5261.07ms 
iter 4579: loss 2.5814, time 5275.96ms 
iter 4580: loss 2.4965, time 5285.18ms 
iter 4581: loss 2.5241, time 5272.58ms 
iter 4582: loss 2.6359, time 5256.38ms 
iter 4583: loss 2.6175, time 5258.74ms 
iter 4584: loss 2.5680, time 5259.63ms 
iter 4585: loss 2.7152, time 5267.62ms 
iter 4586: loss 2.5709, time 5260.52ms 
iter 4587: loss 2.4756, time 5260.53ms 
iter 4588: loss 2.5731, time 5289.22ms 
iter 4589: loss 2.5311, time 5255.85ms 
iter 4590: loss 2.5329, time 5254.07ms 
iter 4591: loss 2.5399, time 5281.25ms 
iter 4592: loss 2.5838, time 5321.80ms 
iter 4593: loss 2.5823, time 5327.21ms 
iter 4594: loss 2.7270, time 5325.30ms 
iter 4595: loss 2.6818, time 5228.60ms 
iter 4596: loss 2.4333, time 5323.57ms 
iter 4597: loss 2.7176, time 5286.96ms 
iter 4598: loss 2.8752, time 5265.70ms 
iter 4599: loss 2.5669, time 5254.61ms 
step 4600: train loss 2.6092, val loss 2.8507
iter 4600: loss 2.3831, time 20051.83ms 
iter 4601: loss 2.4734, time 5260.67ms 
iter 4602: loss 2.3888, time 5261.01ms 
iter 4603: loss 2.5094, time 5261.68ms 
iter 4604: loss 2.7111, time 5263.68ms 
iter 4605: loss 2.6091, time 5274.44ms 
iter 4606: loss 2.8007, time 5264.72ms 
iter 4607: loss 2.7927, time 5258.99ms 
iter 4608: loss 2.6811, time 5266.40ms 
iter 4609: loss 2.7633, time 5273.35ms 
iter 4610: loss 2.6170, time 5273.88ms 
iter 4611: loss 2.7305, time 5277.51ms 
iter 4612: loss 2.6052, time 5270.69ms 
iter 4613: loss 2.6913, time 5287.28ms 
iter 4614: loss 2.6730, time 5263.68ms 
iter 4615: loss 2.7021, time 5267.05ms 
iter 4616: loss 2.5728, time 5297.29ms 
iter 4617: loss 2.4888, time 5326.30ms 
iter 4618: loss 2.3305, time 5296.46ms 
iter 4619: loss 2.6030, time 5270.46ms 
iter 4620: loss 2.6124, time 5269.57ms 
iter 4621: loss 2.5187, time 5262.95ms 
iter 4622: loss 2.6331, time 5256.68ms 
iter 4623: loss 2.5824, time 5254.49ms 
iter 4624: loss 2.6045, time 5258.31ms 
iter 4625: loss 2.6025, time 5257.86ms 
iter 4626: loss 2.7710, time 5258.24ms 
iter 4627: loss 2.6494, time 5284.44ms 
iter 4628: loss 2.6676, time 5251.71ms 
iter 4629: loss 2.6751, time 5251.40ms 
iter 4630: loss 2.5732, time 5258.15ms 
iter 4631: loss 2.5539, time 5293.22ms 
iter 4632: loss 2.4711, time 5256.65ms 
iter 4633: loss 2.6101, time 5250.50ms 
iter 4634: loss 2.8319, time 5258.38ms 
iter 4635: loss 2.4496, time 5269.40ms 
iter 4636: loss 2.5505, time 5261.39ms 
iter 4637: loss 2.7941, time 5259.00ms 
iter 4638: loss 2.6048, time 5255.09ms 
iter 4639: loss 2.4840, time 5263.55ms 
iter 4640: loss 2.4543, time 5265.88ms 
iter 4641: loss 2.7568, time 5264.51ms 
iter 4642: loss 2.6449, time 5273.14ms 
iter 4643: loss 2.5935, time 5271.78ms 
iter 4644: loss 2.6181, time 5306.51ms 
iter 4645: loss 2.8748, time 5283.01ms 
iter 4646: loss 2.4689, time 5268.54ms 
iter 4647: loss 2.6534, time 5268.08ms 
iter 4648: loss 2.4662, time 5268.39ms 
iter 4649: loss 2.3331, time 5262.15ms 
step 4650: train loss 2.5999, val loss 2.8466
iter 4650: loss 2.6224, time 20055.68ms 
iter 4651: loss 2.5233, time 5263.46ms 
iter 4652: loss 2.4709, time 5258.63ms 
iter 4653: loss 2.6436, time 5262.91ms 
iter 4654: loss 2.5400, time 5276.68ms 
iter 4655: loss 2.6820, time 5264.78ms 
iter 4656: loss 2.5309, time 5260.61ms 
iter 4657: loss 2.5821, time 5261.17ms 
iter 4658: loss 2.6612, time 5264.62ms 
iter 4659: loss 2.4107, time 5265.21ms 
iter 4660: loss 2.9125, time 5270.08ms 
iter 4661: loss 2.5876, time 5258.09ms 
iter 4662: loss 2.5015, time 5260.82ms 
iter 4663: loss 2.4118, time 5260.22ms 
iter 4664: loss 2.5570, time 5276.82ms 
iter 4665: loss 2.6954, time 5266.06ms 
iter 4666: loss 2.5184, time 5261.32ms 
iter 4667: loss 2.6151, time 5268.30ms 
iter 4668: loss 2.7151, time 5262.78ms 
iter 4669: loss 2.6595, time 5273.41ms 
iter 4670: loss 2.7115, time 5255.23ms 
iter 4671: loss 2.6868, time 5253.31ms 
iter 4672: loss 2.5338, time 5255.70ms 
iter 4673: loss 2.6135, time 5257.67ms 
iter 4674: loss 2.4598, time 5256.88ms 
iter 4675: loss 2.7973, time 5262.94ms 
iter 4676: loss 2.4689, time 5249.12ms 
iter 4677: loss 2.6847, time 5258.98ms 
iter 4678: loss 2.6382, time 5261.57ms 
iter 4679: loss 2.6154, time 5260.48ms 
iter 4680: loss 2.5082, time 5272.45ms 
iter 4681: loss 2.8654, time 5268.75ms 
iter 4682: loss 2.7100, time 5258.57ms 
iter 4683: loss 2.4951, time 5260.07ms 
iter 4684: loss 2.4086, time 5153.78ms 
iter 4685: loss 2.6833, time 5266.02ms 
iter 4686: loss 2.6210, time 5252.55ms 
iter 4687: loss 2.6894, time 5318.60ms 
iter 4688: loss 2.4881, time 5289.07ms 
iter 4689: loss 2.4693, time 5263.77ms 
iter 4690: loss 2.4805, time 5271.07ms 
iter 4691: loss 2.5104, time 5269.79ms 
iter 4692: loss 2.5739, time 5266.88ms 
iter 4693: loss 2.5933, time 5260.07ms 
iter 4694: loss 2.6454, time 5253.47ms 
iter 4695: loss 2.7865, time 5255.60ms 
iter 4696: loss 2.6756, time 5261.67ms 
iter 4697: loss 2.6981, time 5252.65ms 
iter 4698: loss 2.6028, time 5283.13ms 
iter 4699: loss 2.8242, time 5251.64ms 
step 4700: train loss 2.5888, val loss 2.8349
iter 4700: loss 2.4804, time 20025.50ms 
iter 4701: loss 2.5649, time 5253.45ms 
iter 4702: loss 2.6311, time 5245.14ms 
iter 4703: loss 2.6703, time 5251.71ms 
iter 4704: loss 2.7756, time 5265.36ms 
iter 4705: loss 2.5979, time 5267.97ms 
iter 4706: loss 2.6276, time 5267.32ms 
iter 4707: loss 2.5985, time 5267.21ms 
iter 4708: loss 2.4398, time 5253.10ms 
iter 4709: loss 2.5202, time 5254.97ms 
iter 4710: loss 2.4576, time 5251.60ms 
iter 4711: loss 2.6896, time 5265.73ms 
iter 4712: loss 2.9179, time 5264.82ms 
iter 4713: loss 2.7799, time 5272.79ms 
iter 4714: loss 2.4726, time 5319.89ms 
iter 4715: loss 2.5475, time 5331.19ms 
iter 4716: loss 2.6752, time 5312.12ms 
iter 4717: loss 2.4685, time 5262.87ms 
iter 4718: loss 2.8032, time 5271.59ms 
iter 4719: loss 2.6349, time 5265.27ms 
iter 4720: loss 2.6485, time 5265.56ms 
iter 4721: loss 2.7028, time 5277.40ms 
iter 4722: loss 2.7740, time 5323.06ms 
iter 4723: loss 2.4193, time 5290.73ms 
iter 4724: loss 2.6845, time 5298.96ms 
iter 4725: loss 2.4751, time 5276.94ms 
iter 4726: loss 2.6034, time 5262.81ms 
iter 4727: loss 2.8082, time 5257.17ms 
iter 4728: loss 2.3935, time 5294.65ms 
iter 4729: loss 2.4575, time 5337.05ms 
iter 4730: loss 2.6349, time 5275.21ms 
iter 4731: loss 2.6520, time 5335.70ms 
iter 4732: loss 2.7364, time 5255.22ms 
iter 4733: loss 2.4438, time 5255.40ms 
iter 4734: loss 2.5410, time 5251.02ms 
iter 4735: loss 2.7497, time 5253.71ms 
iter 4736: loss 2.5610, time 5259.93ms 
iter 4737: loss 2.4804, time 5260.07ms 
iter 4738: loss 2.7346, time 5299.54ms 
iter 4739: loss 2.5833, time 5342.26ms 
iter 4740: loss 2.5651, time 5335.03ms 
iter 4741: loss 2.6000, time 5148.16ms 
iter 4742: loss 2.4692, time 5252.66ms 
iter 4743: loss 2.3602, time 5267.24ms 
iter 4744: loss 2.5767, time 5259.46ms 
iter 4745: loss 2.2872, time 5255.13ms 
iter 4746: loss 2.3721, time 5275.71ms 
iter 4747: loss 2.6783, time 5207.72ms 
iter 4748: loss 2.4643, time 5264.12ms 
iter 4749: loss 2.6923, time 5287.96ms 
step 4750: train loss 2.5994, val loss 2.8268
iter 4750: loss 2.7086, time 19939.38ms 
iter 4751: loss 2.5423, time 5086.54ms 
iter 4752: loss 2.8350, time 5187.84ms 
iter 4753: loss 2.5029, time 5265.03ms 
iter 4754: loss 2.7027, time 5274.24ms 
iter 4755: loss 2.5818, time 5261.14ms 
iter 4756: loss 2.4508, time 5255.90ms 
iter 4757: loss 2.5945, time 5255.35ms 
iter 4758: loss 2.6841, time 5267.36ms 
iter 4759: loss 2.5558, time 5264.81ms 
iter 4760: loss 2.3969, time 5253.37ms 
iter 4761: loss 2.6101, time 5257.06ms 
iter 4762: loss 2.3856, time 5259.71ms 
iter 4763: loss 2.4994, time 5260.83ms 
iter 4764: loss 2.7106, time 5268.49ms 
iter 4765: loss 2.6441, time 5235.00ms 
iter 4766: loss 2.5209, time 5258.60ms 
iter 4767: loss 2.6962, time 5320.99ms 
iter 4768: loss 2.7346, time 5248.66ms 
iter 4769: loss 2.5817, time 5275.59ms 
iter 4770: loss 2.5217, time 5266.86ms 
iter 4771: loss 2.4706, time 5266.87ms 
iter 4772: loss 2.5804, time 5261.86ms 
iter 4773: loss 2.6600, time 5263.46ms 
iter 4774: loss 2.6036, time 5262.73ms 
iter 4775: loss 2.7346, time 5274.12ms 
iter 4776: loss 2.5288, time 5275.43ms 
iter 4777: loss 2.5132, time 5140.25ms 
iter 4778: loss 2.5460, time 5251.42ms 
iter 4779: loss 2.5887, time 5159.41ms 
iter 4780: loss 2.6800, time 5252.44ms 
iter 4781: loss 2.5586, time 5255.49ms 
iter 4782: loss 2.5606, time 5270.31ms 
iter 4783: loss 2.6463, time 5261.40ms 
iter 4784: loss 2.8316, time 5272.82ms 
iter 4785: loss 2.6435, time 5264.86ms 
iter 4786: loss 2.6442, time 5265.09ms 
iter 4787: loss 2.5154, time 5262.72ms 
iter 4788: loss 2.6182, time 5261.36ms 
iter 4789: loss 2.7358, time 5260.96ms 
iter 4790: loss 2.5207, time 5263.70ms 
iter 4791: loss 2.5117, time 5267.02ms 
iter 4792: loss 2.8051, time 5252.60ms 
iter 4793: loss 2.5186, time 5250.08ms 
iter 4794: loss 2.7347, time 5251.10ms 
iter 4795: loss 2.5092, time 5261.43ms 
iter 4796: loss 2.5335, time 5251.83ms 
iter 4797: loss 2.6819, time 5248.38ms 
iter 4798: loss 2.4836, time 5222.11ms 
iter 4799: loss 2.7150, time 5216.66ms 
step 4800: train loss 2.5852, val loss 2.8203
iter 4800: loss 2.5080, time 20097.42ms 
iter 4801: loss 2.3838, time 5258.34ms 
iter 4802: loss 2.5833, time 5273.63ms 
iter 4803: loss 2.4682, time 5292.10ms 
iter 4804: loss 2.5968, time 5331.09ms 
iter 4805: loss 2.6635, time 5312.27ms 
iter 4806: loss 2.5521, time 5267.95ms 
iter 4807: loss 2.5468, time 5275.99ms 
iter 4808: loss 2.3280, time 5258.13ms 
iter 4809: loss 2.6760, time 5248.68ms 
iter 4810: loss 2.6464, time 5261.21ms 
iter 4811: loss 2.5835, time 5264.61ms 
iter 4812: loss 2.5841, time 5265.26ms 
iter 4813: loss 2.7574, time 5252.77ms 
iter 4814: loss 2.6394, time 5257.70ms 
iter 4815: loss 2.5976, time 5107.14ms 
iter 4816: loss 2.8278, time 5097.44ms 
iter 4817: loss 2.6706, time 5111.57ms 
iter 4818: loss 2.7991, time 5202.28ms 
iter 4819: loss 2.6462, time 5253.82ms 
iter 4820: loss 2.5023, time 5229.58ms 
iter 4821: loss 2.7971, time 5234.96ms 
iter 4822: loss 2.8406, time 5263.76ms 
iter 4823: loss 2.4149, time 5267.74ms 
iter 4824: loss 2.5381, time 5261.32ms 
iter 4825: loss 2.5980, time 5258.57ms 
iter 4826: loss 2.5993, time 5258.03ms 
iter 4827: loss 2.3816, time 5278.14ms 
iter 4828: loss 2.7896, time 5304.17ms 
iter 4829: loss 2.6132, time 5304.10ms 
iter 4830: loss 2.6458, time 5266.94ms 
iter 4831: loss 2.7648, time 5258.39ms 
iter 4832: loss 2.5011, time 5263.15ms 
iter 4833: loss 2.5624, time 5217.55ms 
iter 4834: loss 2.5172, time 5113.92ms 
iter 4835: loss 2.6607, time 5111.08ms 
iter 4836: loss 2.6443, time 5062.96ms 
iter 4837: loss 2.4007, time 5287.50ms 
iter 4838: loss 2.6570, time 5255.97ms 
iter 4839: loss 2.5983, time 5279.26ms 
iter 4840: loss 2.6519, time 5256.52ms 
iter 4841: loss 2.4328, time 5306.99ms 
iter 4842: loss 2.7051, time 5251.87ms 
iter 4843: loss 2.5419, time 5258.65ms 
iter 4844: loss 2.7209, time 5258.39ms 
iter 4845: loss 2.5920, time 5251.94ms 
iter 4846: loss 2.5256, time 5257.03ms 
iter 4847: loss 2.4943, time 5231.62ms 
iter 4848: loss 2.6254, time 5106.94ms 
iter 4849: loss 2.4114, time 5181.53ms 
step 4850: train loss 2.6081, val loss 2.8303
iter 4850: loss 2.3119, time 20065.69ms 
iter 4851: loss 2.6249, time 5262.88ms 
iter 4852: loss 2.6846, time 5301.48ms 
iter 4853: loss 2.5905, time 5317.15ms 
iter 4854: loss 2.4535, time 5329.44ms 
iter 4855: loss 2.7330, time 5336.11ms 
iter 4856: loss 2.6118, time 5344.45ms 
iter 4857: loss 2.6943, time 5321.36ms 
iter 4858: loss 2.5043, time 5339.22ms 
iter 4859: loss 2.6159, time 5311.22ms 
iter 4860: loss 2.6392, time 5332.72ms 
iter 4861: loss 2.4917, time 5264.00ms 
iter 4862: loss 2.3880, time 5272.35ms 
iter 4863: loss 2.5999, time 5270.74ms 
iter 4864: loss 2.6338, time 5265.34ms 
iter 4865: loss 2.6137, time 5264.04ms 
iter 4866: loss 2.7879, time 5264.58ms 
iter 4867: loss 2.5480, time 5268.06ms 
iter 4868: loss 2.7458, time 5268.27ms 
iter 4869: loss 2.4934, time 5264.20ms 
iter 4870: loss 2.6814, time 5255.71ms 
iter 4871: loss 2.5492, time 5245.08ms 
iter 4872: loss 2.7129, time 5265.92ms 
iter 4873: loss 2.6656, time 5271.56ms 
iter 4874: loss 2.6366, time 5276.26ms 
iter 4875: loss 2.6580, time 5257.57ms 
iter 4876: loss 2.5856, time 5268.41ms 
iter 4877: loss 2.6298, time 5264.33ms 
iter 4878: loss 2.6986, time 5341.18ms 
iter 4879: loss 2.5502, time 5273.55ms 
iter 4880: loss 2.7099, time 5224.78ms 
iter 4881: loss 2.5724, time 5255.60ms 
iter 4882: loss 2.5146, time 5297.02ms 
iter 4883: loss 2.5871, time 5342.95ms 
iter 4884: loss 2.5515, time 5282.40ms 
iter 4885: loss 2.3794, time 5272.08ms 
iter 4886: loss 2.5866, time 5266.74ms 
iter 4887: loss 2.5566, time 5264.85ms 
iter 4888: loss 2.4796, time 5268.47ms 
iter 4889: loss 2.4804, time 5263.87ms 
iter 4890: loss 2.6959, time 5266.10ms 
iter 4891: loss 2.7287, time 5264.20ms 
iter 4892: loss 2.5617, time 5269.81ms 
iter 4893: loss 2.6190, time 5272.37ms 
iter 4894: loss 2.3803, time 5270.66ms 
iter 4895: loss 2.4680, time 5270.30ms 
iter 4896: loss 2.6289, time 5265.18ms 
iter 4897: loss 2.5384, time 5282.88ms 
iter 4898: loss 2.6660, time 5266.94ms 
iter 4899: loss 2.5950, time 5283.30ms 
step 4900: train loss 2.5986, val loss 2.8521
iter 4900: loss 2.5909, time 20022.20ms 
iter 4901: loss 2.5410, time 5276.18ms 
iter 4902: loss 2.6385, time 5320.11ms 
iter 4903: loss 2.6030, time 5290.03ms 
iter 4904: loss 2.7303, time 5284.58ms 
iter 4905: loss 2.5151, time 5321.28ms 
iter 4906: loss 2.6911, time 5318.94ms 
iter 4907: loss 2.4858, time 5331.92ms 
iter 4908: loss 2.6691, time 5338.56ms 
iter 4909: loss 2.3731, time 5336.00ms 
iter 4910: loss 2.5555, time 5318.40ms 
iter 4911: loss 2.8076, time 5271.58ms 
iter 4912: loss 2.6523, time 5273.04ms 
iter 4913: loss 2.2526, time 5276.41ms 
iter 4914: loss 2.4441, time 5263.43ms 
iter 4915: loss 2.8292, time 5246.59ms 
iter 4916: loss 2.5724, time 5264.70ms 
iter 4917: loss 2.8217, time 5266.45ms 
iter 4918: loss 2.5351, time 5257.84ms 
iter 4919: loss 2.3085, time 5257.27ms 
iter 4920: loss 2.6014, time 5270.82ms 
iter 4921: loss 2.5761, time 5263.17ms 
iter 4922: loss 2.7622, time 5268.60ms 
iter 4923: loss 2.6747, time 5265.12ms 
iter 4924: loss 2.4531, time 5256.51ms 
iter 4925: loss 2.4519, time 5256.67ms 
iter 4926: loss 2.4618, time 5261.71ms 
iter 4927: loss 2.5009, time 5264.16ms 
iter 4928: loss 2.3643, time 5261.07ms 
iter 4929: loss 2.3746, time 5275.27ms 
iter 4930: loss 2.7286, time 5288.34ms 
iter 4931: loss 2.3972, time 5276.81ms 
iter 4932: loss 2.5001, time 5324.61ms 
iter 4933: loss 2.6275, time 5273.14ms 
iter 4934: loss 2.5988, time 5267.98ms 
iter 4935: loss 2.6871, time 5265.46ms 
iter 4936: loss 2.5486, time 5270.12ms 
iter 4937: loss 2.4302, time 5261.13ms 
iter 4938: loss 2.6004, time 5272.72ms 
iter 4939: loss 2.5989, time 5212.26ms 
iter 4940: loss 2.6401, time 5256.44ms 
iter 4941: loss 2.8941, time 5258.62ms 
iter 4942: loss 2.6008, time 5260.61ms 
iter 4943: loss 2.5472, time 5283.17ms 
iter 4944: loss 2.5397, time 5279.88ms 
iter 4945: loss 2.5803, time 5270.95ms 
iter 4946: loss 2.6053, time 5279.31ms 
iter 4947: loss 2.8070, time 5287.13ms 
iter 4948: loss 2.4124, time 5258.91ms 
iter 4949: loss 2.9271, time 5228.57ms 
step 4950: train loss 2.5822, val loss 2.8329
iter 4950: loss 2.5977, time 20123.22ms 
iter 4951: loss 2.8299, time 5269.08ms 
iter 4952: loss 2.4645, time 5264.06ms 
iter 4953: loss 2.7681, time 5185.85ms 
iter 4954: loss 2.8026, time 5272.97ms 
iter 4955: loss 2.6196, time 5268.96ms 
iter 4956: loss 2.6158, time 5265.72ms 
iter 4957: loss 2.4313, time 5239.84ms 
iter 4958: loss 2.8806, time 5223.68ms 
iter 4959: loss 2.4137, time 5301.68ms 
iter 4960: loss 2.5737, time 5227.43ms 
iter 4961: loss 2.4848, time 5253.44ms 
iter 4962: loss 2.6449, time 5269.84ms 
iter 4963: loss 2.5114, time 5231.15ms 
iter 4964: loss 2.4418, time 5284.43ms 
iter 4965: loss 2.8243, time 5237.95ms 
iter 4966: loss 2.6669, time 5266.81ms 
iter 4967: loss 2.5934, time 5256.46ms 
iter 4968: loss 2.6539, time 5277.68ms 
iter 4969: loss 2.4643, time 5272.84ms 
iter 4970: loss 2.4391, time 5260.25ms 
iter 4971: loss 2.6559, time 5115.63ms 
iter 4972: loss 2.5652, time 5217.75ms 
iter 4973: loss 2.5209, time 5256.18ms 
iter 4974: loss 2.5901, time 5258.55ms 
iter 4975: loss 2.5481, time 5260.03ms 
iter 4976: loss 2.6660, time 5260.17ms 
iter 4977: loss 2.6606, time 5250.80ms 
iter 4978: loss 2.4731, time 5253.50ms 
iter 4979: loss 2.5354, time 5258.27ms 
iter 4980: loss 2.6286, time 5253.48ms 
iter 4981: loss 2.7424, time 5249.79ms 
iter 4982: loss 2.6868, time 5269.67ms 
iter 4983: loss 2.6621, time 5266.28ms 
iter 4984: loss 2.6642, time 5261.42ms 
iter 4985: loss 2.7441, time 5273.18ms 
iter 4986: loss 2.5095, time 5235.74ms 
iter 4987: loss 2.4137, time 5271.58ms 
iter 4988: loss 2.4307, time 5270.53ms 
iter 4989: loss 2.4895, time 5190.58ms 
iter 4990: loss 2.4506, time 5222.95ms 
iter 4991: loss 2.7581, time 5319.29ms 
iter 4992: loss 2.6011, time 5254.41ms 
iter 4993: loss 2.7057, time 5297.15ms 
iter 4994: loss 2.5013, time 5270.01ms 
iter 4995: loss 2.5815, time 5266.20ms 
iter 4996: loss 2.7085, time 5238.44ms 
iter 4997: loss 2.5363, time 5266.08ms 
iter 4998: loss 2.5552, time 5258.33ms 
iter 4999: loss 2.6363, time 5243.88ms 
step 5000: train loss 2.5853, val loss 2.8231
iter 5000: loss 2.6411, time 20036.07ms 
iter 5001: loss 2.6827, time 5313.88ms 
iter 5002: loss 2.7457, time 5323.90ms 
iter 5003: loss 2.7903, time 5333.65ms 
iter 5004: loss 2.6661, time 5306.39ms 
iter 5005: loss 2.5833, time 5292.68ms 
iter 5006: loss 2.3452, time 5208.50ms 
iter 5007: loss 2.6305, time 5118.65ms 
iter 5008: loss 2.5696, time 5149.21ms 
iter 5009: loss 2.6008, time 5259.43ms 
iter 5010: loss 2.6528, time 5263.09ms 
iter 5011: loss 2.4982, time 5252.67ms 
iter 5012: loss 2.5713, time 5254.34ms 
iter 5013: loss 2.5971, time 5254.39ms 
iter 5014: loss 2.5062, time 5296.96ms 
iter 5015: loss 2.6996, time 5332.62ms 
iter 5016: loss 2.5584, time 5275.90ms 
iter 5017: loss 2.5316, time 5262.71ms 
iter 5018: loss 2.6924, time 5264.25ms 
iter 5019: loss 2.6088, time 5272.25ms 
iter 5020: loss 2.5165, time 5304.34ms 
iter 5021: loss 2.5443, time 5299.29ms 
iter 5022: loss 2.5428, time 5335.65ms 
iter 5023: loss 2.2598, time 5267.51ms 
iter 5024: loss 2.4353, time 5265.23ms 
iter 5025: loss 2.4742, time 5270.13ms 
iter 5026: loss 2.6803, time 5261.59ms 
iter 5027: loss 2.5076, time 5252.16ms 
iter 5028: loss 2.5008, time 5262.91ms 
iter 5029: loss 2.6448, time 5265.27ms 
iter 5030: loss 2.8119, time 5271.24ms 
iter 5031: loss 2.5623, time 5266.86ms 
iter 5032: loss 2.3160, time 5269.48ms 
iter 5033: loss 2.6606, time 5262.61ms 
iter 5034: loss 2.6543, time 5265.67ms 
iter 5035: loss 2.4939, time 5281.92ms 
iter 5036: loss 2.6331, time 5272.42ms 
iter 5037: loss 2.4723, time 5274.83ms 
iter 5038: loss 2.5512, time 5270.66ms 
iter 5039: loss 2.7387, time 5287.31ms 
iter 5040: loss 2.6256, time 5272.01ms 
iter 5041: loss 2.3509, time 5261.19ms 
iter 5042: loss 2.6744, time 5284.32ms 
iter 5043: loss 2.7209, time 5324.58ms 
iter 5044: loss 2.5503, time 5303.12ms 
iter 5045: loss 2.5076, time 5285.65ms 
iter 5046: loss 2.7721, time 5295.12ms 
iter 5047: loss 2.5942, time 5273.11ms 
iter 5048: loss 2.2585, time 5279.87ms 
iter 5049: loss 2.6212, time 5236.88ms 
step 5050: train loss 2.5901, val loss 2.8359
iter 5050: loss 2.4664, time 20066.97ms 
iter 5051: loss 2.7868, time 5264.17ms 
iter 5052: loss 2.9844, time 5266.37ms 
iter 5053: loss 2.1831, time 5266.19ms 
iter 5054: loss 2.3658, time 5278.67ms 
iter 5055: loss 2.4660, time 5271.03ms 
iter 5056: loss 2.6026, time 5279.84ms 
iter 5057: loss 2.8143, time 5255.65ms 
iter 5058: loss 2.5713, time 5268.02ms 
iter 5059: loss 2.5694, time 5283.55ms 
iter 5060: loss 2.6797, time 5285.20ms 
iter 5061: loss 2.5182, time 5281.99ms 
iter 5062: loss 2.5519, time 5281.97ms 
iter 5063: loss 2.5098, time 5282.01ms 
iter 5064: loss 2.6975, time 5277.80ms 
iter 5065: loss 2.3766, time 5292.32ms 
iter 5066: loss 2.6832, time 5300.07ms 
iter 5067: loss 2.5415, time 5264.50ms 
iter 5068: loss 2.5525, time 5270.56ms 
iter 5069: loss 2.6049, time 5314.72ms 
iter 5070: loss 2.3335, time 5303.58ms 
iter 5071: loss 2.6808, time 5261.62ms 
iter 5072: loss 2.4651, time 5314.96ms 
iter 5073: loss 2.6399, time 5286.00ms 
iter 5074: loss 2.7315, time 5314.08ms 
iter 5075: loss 2.6697, time 5312.60ms 
iter 5076: loss 2.6426, time 5350.28ms 
iter 5077: loss 2.4286, time 5302.42ms 
iter 5078: loss 2.4732, time 5267.93ms 
iter 5079: loss 2.6649, time 5271.53ms 
iter 5080: loss 2.4658, time 5280.46ms 
iter 5081: loss 2.5303, time 5276.14ms 
iter 5082: loss 2.4579, time 5274.26ms 
iter 5083: loss 2.5485, time 5264.26ms 
iter 5084: loss 2.7173, time 5248.61ms 
iter 5085: loss 2.5254, time 5302.02ms 
iter 5086: loss 2.6342, time 5270.97ms 
iter 5087: loss 2.5405, time 5265.28ms 
iter 5088: loss 2.6294, time 5269.69ms 
iter 5089: loss 2.6067, time 5272.17ms 
iter 5090: loss 2.6539, time 5288.23ms 
iter 5091: loss 2.5200, time 5302.97ms 
iter 5092: loss 2.6051, time 5273.44ms 
iter 5093: loss 2.5267, time 5265.34ms 
iter 5094: loss 2.3683, time 5277.00ms 
iter 5095: loss 2.6407, time 5271.61ms 
iter 5096: loss 2.6393, time 5266.56ms 
iter 5097: loss 2.5383, time 5271.17ms 
iter 5098: loss 2.7684, time 5226.91ms 
iter 5099: loss 2.6621, time 5253.74ms 
step 5100: train loss 2.5820, val loss 2.8480
iter 5100: loss 2.4448, time 20051.32ms 
iter 5101: loss 2.4359, time 5258.88ms 
iter 5102: loss 2.7335, time 5263.71ms 
iter 5103: loss 2.3936, time 5263.04ms 
iter 5104: loss 2.5326, time 5255.32ms 
iter 5105: loss 2.6689, time 5261.21ms 
iter 5106: loss 2.7161, time 5282.01ms 
iter 5107: loss 2.7638, time 5291.93ms 
iter 5108: loss 2.4533, time 5296.56ms 
iter 5109: loss 2.4729, time 5251.70ms 
iter 5110: loss 2.4822, time 5254.35ms 
iter 5111: loss 2.5121, time 5257.28ms 
iter 5112: loss 2.5664, time 5264.70ms 
iter 5113: loss 2.5393, time 5269.28ms 
iter 5114: loss 2.5573, time 5268.72ms 
iter 5115: loss 2.7476, time 5257.91ms 
iter 5116: loss 2.6675, time 5344.48ms 
iter 5117: loss 2.5189, time 5066.52ms 
iter 5118: loss 2.6120, time 5163.60ms 
iter 5119: loss 2.3545, time 5267.41ms 
iter 5120: loss 2.5370, time 5261.90ms 
iter 5121: loss 2.5334, time 5272.13ms 
iter 5122: loss 2.6878, time 5267.28ms 
iter 5123: loss 2.5149, time 5280.78ms 
iter 5124: loss 2.6633, time 5267.66ms 
iter 5125: loss 2.3316, time 5263.16ms 
iter 5126: loss 2.3796, time 5241.29ms 
iter 5127: loss 2.7227, time 5246.51ms 
iter 5128: loss 2.5175, time 5248.65ms 
iter 5129: loss 2.5352, time 5256.00ms 
iter 5130: loss 2.5879, time 5304.56ms 
iter 5131: loss 2.6150, time 5326.75ms 
iter 5132: loss 2.5480, time 5266.68ms 
iter 5133: loss 2.6547, time 5271.42ms 
iter 5134: loss 2.5677, time 5270.13ms 
iter 5135: loss 2.5815, time 5267.55ms 
iter 5136: loss 2.5277, time 5271.09ms 
iter 5137: loss 2.6511, time 5271.45ms 
iter 5138: loss 2.5384, time 5265.42ms 
iter 5139: loss 2.6267, time 5251.70ms 
iter 5140: loss 2.6764, time 5267.05ms 
iter 5141: loss 2.6352, time 5272.33ms 
iter 5142: loss 2.7288, time 5253.64ms 
iter 5143: loss 2.4994, time 5258.51ms 
iter 5144: loss 2.7635, time 5263.98ms 
iter 5145: loss 2.3691, time 5295.04ms 
iter 5146: loss 2.6384, time 5303.51ms 
iter 5147: loss 2.6815, time 5227.88ms 
iter 5148: loss 2.3985, time 5271.07ms 
iter 5149: loss 2.5078, time 5266.25ms 
step 5150: train loss 2.5738, val loss 2.8402
iter 5150: loss 2.6886, time 20019.10ms 
iter 5151: loss 2.4668, time 5254.45ms 
iter 5152: loss 2.7262, time 5269.64ms 
iter 5153: loss 2.4189, time 5255.04ms 
iter 5154: loss 2.5288, time 5254.71ms 
iter 5155: loss 2.7040, time 5250.71ms 
iter 5156: loss 2.5063, time 5251.37ms 
iter 5157: loss 2.5490, time 5254.79ms 
iter 5158: loss 2.5515, time 5263.89ms 
iter 5159: loss 2.7405, time 5263.90ms 
iter 5160: loss 2.7469, time 5230.63ms 
iter 5161: loss 2.5423, time 5253.58ms 
iter 5162: loss 2.4243, time 5250.54ms 
iter 5163: loss 2.4484, time 5279.63ms 
iter 5164: loss 2.5923, time 5268.31ms 
iter 5165: loss 2.6987, time 5255.52ms 
iter 5166: loss 2.7814, time 5258.03ms 
iter 5167: loss 2.5982, time 5257.03ms 
iter 5168: loss 2.5786, time 5268.72ms 
iter 5169: loss 2.6025, time 5324.67ms 
iter 5170: loss 2.6780, time 5280.87ms 
iter 5171: loss 2.6888, time 5271.47ms 
iter 5172: loss 2.6419, time 5258.89ms 
iter 5173: loss 2.3646, time 5254.24ms 
iter 5174: loss 2.5602, time 5204.93ms 
iter 5175: loss 2.5760, time 5284.64ms 
iter 5176: loss 2.5782, time 5253.74ms 
iter 5177: loss 2.5642, time 5330.94ms 
iter 5178: loss 2.7768, time 5251.56ms 
iter 5179: loss 2.8755, time 5246.71ms 
iter 5180: loss 2.6242, time 5327.68ms 
iter 5181: loss 2.7167, time 5273.59ms 
iter 5182: loss 2.6246, time 5264.93ms 
iter 5183: loss 2.6801, time 5256.17ms 
iter 5184: loss 2.5119, time 5265.84ms 
iter 5185: loss 2.5225, time 5273.04ms 
iter 5186: loss 2.4395, time 5262.68ms 
iter 5187: loss 2.3279, time 5253.34ms 
iter 5188: loss 2.3243, time 5255.77ms 
iter 5189: loss 2.5696, time 5249.99ms 
iter 5190: loss 2.6360, time 5256.05ms 
iter 5191: loss 2.5109, time 5262.73ms 
iter 5192: loss 2.5633, time 5255.46ms 
iter 5193: loss 2.4283, time 5260.99ms 
iter 5194: loss 2.6390, time 5256.12ms 
iter 5195: loss 2.6884, time 5261.75ms 
iter 5196: loss 2.6207, time 5252.09ms 
iter 5197: loss 2.2137, time 5254.21ms 
iter 5198: loss 2.8233, time 5254.41ms 
iter 5199: loss 2.4967, time 5266.29ms 
step 5200: train loss 2.5821, val loss 2.8447
iter 5200: loss 2.4283, time 19883.33ms 
iter 5201: loss 2.9621, time 5252.32ms 
iter 5202: loss 2.6826, time 5271.80ms 
iter 5203: loss 2.4504, time 5258.37ms 
iter 5204: loss 2.6536, time 5295.01ms 
iter 5205: loss 2.5973, time 5271.16ms 
iter 5206: loss 2.7502, time 5265.20ms 
iter 5207: loss 2.4947, time 5296.53ms 
iter 5208: loss 2.5593, time 5266.98ms 
iter 5209: loss 2.7157, time 5255.93ms 
iter 5210: loss 2.6265, time 5264.46ms 
iter 5211: loss 2.2469, time 5276.31ms 
iter 5212: loss 2.5753, time 5269.98ms 
iter 5213: loss 2.6849, time 5278.29ms 
iter 5214: loss 2.5875, time 5267.90ms 
iter 5215: loss 2.6679, time 5243.06ms 
iter 5216: loss 2.6560, time 5269.96ms 
iter 5217: loss 2.5396, time 5278.51ms 
iter 5218: loss 2.5114, time 5271.12ms 
iter 5219: loss 2.5814, time 5301.37ms 
iter 5220: loss 2.5754, time 5292.64ms 
iter 5221: loss 2.5459, time 5268.64ms 
iter 5222: loss 2.2466, time 5263.78ms 
iter 5223: loss 2.5769, time 5259.02ms 
iter 5224: loss 2.5847, time 5317.22ms 
iter 5225: loss 2.6050, time 5334.63ms 
iter 5226: loss 2.3208, time 5336.38ms 
iter 5227: loss 2.5216, time 5338.29ms 
iter 5228: loss 2.4428, time 5324.79ms 
iter 5229: loss 2.6114, time 5318.40ms 
iter 5230: loss 2.4837, time 5281.92ms 
iter 5231: loss 2.5366, time 5261.54ms 
iter 5232: loss 2.7455, time 5256.72ms 
iter 5233: loss 2.5154, time 5257.13ms 
iter 5234: loss 2.6633, time 5251.67ms 
iter 5235: loss 2.5369, time 5279.23ms 
iter 5236: loss 2.7407, time 5269.23ms 
iter 5237: loss 2.5503, time 5265.86ms 
iter 5238: loss 2.4825, time 5256.24ms 
iter 5239: loss 2.6892, time 5261.58ms 
iter 5240: loss 2.5028, time 5270.87ms 
iter 5241: loss 2.5384, time 5259.94ms 
iter 5242: loss 2.4431, time 5262.03ms 
iter 5243: loss 2.5056, time 5225.33ms 
iter 5244: loss 2.6375, time 5266.39ms 
iter 5245: loss 2.5695, time 5265.76ms 
iter 5246: loss 2.6625, time 5262.78ms 
iter 5247: loss 2.7404, time 5246.78ms 
iter 5248: loss 2.4884, time 5257.89ms 
iter 5249: loss 2.7121, time 5258.84ms 
step 5250: train loss 2.5802, val loss 2.8291
iter 5250: loss 2.6339, time 20006.19ms 
iter 5251: loss 2.6340, time 5307.87ms 
iter 5252: loss 2.7040, time 5253.97ms 
iter 5253: loss 2.4855, time 5272.16ms 
iter 5254: loss 2.6430, time 5263.81ms 
iter 5255: loss 2.4687, time 5256.54ms 
iter 5256: loss 2.6254, time 5268.71ms 
iter 5257: loss 2.6652, time 5289.11ms 
iter 5258: loss 2.8500, time 5304.41ms 
iter 5259: loss 2.5256, time 5256.84ms 
iter 5260: loss 2.1854, time 5258.98ms 
iter 5261: loss 2.7116, time 5339.26ms 
iter 5262: loss 2.5690, time 5303.37ms 
iter 5263: loss 2.6713, time 5260.95ms 
iter 5264: loss 2.5608, time 5278.98ms 
iter 5265: loss 2.5304, time 5304.12ms 
iter 5266: loss 2.4386, time 5327.28ms 
iter 5267: loss 2.5231, time 5291.27ms 
iter 5268: loss 2.7677, time 5280.74ms 
iter 5269: loss 2.5755, time 5322.39ms 
iter 5270: loss 2.4465, time 5272.79ms 
iter 5271: loss 2.5319, time 5263.76ms 
iter 5272: loss 2.4131, time 5320.49ms 
iter 5273: loss 2.7101, time 5336.99ms 
iter 5274: loss 2.5461, time 5251.88ms 
iter 5275: loss 2.8595, time 5109.94ms 
iter 5276: loss 2.7457, time 5341.42ms 
iter 5277: loss 2.5644, time 5337.04ms 
iter 5278: loss 2.6586, time 5326.92ms 
iter 5279: loss 2.5781, time 5322.01ms 
iter 5280: loss 2.7307, time 5333.94ms 
iter 5281: loss 2.6469, time 5334.48ms 
iter 5282: loss 2.4372, time 5296.07ms 
iter 5283: loss 2.5373, time 5269.79ms 
iter 5284: loss 2.3569, time 5269.54ms 
iter 5285: loss 2.5813, time 5276.01ms 
iter 5286: loss 2.6912, time 5273.71ms 
iter 5287: loss 2.4283, time 5269.61ms 
iter 5288: loss 2.6508, time 5263.72ms 
iter 5289: loss 2.4328, time 5264.56ms 
iter 5290: loss 2.8888, time 5258.99ms 
iter 5291: loss 2.5917, time 5257.67ms 
iter 5292: loss 2.7643, time 5274.30ms 
iter 5293: loss 2.4745, time 5246.91ms 
iter 5294: loss 2.5233, time 5243.73ms 
iter 5295: loss 2.5563, time 5256.68ms 
iter 5296: loss 2.3781, time 5231.44ms 
iter 5297: loss 2.6409, time 5256.45ms 
iter 5298: loss 2.5974, time 5261.89ms 
iter 5299: loss 2.6515, time 5266.15ms 
step 5300: train loss 2.5667, val loss 2.8444
iter 5300: loss 2.5412, time 19999.09ms 
iter 5301: loss 2.3233, time 5257.33ms 
iter 5302: loss 2.6309, time 5244.69ms 
iter 5303: loss 2.7239, time 5261.39ms 
iter 5304: loss 2.4065, time 5259.33ms 
iter 5305: loss 2.5063, time 5275.49ms 
iter 5306: loss 2.7068, time 5239.09ms 
iter 5307: loss 2.8009, time 5256.47ms 
iter 5308: loss 2.5378, time 5262.55ms 
iter 5309: loss 2.4931, time 5264.46ms 
iter 5310: loss 2.6402, time 5269.63ms 
iter 5311: loss 2.4536, time 5257.40ms 
iter 5312: loss 2.4395, time 5269.80ms 
iter 5313: loss 2.8163, time 5257.43ms 
iter 5314: loss 2.5025, time 5270.63ms 
iter 5315: loss 2.6393, time 5282.56ms 
iter 5316: loss 2.4278, time 5284.08ms 
iter 5317: loss 2.4033, time 5263.53ms 
iter 5318: loss 2.5978, time 5266.04ms 
iter 5319: loss 2.4181, time 5261.28ms 
iter 5320: loss 2.7080, time 5286.54ms 
iter 5321: loss 2.4920, time 5263.57ms 
iter 5322: loss 2.8104, time 5270.03ms 
iter 5323: loss 2.7774, time 5256.21ms 
iter 5324: loss 2.4822, time 5255.63ms 
iter 5325: loss 2.7610, time 5269.44ms 
iter 5326: loss 2.4685, time 5218.82ms 
iter 5327: loss 2.6107, time 5252.34ms 
iter 5328: loss 2.7145, time 5259.47ms 
iter 5329: loss 2.3733, time 5338.21ms 
iter 5330: loss 2.5092, time 5343.36ms 
iter 5331: loss 2.6485, time 5319.29ms 
iter 5332: loss 2.6609, time 5293.00ms 
iter 5333: loss 2.3244, time 5336.00ms 
iter 5334: loss 2.6309, time 5335.10ms 
iter 5335: loss 2.4979, time 5329.09ms 
iter 5336: loss 2.6780, time 5291.84ms 
iter 5337: loss 2.6641, time 5295.59ms 
iter 5338: loss 2.4102, time 5322.89ms 
iter 5339: loss 2.5106, time 5265.52ms 
iter 5340: loss 2.5689, time 5236.04ms 
iter 5341: loss 2.5832, time 5258.01ms 
iter 5342: loss 2.3150, time 5222.72ms 
iter 5343: loss 2.5687, time 5252.69ms 
iter 5344: loss 2.5453, time 5265.18ms 
iter 5345: loss 2.5684, time 5226.12ms 
iter 5346: loss 2.5874, time 5260.72ms 
iter 5347: loss 2.2747, time 5262.70ms 
iter 5348: loss 2.6022, time 5271.79ms 
iter 5349: loss 2.7067, time 5266.17ms 
step 5350: train loss 2.5782, val loss 2.8280
iter 5350: loss 2.6695, time 19993.58ms 
iter 5351: loss 2.5575, time 5264.47ms 
iter 5352: loss 2.6329, time 5272.54ms 
iter 5353: loss 2.7609, time 5275.09ms 
iter 5354: loss 2.7332, time 5267.40ms 
iter 5355: loss 2.6849, time 5264.46ms 
iter 5356: loss 2.4873, time 5258.74ms 
iter 5357: loss 2.5981, time 5276.70ms 
iter 5358: loss 2.2408, time 5267.80ms 
iter 5359: loss 2.6680, time 5288.35ms 
iter 5360: loss 2.7490, time 5267.17ms 
iter 5361: loss 2.5256, time 5262.89ms 
iter 5362: loss 2.5427, time 5274.62ms 
iter 5363: loss 2.6197, time 5262.95ms 
iter 5364: loss 2.4500, time 5273.24ms 
iter 5365: loss 2.9098, time 5264.09ms 
iter 5366: loss 2.7787, time 5264.68ms 
iter 5367: loss 2.5734, time 5256.59ms 
iter 5368: loss 2.6346, time 5265.14ms 
iter 5369: loss 2.5656, time 5267.08ms 
iter 5370: loss 2.5875, time 5266.66ms 
iter 5371: loss 2.8147, time 5226.56ms 
iter 5372: loss 2.5464, time 5268.39ms 
iter 5373: loss 2.4308, time 5273.53ms 
iter 5374: loss 2.5890, time 5270.49ms 
iter 5375: loss 2.4283, time 5254.70ms 
iter 5376: loss 2.6329, time 5259.70ms 
iter 5377: loss 2.4713, time 5258.42ms 
iter 5378: loss 2.2629, time 5279.16ms 
iter 5379: loss 2.6919, time 5255.12ms 
iter 5380: loss 2.6703, time 5250.94ms 
iter 5381: loss 2.6286, time 5257.96ms 
iter 5382: loss 2.6098, time 5265.00ms 
iter 5383: loss 2.6502, time 5255.80ms 
iter 5384: loss 2.7314, time 5257.45ms 
iter 5385: loss 2.4964, time 5259.40ms 
iter 5386: loss 2.5797, time 5256.10ms 
iter 5387: loss 2.4404, time 5257.96ms 
iter 5388: loss 2.5322, time 5258.32ms 
iter 5389: loss 2.7638, time 5260.96ms 
iter 5390: loss 2.5275, time 5256.78ms 
iter 5391: loss 2.4087, time 5268.84ms 
iter 5392: loss 2.3581, time 5310.12ms 
iter 5393: loss 2.6092, time 5273.93ms 
iter 5394: loss 2.6355, time 5318.19ms 
iter 5395: loss 2.4551, time 5259.55ms 
iter 5396: loss 2.5830, time 5267.44ms 
iter 5397: loss 2.6177, time 5269.20ms 
iter 5398: loss 2.5501, time 5248.31ms 
iter 5399: loss 2.6408, time 5239.47ms 
step 5400: train loss 2.5724, val loss 2.8382
iter 5400: loss 2.5232, time 20036.23ms 
iter 5401: loss 2.6477, time 5273.99ms 
iter 5402: loss 2.6255, time 5262.57ms 
iter 5403: loss 2.5245, time 5263.14ms 
iter 5404: loss 2.3237, time 5248.06ms 
iter 5405: loss 2.5373, time 5251.41ms 
iter 5406: loss 2.4099, time 5255.02ms 
iter 5407: loss 2.5294, time 5253.20ms 
iter 5408: loss 2.6485, time 5255.85ms 
iter 5409: loss 2.4019, time 5248.52ms 
iter 5410: loss 2.6833, time 5247.29ms 
iter 5411: loss 2.3608, time 5251.48ms 
iter 5412: loss 2.6631, time 5258.53ms 
iter 5413: loss 2.3640, time 5259.98ms 
iter 5414: loss 2.5129, time 5255.85ms 
iter 5415: loss 2.3263, time 5252.75ms 
iter 5416: loss 2.4073, time 5259.46ms 
iter 5417: loss 2.6079, time 5258.72ms 
iter 5418: loss 2.6910, time 5260.52ms 
iter 5419: loss 2.5189, time 5261.85ms 
iter 5420: loss 2.5245, time 5257.82ms 
iter 5421: loss 2.5241, time 5259.76ms 
iter 5422: loss 2.5372, time 5269.39ms 
iter 5423: loss 2.5269, time 5258.03ms 
iter 5424: loss 2.5157, time 5259.61ms 
iter 5425: loss 2.4905, time 5261.16ms 
iter 5426: loss 2.4533, time 5265.45ms 
iter 5427: loss 2.5676, time 5271.78ms 
iter 5428: loss 2.6103, time 5268.32ms 
iter 5429: loss 2.8308, time 5252.08ms 
iter 5430: loss 2.7141, time 5250.94ms 
iter 5431: loss 2.6529, time 5248.78ms 
iter 5432: loss 2.3685, time 5262.86ms 
iter 5433: loss 2.3839, time 5266.15ms 
iter 5434: loss 2.3423, time 5258.33ms 
iter 5435: loss 2.5030, time 5258.22ms 
iter 5436: loss 2.6322, time 5335.29ms 
iter 5437: loss 2.7134, time 5332.59ms 
iter 5438: loss 2.7237, time 5326.32ms 
iter 5439: loss 2.7455, time 5339.21ms 
iter 5440: loss 2.8033, time 5289.50ms 
iter 5441: loss 2.5724, time 5254.34ms 
iter 5442: loss 2.5830, time 5251.20ms 
iter 5443: loss 2.6669, time 5255.86ms 
iter 5444: loss 2.5128, time 5268.96ms 
iter 5445: loss 2.4475, time 5251.11ms 
iter 5446: loss 2.5010, time 5252.86ms 
iter 5447: loss 2.5709, time 5249.33ms 
iter 5448: loss 2.5886, time 5259.97ms 
iter 5449: loss 2.3913, time 5260.73ms 
step 5450: train loss 2.5600, val loss 2.8427
iter 5450: loss 2.5808, time 19997.17ms 
iter 5451: loss 2.5347, time 5275.59ms 
iter 5452: loss 2.4495, time 5333.10ms 
iter 5453: loss 2.3959, time 5302.64ms 
iter 5454: loss 2.5727, time 5250.33ms 
iter 5455: loss 2.5716, time 5263.80ms 
iter 5456: loss 2.8035, time 5253.16ms 
iter 5457: loss 2.7097, time 5257.32ms 
iter 5458: loss 2.4083, time 5250.91ms 
iter 5459: loss 2.7468, time 5259.99ms 
iter 5460: loss 2.4976, time 5261.30ms 
iter 5461: loss 2.5403, time 5254.04ms 
iter 5462: loss 2.6112, time 5256.63ms 
iter 5463: loss 2.4895, time 5248.93ms 
iter 5464: loss 2.7534, time 5262.72ms 
iter 5465: loss 2.5877, time 5261.46ms 
iter 5466: loss 2.4764, time 5260.35ms 
iter 5467: loss 2.5431, time 5255.30ms 
iter 5468: loss 2.4324, time 5251.25ms 
iter 5469: loss 2.6223, time 5273.72ms 
iter 5470: loss 2.4827, time 5259.66ms 
iter 5471: loss 2.5655, time 5228.08ms 
iter 5472: loss 2.6705, time 5251.11ms 
iter 5473: loss 2.5894, time 5246.84ms 
iter 5474: loss 2.4422, time 5265.89ms 
iter 5475: loss 2.4919, time 5254.61ms 
iter 5476: loss 2.5527, time 5259.72ms 
iter 5477: loss 2.5416, time 5262.28ms 
iter 5478: loss 2.5629, time 5254.23ms 
iter 5479: loss 2.4613, time 5269.80ms 
iter 5480: loss 2.8596, time 5272.24ms 
iter 5481: loss 2.5749, time 5242.54ms 
iter 5482: loss 2.5873, time 5267.03ms 
iter 5483: loss 2.7189, time 5271.68ms 
iter 5484: loss 2.5929, time 5296.20ms 
iter 5485: loss 2.6903, time 5263.91ms 
iter 5486: loss 2.3862, time 5268.74ms 
iter 5487: loss 2.4129, time 5269.42ms 
iter 5488: loss 2.8149, time 5252.71ms 
iter 5489: loss 2.5783, time 5263.50ms 
iter 5490: loss 2.5519, time 5266.58ms 
iter 5491: loss 2.3866, time 5298.49ms 
iter 5492: loss 2.6584, time 5316.42ms 
iter 5493: loss 2.4277, time 5253.46ms 
iter 5494: loss 2.5535, time 5247.21ms 
iter 5495: loss 2.6189, time 5246.42ms 
iter 5496: loss 2.4425, time 5264.07ms 
iter 5497: loss 2.6121, time 5257.03ms 
iter 5498: loss 2.5288, time 5252.79ms 
iter 5499: loss 2.5056, time 5252.95ms 
step 5500: train loss 2.5618, val loss 2.8351
iter 5500: loss 2.6583, time 19992.71ms 
iter 5501: loss 2.4951, time 5344.04ms 
iter 5502: loss 2.5988, time 5345.24ms 
iter 5503: loss 2.3243, time 5331.31ms 
iter 5504: loss 2.5826, time 5348.05ms 
iter 5505: loss 2.5611, time 5315.56ms 
iter 5506: loss 2.6536, time 5337.75ms 
iter 5507: loss 2.7469, time 5320.52ms 
iter 5508: loss 2.4153, time 5258.54ms 
iter 5509: loss 2.4506, time 5256.71ms 
iter 5510: loss 2.5770, time 5254.21ms 
iter 5511: loss 2.7391, time 5253.17ms 
iter 5512: loss 2.3694, time 5262.02ms 
iter 5513: loss 2.2996, time 5254.29ms 
iter 5514: loss 2.6502, time 5253.78ms 
iter 5515: loss 2.5525, time 5256.06ms 
iter 5516: loss 2.5404, time 5251.82ms 
iter 5517: loss 2.6150, time 5262.23ms 
iter 5518: loss 2.8623, time 5255.60ms 
iter 5519: loss 2.6603, time 5255.45ms 
iter 5520: loss 2.7844, time 5261.54ms 
iter 5521: loss 2.5590, time 5255.78ms 
iter 5522: loss 2.5162, time 5266.91ms 
iter 5523: loss 2.4478, time 5266.00ms 
iter 5524: loss 2.6777, time 5270.54ms 
iter 5525: loss 2.5759, time 5274.99ms 
iter 5526: loss 2.5158, time 5273.73ms 
iter 5527: loss 2.3922, time 5268.10ms 
iter 5528: loss 2.5963, time 5274.42ms 
iter 5529: loss 2.3764, time 5274.61ms 
iter 5530: loss 2.7077, time 5254.60ms 
iter 5531: loss 2.4900, time 5252.24ms 
iter 5532: loss 2.7008, time 5287.62ms 
iter 5533: loss 2.4806, time 5333.18ms 
iter 5534: loss 2.6406, time 5257.45ms 
iter 5535: loss 2.5348, time 5255.43ms 
iter 5536: loss 2.6472, time 5267.62ms 
iter 5537: loss 2.4321, time 5257.00ms 
iter 5538: loss 2.6589, time 5268.90ms 
iter 5539: loss 2.5344, time 5258.02ms 
iter 5540: loss 2.6980, time 5261.95ms 
iter 5541: loss 2.4760, time 5252.69ms 
iter 5542: loss 2.4922, time 5252.35ms 
iter 5543: loss 2.4663, time 5263.27ms 
iter 5544: loss 2.4655, time 5298.28ms 
iter 5545: loss 2.5092, time 5281.66ms 
iter 5546: loss 2.4606, time 5253.81ms 
iter 5547: loss 2.7126, time 5274.29ms 
iter 5548: loss 2.4944, time 5259.50ms 
iter 5549: loss 2.6204, time 5263.30ms 
step 5550: train loss 2.5507, val loss 2.8382
iter 5550: loss 2.5860, time 19964.73ms 
iter 5551: loss 2.6654, time 5270.74ms 
iter 5552: loss 2.5961, time 5270.12ms 
iter 5553: loss 2.5398, time 5265.09ms 
iter 5554: loss 2.5963, time 5258.23ms 
iter 5555: loss 2.6984, time 5253.85ms 
iter 5556: loss 2.7771, time 5264.71ms 
iter 5557: loss 2.8913, time 5250.64ms 
iter 5558: loss 2.4254, time 5262.19ms 
iter 5559: loss 2.5450, time 5253.14ms 
iter 5560: loss 2.5311, time 5274.10ms 
iter 5561: loss 2.5894, time 5267.46ms 
iter 5562: loss 2.6263, time 5271.11ms 
iter 5563: loss 2.7747, time 5272.63ms 
iter 5564: loss 2.4238, time 5257.38ms 
iter 5565: loss 2.6902, time 5256.42ms 
iter 5566: loss 2.3317, time 5254.65ms 
iter 5567: loss 2.5536, time 5257.83ms 
iter 5568: loss 2.6053, time 5259.51ms 
iter 5569: loss 2.5408, time 5254.07ms 
iter 5570: loss 2.6279, time 5251.20ms 
iter 5571: loss 2.4256, time 5252.13ms 
iter 5572: loss 2.5982, time 5250.37ms 
iter 5573: loss 2.4619, time 5274.70ms 
iter 5574: loss 2.7079, time 5274.12ms 
iter 5575: loss 2.6998, time 5270.12ms 
iter 5576: loss 2.4672, time 5261.17ms 
iter 5577: loss 2.3926, time 5157.33ms 
iter 5578: loss 2.4406, time 5269.20ms 
iter 5579: loss 2.5477, time 5308.66ms 
iter 5580: loss 2.5716, time 5269.30ms 
iter 5581: loss 2.5203, time 5268.22ms 
iter 5582: loss 2.5060, time 5268.53ms 
iter 5583: loss 2.7646, time 5321.88ms 
iter 5584: loss 2.2138, time 5346.56ms 
iter 5585: loss 2.7651, time 5322.66ms 
iter 5586: loss 2.5955, time 5273.92ms 
iter 5587: loss 2.6286, time 5337.40ms 
iter 5588: loss 2.5623, time 5303.99ms 
iter 5589: loss 2.6135, time 5286.16ms 
iter 5590: loss 2.3749, time 5345.19ms 
iter 5591: loss 2.6406, time 5256.39ms 
iter 5592: loss 2.5373, time 5260.23ms 
iter 5593: loss 2.6270, time 5261.51ms 
iter 5594: loss 2.3365, time 5261.14ms 
iter 5595: loss 2.7206, time 5253.29ms 
iter 5596: loss 2.6054, time 5283.21ms 
iter 5597: loss 2.4571, time 5283.23ms 
iter 5598: loss 2.5187, time 5263.90ms 
iter 5599: loss 2.7195, time 5252.02ms 
step 5600: train loss 2.5685, val loss 2.8345
iter 5600: loss 2.5034, time 19994.05ms 
iter 5601: loss 2.4364, time 5287.52ms 
iter 5602: loss 2.4493, time 5259.61ms 
iter 5603: loss 2.5590, time 5253.96ms 
iter 5604: loss 2.4298, time 5269.90ms 
iter 5605: loss 2.5202, time 5261.77ms 
iter 5606: loss 2.5785, time 5258.14ms 
iter 5607: loss 2.6030, time 5258.71ms 
iter 5608: loss 2.5399, time 5266.54ms 
iter 5609: loss 2.6655, time 5264.42ms 
iter 5610: loss 2.7900, time 5263.29ms 
iter 5611: loss 2.5930, time 5257.30ms 
iter 5612: loss 2.6232, time 5261.47ms 
iter 5613: loss 2.5747, time 5276.71ms 
iter 5614: loss 2.4780, time 5293.96ms 
iter 5615: loss 2.6699, time 5298.94ms 
iter 5616: loss 2.6104, time 5257.16ms 
iter 5617: loss 2.5520, time 5257.61ms 
iter 5618: loss 2.5406, time 5272.42ms 
iter 5619: loss 2.6118, time 5271.82ms 
iter 5620: loss 2.8245, time 5267.84ms 
iter 5621: loss 2.6430, time 5268.20ms 
iter 5622: loss 2.7616, time 5261.80ms 
iter 5623: loss 2.7636, time 5271.36ms 
iter 5624: loss 2.6448, time 5283.05ms 
iter 5625: loss 2.5548, time 5264.91ms 
iter 5626: loss 2.4851, time 5272.23ms 
iter 5627: loss 2.5950, time 5250.11ms 
iter 5628: loss 2.5644, time 5266.35ms 
iter 5629: loss 2.6080, time 5261.89ms 
iter 5630: loss 2.5059, time 5254.35ms 
iter 5631: loss 2.5966, time 5260.23ms 
iter 5632: loss 2.3575, time 5299.13ms 
iter 5633: loss 2.1474, time 5283.58ms 
iter 5634: loss 2.4379, time 5267.05ms 
iter 5635: loss 2.5440, time 5261.68ms 
iter 5636: loss 2.7562, time 5257.72ms 
iter 5637: loss 2.5721, time 5275.82ms 
iter 5638: loss 2.6539, time 5280.75ms 
iter 5639: loss 2.3624, time 5260.00ms 
iter 5640: loss 2.6664, time 5330.79ms 
iter 5641: loss 2.4439, time 5280.61ms 
iter 5642: loss 2.5528, time 5258.47ms 
iter 5643: loss 2.4221, time 5275.24ms 
iter 5644: loss 2.5341, time 5262.11ms 
iter 5645: loss 2.2673, time 5267.50ms 
iter 5646: loss 2.5294, time 5270.99ms 
iter 5647: loss 2.6793, time 5242.79ms 
iter 5648: loss 2.5298, time 5297.99ms 
iter 5649: loss 2.6899, time 5255.82ms 
step 5650: train loss 2.5725, val loss 2.8246
iter 5650: loss 2.7146, time 20045.86ms 
iter 5651: loss 2.5635, time 5260.78ms 
iter 5652: loss 2.3894, time 5275.11ms 
iter 5653: loss 2.4959, time 5278.77ms 
iter 5654: loss 2.6001, time 5259.75ms 
iter 5655: loss 2.7602, time 5260.70ms 
iter 5656: loss 2.7229, time 5259.28ms 
iter 5657: loss 2.5685, time 5281.86ms 
iter 5658: loss 2.6009, time 5270.80ms 
iter 5659: loss 2.6836, time 5266.80ms 
iter 5660: loss 2.5809, time 5335.93ms 
iter 5661: loss 2.5687, time 5339.85ms 
iter 5662: loss 2.4693, time 5339.48ms 
iter 5663: loss 2.5699, time 5335.77ms 
iter 5664: loss 2.6599, time 5345.99ms 
iter 5665: loss 2.6442, time 5253.37ms 
iter 5666: loss 2.3577, time 5312.00ms 
iter 5667: loss 2.6248, time 5289.85ms 
iter 5668: loss 2.4384, time 5289.46ms 
iter 5669: loss 2.7631, time 5284.50ms 
iter 5670: loss 2.4794, time 5252.81ms 
iter 5671: loss 2.5494, time 5262.69ms 
iter 5672: loss 2.6610, time 5248.72ms 
iter 5673: loss 2.5580, time 5272.32ms 
iter 5674: loss 2.4259, time 5273.50ms 
iter 5675: loss 2.5663, time 5237.67ms 
iter 5676: loss 2.5201, time 5280.47ms 
iter 5677: loss 2.4458, time 5280.45ms 
iter 5678: loss 2.6013, time 5267.07ms 
iter 5679: loss 2.4951, time 5273.36ms 
iter 5680: loss 2.4997, time 5274.14ms 
iter 5681: loss 2.6562, time 5278.73ms 
iter 5682: loss 2.6222, time 5280.11ms 
iter 5683: loss 2.5934, time 5279.22ms 
iter 5684: loss 2.2927, time 5272.28ms 
iter 5685: loss 2.5419, time 5252.75ms 
iter 5686: loss 2.4157, time 5269.97ms 
iter 5687: loss 2.4720, time 5297.79ms 
iter 5688: loss 2.6869, time 5292.06ms 
iter 5689: loss 2.4529, time 5342.69ms 
iter 5690: loss 2.5545, time 5273.88ms 
iter 5691: loss 2.4312, time 5260.27ms 
iter 5692: loss 2.4929, time 5266.52ms 
iter 5693: loss 2.4557, time 5269.43ms 
iter 5694: loss 2.6356, time 5259.21ms 
iter 5695: loss 2.6499, time 5257.49ms 
iter 5696: loss 2.5788, time 5262.72ms 
iter 5697: loss 2.3327, time 5263.32ms 
iter 5698: loss 2.5226, time 5342.82ms 
iter 5699: loss 2.3339, time 5309.93ms 
step 5700: train loss 2.5555, val loss 2.8408
iter 5700: loss 2.6197, time 20060.95ms 
iter 5701: loss 2.5662, time 5326.63ms 
iter 5702: loss 2.5332, time 5260.67ms 
iter 5703: loss 2.4577, time 5264.11ms 
iter 5704: loss 2.7033, time 5258.78ms 
iter 5705: loss 2.7205, time 5271.89ms 
iter 5706: loss 2.5570, time 5256.58ms 
iter 5707: loss 2.5621, time 5245.72ms 
iter 5708: loss 2.4623, time 5253.04ms 
iter 5709: loss 2.6284, time 5213.68ms 
iter 5710: loss 2.5146, time 5317.29ms 
iter 5711: loss 2.5385, time 5342.90ms 
iter 5712: loss 2.4830, time 5333.98ms 
iter 5713: loss 2.5503, time 5317.37ms 
iter 5714: loss 2.5529, time 5274.47ms 
iter 5715: loss 2.5752, time 5298.42ms 
iter 5716: loss 2.5604, time 5270.82ms 
iter 5717: loss 2.5022, time 5263.84ms 
iter 5718: loss 2.5740, time 5268.26ms 
iter 5719: loss 2.6948, time 5251.43ms 
iter 5720: loss 2.3571, time 5254.63ms 
iter 5721: loss 2.5746, time 5252.88ms 
iter 5722: loss 2.5330, time 5318.38ms 
iter 5723: loss 2.4671, time 5272.90ms 
iter 5724: loss 2.4879, time 5261.96ms 
iter 5725: loss 2.6753, time 5257.37ms 
iter 5726: loss 2.6503, time 5264.93ms 
iter 5727: loss 2.5596, time 5271.62ms 
iter 5728: loss 2.6423, time 5257.49ms 
iter 5729: loss 2.5132, time 5225.83ms 
iter 5730: loss 2.5750, time 5248.72ms 
iter 5731: loss 2.6861, time 5265.54ms 
iter 5732: loss 2.4272, time 5252.75ms 
iter 5733: loss 2.5291, time 5256.57ms 
iter 5734: loss 2.6852, time 5264.20ms 
iter 5735: loss 2.6280, time 5257.85ms 
iter 5736: loss 2.9309, time 5276.48ms 
iter 5737: loss 2.7413, time 5276.24ms 
iter 5738: loss 2.7765, time 5261.78ms 
iter 5739: loss 2.7847, time 5268.05ms 
iter 5740: loss 2.6574, time 5261.58ms 
iter 5741: loss 2.6726, time 5302.46ms 
iter 5742: loss 2.5132, time 5320.16ms 
iter 5743: loss 2.5754, time 5264.27ms 
iter 5744: loss 2.5735, time 5254.10ms 
iter 5745: loss 2.6188, time 5256.29ms 
iter 5746: loss 2.6507, time 5262.57ms 
iter 5747: loss 2.4818, time 5265.02ms 
iter 5748: loss 2.4651, time 5295.61ms 
iter 5749: loss 2.5080, time 5257.99ms 
step 5750: train loss 2.5657, val loss 2.8182
iter 5750: loss 2.4929, time 19959.02ms 
iter 5751: loss 2.3672, time 5265.39ms 
iter 5752: loss 2.8044, time 5261.86ms 
iter 5753: loss 2.4328, time 5285.62ms 
iter 5754: loss 2.4545, time 5259.55ms 
iter 5755: loss 2.5782, time 5260.01ms 
iter 5756: loss 2.6492, time 5289.48ms 
iter 5757: loss 2.3656, time 5252.54ms 
iter 5758: loss 2.5924, time 5261.44ms 
iter 5759: loss 2.7751, time 5282.26ms 
iter 5760: loss 2.6791, time 5260.94ms 
iter 5761: loss 2.4420, time 5266.52ms 
iter 5762: loss 2.5669, time 5254.94ms 
iter 5763: loss 2.6118, time 5262.47ms 
iter 5764: loss 2.5569, time 5256.03ms 
iter 5765: loss 2.5976, time 5281.13ms 
iter 5766: loss 2.5377, time 5320.88ms 
iter 5767: loss 2.4632, time 5245.66ms 
iter 5768: loss 2.3030, time 5020.21ms 
iter 5769: loss 2.5936, time 4998.49ms 
iter 5770: loss 2.4535, time 5216.69ms 
iter 5771: loss 2.3508, time 5249.17ms 
iter 5772: loss 2.6805, time 5201.21ms 
iter 5773: loss 2.5430, time 5251.53ms 
iter 5774: loss 2.4753, time 5252.88ms 
iter 5775: loss 2.5923, time 5269.84ms 
iter 5776: loss 2.5359, time 5256.20ms 
iter 5777: loss 2.6116, time 5251.34ms 
iter 5778: loss 2.3489, time 5248.05ms 
iter 5779: loss 2.1408, time 5255.74ms 
iter 5780: loss 2.4715, time 5267.44ms 
iter 5781: loss 2.5812, time 5256.16ms 
iter 5782: loss 2.7815, time 5254.26ms 
iter 5783: loss 2.4925, time 5275.40ms 
iter 5784: loss 2.6889, time 5265.21ms 
iter 5785: loss 2.6270, time 5259.03ms 
iter 5786: loss 2.4655, time 5264.51ms 
iter 5787: loss 2.4477, time 5252.50ms 
iter 5788: loss 2.5830, time 5256.11ms 
iter 5789: loss 2.4250, time 5267.26ms 
iter 5790: loss 2.5056, time 5267.50ms 
iter 5791: loss 2.5896, time 5266.56ms 
iter 5792: loss 2.7068, time 5257.15ms 
iter 5793: loss 2.6841, time 5276.25ms 
iter 5794: loss 2.5723, time 5262.78ms 
iter 5795: loss 2.4090, time 5264.00ms 
iter 5796: loss 2.5491, time 5258.63ms 
iter 5797: loss 2.5916, time 5252.24ms 
iter 5798: loss 2.5362, time 5267.73ms 
iter 5799: loss 2.7264, time 5258.65ms 
step 5800: train loss 2.5290, val loss 2.8422
iter 5800: loss 2.7803, time 20013.00ms 
iter 5801: loss 2.6554, time 5258.59ms 
iter 5802: loss 2.5516, time 5256.57ms 
iter 5803: loss 2.6964, time 5252.39ms 
iter 5804: loss 2.5624, time 5266.66ms 
iter 5805: loss 2.3187, time 5252.26ms 
iter 5806: loss 2.4888, time 5256.96ms 
iter 5807: loss 2.5242, time 5257.79ms 
iter 5808: loss 2.5988, time 5269.66ms 
iter 5809: loss 2.3243, time 5265.41ms 
iter 5810: loss 2.4792, time 5280.28ms 
iter 5811: loss 2.6662, time 5279.95ms 
iter 5812: loss 2.7362, time 5258.44ms 
iter 5813: loss 2.6406, time 5263.08ms 
iter 5814: loss 2.4323, time 5253.49ms 
iter 5815: loss 2.6094, time 5258.10ms 
iter 5816: loss 2.7569, time 5247.78ms 
iter 5817: loss 2.6377, time 5265.23ms 
iter 5818: loss 2.6054, time 5274.93ms 
iter 5819: loss 2.5159, time 5282.13ms 
iter 5820: loss 2.4549, time 5255.73ms 
iter 5821: loss 2.5898, time 5257.29ms 
iter 5822: loss 2.7466, time 5258.41ms 
iter 5823: loss 2.5728, time 5260.46ms 
iter 5824: loss 2.4354, time 5261.19ms 
iter 5825: loss 2.5358, time 5251.88ms 
iter 5826: loss 2.5182, time 5236.80ms 
iter 5827: loss 2.5418, time 5267.93ms 
iter 5828: loss 2.4426, time 5256.78ms 
iter 5829: loss 2.6086, time 5304.82ms 
iter 5830: loss 2.6532, time 5284.91ms 
iter 5831: loss 2.6868, time 5261.09ms 
iter 5832: loss 2.5248, time 5258.59ms 
iter 5833: loss 2.5745, time 5268.39ms 
iter 5834: loss 2.7665, time 5266.00ms 
iter 5835: loss 2.6862, time 5256.75ms 
iter 5836: loss 2.3704, time 5259.35ms 
iter 5837: loss 2.5497, time 5251.36ms 
iter 5838: loss 2.3084, time 5258.36ms 
iter 5839: loss 2.5780, time 5253.14ms 
iter 5840: loss 2.6536, time 5258.32ms 
iter 5841: loss 2.5123, time 5249.71ms 
iter 5842: loss 2.5422, time 5263.36ms 
iter 5843: loss 2.7523, time 5254.92ms 
iter 5844: loss 2.4785, time 5256.15ms 
iter 5845: loss 2.5559, time 5263.66ms 
iter 5846: loss 2.5116, time 5275.51ms 
iter 5847: loss 2.5882, time 5258.05ms 
iter 5848: loss 2.7720, time 5263.68ms 
iter 5849: loss 2.6160, time 5258.93ms 
step 5850: train loss 2.5498, val loss 2.8283
iter 5850: loss 2.5628, time 19995.92ms 
iter 5851: loss 2.4170, time 5230.49ms 
iter 5852: loss 2.4434, time 5254.00ms 
iter 5853: loss 2.7760, time 5256.07ms 
iter 5854: loss 2.6327, time 5261.17ms 
iter 5855: loss 2.4682, time 5258.74ms 
iter 5856: loss 2.6481, time 5252.45ms 
iter 5857: loss 2.3201, time 5249.32ms 
iter 5858: loss 2.6770, time 5254.92ms 
iter 5859: loss 2.6334, time 5257.43ms 
iter 5860: loss 2.2056, time 5273.51ms 
iter 5861: loss 2.6549, time 5238.82ms 
iter 5862: loss 2.4703, time 5256.03ms 
iter 5863: loss 2.5678, time 5257.80ms 
iter 5864: loss 2.7094, time 5263.00ms 
iter 5865: loss 2.6280, time 5256.85ms 
iter 5866: loss 2.3307, time 5253.52ms 
iter 5867: loss 2.5737, time 5253.35ms 
iter 5868: loss 2.5154, time 5249.94ms 
iter 5869: loss 2.5003, time 5263.53ms 
iter 5870: loss 2.7209, time 5257.20ms 
iter 5871: loss 2.5882, time 5248.45ms 
iter 5872: loss 2.5006, time 5262.51ms 
iter 5873: loss 2.2331, time 5270.52ms 
iter 5874: loss 2.7154, time 5268.27ms 
iter 5875: loss 2.5542, time 5250.30ms 
iter 5876: loss 2.6057, time 5253.28ms 
iter 5877: loss 2.6969, time 5251.56ms 
iter 5878: loss 2.6871, time 5261.46ms 
iter 5879: loss 2.3778, time 5255.22ms 
iter 5880: loss 2.4859, time 5262.24ms 
iter 5881: loss 2.6475, time 5253.53ms 
iter 5882: loss 2.4913, time 5259.92ms 
iter 5883: loss 2.4775, time 5262.79ms 
iter 5884: loss 2.4488, time 5256.75ms 
iter 5885: loss 2.6014, time 5258.50ms 
iter 5886: loss 2.4731, time 5237.44ms 
iter 5887: loss 2.6456, time 5257.54ms 
iter 5888: loss 2.3147, time 5260.17ms 
iter 5889: loss 2.7241, time 5253.81ms 
iter 5890: loss 2.5384, time 5253.35ms 
iter 5891: loss 2.4517, time 5260.75ms 
iter 5892: loss 2.4475, time 5261.22ms 
iter 5893: loss 2.7442, time 5254.05ms 
iter 5894: loss 2.5658, time 5248.92ms 
iter 5895: loss 2.3926, time 5215.63ms 
iter 5896: loss 2.6114, time 5250.06ms 
iter 5897: loss 2.5643, time 5257.46ms 
iter 5898: loss 2.7093, time 5265.10ms 
iter 5899: loss 2.3226, time 5262.56ms 
step 5900: train loss 2.5498, val loss 2.8296
iter 5900: loss 2.4185, time 19908.15ms 
iter 5901: loss 2.4166, time 5336.43ms 
iter 5902: loss 2.5998, time 5313.17ms 
iter 5903: loss 2.2925, time 5321.85ms 
iter 5904: loss 2.6681, time 5289.47ms 
iter 5905: loss 2.4270, time 5294.21ms 
iter 5906: loss 2.4792, time 5250.25ms 
iter 5907: loss 2.5661, time 5253.72ms 
iter 5908: loss 2.4876, time 5257.70ms 
iter 5909: loss 2.3517, time 5256.19ms 
iter 5910: loss 2.5533, time 5277.39ms 
iter 5911: loss 2.4824, time 5277.99ms 
iter 5912: loss 2.7243, time 5248.68ms 
iter 5913: loss 2.5464, time 5259.33ms 
iter 5914: loss 2.5564, time 5255.29ms 
iter 5915: loss 2.5198, time 5251.95ms 
iter 5916: loss 2.5500, time 5251.38ms 
iter 5917: loss 2.4053, time 5255.53ms 
iter 5918: loss 2.4879, time 5240.02ms 
iter 5919: loss 2.4388, time 5219.14ms 
iter 5920: loss 2.8293, time 5230.58ms 
iter 5921: loss 2.6149, time 5253.21ms 
iter 5922: loss 2.4891, time 5258.98ms 
iter 5923: loss 2.6211, time 5274.30ms 
iter 5924: loss 2.5072, time 5250.37ms 
iter 5925: loss 2.5069, time 5259.86ms 
iter 5926: loss 2.6310, time 5254.71ms 
iter 5927: loss 2.5574, time 5251.29ms 
iter 5928: loss 2.2553, time 5276.24ms 
iter 5929: loss 2.5124, time 5257.01ms 
iter 5930: loss 2.6860, time 5258.30ms 
iter 5931: loss 2.8328, time 5253.91ms 
iter 5932: loss 2.5968, time 5256.50ms 
iter 5933: loss 2.5550, time 5275.73ms 
iter 5934: loss 2.4985, time 5256.24ms 
iter 5935: loss 2.6165, time 5269.68ms 
iter 5936: loss 2.4917, time 5245.88ms 
iter 5937: loss 2.5696, time 5282.22ms 
iter 5938: loss 2.5473, time 5256.85ms 
iter 5939: loss 2.4999, time 5250.81ms 
iter 5940: loss 2.4971, time 5238.36ms 
iter 5941: loss 2.5417, time 5236.26ms 
iter 5942: loss 2.3355, time 5264.35ms 
iter 5943: loss 2.3762, time 5248.75ms 
iter 5944: loss 2.4190, time 5253.06ms 
iter 5945: loss 2.4046, time 5249.09ms 
iter 5946: loss 2.5488, time 5264.24ms 
iter 5947: loss 2.8615, time 5255.16ms 
iter 5948: loss 2.5310, time 5246.89ms 
iter 5949: loss 2.7423, time 5254.51ms 
step 5950: train loss 2.5399, val loss 2.8341
iter 5950: loss 2.5425, time 19999.70ms 
iter 5951: loss 2.3867, time 5256.94ms 
iter 5952: loss 2.6715, time 5259.70ms 
iter 5953: loss 2.4732, time 5250.86ms 
iter 5954: loss 2.3755, time 5260.05ms 
iter 5955: loss 2.5557, time 5255.24ms 
iter 5956: loss 2.5614, time 5244.94ms 
iter 5957: loss 2.5739, time 5257.87ms 
iter 5958: loss 2.4962, time 5248.86ms 
iter 5959: loss 2.5586, time 5259.59ms 
iter 5960: loss 2.5163, time 5226.73ms 
iter 5961: loss 2.4650, time 5291.80ms 
iter 5962: loss 2.4979, time 5346.51ms 
iter 5963: loss 2.5261, time 5288.71ms 
iter 5964: loss 2.6995, time 5330.80ms 
iter 5965: loss 2.5893, time 5327.75ms 
iter 5966: loss 2.3212, time 5301.89ms 
iter 5967: loss 2.4549, time 5263.89ms 
iter 5968: loss 2.5580, time 5254.63ms 
iter 5969: loss 2.6949, time 5246.14ms 
iter 5970: loss 2.7150, time 5253.94ms 
iter 5971: loss 2.5426, time 5290.94ms 
iter 5972: loss 2.6968, time 5262.04ms 
iter 5973: loss 2.5365, time 5251.41ms 
iter 5974: loss 2.5799, time 5266.11ms 
iter 5975: loss 2.7034, time 5331.23ms 
iter 5976: loss 2.5480, time 5313.67ms 
iter 5977: loss 2.3794, time 5265.92ms 
iter 5978: loss 2.4718, time 5254.16ms 
iter 5979: loss 2.6165, time 5253.68ms 
iter 5980: loss 2.2616, time 5264.84ms 
iter 5981: loss 2.7348, time 5267.68ms 
iter 5982: loss 2.6056, time 5260.26ms 
iter 5983: loss 2.4494, time 5253.98ms 
iter 5984: loss 2.4299, time 5290.91ms 
iter 5985: loss 2.6173, time 5248.15ms 
iter 5986: loss 2.2899, time 5254.61ms 
iter 5987: loss 2.7613, time 5260.23ms 
iter 5988: loss 2.4915, time 5268.48ms 
iter 5989: loss 2.4491, time 5250.37ms 
iter 5990: loss 2.6140, time 5266.95ms 
iter 5991: loss 2.4862, time 5259.26ms 
iter 5992: loss 2.6041, time 5269.51ms 
iter 5993: loss 2.5490, time 5264.48ms 
iter 5994: loss 2.5881, time 5272.87ms 
iter 5995: loss 2.4659, time 5239.34ms 
iter 5996: loss 2.4159, time 5257.18ms 
iter 5997: loss 2.4605, time 5271.97ms 
iter 5998: loss 2.4875, time 5258.49ms 
iter 5999: loss 2.3641, time 5250.57ms 
step 6000: train loss 2.5350, val loss 2.8317
iter 6000: loss 2.4460, time 20002.26ms 
iter 6001: loss 2.6341, time 5249.03ms 
iter 6002: loss 2.4845, time 5253.92ms 
iter 6003: loss 2.4941, time 5258.03ms 
iter 6004: loss 2.5725, time 5238.36ms 
iter 6005: loss 2.4820, time 5262.94ms 
iter 6006: loss 2.7084, time 5259.84ms 
iter 6007: loss 2.5787, time 5257.67ms 
iter 6008: loss 2.6417, time 5257.71ms 
iter 6009: loss 2.5243, time 5254.22ms 
iter 6010: loss 2.6571, time 5265.28ms 
iter 6011: loss 2.5054, time 5225.13ms 
iter 6012: loss 2.4245, time 5257.91ms 
iter 6013: loss 2.7912, time 5255.26ms 
iter 6014: loss 2.6081, time 5250.50ms 
iter 6015: loss 2.4906, time 5259.66ms 
iter 6016: loss 2.4944, time 5260.44ms 
iter 6017: loss 2.4249, time 5260.03ms 
iter 6018: loss 2.5383, time 5251.30ms 
iter 6019: loss 2.5948, time 5256.80ms 
iter 6020: loss 2.6083, time 5256.52ms 
iter 6021: loss 2.4244, time 5261.41ms 
iter 6022: loss 2.7167, time 5255.83ms 
iter 6023: loss 2.6043, time 5250.91ms 
iter 6024: loss 2.6956, time 5259.24ms 
iter 6025: loss 2.6857, time 5259.71ms 
iter 6026: loss 2.5005, time 5269.35ms 
iter 6027: loss 2.6583, time 5265.64ms 
iter 6028: loss 2.3682, time 5257.59ms 
iter 6029: loss 2.6951, time 5255.55ms 
iter 6030: loss 2.5545, time 5269.57ms 
iter 6031: loss 2.4846, time 5261.36ms 
iter 6032: loss 2.6647, time 5252.38ms 
iter 6033: loss 2.4610, time 5254.78ms 
iter 6034: loss 2.5167, time 5264.47ms 
iter 6035: loss 2.3815, time 5267.84ms 
iter 6036: loss 2.4725, time 5254.58ms 
iter 6037: loss 2.5616, time 5254.29ms 
iter 6038: loss 2.6249, time 5262.64ms 
iter 6039: loss 2.6295, time 5258.39ms 
iter 6040: loss 2.6001, time 5261.00ms 
iter 6041: loss 2.6070, time 5258.46ms 
iter 6042: loss 2.6159, time 5218.66ms 
iter 6043: loss 2.4942, time 5264.57ms 
iter 6044: loss 2.4587, time 5263.36ms 
iter 6045: loss 2.7085, time 5287.10ms 
iter 6046: loss 2.5812, time 5256.17ms 
iter 6047: loss 2.3484, time 5254.72ms 
iter 6048: loss 2.5059, time 5262.98ms 
iter 6049: loss 2.7008, time 5253.54ms 
step 6050: train loss 2.5399, val loss 2.8300
iter 6050: loss 2.4825, time 19987.16ms 
iter 6051: loss 2.6583, time 5246.47ms 
iter 6052: loss 2.6211, time 5261.41ms 
iter 6053: loss 2.4284, time 5257.22ms 
iter 6054: loss 2.6663, time 5260.61ms 
iter 6055: loss 2.5722, time 5256.29ms 
iter 6056: loss 2.5869, time 5254.29ms 
iter 6057: loss 2.5944, time 5252.92ms 
iter 6058: loss 2.6265, time 5266.04ms 
iter 6059: loss 2.4377, time 5257.49ms 
iter 6060: loss 2.4516, time 5332.85ms 
iter 6061: loss 2.4264, time 5319.57ms 
iter 6062: loss 2.5860, time 5306.10ms 
iter 6063: loss 2.4909, time 5319.26ms 
iter 6064: loss 2.4694, time 5283.36ms 
iter 6065: loss 2.5679, time 5248.53ms 
iter 6066: loss 2.3273, time 5253.83ms 
iter 6067: loss 2.5824, time 5264.34ms 
iter 6068: loss 2.6106, time 5250.37ms 
iter 6069: loss 2.5652, time 5252.44ms 
iter 6070: loss 2.5907, time 5256.85ms 
iter 6071: loss 2.6453, time 5256.62ms 
iter 6072: loss 2.5204, time 5256.88ms 
iter 6073: loss 2.5222, time 5255.17ms 
iter 6074: loss 2.4484, time 5250.09ms 
iter 6075: loss 2.4552, time 5259.09ms 
iter 6076: loss 2.6852, time 5259.18ms 
iter 6077: loss 2.3462, time 5220.31ms 
iter 6078: loss 2.7163, time 5249.05ms 
iter 6079: loss 2.4567, time 5252.60ms 
iter 6080: loss 2.4149, time 5264.31ms 
iter 6081: loss 2.7389, time 5255.85ms 
iter 6082: loss 2.3775, time 5249.17ms 
iter 6083: loss 2.6607, time 5266.26ms 
iter 6084: loss 2.3171, time 5340.06ms 
iter 6085: loss 2.5440, time 5309.77ms 
iter 6086: loss 2.4639, time 5251.61ms 
iter 6087: loss 2.4169, time 5252.12ms 
iter 6088: loss 2.3864, time 5250.15ms 
iter 6089: loss 2.7068, time 5266.90ms 
iter 6090: loss 2.4078, time 5259.26ms 
iter 6091: loss 2.6316, time 5254.86ms 
iter 6092: loss 2.5751, time 5260.40ms 
iter 6093: loss 2.5700, time 5251.71ms 
iter 6094: loss 2.5249, time 5268.74ms 
iter 6095: loss 2.3155, time 5250.88ms 
iter 6096: loss 2.5341, time 5252.35ms 
iter 6097: loss 2.6292, time 5251.45ms 
iter 6098: loss 2.6550, time 5230.13ms 
iter 6099: loss 2.3070, time 5258.77ms 
step 6100: train loss 2.5516, val loss 2.8376
iter 6100: loss 2.5855, time 20014.74ms 
iter 6101: loss 2.4967, time 5252.08ms 
iter 6102: loss 2.5098, time 5253.25ms 
iter 6103: loss 2.4948, time 5252.48ms 
iter 6104: loss 2.5909, time 5252.16ms 
iter 6105: loss 2.4113, time 5260.68ms 
iter 6106: loss 2.5803, time 5256.76ms 
iter 6107: loss 2.3770, time 5252.58ms 
iter 6108: loss 2.3812, time 5253.02ms 
iter 6109: loss 2.6604, time 5252.83ms 
iter 6110: loss 2.3369, time 5261.02ms 
iter 6111: loss 2.7897, time 5255.60ms 
iter 6112: loss 2.4767, time 5221.06ms 
iter 6113: loss 2.5211, time 5246.55ms 
iter 6114: loss 2.4625, time 5244.90ms 
iter 6115: loss 2.4513, time 5260.22ms 
iter 6116: loss 2.6833, time 5264.30ms 
iter 6117: loss 2.6124, time 5297.22ms 
iter 6118: loss 2.5383, time 5275.06ms 
iter 6119: loss 2.3861, time 5263.44ms 
iter 6120: loss 2.5364, time 5326.95ms 
iter 6121: loss 2.4937, time 5296.66ms 
iter 6122: loss 2.5752, time 5332.49ms 
iter 6123: loss 2.3711, time 5336.12ms 
iter 6124: loss 2.6248, time 5290.18ms 
iter 6125: loss 2.4584, time 5314.40ms 
iter 6126: loss 2.4228, time 5267.70ms 
iter 6127: loss 2.5411, time 5259.48ms 
iter 6128: loss 2.6058, time 5251.80ms 
iter 6129: loss 2.5130, time 5268.88ms 
iter 6130: loss 2.6449, time 5256.28ms 
iter 6131: loss 2.5423, time 5310.55ms 
iter 6132: loss 2.5539, time 5273.51ms 
iter 6133: loss 2.4900, time 5252.30ms 
iter 6134: loss 2.5688, time 5249.27ms 
iter 6135: loss 2.5236, time 5257.07ms 
iter 6136: loss 2.4008, time 5261.20ms 
iter 6137: loss 2.6097, time 5256.29ms 
iter 6138: loss 2.4316, time 5269.41ms 
iter 6139: loss 2.6924, time 5318.51ms 
iter 6140: loss 2.5069, time 5298.66ms 
iter 6141: loss 2.5860, time 5271.50ms 
iter 6142: loss 2.4911, time 5256.02ms 
iter 6143: loss 2.6283, time 5256.42ms 
iter 6144: loss 2.4579, time 5274.87ms 
iter 6145: loss 2.3851, time 5267.54ms 
iter 6146: loss 2.5009, time 5263.15ms 
iter 6147: loss 2.4357, time 5256.28ms 
iter 6148: loss 2.5210, time 5259.84ms 
iter 6149: loss 2.4384, time 5259.10ms 
step 6150: train loss 2.5506, val loss 2.8381
iter 6150: loss 2.5322, time 20009.48ms 
iter 6151: loss 2.7074, time 5273.93ms 
iter 6152: loss 2.4932, time 5266.89ms 
iter 6153: loss 2.4333, time 5256.02ms 
iter 6154: loss 2.6672, time 5263.81ms 
iter 6155: loss 2.5114, time 5270.27ms 
iter 6156: loss 2.3316, time 5272.50ms 
iter 6157: loss 2.4264, time 5256.94ms 
iter 6158: loss 2.4497, time 5251.95ms 
iter 6159: loss 2.5744, time 5264.43ms 
iter 6160: loss 2.5369, time 5266.64ms 
iter 6161: loss 2.4158, time 5258.11ms 
iter 6162: loss 2.4471, time 5252.81ms 
iter 6163: loss 2.6017, time 5253.88ms 
iter 6164: loss 2.6286, time 5265.83ms 
iter 6165: loss 2.7229, time 5266.77ms 
iter 6166: loss 2.4726, time 5263.35ms 
iter 6167: loss 2.5982, time 5262.00ms 
iter 6168: loss 2.6106, time 5271.79ms 
iter 6169: loss 2.3516, time 5271.92ms 
iter 6170: loss 2.5270, time 5232.19ms 
iter 6171: loss 2.3740, time 5266.82ms 
iter 6172: loss 2.4471, time 5270.06ms 
iter 6173: loss 2.4612, time 5263.82ms 
iter 6174: loss 2.5097, time 5247.89ms 
iter 6175: loss 2.6880, time 5266.05ms 
iter 6176: loss 2.4633, time 5296.13ms 
iter 6177: loss 2.5231, time 5315.56ms 
iter 6178: loss 2.4043, time 5305.61ms 
iter 6179: loss 2.6014, time 5252.94ms 
iter 6180: loss 2.4893, time 5264.66ms 
iter 6181: loss 2.5190, time 5261.27ms 
iter 6182: loss 2.6313, time 5261.37ms 
iter 6183: loss 2.7715, time 5253.88ms 
iter 6184: loss 2.4706, time 5253.21ms 
iter 6185: loss 2.5968, time 5262.25ms 
iter 6186: loss 2.5244, time 5254.24ms 
iter 6187: loss 2.6076, time 5254.75ms 
iter 6188: loss 2.6323, time 5250.26ms 
iter 6189: loss 2.5169, time 5255.88ms 
iter 6190: loss 2.4467, time 5258.26ms 
iter 6191: loss 2.5171, time 5252.82ms 
iter 6192: loss 2.5463, time 5246.81ms 
iter 6193: loss 2.6835, time 5232.95ms 
iter 6194: loss 2.4126, time 5257.99ms 
iter 6195: loss 2.5749, time 5255.66ms 
iter 6196: loss 2.7674, time 5251.63ms 
iter 6197: loss 2.6468, time 5257.46ms 
iter 6198: loss 2.4495, time 5248.44ms 
iter 6199: loss 2.4466, time 5258.59ms 
step 6200: train loss 2.5530, val loss 2.8414
iter 6200: loss 2.8030, time 19981.99ms 
iter 6201: loss 2.2911, time 5251.13ms 
iter 6202: loss 2.5900, time 5251.12ms 
iter 6203: loss 2.5977, time 5256.92ms 
iter 6204: loss 2.6131, time 5265.16ms 
iter 6205: loss 2.3445, time 5262.00ms 
iter 6206: loss 2.5761, time 5261.18ms 
iter 6207: loss 2.3722, time 5280.03ms 
iter 6208: loss 2.4188, time 5274.64ms 
iter 6209: loss 2.4164, time 5285.23ms 
iter 6210: loss 2.5678, time 5253.07ms 
iter 6211: loss 2.5424, time 5251.08ms 
iter 6212: loss 2.3235, time 5250.03ms 
iter 6213: loss 2.3957, time 5261.75ms 
iter 6214: loss 2.3912, time 5250.08ms 
iter 6215: loss 2.4697, time 5249.28ms 
iter 6216: loss 2.4664, time 5248.89ms 
iter 6217: loss 2.8077, time 5260.81ms 
iter 6218: loss 2.4087, time 5259.51ms 
iter 6219: loss 2.4439, time 5251.15ms 
iter 6220: loss 2.7213, time 5255.00ms 
iter 6221: loss 2.5076, time 5263.02ms 
iter 6222: loss 2.3535, time 5278.05ms 
iter 6223: loss 2.5142, time 5249.88ms 
iter 6224: loss 2.5036, time 5264.33ms 
iter 6225: loss 2.4233, time 5270.18ms 
iter 6226: loss 2.3562, time 5273.70ms 
iter 6227: loss 2.7682, time 5274.32ms 
iter 6228: loss 2.3750, time 5266.20ms 
iter 6229: loss 2.7630, time 5249.90ms 
iter 6230: loss 2.5600, time 5271.37ms 
iter 6231: loss 2.6832, time 5269.04ms 
iter 6232: loss 2.4529, time 5263.86ms 
iter 6233: loss 2.2762, time 5261.12ms 
iter 6234: loss 2.3979, time 5261.29ms 
iter 6235: loss 2.5527, time 5276.35ms 
iter 6236: loss 2.5426, time 5266.10ms 
iter 6237: loss 2.5725, time 5264.28ms 
iter 6238: loss 2.2998, time 5263.40ms 
iter 6239: loss 2.5599, time 5266.75ms 
iter 6240: loss 2.5563, time 5279.09ms 
iter 6241: loss 2.4726, time 5270.99ms 
iter 6242: loss 2.5140, time 5263.08ms 
iter 6243: loss 2.4941, time 5247.77ms 
iter 6244: loss 2.7261, time 5262.19ms 
iter 6245: loss 2.4399, time 5258.15ms 
iter 6246: loss 2.4479, time 5232.27ms 
iter 6247: loss 2.3830, time 5250.53ms 
iter 6248: loss 2.5951, time 5247.68ms 
iter 6249: loss 2.6269, time 5218.01ms 
step 6250: train loss 2.5401, val loss 2.8231
iter 6250: loss 2.4005, time 19801.12ms 
iter 6251: loss 2.6238, time 5211.85ms 
iter 6252: loss 2.6044, time 5214.98ms 
iter 6253: loss 2.7546, time 5209.33ms 
iter 6254: loss 2.7100, time 5225.18ms 
iter 6255: loss 2.4433, time 5214.22ms 
iter 6256: loss 2.2838, time 5212.59ms 
iter 6257: loss 2.4706, time 5210.71ms 
iter 6258: loss 2.4522, time 5218.31ms 
iter 6259: loss 2.4760, time 5218.51ms 
iter 6260: loss 2.8241, time 5210.18ms 
iter 6261: loss 2.5323, time 5212.53ms 
iter 6262: loss 2.5066, time 5211.54ms 
iter 6263: loss 2.5658, time 5208.73ms 
iter 6264: loss 2.4257, time 5202.76ms 
iter 6265: loss 2.6365, time 5211.71ms 
iter 6266: loss 2.3372, time 5203.42ms 
iter 6267: loss 2.4918, time 5216.16ms 
iter 6268: loss 2.4788, time 5209.58ms 
iter 6269: loss 2.6337, time 5211.46ms 
iter 6270: loss 2.5305, time 5211.76ms 
iter 6271: loss 2.6388, time 5211.36ms 
iter 6272: loss 2.4737, time 5217.84ms 
iter 6273: loss 2.5480, time 5214.88ms 
iter 6274: loss 2.5573, time 5210.23ms 
iter 6275: loss 2.4985, time 5207.75ms 
iter 6276: loss 2.4810, time 5211.91ms 
iter 6277: loss 2.5212, time 5219.52ms 
iter 6278: loss 2.7147, time 5219.03ms 
iter 6279: loss 2.4009, time 5210.68ms 
iter 6280: loss 2.6402, time 5208.26ms 
iter 6281: loss 2.4048, time 5209.93ms 
iter 6282: loss 2.3826, time 5219.13ms 
iter 6283: loss 2.6172, time 5221.27ms 
iter 6284: loss 2.3022, time 5212.56ms 
iter 6285: loss 2.6626, time 5211.20ms 
iter 6286: loss 2.5584, time 5211.29ms 
iter 6287: loss 2.6976, time 5217.03ms 
iter 6288: loss 2.6202, time 5220.52ms 
iter 6289: loss 2.5069, time 5208.89ms 
iter 6290: loss 2.3270, time 5212.58ms 
iter 6291: loss 2.5781, time 5210.97ms 
iter 6292: loss 2.4209, time 5260.88ms 
iter 6293: loss 2.4733, time 5260.44ms 
iter 6294: loss 2.4100, time 5252.28ms 
iter 6295: loss 2.6906, time 5252.61ms 
iter 6296: loss 2.4563, time 5266.34ms 
iter 6297: loss 2.7353, time 5262.13ms 
iter 6298: loss 2.5831, time 5257.53ms 
iter 6299: loss 2.8075, time 5253.96ms 
step 6300: train loss 2.5460, val loss 2.8430
iter 6300: loss 2.5498, time 19998.07ms 
iter 6301: loss 2.3349, time 5297.01ms 
iter 6302: loss 2.6203, time 5322.45ms 
iter 6303: loss 2.5501, time 5336.98ms 
iter 6304: loss 2.4930, time 5283.24ms 
iter 6305: loss 2.7301, time 5249.32ms 
iter 6306: loss 2.6914, time 5293.17ms 
iter 6307: loss 2.5912, time 5282.15ms 
iter 6308: loss 2.4311, time 5260.56ms 
iter 6309: loss 2.5016, time 5269.98ms 
iter 6310: loss 2.5170, time 5299.97ms 
iter 6311: loss 2.6436, time 5259.23ms 
iter 6312: loss 2.5761, time 5267.61ms 
iter 6313: loss 2.3714, time 5260.05ms 
iter 6314: loss 2.4784, time 5264.16ms 
iter 6315: loss 2.4458, time 5259.57ms 
iter 6316: loss 2.3838, time 5261.60ms 
iter 6317: loss 2.4027, time 5250.65ms 
iter 6318: loss 2.6435, time 5250.17ms 
iter 6319: loss 2.6845, time 5250.84ms 
iter 6320: loss 2.1492, time 5255.11ms 
iter 6321: loss 2.6076, time 5261.27ms 
iter 6322: loss 2.5696, time 5264.07ms 
iter 6323: loss 2.7183, time 5262.53ms 
iter 6324: loss 2.6669, time 5253.25ms 
iter 6325: loss 2.3816, time 5262.32ms 
iter 6326: loss 2.5262, time 5253.29ms 
iter 6327: loss 2.4493, time 5250.85ms 
iter 6328: loss 2.7518, time 5248.77ms 
iter 6329: loss 2.4475, time 5239.55ms 
iter 6330: loss 2.4155, time 5263.26ms 
iter 6331: loss 2.5697, time 5262.87ms 
iter 6332: loss 2.4185, time 5250.63ms 
iter 6333: loss 2.4904, time 5256.97ms 
iter 6334: loss 2.6960, time 5262.48ms 
iter 6335: loss 2.4410, time 5255.76ms 
iter 6336: loss 2.3491, time 5292.10ms 
iter 6337: loss 2.5610, time 5317.80ms 
iter 6338: loss 2.6907, time 5341.31ms 
iter 6339: loss 2.5854, time 5294.26ms 
iter 6340: loss 2.5994, time 5262.51ms 
iter 6341: loss 2.6537, time 5262.17ms 
iter 6342: loss 2.3987, time 5263.10ms 
iter 6343: loss 2.5304, time 5264.06ms 
iter 6344: loss 2.7073, time 5274.64ms 
iter 6345: loss 2.5951, time 5263.66ms 
iter 6346: loss 2.6235, time 5262.75ms 
iter 6347: loss 2.5205, time 5254.90ms 
iter 6348: loss 2.5263, time 5321.69ms 
iter 6349: loss 2.6676, time 5250.82ms 
step 6350: train loss 2.5332, val loss 2.8398
iter 6350: loss 2.7847, time 20010.11ms 
iter 6351: loss 2.3456, time 5234.32ms 
iter 6352: loss 2.5994, time 5293.21ms 
iter 6353: loss 2.4483, time 5297.08ms 
iter 6354: loss 2.5400, time 5256.74ms 
iter 6355: loss 2.6018, time 5259.42ms 
iter 6356: loss 2.5752, time 5266.65ms 
iter 6357: loss 2.5546, time 5276.15ms 
iter 6358: loss 2.4907, time 5248.19ms 
iter 6359: loss 2.5230, time 5250.18ms 
iter 6360: loss 2.5600, time 5257.10ms 
iter 6361: loss 2.4097, time 5262.66ms 
iter 6362: loss 2.7540, time 5250.71ms 
iter 6363: loss 2.4150, time 5326.07ms 
iter 6364: loss 2.5757, time 5248.46ms 
iter 6365: loss 2.6987, time 5257.93ms 
iter 6366: loss 2.5420, time 5295.55ms 
iter 6367: loss 2.4496, time 5248.67ms 
iter 6368: loss 2.3133, time 5295.68ms 
iter 6369: loss 2.6283, time 5262.64ms 
iter 6370: loss 2.4178, time 5248.05ms 
iter 6371: loss 2.3509, time 5267.37ms 
iter 6372: loss 2.5091, time 5275.70ms 
iter 6373: loss 2.4973, time 5263.86ms 
iter 6374: loss 2.4370, time 5270.14ms 
iter 6375: loss 2.7369, time 5257.15ms 
iter 6376: loss 2.4987, time 5262.65ms 
iter 6377: loss 2.4792, time 5251.66ms 
iter 6378: loss 2.3979, time 5256.94ms 
iter 6379: loss 2.5585, time 5258.78ms 
iter 6380: loss 2.6797, time 5262.58ms 
iter 6381: loss 2.5748, time 5246.86ms 
iter 6382: loss 2.6564, time 5252.56ms 
iter 6383: loss 2.4284, time 5267.25ms 
iter 6384: loss 2.4990, time 5264.02ms 
iter 6385: loss 2.5529, time 5257.56ms 
iter 6386: loss 2.4471, time 5259.45ms 
iter 6387: loss 2.3695, time 5266.84ms 
iter 6388: loss 2.4623, time 5269.05ms 
iter 6389: loss 2.5731, time 5262.90ms 
iter 6390: loss 2.3396, time 5261.68ms 
iter 6391: loss 2.5607, time 5260.79ms 
iter 6392: loss 2.4682, time 5266.75ms 
iter 6393: loss 2.5505, time 5254.79ms 
iter 6394: loss 2.4862, time 5252.90ms 
iter 6395: loss 2.3882, time 5326.51ms 
iter 6396: loss 2.6310, time 5284.08ms 
iter 6397: loss 2.3723, time 5279.45ms 
iter 6398: loss 2.4943, time 5251.25ms 
iter 6399: loss 2.7813, time 5290.43ms 
step 6400: train loss 2.5311, val loss 2.8400
iter 6400: loss 2.4002, time 20047.75ms 
iter 6401: loss 2.4357, time 5281.67ms 
iter 6402: loss 2.4923, time 5323.63ms 
iter 6403: loss 2.5061, time 5297.85ms 
iter 6404: loss 2.5371, time 5254.42ms 
iter 6405: loss 2.6621, time 5249.66ms 
iter 6406: loss 2.4331, time 5261.11ms 
iter 6407: loss 2.9074, time 5330.22ms 
iter 6408: loss 2.4717, time 5258.89ms 
iter 6409: loss 2.6632, time 5247.61ms 
iter 6410: loss 2.4243, time 5252.55ms 
iter 6411: loss 2.4763, time 5256.00ms 
iter 6412: loss 2.4545, time 5242.04ms 
iter 6413: loss 2.7026, time 5247.32ms 
iter 6414: loss 2.6477, time 5251.51ms 
iter 6415: loss 2.7258, time 5257.91ms 
iter 6416: loss 2.7055, time 5270.26ms 
iter 6417: loss 2.4295, time 5248.35ms 
iter 6418: loss 2.3460, time 5248.13ms 
iter 6419: loss 2.3962, time 5249.94ms 
iter 6420: loss 2.6146, time 5260.21ms 
iter 6421: loss 2.6352, time 5263.73ms 
iter 6422: loss 2.5411, time 5254.35ms 
iter 6423: loss 2.5359, time 5261.61ms 
iter 6424: loss 2.6804, time 5254.09ms 
iter 6425: loss 2.5289, time 5261.40ms 
iter 6426: loss 2.5612, time 5254.65ms 
iter 6427: loss 2.4604, time 5258.77ms 
iter 6428: loss 2.4297, time 5252.05ms 
iter 6429: loss 2.5219, time 5266.22ms 
iter 6430: loss 2.4804, time 5269.71ms 
iter 6431: loss 2.6794, time 5260.57ms 
iter 6432: loss 2.3257, time 5254.17ms 
iter 6433: loss 2.4608, time 5252.42ms 
iter 6434: loss 2.5126, time 5265.17ms 
iter 6435: loss 2.5542, time 5261.86ms 
iter 6436: loss 2.2376, time 5253.41ms 
iter 6437: loss 2.4420, time 5252.65ms 
iter 6438: loss 2.3298, time 5233.36ms 
iter 6439: loss 2.7078, time 5249.65ms 
iter 6440: loss 2.4659, time 5255.47ms 
iter 6441: loss 2.3701, time 5253.05ms 
iter 6442: loss 2.5260, time 5263.86ms 
iter 6443: loss 2.6342, time 5257.86ms 
iter 6444: loss 2.5482, time 5252.37ms 
iter 6445: loss 2.4772, time 5227.63ms 
iter 6446: loss 2.5090, time 5260.29ms 
iter 6447: loss 2.4887, time 5242.53ms 
iter 6448: loss 2.5514, time 5258.34ms 
iter 6449: loss 2.2913, time 5256.30ms 
step 6450: train loss 2.5141, val loss 2.8171
iter 6450: loss 2.7785, time 20075.54ms 
iter 6451: loss 2.5588, time 5265.66ms 
iter 6452: loss 2.6185, time 5259.08ms 
iter 6453: loss 2.5189, time 5253.53ms 
iter 6454: loss 2.5770, time 5219.56ms 
iter 6455: loss 2.5023, time 5330.04ms 
iter 6456: loss 2.4887, time 5323.16ms 
iter 6457: loss 2.4598, time 5267.23ms 
iter 6458: loss 2.5775, time 5318.58ms 
iter 6459: loss 2.3791, time 5260.37ms 
iter 6460: loss 2.3504, time 5254.37ms 
iter 6461: loss 2.4761, time 5255.25ms 
iter 6462: loss 2.5209, time 5250.19ms 
iter 6463: loss 2.5612, time 5263.65ms 
iter 6464: loss 2.4771, time 5260.62ms 
iter 6465: loss 2.6676, time 5255.70ms 
iter 6466: loss 2.6066, time 5252.29ms 
iter 6467: loss 2.5245, time 5287.70ms 
iter 6468: loss 2.6382, time 5269.83ms 
iter 6469: loss 2.6753, time 5258.65ms 
iter 6470: loss 2.4584, time 5249.49ms 
iter 6471: loss 2.6678, time 5321.66ms 
iter 6472: loss 2.4429, time 5345.01ms 
iter 6473: loss 2.5348, time 5260.99ms 
iter 6474: loss 2.2942, time 5250.19ms 
iter 6475: loss 2.4712, time 5248.03ms 
iter 6476: loss 2.4042, time 5265.07ms 
iter 6477: loss 2.4979, time 5253.72ms 
iter 6478: loss 2.4735, time 5248.17ms 
iter 6479: loss 2.5042, time 5259.96ms 
iter 6480: loss 2.5350, time 5245.01ms 
iter 6481: loss 2.3431, time 5264.06ms 
iter 6482: loss 2.5789, time 5237.08ms 
iter 6483: loss 2.5773, time 5250.14ms 
iter 6484: loss 2.6006, time 5244.63ms 
iter 6485: loss 2.6331, time 5263.67ms 
iter 6486: loss 2.5478, time 5254.94ms 
iter 6487: loss 2.4576, time 5222.09ms 
iter 6488: loss 2.4580, time 5237.86ms 
iter 6489: loss 2.6937, time 5267.45ms 
iter 6490: loss 2.7742, time 5258.60ms 
iter 6491: loss 2.6095, time 5251.31ms 
iter 6492: loss 2.4136, time 5220.97ms 
iter 6493: loss 2.4263, time 5253.64ms 
iter 6494: loss 2.3500, time 5264.97ms 
iter 6495: loss 2.4780, time 5250.66ms 
iter 6496: loss 2.5346, time 5249.60ms 
iter 6497: loss 2.6148, time 5269.20ms 
iter 6498: loss 2.6372, time 5335.05ms 
iter 6499: loss 2.3601, time 5291.02ms 
step 6500: train loss 2.5177, val loss 2.8341
iter 6500: loss 2.3945, time 20085.03ms 
iter 6501: loss 2.5432, time 5266.81ms 
iter 6502: loss 2.5132, time 5255.31ms 
iter 6503: loss 2.4273, time 5267.25ms 
iter 6504: loss 2.4203, time 5285.12ms 
iter 6505: loss 2.2743, time 5254.93ms 
iter 6506: loss 2.6627, time 5258.67ms 
iter 6507: loss 2.5534, time 5266.15ms 
iter 6508: loss 2.4842, time 5180.21ms 
iter 6509: loss 2.3629, time 5255.55ms 
iter 6510: loss 2.6252, time 5255.73ms 
iter 6511: loss 2.4770, time 5251.73ms 
iter 6512: loss 2.4754, time 5262.09ms 
iter 6513: loss 2.5316, time 5267.05ms 
iter 6514: loss 2.5905, time 5257.76ms 
iter 6515: loss 2.6497, time 5249.62ms 
iter 6516: loss 2.3012, time 5257.56ms 
iter 6517: loss 2.4037, time 5261.79ms 
iter 6518: loss 2.5009, time 5299.33ms 
iter 6519: loss 2.5953, time 5292.01ms 
iter 6520: loss 2.6297, time 5337.52ms 
iter 6521: loss 2.6219, time 5323.45ms 
iter 6522: loss 2.5414, time 5344.83ms 
iter 6523: loss 2.4803, time 5323.54ms 
iter 6524: loss 2.5543, time 5332.29ms 
iter 6525: loss 2.5434, time 5310.58ms 
iter 6526: loss 2.3759, time 5269.12ms 
iter 6527: loss 2.4793, time 5255.46ms 
iter 6528: loss 2.6604, time 5320.13ms 
iter 6529: loss 2.7341, time 5280.52ms 
iter 6530: loss 2.4581, time 5253.21ms 
iter 6531: loss 2.5303, time 5237.55ms 
iter 6532: loss 2.8211, time 5304.12ms 
iter 6533: loss 2.5376, time 5334.41ms 
iter 6534: loss 2.5555, time 5314.20ms 
iter 6535: loss 2.5189, time 5259.53ms 
iter 6536: loss 2.5244, time 5268.80ms 
iter 6537: loss 2.4732, time 5258.46ms 
iter 6538: loss 2.5711, time 5240.84ms 
iter 6539: loss 2.5255, time 5260.50ms 
iter 6540: loss 2.3819, time 5251.55ms 
iter 6541: loss 2.3829, time 5259.96ms 
iter 6542: loss 2.4827, time 5232.53ms 
iter 6543: loss 2.3947, time 5232.10ms 
iter 6544: loss 2.5922, time 5243.10ms 
iter 6545: loss 2.4055, time 5241.35ms 
iter 6546: loss 2.5712, time 5241.39ms 
iter 6547: loss 2.5629, time 5245.52ms 
iter 6548: loss 2.6294, time 5151.91ms 
iter 6549: loss 2.5061, time 5229.20ms 
step 6550: train loss 2.5240, val loss 2.8271
iter 6550: loss 2.3003, time 20005.92ms 
iter 6551: loss 2.4040, time 5170.02ms 
iter 6552: loss 2.3492, time 5208.00ms 
iter 6553: loss 2.4159, time 5132.66ms 
iter 6554: loss 2.5794, time 5221.34ms 
iter 6555: loss 2.2792, time 5158.42ms 
iter 6556: loss 2.3499, time 5259.78ms 
iter 6557: loss 2.5109, time 5319.81ms 
iter 6558: loss 2.7019, time 5318.54ms 
iter 6559: loss 2.3186, time 5275.92ms 
iter 6560: loss 2.7571, time 5219.88ms 
iter 6561: loss 2.5317, time 5255.53ms 
iter 6562: loss 2.5719, time 5248.38ms 
iter 6563: loss 2.4383, time 5253.89ms 
iter 6564: loss 2.6517, time 5272.20ms 
iter 6565: loss 2.5590, time 5247.88ms 
iter 6566: loss 2.2043, time 5259.71ms 
iter 6567: loss 2.4856, time 5229.03ms 
iter 6568: loss 2.2498, time 5249.52ms 
iter 6569: loss 2.5508, time 5255.93ms 
iter 6570: loss 2.3531, time 5250.78ms 
iter 6571: loss 2.4675, time 5254.15ms 
iter 6572: loss 2.6576, time 5249.18ms 
iter 6573: loss 2.5236, time 5221.10ms 
iter 6574: loss 2.4308, time 5243.81ms 
iter 6575: loss 2.4386, time 5258.19ms 
iter 6576: loss 2.3944, time 5247.41ms 
iter 6577: loss 2.4957, time 5251.49ms 
iter 6578: loss 2.3215, time 5249.66ms 
iter 6579: loss 2.3598, time 5257.18ms 
iter 6580: loss 2.5209, time 5251.11ms 
iter 6581: loss 2.6066, time 5250.66ms 
iter 6582: loss 2.7933, time 5247.40ms 
iter 6583: loss 2.4742, time 5257.59ms 
iter 6584: loss 2.5015, time 5231.18ms 
iter 6585: loss 2.5893, time 5254.53ms 
iter 6586: loss 2.5333, time 5250.13ms 
iter 6587: loss 2.6361, time 5247.56ms 
iter 6588: loss 2.5196, time 5256.02ms 
iter 6589: loss 2.6929, time 5235.57ms 
iter 6590: loss 2.5810, time 5247.45ms 
iter 6591: loss 2.4704, time 5249.82ms 
iter 6592: loss 2.4770, time 5263.97ms 
iter 6593: loss 2.3276, time 5212.28ms 
iter 6594: loss 2.3929, time 5244.19ms 
iter 6595: loss 2.7084, time 5251.85ms 
iter 6596: loss 2.7802, time 5253.26ms 
iter 6597: loss 2.2982, time 5255.41ms 
iter 6598: loss 2.4653, time 5236.48ms 
iter 6599: loss 2.2896, time 5244.66ms 
step 6600: train loss 2.5198, val loss 2.8286
iter 6600: loss 2.3891, time 19993.34ms 
iter 6601: loss 2.5476, time 5260.97ms 
iter 6602: loss 2.4217, time 5256.80ms 
iter 6603: loss 2.7239, time 5245.39ms 
iter 6604: loss 2.7083, time 5249.66ms 
iter 6605: loss 2.4749, time 5253.60ms 
iter 6606: loss 2.6686, time 5254.02ms 
iter 6607: loss 2.4200, time 5246.58ms 
iter 6608: loss 2.4599, time 5241.44ms 
iter 6609: loss 2.5360, time 5274.77ms 
iter 6610: loss 2.4045, time 5103.41ms 
iter 6611: loss 2.2114, time 5055.79ms 
iter 6612: loss 2.3470, time 5284.59ms 
iter 6613: loss 2.5092, time 5249.12ms 
iter 6614: loss 2.5550, time 5248.50ms 
iter 6615: loss 2.4584, time 5264.82ms 
iter 6616: loss 2.3151, time 5258.66ms 
iter 6617: loss 2.3438, time 5244.10ms 
iter 6618: loss 2.7124, time 5140.46ms 
iter 6619: loss 2.6059, time 5145.67ms 
iter 6620: loss 2.4842, time 5274.48ms 
iter 6621: loss 2.2225, time 5243.18ms 
iter 6622: loss 2.4896, time 5217.20ms 
iter 6623: loss 2.5564, time 5235.18ms 
iter 6624: loss 2.5108, time 5188.79ms 
iter 6625: loss 2.3918, time 5149.67ms 
iter 6626: loss 2.5487, time 5182.39ms 
iter 6627: loss 2.4899, time 5195.22ms 
iter 6628: loss 2.3802, time 5038.49ms 
iter 6629: loss 2.4715, time 5046.34ms 
iter 6630: loss 2.6466, time 5288.06ms 
iter 6631: loss 2.5146, time 5152.33ms 
iter 6632: loss 2.5700, time 5015.97ms 
iter 6633: loss 2.4965, time 5163.39ms 
iter 6634: loss 2.4631, time 5253.97ms 
iter 6635: loss 2.4450, time 5083.02ms 
iter 6636: loss 2.2761, time 5084.48ms 
iter 6637: loss 2.5837, time 5221.21ms 
iter 6638: loss 2.3868, time 5219.35ms 
iter 6639: loss 2.4623, time 5294.64ms 
iter 6640: loss 2.6237, time 5256.71ms 
iter 6641: loss 2.5782, time 5066.28ms 
iter 6642: loss 2.5580, time 5051.28ms 
iter 6643: loss 2.4716, time 5043.97ms 
iter 6644: loss 2.7440, time 5039.81ms 
iter 6645: loss 2.4574, time 5047.16ms 
iter 6646: loss 2.2582, time 5131.84ms 
iter 6647: loss 2.6783, time 5046.40ms 
iter 6648: loss 2.4894, time 5037.10ms 
iter 6649: loss 2.4695, time 5129.57ms 
step 6650: train loss 2.5142, val loss 2.8408
iter 6650: loss 2.5853, time 19961.16ms 
iter 6651: loss 2.5589, time 5250.18ms 
iter 6652: loss 2.7595, time 5224.03ms 
iter 6653: loss 2.6286, time 5064.98ms 
iter 6654: loss 2.2920, time 5185.07ms 
iter 6655: loss 2.6135, time 5261.86ms 
iter 6656: loss 2.7657, time 5253.19ms 
iter 6657: loss 2.4868, time 5272.54ms 
iter 6658: loss 2.4942, time 5052.22ms 
iter 6659: loss 2.6009, time 5047.83ms 
iter 6660: loss 2.7163, time 5249.90ms 
iter 6661: loss 2.4343, time 5243.01ms 
iter 6662: loss 2.6238, time 5230.06ms 
iter 6663: loss 2.5225, time 5176.64ms 
iter 6664: loss 2.5541, time 5215.21ms 
iter 6665: loss 2.4158, time 5157.75ms 
iter 6666: loss 2.5465, time 5229.90ms 
iter 6667: loss 2.6152, time 5205.77ms 
iter 6668: loss 2.5794, time 5164.10ms 
iter 6669: loss 2.4295, time 5199.09ms 
iter 6670: loss 2.4933, time 5216.69ms 
iter 6671: loss 2.5272, time 5252.95ms 
iter 6672: loss 2.2912, time 5252.00ms 
iter 6673: loss 2.4443, time 5257.54ms 
iter 6674: loss 2.4841, time 5255.89ms 
iter 6675: loss 2.3776, time 5312.03ms 
iter 6676: loss 2.6132, time 5161.06ms 
iter 6677: loss 2.5903, time 5140.46ms 
iter 6678: loss 2.5058, time 5141.98ms 
iter 6679: loss 2.6660, time 5222.63ms 
iter 6680: loss 2.5414, time 5252.68ms 
iter 6681: loss 2.3689, time 5255.73ms 
iter 6682: loss 2.2073, time 5300.61ms 
iter 6683: loss 2.3228, time 5245.39ms 
iter 6684: loss 2.6721, time 5264.37ms 
iter 6685: loss 2.2752, time 5058.47ms 
iter 6686: loss 2.4715, time 5085.63ms 
iter 6687: loss 2.6600, time 5035.38ms 
iter 6688: loss 2.4857, time 5094.79ms 
iter 6689: loss 2.6525, time 5073.14ms 
iter 6690: loss 2.8526, time 5105.23ms 
iter 6691: loss 2.7146, time 5058.29ms 
iter 6692: loss 2.6963, time 5060.72ms 
iter 6693: loss 2.3596, time 5054.33ms 
iter 6694: loss 2.2024, time 5159.44ms 
iter 6695: loss 2.4330, time 5072.51ms 
iter 6696: loss 2.3498, time 5043.48ms 
iter 6697: loss 2.5295, time 5131.58ms 
iter 6698: loss 2.3034, time 5271.78ms 
iter 6699: loss 2.4763, time 5246.54ms 
step 6700: train loss 2.5120, val loss 2.8416
iter 6700: loss 2.5694, time 20056.88ms 
iter 6701: loss 2.6523, time 5210.40ms 
iter 6702: loss 2.1586, time 5229.46ms 
iter 6703: loss 2.3718, time 5205.94ms 
iter 6704: loss 2.6094, time 5220.37ms 
iter 6705: loss 2.4776, time 5219.03ms 
iter 6706: loss 2.5176, time 5171.72ms 
iter 6707: loss 2.5053, time 5252.01ms 
iter 6708: loss 2.9331, time 5109.23ms 
iter 6709: loss 2.8009, time 5052.38ms 
iter 6710: loss 2.5721, time 5060.83ms 
iter 6711: loss 2.4851, time 5189.32ms 
iter 6712: loss 2.4977, time 5259.42ms 
iter 6713: loss 2.4912, time 5263.42ms 
iter 6714: loss 2.7207, time 5256.50ms 
iter 6715: loss 2.4780, time 5072.57ms 
iter 6716: loss 2.3069, time 5172.53ms 
iter 6717: loss 2.3389, time 5259.75ms 
iter 6718: loss 2.5668, time 5063.22ms 
iter 6719: loss 2.5966, time 5052.26ms 
iter 6720: loss 2.5958, time 5040.46ms 
iter 6721: loss 2.6877, time 5049.63ms 
iter 6722: loss 2.4777, time 5052.69ms 
iter 6723: loss 2.5844, time 5097.67ms 
iter 6724: loss 2.4848, time 5258.94ms 
iter 6725: loss 2.3247, time 5255.79ms 
iter 6726: loss 2.4880, time 5256.06ms 
iter 6727: loss 2.5840, time 5275.67ms 
iter 6728: loss 2.4558, time 5265.75ms 
iter 6729: loss 2.4959, time 5259.57ms 
iter 6730: loss 2.4082, time 5269.32ms 
iter 6731: loss 2.4361, time 5254.92ms 
iter 6732: loss 2.2846, time 5261.11ms 
iter 6733: loss 2.4052, time 5263.06ms 
iter 6734: loss 2.5159, time 5247.56ms 
iter 6735: loss 2.5021, time 5143.98ms 
iter 6736: loss 2.5283, time 5218.95ms 
iter 6737: loss 2.5563, time 5217.66ms 
iter 6738: loss 2.4138, time 5216.19ms 
iter 6739: loss 2.3416, time 5211.47ms 
iter 6740: loss 2.5244, time 5197.91ms 
iter 6741: loss 2.4804, time 5162.95ms 
iter 6742: loss 2.6348, time 5186.47ms 
iter 6743: loss 2.5201, time 5251.16ms 
iter 6744: loss 2.4273, time 5236.11ms 
iter 6745: loss 2.4113, time 5233.99ms 
iter 6746: loss 2.1896, time 5248.12ms 
iter 6747: loss 2.3623, time 5245.59ms 
iter 6748: loss 2.7184, time 5175.95ms 
iter 6749: loss 2.4304, time 5050.61ms 
step 6750: train loss 2.5198, val loss 2.8341
iter 6750: loss 2.6062, time 19808.99ms 
iter 6751: loss 2.3564, time 5052.07ms 
iter 6752: loss 2.4698, time 5209.73ms 
iter 6753: loss 2.5003, time 5275.49ms 
iter 6754: loss 2.5652, time 5256.09ms 
iter 6755: loss 2.5408, time 5227.99ms 
iter 6756: loss 2.5969, time 5258.70ms 
iter 6757: loss 2.5180, time 5205.48ms 
iter 6758: loss 2.5832, time 5227.99ms 
iter 6759: loss 2.4632, time 5263.14ms 
iter 6760: loss 2.5577, time 5245.19ms 
iter 6761: loss 2.4165, time 5236.15ms 
iter 6762: loss 2.4491, time 5260.44ms 
iter 6763: loss 2.5391, time 5256.92ms 
iter 6764: loss 2.7145, time 5277.34ms 
iter 6765: loss 2.7226, time 5252.97ms 
iter 6766: loss 2.6139, time 5249.50ms 
iter 6767: loss 2.4944, time 5255.59ms 
iter 6768: loss 2.5438, time 5240.98ms 
iter 6769: loss 2.5149, time 5254.66ms 
iter 6770: loss 2.4534, time 5265.28ms 
iter 6771: loss 2.3767, time 5266.32ms 
iter 6772: loss 2.7009, time 5324.33ms 
iter 6773: loss 2.3923, time 5324.72ms 
iter 6774: loss 2.3935, time 5279.11ms 
iter 6775: loss 2.4619, time 5289.87ms 
iter 6776: loss 2.4051, time 5255.29ms 
iter 6777: loss 2.6208, time 5238.06ms 
iter 6778: loss 2.5037, time 5247.45ms 
iter 6779: loss 2.6406, time 5246.14ms 
iter 6780: loss 2.4733, time 5251.00ms 
iter 6781: loss 2.7055, time 5265.59ms 
iter 6782: loss 2.6278, time 5247.96ms 
iter 6783: loss 2.6065, time 5254.11ms 
iter 6784: loss 2.6218, time 5249.49ms 
iter 6785: loss 2.4499, time 5270.50ms 
iter 6786: loss 2.7021, time 5227.40ms 
iter 6787: loss 2.3980, time 5245.54ms 
iter 6788: loss 2.5057, time 5247.27ms 
iter 6789: loss 2.4595, time 5265.28ms 
iter 6790: loss 2.2740, time 5263.38ms 
iter 6791: loss 2.6991, time 5259.28ms 
iter 6792: loss 2.5824, time 5259.64ms 
iter 6793: loss 2.6271, time 5254.77ms 
iter 6794: loss 2.5005, time 5256.54ms 
iter 6795: loss 2.4000, time 5308.75ms 
iter 6796: loss 2.6461, time 5179.59ms 
iter 6797: loss 2.2652, time 5263.76ms 
iter 6798: loss 2.4554, time 5286.91ms 
iter 6799: loss 2.5585, time 5318.14ms 
step 6800: train loss 2.5074, val loss 2.8286
iter 6800: loss 2.5761, time 19998.14ms 
iter 6801: loss 2.4897, time 5289.90ms 
iter 6802: loss 2.3813, time 5215.83ms 
iter 6803: loss 2.3132, time 5305.12ms 
iter 6804: loss 2.5795, time 5326.54ms 
iter 6805: loss 2.5819, time 5281.75ms 
iter 6806: loss 2.5319, time 5051.62ms 
iter 6807: loss 2.6002, time 5066.45ms 
iter 6808: loss 2.4198, time 5053.51ms 
iter 6809: loss 2.4957, time 5048.75ms 
iter 6810: loss 2.2443, time 5055.10ms 
iter 6811: loss 2.5231, time 5043.03ms 
iter 6812: loss 2.5909, time 5250.07ms 
iter 6813: loss 2.6385, time 5225.11ms 
iter 6814: loss 2.5315, time 5051.08ms 
iter 6815: loss 2.5727, time 5062.70ms 
iter 6816: loss 2.5386, time 5264.30ms 
iter 6817: loss 2.6652, time 5269.38ms 
iter 6818: loss 2.6052, time 5250.87ms 
iter 6819: loss 2.2989, time 5266.36ms 
iter 6820: loss 2.5223, time 5041.39ms 
iter 6821: loss 2.6254, time 5051.89ms 
iter 6822: loss 2.5824, time 5070.18ms 
iter 6823: loss 2.2614, time 5285.71ms 
iter 6824: loss 2.8067, time 5325.77ms 
iter 6825: loss 2.6497, time 5305.73ms 
iter 6826: loss 2.5168, time 5167.54ms 
iter 6827: loss 2.5673, time 5256.99ms 
iter 6828: loss 2.6544, time 5039.40ms 
iter 6829: loss 2.5360, time 5045.52ms 
iter 6830: loss 2.5740, time 5124.46ms 
iter 6831: loss 2.4376, time 5181.98ms 
iter 6832: loss 2.5617, time 5061.33ms 
iter 6833: loss 2.6724, time 5078.46ms 
iter 6834: loss 2.4354, time 5052.21ms 
iter 6835: loss 2.4826, time 5204.36ms 
iter 6836: loss 2.6707, time 5317.13ms 
iter 6837: loss 2.4469, time 5310.50ms 
iter 6838: loss 2.4375, time 5303.54ms 
iter 6839: loss 2.6499, time 5304.70ms 
iter 6840: loss 2.7539, time 5271.46ms 
iter 6841: loss 2.5991, time 5238.35ms 
iter 6842: loss 2.6063, time 5253.03ms 
iter 6843: loss 2.4267, time 5256.48ms 
iter 6844: loss 2.5830, time 5271.07ms 
iter 6845: loss 2.6659, time 5258.53ms 
iter 6846: loss 2.3027, time 5251.96ms 
iter 6847: loss 2.4232, time 5254.89ms 
iter 6848: loss 2.4856, time 5258.48ms 
iter 6849: loss 2.5162, time 5257.46ms 
step 6850: train loss 2.4891, val loss 2.8525
iter 6850: loss 2.5803, time 19989.66ms 
iter 6851: loss 2.4294, time 5259.71ms 
iter 6852: loss 2.5391, time 5250.39ms 
iter 6853: loss 2.3508, time 5232.60ms 
iter 6854: loss 2.5548, time 5231.19ms 
iter 6855: loss 2.6736, time 5222.75ms 
iter 6856: loss 2.4598, time 5210.87ms 
iter 6857: loss 2.6827, time 5205.27ms 
iter 6858: loss 2.3306, time 5207.93ms 
iter 6859: loss 2.3486, time 5217.48ms 
iter 6860: loss 2.5155, time 5200.34ms 
iter 6861: loss 2.5179, time 5225.95ms 
iter 6862: loss 2.5138, time 5260.84ms 
iter 6863: loss 2.5901, time 5264.15ms 
iter 6864: loss 2.4314, time 5259.92ms 
iter 6865: loss 2.7258, time 5260.70ms 
iter 6866: loss 2.5887, time 5070.54ms 
iter 6867: loss 2.3830, time 5074.14ms 
iter 6868: loss 2.5996, time 5046.95ms 
iter 6869: loss 2.7349, time 5167.44ms 
iter 6870: loss 2.3784, time 5259.67ms 
iter 6871: loss 2.6868, time 5275.27ms 
iter 6872: loss 2.5279, time 5189.08ms 
iter 6873: loss 2.3633, time 5209.78ms 
iter 6874: loss 2.5576, time 5205.97ms 
iter 6875: loss 2.4080, time 5156.20ms 
iter 6876: loss 2.6712, time 5192.65ms 
iter 6877: loss 2.5763, time 5208.19ms 
iter 6878: loss 2.6744, time 5161.67ms 
iter 6879: loss 2.7133, time 5092.37ms 
iter 6880: loss 2.3751, time 5249.43ms 
iter 6881: loss 2.3149, time 5237.83ms 
iter 6882: loss 2.5702, time 5268.17ms 
iter 6883: loss 2.4750, time 5134.77ms 
iter 6884: loss 2.5372, time 5059.63ms 
iter 6885: loss 2.5450, time 5100.24ms 
iter 6886: loss 2.4739, time 5264.29ms 
iter 6887: loss 2.4712, time 5254.60ms 
iter 6888: loss 2.5764, time 5239.91ms 
iter 6889: loss 2.5661, time 5047.76ms 
iter 6890: loss 2.3894, time 5036.25ms 
iter 6891: loss 2.4401, time 5188.67ms 
iter 6892: loss 2.4060, time 5228.36ms 
iter 6893: loss 2.4387, time 5269.90ms 
iter 6894: loss 2.5124, time 5288.66ms 
iter 6895: loss 2.6694, time 5278.86ms 
iter 6896: loss 2.4741, time 5295.13ms 
iter 6897: loss 2.4941, time 5280.47ms 
iter 6898: loss 2.3298, time 5130.33ms 
iter 6899: loss 2.6524, time 5124.41ms 
step 6900: train loss 2.5060, val loss 2.8588
iter 6900: loss 2.5826, time 19916.65ms 
iter 6901: loss 2.3917, time 5229.31ms 
iter 6902: loss 2.7059, time 5251.82ms 
iter 6903: loss 2.6658, time 5273.22ms 
iter 6904: loss 2.6977, time 5276.76ms 
iter 6905: loss 2.5730, time 5281.19ms 
iter 6906: loss 2.5104, time 5266.60ms 
iter 6907: loss 2.5071, time 5266.72ms 
iter 6908: loss 2.5569, time 5221.28ms 
iter 6909: loss 2.6782, time 5103.90ms 
iter 6910: loss 2.3782, time 5219.20ms 
iter 6911: loss 2.6537, time 5256.48ms 
iter 6912: loss 2.4054, time 5232.01ms 
iter 6913: loss 2.5769, time 5274.79ms 
iter 6914: loss 2.6275, time 5246.54ms 
iter 6915: loss 2.6203, time 5161.67ms 
iter 6916: loss 2.7987, time 5208.94ms 
iter 6917: loss 2.3525, time 5228.48ms 
iter 6918: loss 2.6261, time 5205.88ms 
iter 6919: loss 2.4195, time 5249.07ms 
iter 6920: loss 2.2122, time 5204.41ms 
iter 6921: loss 2.4895, time 5240.18ms 
iter 6922: loss 2.5526, time 5222.76ms 
iter 6923: loss 2.4457, time 5216.14ms 
iter 6924: loss 2.5307, time 5219.46ms 
iter 6925: loss 2.5629, time 5215.45ms 
iter 6926: loss 2.6953, time 5230.80ms 
iter 6927: loss 2.6539, time 5215.28ms 
iter 6928: loss 2.6066, time 5247.41ms 
iter 6929: loss 2.4664, time 5176.51ms 
iter 6930: loss 2.6525, time 5260.49ms 
iter 6931: loss 2.5583, time 5257.82ms 
iter 6932: loss 2.2376, time 5248.68ms 
iter 6933: loss 2.6479, time 5253.97ms 
iter 6934: loss 2.6036, time 5260.53ms 
iter 6935: loss 2.5910, time 5257.48ms 
iter 6936: loss 2.7302, time 5246.22ms 
iter 6937: loss 2.5173, time 5259.85ms 
iter 6938: loss 2.7051, time 5252.55ms 
iter 6939: loss 2.5603, time 5255.29ms 
iter 6940: loss 2.4703, time 5244.51ms 
iter 6941: loss 2.4346, time 5254.01ms 
iter 6942: loss 2.6786, time 5242.66ms 
iter 6943: loss 2.7324, time 5231.75ms 
iter 6944: loss 2.3980, time 5242.13ms 
iter 6945: loss 2.3061, time 5232.50ms 
iter 6946: loss 2.4551, time 5246.58ms 
iter 6947: loss 2.4597, time 5242.21ms 
iter 6948: loss 2.6454, time 5257.80ms 
iter 6949: loss 2.5774, time 5197.68ms 
step 6950: train loss 2.5046, val loss 2.8343
iter 6950: loss 2.4707, time 19978.97ms 
iter 6951: loss 2.3461, time 5246.77ms 
iter 6952: loss 2.5144, time 5211.84ms 
iter 6953: loss 2.4293, time 5045.17ms 
iter 6954: loss 2.4891, time 5042.67ms 
iter 6955: loss 2.4417, time 5025.27ms 
iter 6956: loss 2.6346, time 5059.15ms 
iter 6957: loss 2.6034, time 5253.16ms 
iter 6958: loss 2.4697, time 5267.79ms 
iter 6959: loss 2.4033, time 5177.16ms 
iter 6960: loss 2.4010, time 5154.97ms 
iter 6961: loss 2.3960, time 5194.66ms 
iter 6962: loss 2.5842, time 5148.43ms 
iter 6963: loss 2.6736, time 5148.59ms 
iter 6964: loss 2.5821, time 5050.31ms 
iter 6965: loss 2.5978, time 5189.28ms 
iter 6966: loss 2.4214, time 5173.99ms 
iter 6967: loss 2.5648, time 5140.98ms 
iter 6968: loss 2.5239, time 5101.67ms 
iter 6969: loss 2.5847, time 5041.79ms 
iter 6970: loss 2.4687, time 5149.89ms 
iter 6971: loss 2.5778, time 5233.72ms 
iter 6972: loss 2.5227, time 5281.38ms 
iter 6973: loss 2.2538, time 5252.64ms 
iter 6974: loss 2.2025, time 5253.44ms 
iter 6975: loss 2.5298, time 5232.57ms 
iter 6976: loss 2.5835, time 5286.55ms 
iter 6977: loss 2.5862, time 5267.14ms 
iter 6978: loss 2.5843, time 5207.32ms 
iter 6979: loss 2.4964, time 5143.50ms 
iter 6980: loss 2.6146, time 5061.48ms 
iter 6981: loss 2.5980, time 5193.97ms 
iter 6982: loss 2.4656, time 5251.43ms 
iter 6983: loss 2.6880, time 5260.71ms 
iter 6984: loss 2.4740, time 5294.33ms 
iter 6985: loss 2.4346, time 5067.66ms 
iter 6986: loss 2.5236, time 5045.06ms 
iter 6987: loss 2.6234, time 5046.05ms 
iter 6988: loss 2.5693, time 5072.71ms 
iter 6989: loss 2.4627, time 5210.88ms 
iter 6990: loss 2.6824, time 5269.85ms 
iter 6991: loss 2.3869, time 5065.06ms 
iter 6992: loss 2.4080, time 5033.50ms 
iter 6993: loss 2.6482, time 5068.67ms 
iter 6994: loss 2.5795, time 5265.33ms 
iter 6995: loss 2.6187, time 5297.61ms 
iter 6996: loss 2.4748, time 5281.64ms 
iter 6997: loss 2.4889, time 5054.91ms 
iter 6998: loss 2.6583, time 5052.17ms 
iter 6999: loss 2.3629, time 5047.66ms 
step 7000: train loss 2.5114, val loss 2.8355
iter 7000: loss 2.5927, time 19887.15ms 
iter 7001: loss 2.5836, time 5241.66ms 
iter 7002: loss 2.3991, time 5188.53ms 
iter 7003: loss 2.5893, time 5043.04ms 
iter 7004: loss 2.6033, time 5060.56ms 
iter 7005: loss 2.4769, time 5043.62ms 
iter 7006: loss 2.6978, time 5132.69ms 
iter 7007: loss 2.4749, time 5186.13ms 
iter 7008: loss 2.5204, time 5200.20ms 
iter 7009: loss 2.2360, time 5232.81ms 
iter 7010: loss 2.7478, time 5089.26ms 
iter 7011: loss 2.4880, time 5055.15ms 
iter 7012: loss 2.4518, time 5064.10ms 
iter 7013: loss 2.7234, time 5049.91ms 
iter 7014: loss 2.4007, time 5098.39ms 
iter 7015: loss 2.7010, time 5210.97ms 
iter 7016: loss 2.5106, time 5219.56ms 
iter 7017: loss 2.3944, time 5216.48ms 
iter 7018: loss 2.3595, time 5209.20ms 
iter 7019: loss 2.6284, time 5211.45ms 
iter 7020: loss 2.5242, time 5222.18ms 
iter 7021: loss 2.2246, time 5221.39ms 
iter 7022: loss 2.4298, time 5167.94ms 
iter 7023: loss 2.4972, time 5058.31ms 
iter 7024: loss 2.5495, time 5053.50ms 
iter 7025: loss 2.2657, time 5066.55ms 
iter 7026: loss 2.3473, time 5197.00ms 
iter 7027: loss 2.5767, time 5270.25ms 
iter 7028: loss 2.8342, time 5278.96ms 
iter 7029: loss 2.6417, time 5200.44ms 
iter 7030: loss 2.5030, time 5046.29ms 
iter 7031: loss 2.4076, time 5053.57ms 
iter 7032: loss 2.3485, time 5051.12ms 
iter 7033: loss 2.4821, time 5049.63ms 
iter 7034: loss 2.6042, time 5042.72ms 
iter 7035: loss 2.4857, time 5042.19ms 
iter 7036: loss 2.4036, time 5047.90ms 
iter 7037: loss 2.8139, time 5036.61ms 
iter 7038: loss 2.5362, time 5054.02ms 
iter 7039: loss 2.5282, time 5048.64ms 
iter 7040: loss 2.4725, time 5196.19ms 
iter 7041: loss 2.7709, time 5290.83ms 
iter 7042: loss 2.5922, time 5243.16ms 
iter 7043: loss 2.4245, time 5076.59ms 
iter 7044: loss 2.6183, time 5074.73ms 
iter 7045: loss 2.4435, time 5050.96ms 
iter 7046: loss 2.7678, time 5052.46ms 
iter 7047: loss 2.6595, time 5047.55ms 
iter 7048: loss 2.5244, time 5049.11ms 
iter 7049: loss 2.6585, time 5058.53ms 
step 7050: train loss 2.5084, val loss 2.8582
iter 7050: loss 2.8041, time 19983.58ms 
iter 7051: loss 2.4825, time 5029.48ms 
iter 7052: loss 2.8055, time 5309.07ms 
iter 7053: loss 2.4563, time 5283.12ms 
iter 7054: loss 2.4804, time 5278.73ms 
iter 7055: loss 2.3457, time 5038.30ms 
iter 7056: loss 2.5638, time 5064.00ms 
iter 7057: loss 2.4588, time 5226.50ms 
iter 7058: loss 2.6901, time 5292.42ms 
iter 7059: loss 2.5025, time 5286.75ms 
iter 7060: loss 2.6045, time 5292.28ms 
iter 7061: loss 2.4424, time 5259.94ms 
iter 7062: loss 2.5340, time 5276.03ms 
iter 7063: loss 2.6302, time 5271.96ms 
iter 7064: loss 2.6983, time 5252.63ms 
iter 7065: loss 2.6374, time 5259.58ms 
iter 7066: loss 2.6674, time 5153.96ms 
iter 7067: loss 2.5295, time 5024.25ms 
iter 7068: loss 2.3947, time 5028.04ms 
iter 7069: loss 2.5243, time 5030.45ms 
iter 7070: loss 2.5151, time 5023.68ms 
iter 7071: loss 2.6426, time 5137.05ms 
iter 7072: loss 2.3992, time 5269.50ms 
iter 7073: loss 2.4692, time 5191.58ms 
iter 7074: loss 2.5977, time 5041.22ms 
iter 7075: loss 2.6188, time 5077.63ms 
iter 7076: loss 2.7841, time 5051.40ms 
iter 7077: loss 2.3894, time 5025.24ms 
iter 7078: loss 2.5448, time 5168.55ms 
iter 7079: loss 2.4128, time 5258.16ms 
iter 7080: loss 2.5368, time 5278.68ms 
iter 7081: loss 2.5368, time 5265.13ms 
iter 7082: loss 2.4733, time 5274.24ms 
iter 7083: loss 2.5496, time 5275.92ms 
iter 7084: loss 2.3986, time 5234.45ms 
iter 7085: loss 2.5595, time 5239.46ms 
iter 7086: loss 2.4003, time 5257.74ms 
iter 7087: loss 2.4744, time 5131.45ms 
iter 7088: loss 2.3229, time 5042.86ms 
iter 7089: loss 2.4287, time 5046.52ms 
iter 7090: loss 2.5294, time 5052.26ms 
iter 7091: loss 2.6454, time 5048.73ms 
iter 7092: loss 2.6050, time 5047.39ms 
iter 7093: loss 2.3825, time 5047.62ms 
iter 7094: loss 2.4484, time 5249.97ms 
iter 7095: loss 2.6968, time 5260.07ms 
iter 7096: loss 2.5383, time 5261.58ms 
iter 7097: loss 2.3058, time 5272.58ms 
iter 7098: loss 2.3804, time 5252.87ms 
iter 7099: loss 2.5828, time 5249.31ms 
step 7100: train loss 2.5027, val loss 2.8454
iter 7100: loss 2.5910, time 20020.33ms 
iter 7101: loss 2.4346, time 5224.87ms 
iter 7102: loss 2.5432, time 5260.29ms 
iter 7103: loss 2.4299, time 5256.37ms 
iter 7104: loss 2.4985, time 5252.41ms 
iter 7105: loss 2.4881, time 5251.90ms 
iter 7106: loss 2.5330, time 5264.93ms 
iter 7107: loss 2.4065, time 5292.65ms 
iter 7108: loss 2.4903, time 5252.53ms 
iter 7109: loss 2.3771, time 5255.24ms 
iter 7110: loss 2.4509, time 5246.25ms 
iter 7111: loss 2.6366, time 5264.39ms 
iter 7112: loss 2.2591, time 5257.20ms 
iter 7113: loss 2.4633, time 5249.41ms 
iter 7114: loss 2.3028, time 5258.16ms 
iter 7115: loss 2.5671, time 5254.72ms 
iter 7116: loss 2.6406, time 5273.29ms 
iter 7117: loss 2.5590, time 5270.91ms 
iter 7118: loss 2.5184, time 5256.40ms 
iter 7119: loss 2.5215, time 5253.11ms 
iter 7120: loss 2.2914, time 5256.53ms 
iter 7121: loss 2.6142, time 5320.42ms 
iter 7122: loss 2.4842, time 5334.50ms 
iter 7123: loss 2.4755, time 5332.15ms 
iter 7124: loss 2.6229, time 5340.64ms 
iter 7125: loss 2.3811, time 5262.23ms 
iter 7126: loss 2.4757, time 5292.10ms 
iter 7127: loss 2.4491, time 5261.97ms 
iter 7128: loss 2.5264, time 5317.67ms 
iter 7129: loss 2.5070, time 5326.78ms 
iter 7130: loss 2.5285, time 5267.53ms 
iter 7131: loss 2.6342, time 5274.41ms 
iter 7132: loss 2.5678, time 5254.00ms 
iter 7133: loss 2.5270, time 5256.19ms 
iter 7134: loss 2.4472, time 5252.92ms 
iter 7135: loss 2.6753, time 5256.85ms 
iter 7136: loss 2.5637, time 5254.90ms 
iter 7137: loss 2.5656, time 5243.89ms 
iter 7138: loss 2.4975, time 5248.59ms 
iter 7139: loss 2.4422, time 5264.92ms 
iter 7140: loss 2.3124, time 5258.93ms 
iter 7141: loss 2.3247, time 5253.35ms 
iter 7142: loss 2.8310, time 5259.00ms 
iter 7143: loss 2.3599, time 5258.82ms 
iter 7144: loss 2.6404, time 5263.65ms 
iter 7145: loss 2.5024, time 5255.93ms 
iter 7146: loss 2.4402, time 5263.81ms 
iter 7147: loss 2.5215, time 5264.79ms 
iter 7148: loss 2.5606, time 5252.61ms 
iter 7149: loss 2.5048, time 5253.62ms 
step 7150: train loss 2.5124, val loss 2.8212
iter 7150: loss 2.6085, time 20041.95ms 
iter 7151: loss 2.5797, time 5264.76ms 
iter 7152: loss 2.5880, time 5265.33ms 
iter 7153: loss 2.4573, time 5279.02ms 
iter 7154: loss 2.6544, time 5289.97ms 
iter 7155: loss 2.5402, time 5329.71ms 
iter 7156: loss 2.4738, time 5274.66ms 
iter 7157: loss 2.3868, time 5259.10ms 
iter 7158: loss 2.7207, time 5249.11ms 
iter 7159: loss 2.4068, time 5248.79ms 
iter 7160: loss 2.5250, time 5251.45ms 
iter 7161: loss 2.4383, time 5292.45ms 
iter 7162: loss 2.3918, time 5265.87ms 
iter 7163: loss 2.7426, time 5262.78ms 
iter 7164: loss 2.3470, time 5259.76ms 
iter 7165: loss 2.4939, time 5261.48ms 
iter 7166: loss 2.4906, time 5261.65ms 
iter 7167: loss 2.4336, time 5247.66ms 
iter 7168: loss 2.3190, time 5260.02ms 
iter 7169: loss 2.6739, time 5272.65ms 
iter 7170: loss 2.5567, time 5267.60ms 
iter 7171: loss 2.6515, time 5257.60ms 
iter 7172: loss 2.5439, time 5261.88ms 
iter 7173: loss 2.3563, time 5257.63ms 
iter 7174: loss 2.5543, time 5300.78ms 
iter 7175: loss 2.5388, time 5252.87ms 
iter 7176: loss 2.5707, time 5251.79ms 
iter 7177: loss 2.6196, time 5270.74ms 
iter 7178: loss 2.3890, time 5311.96ms 
iter 7179: loss 2.5019, time 5257.30ms 
iter 7180: loss 2.4118, time 5283.60ms 
iter 7181: loss 2.8636, time 5250.52ms 
iter 7182: loss 2.5807, time 5261.44ms 
iter 7183: loss 2.6038, time 5258.11ms 
iter 7184: loss 2.6010, time 5259.48ms 
iter 7185: loss 2.4855, time 5255.72ms 
iter 7186: loss 2.6794, time 5265.04ms 
iter 7187: loss 2.4021, time 5251.67ms 
iter 7188: loss 2.4134, time 5296.93ms 
iter 7189: loss 2.5077, time 5268.82ms 
iter 7190: loss 2.7885, time 5266.74ms 
iter 7191: loss 2.3995, time 5262.03ms 
iter 7192: loss 2.5269, time 5253.52ms 
iter 7193: loss 2.4890, time 5242.03ms 
iter 7194: loss 2.3393, time 5271.09ms 
iter 7195: loss 2.5108, time 5267.79ms 
iter 7196: loss 2.7065, time 5262.11ms 
iter 7197: loss 2.3662, time 5258.58ms 
iter 7198: loss 2.6731, time 5267.02ms 
iter 7199: loss 2.6131, time 5265.65ms 
step 7200: train loss 2.5018, val loss 2.8342
iter 7200: loss 2.2771, time 20050.84ms 
iter 7201: loss 2.2806, time 5269.23ms 
iter 7202: loss 2.5766, time 5300.62ms 
iter 7203: loss 2.3615, time 5290.52ms 
iter 7204: loss 2.5851, time 5273.06ms 
iter 7205: loss 2.4620, time 5269.27ms 
iter 7206: loss 2.6339, time 5276.75ms 
iter 7207: loss 2.4210, time 5294.01ms 
iter 7208: loss 2.4704, time 5267.42ms 
iter 7209: loss 2.4830, time 5252.96ms 
iter 7210: loss 2.5581, time 5249.58ms 
iter 7211: loss 2.5547, time 5273.44ms 
iter 7212: loss 2.5934, time 5305.67ms 
iter 7213: loss 2.3924, time 5287.47ms 
iter 7214: loss 2.4375, time 5261.70ms 
iter 7215: loss 2.2788, time 5272.26ms 
iter 7216: loss 2.6163, time 5255.18ms 
iter 7217: loss 2.3788, time 5254.15ms 
iter 7218: loss 2.4966, time 5267.27ms 
iter 7219: loss 2.7085, time 5269.03ms 
iter 7220: loss 2.4714, time 5274.88ms 
iter 7221: loss 2.5896, time 5265.28ms 
iter 7222: loss 2.6488, time 5255.77ms 
iter 7223: loss 2.5419, time 5262.07ms 
iter 7224: loss 2.4680, time 5264.66ms 
iter 7225: loss 2.5270, time 5260.21ms 
iter 7226: loss 2.8576, time 5281.76ms 
iter 7227: loss 2.5266, time 5266.79ms 
iter 7228: loss 2.5208, time 5275.86ms 
iter 7229: loss 2.3603, time 5268.39ms 
iter 7230: loss 2.3014, time 5259.54ms 
iter 7231: loss 2.3361, time 5265.98ms 
iter 7232: loss 2.5239, time 5323.72ms 
iter 7233: loss 2.5694, time 5261.51ms 
iter 7234: loss 2.5615, time 5261.83ms 
iter 7235: loss 2.4834, time 5255.61ms 
iter 7236: loss 2.3014, time 5266.95ms 
iter 7237: loss 2.5117, time 5274.87ms 
iter 7238: loss 2.5512, time 5258.86ms 
iter 7239: loss 2.4159, time 5251.63ms 
iter 7240: loss 2.4834, time 5252.95ms 
iter 7241: loss 2.6444, time 5325.74ms 
iter 7242: loss 2.6741, time 5316.06ms 
iter 7243: loss 2.5778, time 5315.09ms 
iter 7244: loss 2.4526, time 5254.46ms 
iter 7245: loss 2.4116, time 5250.89ms 
iter 7246: loss 2.6138, time 5247.53ms 
iter 7247: loss 2.4711, time 5253.40ms 
iter 7248: loss 2.3140, time 5254.39ms 
iter 7249: loss 2.4081, time 5265.03ms 
step 7250: train loss 2.5130, val loss 2.8384
iter 7250: loss 2.3719, time 19959.38ms 
iter 7251: loss 2.4398, time 5244.93ms 
iter 7252: loss 2.6807, time 5246.26ms 
iter 7253: loss 2.4682, time 5252.52ms 
iter 7254: loss 2.5266, time 5136.94ms 
iter 7255: loss 2.4706, time 5167.45ms 
iter 7256: loss 2.3211, time 5196.42ms 
iter 7257: loss 2.4236, time 5229.69ms 
iter 7258: loss 2.3921, time 5251.33ms 
iter 7259: loss 2.6333, time 5252.48ms 
iter 7260: loss 2.4791, time 5254.75ms 
iter 7261: loss 2.5218, time 5261.82ms 
iter 7262: loss 2.3190, time 5252.41ms 
iter 7263: loss 2.2519, time 5266.15ms 
iter 7264: loss 2.3975, time 5259.02ms 
iter 7265: loss 2.3372, time 5254.12ms 
iter 7266: loss 2.6559, time 5253.59ms 
iter 7267: loss 2.5715, time 5259.58ms 
iter 7268: loss 2.4446, time 5250.00ms 
iter 7269: loss 2.5049, time 5237.62ms 
iter 7270: loss 2.5424, time 5241.34ms 
iter 7271: loss 2.5851, time 5249.50ms 
iter 7272: loss 2.4545, time 5246.68ms 
iter 7273: loss 2.5586, time 5248.14ms 
iter 7274: loss 2.6088, time 5193.36ms 
iter 7275: loss 2.5560, time 5158.81ms 
iter 7276: loss 2.4690, time 5157.11ms 
iter 7277: loss 2.4525, time 5116.24ms 
iter 7278: loss 2.5188, time 5081.92ms 
iter 7279: loss 2.7125, time 5177.26ms 
iter 7280: loss 2.5043, time 5154.03ms 
iter 7281: loss 2.5169, time 5260.32ms 
iter 7282: loss 2.6179, time 5239.52ms 
iter 7283: loss 2.5145, time 5250.46ms 
iter 7284: loss 2.4580, time 5215.89ms 
iter 7285: loss 2.5346, time 5277.77ms 
iter 7286: loss 2.4117, time 5252.53ms 
iter 7287: loss 2.3119, time 5260.98ms 
iter 7288: loss 2.5761, time 5256.84ms 
iter 7289: loss 2.6458, time 5257.28ms 
iter 7290: loss 2.3950, time 5259.66ms 
iter 7291: loss 2.2810, time 5255.71ms 
iter 7292: loss 2.4595, time 5256.88ms 
iter 7293: loss 2.4957, time 5262.70ms 
iter 7294: loss 2.5203, time 5258.27ms 
iter 7295: loss 2.4953, time 5236.82ms 
iter 7296: loss 2.6073, time 5248.31ms 
iter 7297: loss 2.6418, time 5245.92ms 
iter 7298: loss 2.4609, time 5248.26ms 
iter 7299: loss 2.7132, time 5208.01ms 
step 7300: train loss 2.4814, val loss 2.8374
iter 7300: loss 2.5392, time 19978.97ms 
iter 7301: loss 2.3995, time 5244.56ms 
iter 7302: loss 2.4448, time 5250.67ms 
iter 7303: loss 2.4326, time 5251.36ms 
iter 7304: loss 2.5323, time 5239.53ms 
iter 7305: loss 2.4626, time 5242.78ms 
iter 7306: loss 2.5239, time 5258.30ms 
iter 7307: loss 2.5604, time 5246.42ms 
iter 7308: loss 2.5546, time 5248.93ms 
iter 7309: loss 2.3821, time 5245.38ms 
iter 7310: loss 2.6604, time 5253.16ms 
iter 7311: loss 2.5441, time 5263.22ms 
iter 7312: loss 2.5420, time 5249.78ms 
iter 7313: loss 2.4172, time 5246.42ms 
iter 7314: loss 2.5783, time 5246.25ms 
iter 7315: loss 2.6268, time 5247.56ms 
iter 7316: loss 2.5180, time 5246.08ms 
iter 7317: loss 2.5610, time 5198.17ms 
iter 7318: loss 2.5538, time 5142.83ms 
iter 7319: loss 2.4836, time 5215.29ms 
iter 7320: loss 2.3021, time 5207.03ms 
iter 7321: loss 2.6860, time 5165.99ms 
iter 7322: loss 2.5401, time 5247.38ms 
iter 7323: loss 2.6254, time 5246.25ms 
iter 7324: loss 2.4527, time 5275.22ms 
iter 7325: loss 2.5254, time 5263.47ms 
iter 7326: loss 2.1883, time 5263.93ms 
iter 7327: loss 2.5739, time 5268.87ms 
iter 7328: loss 2.4580, time 5273.65ms 
iter 7329: loss 2.6083, time 5279.09ms 
iter 7330: loss 2.2352, time 5263.43ms 
iter 7331: loss 2.5980, time 5251.19ms 
iter 7332: loss 2.4585, time 5266.15ms 
iter 7333: loss 2.4557, time 5277.34ms 
iter 7334: loss 2.5127, time 5273.81ms 
iter 7335: loss 2.5561, time 5293.78ms 
iter 7336: loss 2.5135, time 5287.89ms 
iter 7337: loss 2.2322, time 5270.54ms 
iter 7338: loss 2.5249, time 5256.14ms 
iter 7339: loss 2.2747, time 5260.12ms 
iter 7340: loss 2.2995, time 5266.40ms 
iter 7341: loss 2.5078, time 5265.43ms 
iter 7342: loss 2.4620, time 5251.91ms 
iter 7343: loss 2.4684, time 5253.47ms 
iter 7344: loss 2.5191, time 5257.28ms 
iter 7345: loss 2.4017, time 5266.38ms 
iter 7346: loss 2.3252, time 5252.65ms 
iter 7347: loss 2.5741, time 5251.74ms 
iter 7348: loss 2.5594, time 5220.29ms 
iter 7349: loss 2.4900, time 5251.16ms 
step 7350: train loss 2.5115, val loss 2.8339
iter 7350: loss 2.6050, time 20014.69ms 
iter 7351: loss 2.4506, time 5254.72ms 
iter 7352: loss 2.6062, time 5262.60ms 
iter 7353: loss 2.3660, time 5259.86ms 
iter 7354: loss 2.7054, time 5246.48ms 
iter 7355: loss 2.5002, time 5250.71ms 
iter 7356: loss 2.4796, time 5250.64ms 
iter 7357: loss 2.5347, time 5263.80ms 
iter 7358: loss 2.5738, time 5254.37ms 
iter 7359: loss 2.4386, time 5262.30ms 
iter 7360: loss 2.7190, time 5252.13ms 
iter 7361: loss 2.5310, time 5236.62ms 
iter 7362: loss 2.4618, time 5248.28ms 
iter 7363: loss 2.4510, time 5254.81ms 
iter 7364: loss 2.2015, time 5256.33ms 
iter 7365: loss 2.4610, time 5289.05ms 
iter 7366: loss 2.3235, time 5249.28ms 
iter 7367: loss 2.4705, time 5265.07ms 
iter 7368: loss 2.5550, time 5316.89ms 
iter 7369: loss 2.5136, time 5305.77ms 
iter 7370: loss 2.5375, time 5264.46ms 
iter 7371: loss 2.7704, time 5289.35ms 
iter 7372: loss 2.5591, time 5331.36ms 
iter 7373: loss 2.4747, time 5330.26ms 
iter 7374: loss 2.5681, time 5346.10ms 
iter 7375: loss 2.4985, time 5276.07ms 
iter 7376: loss 2.5541, time 5246.39ms 
iter 7377: loss 2.4768, time 5248.51ms 
iter 7378: loss 2.6829, time 5249.36ms 
iter 7379: loss 2.4444, time 5237.70ms 
iter 7380: loss 2.3230, time 5245.25ms 
iter 7381: loss 2.5335, time 5246.00ms 
iter 7382: loss 2.5756, time 5260.18ms 
iter 7383: loss 2.5501, time 5242.44ms 
iter 7384: loss 2.4394, time 5249.86ms 
iter 7385: loss 2.4247, time 5243.64ms 
iter 7386: loss 2.4442, time 5250.74ms 
iter 7387: loss 2.4238, time 5251.21ms 
iter 7388: loss 2.6292, time 5234.49ms 
iter 7389: loss 2.6636, time 5221.66ms 
iter 7390: loss 2.6451, time 5232.93ms 
iter 7391: loss 2.5887, time 5252.90ms 
iter 7392: loss 2.7718, time 5229.88ms 
iter 7393: loss 2.5548, time 5239.68ms 
iter 7394: loss 2.6197, time 5206.54ms 
iter 7395: loss 2.3859, time 5215.41ms 
iter 7396: loss 2.4236, time 5226.34ms 
iter 7397: loss 2.5295, time 5208.33ms 
iter 7398: loss 2.4996, time 5228.77ms 
iter 7399: loss 2.6313, time 5240.01ms 
step 7400: train loss 2.5098, val loss 2.8403
iter 7400: loss 2.2973, time 20010.37ms 
iter 7401: loss 2.4870, time 5276.50ms 
iter 7402: loss 2.2919, time 5270.21ms 
iter 7403: loss 2.4736, time 5264.88ms 
iter 7404: loss 2.3849, time 5256.96ms 
iter 7405: loss 2.7209, time 5267.06ms 
iter 7406: loss 2.4586, time 5260.34ms 
iter 7407: loss 2.6479, time 5261.79ms 
iter 7408: loss 2.5380, time 5257.37ms 
iter 7409: loss 2.5736, time 5265.02ms 
iter 7410: loss 2.3837, time 5269.24ms 
iter 7411: loss 2.4192, time 5258.22ms 
iter 7412: loss 2.5962, time 5259.30ms 
iter 7413: loss 2.4068, time 5251.67ms 
iter 7414: loss 2.5289, time 5270.58ms 
iter 7415: loss 2.8482, time 5258.28ms 
iter 7416: loss 2.5142, time 5263.02ms 
iter 7417: loss 2.5174, time 5261.79ms 
iter 7418: loss 2.4130, time 5260.73ms 
iter 7419: loss 2.4675, time 5257.98ms 
iter 7420: loss 2.3287, time 5266.56ms 
iter 7421: loss 2.4886, time 5262.36ms 
iter 7422: loss 2.3587, time 5316.38ms 
iter 7423: loss 2.4616, time 5280.76ms 
iter 7424: loss 2.4729, time 5255.79ms 
iter 7425: loss 2.5150, time 5253.93ms 
iter 7426: loss 2.4470, time 5266.15ms 
iter 7427: loss 2.4654, time 5321.35ms 
iter 7428: loss 2.6690, time 5338.35ms 
iter 7429: loss 2.4148, time 5317.44ms 
iter 7430: loss 2.5024, time 5265.66ms 
iter 7431: loss 2.4276, time 5250.33ms 
iter 7432: loss 2.3340, time 5290.74ms 
iter 7433: loss 2.4491, time 5163.61ms 
iter 7434: loss 2.4231, time 5341.76ms 
iter 7435: loss 2.2514, time 5347.07ms 
iter 7436: loss 2.4238, time 5324.34ms 
iter 7437: loss 2.2431, time 5266.39ms 
iter 7438: loss 2.3724, time 5267.18ms 
iter 7439: loss 2.4093, time 5253.89ms 
iter 7440: loss 2.3737, time 5226.62ms 
iter 7441: loss 2.6047, time 5258.89ms 
iter 7442: loss 2.5158, time 5263.48ms 
iter 7443: loss 2.4164, time 5263.21ms 
iter 7444: loss 2.6628, time 5240.75ms 
iter 7445: loss 2.6890, time 5264.34ms 
iter 7446: loss 2.4644, time 5272.68ms 
iter 7447: loss 2.8300, time 5313.96ms 
iter 7448: loss 2.1999, time 5255.62ms 
iter 7449: loss 2.4041, time 5258.98ms 
step 7450: train loss 2.5008, val loss 2.8331
iter 7450: loss 2.5380, time 20020.52ms 
iter 7451: loss 2.3947, time 5261.62ms 
iter 7452: loss 2.4398, time 5228.94ms 
iter 7453: loss 2.6112, time 5241.61ms 
iter 7454: loss 2.4910, time 5254.02ms 
iter 7455: loss 2.4709, time 5246.09ms 
iter 7456: loss 2.3886, time 5220.43ms 
iter 7457: loss 2.4281, time 5213.12ms 
iter 7458: loss 2.3122, time 5220.90ms 
iter 7459: loss 2.4501, time 5218.18ms 
iter 7460: loss 2.4979, time 5232.30ms 
iter 7461: loss 2.5584, time 5234.47ms 
iter 7462: loss 2.6933, time 5243.34ms 
iter 7463: loss 2.4415, time 5234.54ms 
iter 7464: loss 2.6931, time 5234.86ms 
iter 7465: loss 2.6978, time 5233.25ms 
iter 7466: loss 2.4917, time 5247.63ms 
iter 7467: loss 2.4131, time 5244.52ms 
iter 7468: loss 2.4763, time 5236.54ms 
iter 7469: loss 2.6742, time 5235.34ms 
iter 7470: loss 2.7616, time 5244.42ms 
iter 7471: loss 2.6113, time 5243.96ms 
iter 7472: loss 2.3026, time 5245.49ms 
iter 7473: loss 2.4986, time 5164.20ms 
iter 7474: loss 2.4625, time 5258.63ms 
iter 7475: loss 2.5495, time 5245.82ms 
iter 7476: loss 2.6332, time 5248.53ms 
iter 7477: loss 2.6553, time 5253.94ms 
iter 7478: loss 2.5890, time 5267.08ms 
iter 7479: loss 2.4516, time 5252.11ms 
iter 7480: loss 2.5441, time 5242.85ms 
iter 7481: loss 2.4404, time 5262.81ms 
iter 7482: loss 2.7978, time 5261.40ms 
iter 7483: loss 2.5865, time 5254.85ms 
iter 7484: loss 2.6068, time 5244.65ms 
iter 7485: loss 2.2293, time 5240.15ms 
iter 7486: loss 2.4138, time 5250.28ms 
iter 7487: loss 2.5305, time 5266.38ms 
iter 7488: loss 2.4307, time 5268.03ms 
iter 7489: loss 2.4034, time 5253.76ms 
iter 7490: loss 2.5168, time 5266.20ms 
iter 7491: loss 2.7102, time 5261.61ms 
iter 7492: loss 2.4511, time 5263.84ms 
iter 7493: loss 2.6437, time 5087.68ms 
iter 7494: loss 2.5040, time 5264.09ms 
iter 7495: loss 2.5037, time 5187.48ms 
iter 7496: loss 2.3918, time 5231.94ms 
iter 7497: loss 2.5580, time 5106.58ms 
iter 7498: loss 2.5158, time 5260.61ms 
iter 7499: loss 2.5067, time 5130.02ms 
step 7500: train loss 2.5035, val loss 2.8283
iter 7500: loss 2.4233, time 19974.23ms 
iter 7501: loss 2.3614, time 5124.61ms 
iter 7502: loss 2.3437, time 5099.16ms 
iter 7503: loss 2.5918, time 5107.22ms 
iter 7504: loss 2.5181, time 5087.25ms 
iter 7505: loss 2.5130, time 5238.25ms 
iter 7506: loss 2.7247, time 5249.03ms 
iter 7507: loss 2.4144, time 5254.00ms 
iter 7508: loss 2.4508, time 5258.89ms 
iter 7509: loss 2.4873, time 5255.50ms 
iter 7510: loss 2.5106, time 5296.97ms 
iter 7511: loss 2.5893, time 5257.33ms 
iter 7512: loss 2.5794, time 5266.54ms 
iter 7513: loss 2.5575, time 5250.16ms 
iter 7514: loss 2.5300, time 5250.57ms 
iter 7515: loss 2.5035, time 5256.76ms 
iter 7516: loss 2.6037, time 5258.45ms 
iter 7517: loss 2.4170, time 5269.90ms 
iter 7518: loss 2.6400, time 5255.66ms 
iter 7519: loss 2.6448, time 5261.37ms 
iter 7520: loss 2.2236, time 5269.91ms 
iter 7521: loss 2.4627, time 5260.02ms 
iter 7522: loss 2.4652, time 5257.02ms 
iter 7523: loss 2.5588, time 5259.40ms 
iter 7524: loss 2.5114, time 5256.95ms 
iter 7525: loss 2.4701, time 5270.35ms 
iter 7526: loss 2.3748, time 5273.00ms 
iter 7527: loss 2.6252, time 5312.07ms 
iter 7528: loss 2.5199, time 5331.40ms 
iter 7529: loss 2.6126, time 5341.80ms 
iter 7530: loss 2.3327, time 5287.84ms 
iter 7531: loss 2.2720, time 5307.77ms 
iter 7532: loss 2.5316, time 5335.80ms 
iter 7533: loss 2.5682, time 5311.32ms 
iter 7534: loss 2.3655, time 5254.51ms 
iter 7535: loss 2.4961, time 5288.46ms 
iter 7536: loss 2.5971, time 5254.89ms 
iter 7537: loss 2.5910, time 5247.45ms 
iter 7538: loss 2.4400, time 5257.46ms 
iter 7539: loss 2.3395, time 5297.52ms 
iter 7540: loss 2.4135, time 5254.49ms 
iter 7541: loss 2.5400, time 5251.91ms 
iter 7542: loss 2.5170, time 5253.94ms 
iter 7543: loss 2.2183, time 5259.64ms 
iter 7544: loss 2.4671, time 5248.96ms 
iter 7545: loss 2.6710, time 5250.95ms 
iter 7546: loss 2.5459, time 5251.16ms 
iter 7547: loss 2.4905, time 5259.37ms 
iter 7548: loss 2.3915, time 5261.53ms 
iter 7549: loss 2.4562, time 5248.94ms 
step 7550: train loss 2.4884, val loss 2.8532
iter 7550: loss 2.5674, time 19981.97ms 
iter 7551: loss 2.7064, time 5248.06ms 
iter 7552: loss 2.5797, time 5255.48ms 
iter 7553: loss 2.7337, time 5273.65ms 
iter 7554: loss 2.4142, time 5258.40ms 
iter 7555: loss 2.6482, time 5254.91ms 
iter 7556: loss 2.6051, time 5251.11ms 
iter 7557: loss 2.6121, time 5262.93ms 
iter 7558: loss 2.3629, time 5250.80ms 
iter 7559: loss 2.5488, time 5256.43ms 
iter 7560: loss 2.4129, time 5282.99ms 
iter 7561: loss 2.4891, time 5260.30ms 
iter 7562: loss 2.6584, time 5252.89ms 
iter 7563: loss 2.3686, time 5272.32ms 
iter 7564: loss 2.3558, time 5295.84ms 
iter 7565: loss 2.6063, time 5260.42ms 
iter 7566: loss 2.5115, time 5237.62ms 
iter 7567: loss 2.5232, time 5252.07ms 
iter 7568: loss 2.5189, time 5255.21ms 
iter 7569: loss 2.7574, time 5253.27ms 
iter 7570: loss 2.4888, time 5252.94ms 
iter 7571: loss 2.5051, time 5249.47ms 
iter 7572: loss 2.4615, time 5263.26ms 
iter 7573: loss 2.4559, time 5225.49ms 
iter 7574: loss 2.5135, time 5243.96ms 
iter 7575: loss 2.7893, time 5254.20ms 
iter 7576: loss 2.4388, time 5264.02ms 
iter 7577: loss 2.4778, time 5252.02ms 
iter 7578: loss 2.5928, time 5225.41ms 
iter 7579: loss 2.3917, time 5202.95ms 
iter 7580: loss 2.4680, time 5144.92ms 
iter 7581: loss 2.5660, time 5228.01ms 
iter 7582: loss 2.4925, time 5246.58ms 
iter 7583: loss 2.1965, time 5245.32ms 
iter 7584: loss 2.5660, time 5196.80ms 
iter 7585: loss 2.4464, time 5248.71ms 
iter 7586: loss 2.6185, time 5318.35ms 
iter 7587: loss 2.6094, time 5253.73ms 
iter 7588: loss 2.3376, time 5254.65ms 
iter 7589: loss 2.5930, time 5255.47ms 
iter 7590: loss 2.3418, time 5249.21ms 
iter 7591: loss 2.5888, time 5177.42ms 
iter 7592: loss 2.6387, time 5075.53ms 
iter 7593: loss 2.5586, time 5213.84ms 
iter 7594: loss 2.4608, time 5183.72ms 
iter 7595: loss 2.5063, time 5254.21ms 
iter 7596: loss 2.4633, time 5234.44ms 
iter 7597: loss 2.4009, time 5240.01ms 
iter 7598: loss 2.3980, time 5249.01ms 
iter 7599: loss 2.4109, time 5260.03ms 
step 7600: train loss 2.4894, val loss 2.8316
iter 7600: loss 2.4883, time 19945.79ms 
iter 7601: loss 2.3671, time 5260.77ms 
iter 7602: loss 2.4691, time 5332.00ms 
iter 7603: loss 2.3713, time 5312.55ms 
iter 7604: loss 2.3937, time 5293.92ms 
iter 7605: loss 2.4870, time 5316.50ms 
iter 7606: loss 2.5968, time 5235.62ms 
iter 7607: loss 2.4073, time 5277.28ms 
iter 7608: loss 2.4946, time 5265.14ms 
iter 7609: loss 2.5175, time 5253.78ms 
iter 7610: loss 2.3383, time 5270.33ms 
iter 7611: loss 2.4695, time 5265.16ms 
iter 7612: loss 2.4584, time 5284.99ms 
iter 7613: loss 2.5762, time 5326.20ms 
iter 7614: loss 2.5289, time 5298.84ms 
iter 7615: loss 2.4679, time 5270.01ms 
iter 7616: loss 2.4921, time 5270.79ms 
iter 7617: loss 2.4965, time 5266.68ms 
iter 7618: loss 2.4446, time 5268.61ms 
iter 7619: loss 2.3601, time 5280.34ms 
iter 7620: loss 2.6715, time 5273.50ms 
iter 7621: loss 2.5463, time 5263.92ms 
iter 7622: loss 2.3779, time 5263.00ms 
iter 7623: loss 2.6370, time 5275.03ms 
iter 7624: loss 2.4995, time 5268.61ms 
iter 7625: loss 2.4025, time 5267.34ms 
iter 7626: loss 2.4703, time 5247.90ms 
iter 7627: loss 2.4884, time 5262.92ms 
iter 7628: loss 2.4892, time 5248.05ms 
iter 7629: loss 2.4399, time 5251.40ms 
iter 7630: loss 2.4002, time 5250.14ms 
iter 7631: loss 2.4100, time 5257.81ms 
iter 7632: loss 2.6089, time 5247.70ms 
iter 7633: loss 2.6445, time 5246.47ms 
iter 7634: loss 2.4252, time 5255.67ms 
iter 7635: loss 2.5315, time 5264.89ms 
iter 7636: loss 2.7982, time 5252.07ms 
iter 7637: loss 2.5464, time 5318.78ms 
iter 7638: loss 2.4826, time 5312.39ms 
iter 7639: loss 2.4937, time 5322.89ms 
iter 7640: loss 2.3118, time 5318.59ms 
iter 7641: loss 2.5579, time 5290.49ms 
iter 7642: loss 2.6502, time 5322.82ms 
iter 7643: loss 2.5201, time 5319.73ms 
iter 7644: loss 2.4237, time 5324.87ms 
iter 7645: loss 2.4176, time 5331.75ms 
iter 7646: loss 2.5315, time 5284.34ms 
iter 7647: loss 2.3835, time 5200.25ms 
iter 7648: loss 2.4083, time 5252.61ms 
iter 7649: loss 2.4856, time 5288.39ms 
step 7650: train loss 2.4840, val loss 2.8388
iter 7650: loss 2.5308, time 20117.62ms 
iter 7651: loss 2.5567, time 5335.00ms 
iter 7652: loss 2.5503, time 5325.36ms 
iter 7653: loss 2.6105, time 5268.27ms 
iter 7654: loss 2.4383, time 5257.06ms 
iter 7655: loss 2.4390, time 5260.07ms 
iter 7656: loss 2.3560, time 5261.28ms 
iter 7657: loss 2.5637, time 5248.69ms 
iter 7658: loss 2.4713, time 5255.83ms 
iter 7659: loss 2.5601, time 5264.03ms 
iter 7660: loss 2.4484, time 5253.83ms 
iter 7661: loss 2.5470, time 5256.25ms 
iter 7662: loss 2.3361, time 5272.52ms 
iter 7663: loss 2.4715, time 5256.52ms 
iter 7664: loss 2.5950, time 5258.18ms 
iter 7665: loss 2.6088, time 5264.37ms 
iter 7666: loss 2.5881, time 5259.16ms 
iter 7667: loss 2.4995, time 5260.54ms 
iter 7668: loss 2.4561, time 5254.82ms 
iter 7669: loss 2.3024, time 5312.58ms 
iter 7670: loss 2.4823, time 5282.03ms 
iter 7671: loss 2.5372, time 5339.08ms 
iter 7672: loss 2.3505, time 5333.74ms 
iter 7673: loss 2.5819, time 5267.42ms 
iter 7674: loss 2.4039, time 5307.62ms 
iter 7675: loss 2.4369, time 5277.92ms 
iter 7676: loss 2.5107, time 5253.01ms 
iter 7677: loss 2.4368, time 5254.03ms 
iter 7678: loss 2.5032, time 5253.60ms 
iter 7679: loss 2.5835, time 5265.37ms 
iter 7680: loss 2.4210, time 5255.86ms 
iter 7681: loss 2.6156, time 5251.02ms 
iter 7682: loss 2.6186, time 5248.60ms 
iter 7683: loss 2.5229, time 5259.10ms 
iter 7684: loss 2.5826, time 5264.47ms 
iter 7685: loss 2.5500, time 5259.90ms 
iter 7686: loss 2.2665, time 5259.83ms 
iter 7687: loss 2.7726, time 5261.12ms 
iter 7688: loss 2.5615, time 5312.63ms 
iter 7689: loss 2.5255, time 5340.57ms 
iter 7690: loss 2.5593, time 5306.31ms 
iter 7691: loss 2.4249, time 5265.87ms 
iter 7692: loss 2.5999, time 5263.15ms 
iter 7693: loss 2.4031, time 5240.32ms 
iter 7694: loss 2.4246, time 5267.04ms 
iter 7695: loss 2.8748, time 5332.98ms 
iter 7696: loss 2.2039, time 5255.29ms 
iter 7697: loss 2.4553, time 5233.94ms 
iter 7698: loss 2.4477, time 5215.29ms 
iter 7699: loss 2.5971, time 5227.99ms 
step 7700: train loss 2.4868, val loss 2.8333
iter 7700: loss 2.3692, time 19968.39ms 
iter 7701: loss 2.4071, time 5227.20ms 
iter 7702: loss 2.2388, time 5240.39ms 
iter 7703: loss 2.3459, time 5243.16ms 
iter 7704: loss 2.4396, time 5262.87ms 
iter 7705: loss 2.5654, time 5248.97ms 
iter 7706: loss 2.4605, time 5249.69ms 
iter 7707: loss 2.5264, time 5252.59ms 
iter 7708: loss 2.2823, time 5269.31ms 
iter 7709: loss 2.4748, time 5252.04ms 
iter 7710: loss 2.5240, time 5250.11ms 
iter 7711: loss 2.6498, time 5252.75ms 
iter 7712: loss 2.5119, time 5260.35ms 
iter 7713: loss 2.6742, time 5255.47ms 
iter 7714: loss 2.3495, time 5253.93ms 
iter 7715: loss 2.3385, time 5251.36ms 
iter 7716: loss 2.4685, time 5318.58ms 
iter 7717: loss 2.3710, time 5346.08ms 
iter 7718: loss 2.4801, time 5299.07ms 
iter 7719: loss 2.2924, time 5330.57ms 
iter 7720: loss 2.4740, time 5313.22ms 
iter 7721: loss 2.4617, time 5273.66ms 
iter 7722: loss 2.3366, time 5249.22ms 
iter 7723: loss 2.3598, time 5287.28ms 
iter 7724: loss 2.3225, time 5339.33ms 
iter 7725: loss 2.3761, time 5292.25ms 
iter 7726: loss 2.5851, time 5269.27ms 
iter 7727: loss 2.5597, time 5253.91ms 
iter 7728: loss 2.4737, time 5250.50ms 
iter 7729: loss 2.3524, time 5244.65ms 
iter 7730: loss 2.4954, time 5217.39ms 
iter 7731: loss 2.6205, time 5208.53ms 
iter 7732: loss 2.4809, time 5208.48ms 
iter 7733: loss 2.5455, time 5213.86ms 
iter 7734: loss 2.4636, time 5218.57ms 
iter 7735: loss 2.5490, time 5220.11ms 
iter 7736: loss 2.5944, time 5248.34ms 
iter 7737: loss 2.3574, time 5298.57ms 
iter 7738: loss 2.3096, time 5308.33ms 
iter 7739: loss 2.4669, time 5336.00ms 
iter 7740: loss 2.4604, time 5288.49ms 
iter 7741: loss 2.3006, time 5209.82ms 
iter 7742: loss 2.4746, time 5262.69ms 
iter 7743: loss 2.5250, time 5260.36ms 
iter 7744: loss 2.5263, time 5252.16ms 
iter 7745: loss 2.5544, time 5253.72ms 
iter 7746: loss 2.4638, time 5264.80ms 
iter 7747: loss 2.4878, time 5317.98ms 
iter 7748: loss 2.4766, time 5286.59ms 
iter 7749: loss 2.5219, time 5279.85ms 
step 7750: train loss 2.4951, val loss 2.8419
iter 7750: loss 2.5474, time 20035.76ms 
iter 7751: loss 2.6659, time 5273.94ms 
iter 7752: loss 2.6063, time 5285.60ms 
iter 7753: loss 2.7412, time 5247.68ms 
iter 7754: loss 2.2714, time 5259.89ms 
iter 7755: loss 2.3118, time 5265.82ms 
iter 7756: loss 2.6700, time 5268.47ms 
iter 7757: loss 2.5548, time 5315.78ms 
iter 7758: loss 2.5051, time 5271.29ms 
iter 7759: loss 2.5917, time 5279.34ms 
iter 7760: loss 2.5155, time 5247.40ms 
iter 7761: loss 2.5928, time 5253.08ms 
iter 7762: loss 2.4628, time 5256.79ms 
iter 7763: loss 2.4942, time 5277.35ms 
iter 7764: loss 2.4012, time 5340.35ms 
iter 7765: loss 2.4599, time 5334.40ms 
iter 7766: loss 2.3967, time 5309.41ms 
iter 7767: loss 2.4336, time 5297.42ms 
iter 7768: loss 2.6615, time 5254.45ms 
iter 7769: loss 2.6242, time 5253.56ms 
iter 7770: loss 2.5333, time 5257.95ms 
iter 7771: loss 2.5078, time 5265.23ms 
iter 7772: loss 2.7682, time 5256.38ms 
iter 7773: loss 2.6073, time 5254.60ms 
iter 7774: loss 2.2840, time 5253.89ms 
iter 7775: loss 2.5337, time 5273.29ms 
iter 7776: loss 2.5118, time 5246.21ms 
iter 7777: loss 2.3460, time 5250.78ms 
iter 7778: loss 2.6698, time 5249.80ms 
iter 7779: loss 2.3229, time 5262.93ms 
iter 7780: loss 2.5642, time 5339.16ms 
iter 7781: loss 2.2867, time 5330.46ms 
iter 7782: loss 2.3916, time 5298.05ms 
iter 7783: loss 2.4042, time 5288.04ms 
iter 7784: loss 2.6033, time 5253.07ms 
iter 7785: loss 2.3107, time 5248.71ms 
iter 7786: loss 2.1093, time 5252.25ms 
iter 7787: loss 2.6015, time 5259.48ms 
iter 7788: loss 2.4349, time 5250.75ms 
iter 7789: loss 2.5077, time 5247.93ms 
iter 7790: loss 2.5129, time 5250.99ms 
iter 7791: loss 2.7553, time 5258.62ms 
iter 7792: loss 2.5210, time 5258.36ms 
iter 7793: loss 2.5485, time 5251.70ms 
iter 7794: loss 2.4989, time 5262.01ms 
iter 7795: loss 2.5987, time 5277.20ms 
iter 7796: loss 2.6400, time 5268.99ms 
iter 7797: loss 2.4909, time 5257.27ms 
iter 7798: loss 2.3993, time 5261.57ms 
iter 7799: loss 2.5259, time 5272.71ms 
step 7800: train loss 2.4828, val loss 2.8394
iter 7800: loss 2.5990, time 19891.16ms 
iter 7801: loss 2.5197, time 5257.87ms 
iter 7802: loss 2.5229, time 5274.72ms 
iter 7803: loss 2.5553, time 5267.82ms 
iter 7804: loss 2.4556, time 5264.29ms 
iter 7805: loss 2.5786, time 5280.48ms 
iter 7806: loss 2.5568, time 5248.95ms 
iter 7807: loss 2.3156, time 5251.73ms 
iter 7808: loss 2.4177, time 5257.22ms 
iter 7809: loss 2.6463, time 5247.79ms 
iter 7810: loss 2.4170, time 5250.40ms 
iter 7811: loss 2.2774, time 5259.95ms 
iter 7812: loss 2.5228, time 5249.41ms 
iter 7813: loss 2.5811, time 5260.96ms 
iter 7814: loss 2.4559, time 5250.21ms 
iter 7815: loss 2.4478, time 5250.45ms 
iter 7816: loss 2.6337, time 5250.76ms 
iter 7817: loss 2.5281, time 5258.23ms 
iter 7818: loss 2.3718, time 5253.16ms 
iter 7819: loss 2.4952, time 5251.04ms 
iter 7820: loss 2.3906, time 5251.62ms 
iter 7821: loss 2.2736, time 5252.33ms 
iter 7822: loss 2.6559, time 5265.05ms 
iter 7823: loss 2.5663, time 5257.01ms 
iter 7824: loss 2.4318, time 5249.83ms 
iter 7825: loss 2.6609, time 5249.55ms 
iter 7826: loss 2.5755, time 5267.64ms 
iter 7827: loss 2.6405, time 5256.56ms 
iter 7828: loss 2.5595, time 5323.47ms 
iter 7829: loss 2.4029, time 5328.00ms 
iter 7830: loss 2.5206, time 5309.79ms 
iter 7831: loss 2.1415, time 5259.60ms 
iter 7832: loss 2.3974, time 5262.83ms 
iter 7833: loss 2.5650, time 5312.12ms 
iter 7834: loss 2.6111, time 5249.57ms 
iter 7835: loss 2.3008, time 5260.31ms 
iter 7836: loss 2.4547, time 5258.79ms 
iter 7837: loss 2.6571, time 5254.72ms 
iter 7838: loss 2.5136, time 5240.51ms 
iter 7839: loss 2.7197, time 5251.35ms 
iter 7840: loss 2.5705, time 5260.35ms 
iter 7841: loss 2.6444, time 5257.28ms 
iter 7842: loss 2.7405, time 5255.80ms 
iter 7843: loss 2.4683, time 5252.31ms 
iter 7844: loss 2.3145, time 5267.96ms 
iter 7845: loss 2.4188, time 5251.43ms 
iter 7846: loss 2.5390, time 5251.38ms 
iter 7847: loss 2.3540, time 5249.35ms 
iter 7848: loss 2.5235, time 5269.20ms 
iter 7849: loss 2.4338, time 5242.88ms 
step 7850: train loss 2.4624, val loss 2.8409
iter 7850: loss 2.4995, time 20094.80ms 
iter 7851: loss 2.5658, time 5314.34ms 
iter 7852: loss 2.5257, time 5303.09ms 
iter 7853: loss 2.5965, time 5341.30ms 
iter 7854: loss 2.5712, time 5340.23ms 
iter 7855: loss 2.5287, time 5338.20ms 
iter 7856: loss 2.3941, time 5346.23ms 
iter 7857: loss 2.4490, time 5262.61ms 
iter 7858: loss 2.4669, time 5256.32ms 
iter 7859: loss 2.5126, time 5248.52ms 
iter 7860: loss 2.2892, time 5255.81ms 
iter 7861: loss 2.2722, time 5266.08ms 
iter 7862: loss 2.5324, time 5255.12ms 
iter 7863: loss 2.4875, time 5254.69ms 
iter 7864: loss 2.4600, time 5315.73ms 
iter 7865: loss 2.4853, time 5251.46ms 
iter 7866: loss 2.6067, time 5251.88ms 
iter 7867: loss 2.4244, time 5257.51ms 
iter 7868: loss 2.5122, time 5257.74ms 
iter 7869: loss 2.5931, time 5266.65ms 
iter 7870: loss 2.6432, time 5254.14ms 
iter 7871: loss 2.5001, time 5259.24ms 
iter 7872: loss 2.7337, time 5258.92ms 
iter 7873: loss 2.3648, time 5270.19ms 
iter 7874: loss 2.4418, time 5255.40ms 
iter 7875: loss 2.5725, time 5252.02ms 
iter 7876: loss 2.5431, time 5261.98ms 
iter 7877: loss 2.4644, time 5266.28ms 
iter 7878: loss 2.7021, time 5255.55ms 
iter 7879: loss 2.5712, time 5257.10ms 
iter 7880: loss 2.5432, time 5255.83ms 
iter 7881: loss 2.7405, time 5269.66ms 
iter 7882: loss 2.5403, time 5263.13ms 
iter 7883: loss 2.5475, time 5253.63ms 
iter 7884: loss 2.3745, time 5252.22ms 
iter 7885: loss 2.7225, time 5273.74ms 
iter 7886: loss 2.4940, time 5256.87ms 
iter 7887: loss 2.3283, time 5281.63ms 
iter 7888: loss 2.5236, time 5287.68ms 
iter 7889: loss 2.3927, time 5299.76ms 
iter 7890: loss 2.4828, time 5290.29ms 
iter 7891: loss 2.5259, time 5279.63ms 
iter 7892: loss 2.3873, time 5276.27ms 
iter 7893: loss 2.4691, time 5247.14ms 
iter 7894: loss 2.5043, time 5269.34ms 
iter 7895: loss 2.3754, time 5256.05ms 
iter 7896: loss 2.6310, time 5250.22ms 
iter 7897: loss 2.4175, time 5255.41ms 
iter 7898: loss 2.7017, time 5268.98ms 
iter 7899: loss 2.3930, time 5259.75ms 
step 7900: train loss 2.4878, val loss 2.8286
iter 7900: loss 2.4493, time 20103.11ms 
iter 7901: loss 2.6215, time 5255.70ms 
iter 7902: loss 2.4592, time 5249.37ms 
iter 7903: loss 2.3717, time 5263.95ms 
iter 7904: loss 2.5189, time 5237.70ms 
iter 7905: loss 2.5039, time 5244.77ms 
iter 7906: loss 2.5504, time 5249.45ms 
iter 7907: loss 2.6029, time 5257.28ms 
iter 7908: loss 2.6343, time 5301.59ms 
iter 7909: loss 2.3822, time 5256.18ms 
iter 7910: loss 2.4864, time 5252.85ms 
iter 7911: loss 2.5896, time 5265.13ms 
iter 7912: loss 2.3982, time 5261.16ms 
iter 7913: loss 2.5126, time 5255.69ms 
iter 7914: loss 2.5356, time 5254.02ms 
iter 7915: loss 2.4859, time 5255.50ms 
iter 7916: loss 2.6238, time 5341.40ms 
iter 7917: loss 2.5787, time 5321.88ms 
iter 7918: loss 2.4359, time 5262.69ms 
iter 7919: loss 2.3940, time 5253.22ms 
iter 7920: loss 2.3870, time 5269.85ms 
iter 7921: loss 2.6427, time 5253.08ms 
iter 7922: loss 2.2950, time 5255.78ms 
iter 7923: loss 2.5864, time 5251.99ms 
iter 7924: loss 2.2102, time 5260.64ms 
iter 7925: loss 2.4272, time 5253.64ms 
iter 7926: loss 2.3770, time 5246.43ms 
iter 7927: loss 2.4786, time 5259.18ms 
iter 7928: loss 2.6676, time 5256.04ms 
iter 7929: loss 2.3918, time 5266.28ms 
iter 7930: loss 2.5530, time 5240.50ms 
iter 7931: loss 2.6356, time 5250.62ms 
iter 7932: loss 2.6129, time 5243.60ms 
iter 7933: loss 2.5031, time 5266.93ms 
iter 7934: loss 2.5169, time 5252.44ms 
iter 7935: loss 2.4526, time 5252.73ms 
iter 7936: loss 2.5240, time 5258.88ms 
iter 7937: loss 2.4191, time 5265.52ms 
iter 7938: loss 2.5484, time 5270.37ms 
iter 7939: loss 2.4946, time 5265.43ms 
iter 7940: loss 2.5703, time 5261.94ms 
iter 7941: loss 2.5092, time 5254.39ms 
iter 7942: loss 2.3486, time 5265.28ms 
iter 7943: loss 2.5577, time 5258.66ms 
iter 7944: loss 2.3940, time 5254.46ms 
iter 7945: loss 2.5642, time 5250.01ms 
iter 7946: loss 2.4743, time 5267.38ms 
iter 7947: loss 2.5487, time 5271.56ms 
iter 7948: loss 2.5769, time 5254.87ms 
iter 7949: loss 2.5155, time 5250.10ms 
step 7950: train loss 2.4792, val loss 2.8268
iter 7950: loss 2.4211, time 20118.24ms 
iter 7951: loss 2.4737, time 5277.79ms 
iter 7952: loss 2.5105, time 5267.92ms 
iter 7953: loss 2.4092, time 5274.68ms 
iter 7954: loss 2.5968, time 5258.69ms 
iter 7955: loss 2.7378, time 5271.41ms 
iter 7956: loss 2.3798, time 5277.00ms 
iter 7957: loss 2.5134, time 5277.65ms 
iter 7958: loss 2.5648, time 5323.22ms 
iter 7959: loss 2.4933, time 5320.44ms 
iter 7960: loss 2.2349, time 5278.60ms 
iter 7961: loss 2.4032, time 5280.66ms 
iter 7962: loss 2.4728, time 5308.29ms 
iter 7963: loss 2.4564, time 5273.37ms 
iter 7964: loss 2.3226, time 5284.80ms 
iter 7965: loss 2.6167, time 5275.26ms 
iter 7966: loss 2.5752, time 5274.17ms 
iter 7967: loss 2.6391, time 5277.64ms 
iter 7968: loss 2.5923, time 5274.61ms 
iter 7969: loss 2.3163, time 5272.76ms 
iter 7970: loss 2.3614, time 5263.55ms 
iter 7971: loss 2.6357, time 5241.88ms 
iter 7972: loss 2.5084, time 5263.36ms 
iter 7973: loss 2.5512, time 5267.25ms 
iter 7974: loss 2.7109, time 5252.54ms 
iter 7975: loss 2.5285, time 5249.24ms 
iter 7976: loss 2.2151, time 5303.70ms 
iter 7977: loss 2.4832, time 5332.45ms 
iter 7978: loss 2.4247, time 5344.75ms 
iter 7979: loss 2.4172, time 5327.72ms 
iter 7980: loss 2.6367, time 5317.59ms 
iter 7981: loss 2.4537, time 5284.06ms 
iter 7982: loss 2.6419, time 5284.75ms 
iter 7983: loss 2.4110, time 5294.63ms 
iter 7984: loss 2.6516, time 5285.58ms 
iter 7985: loss 2.4462, time 5280.62ms 
iter 7986: loss 2.3682, time 5262.95ms 
iter 7987: loss 2.4713, time 5269.04ms 
iter 7988: loss 2.6099, time 5292.47ms 
iter 7989: loss 2.4092, time 5274.25ms 
iter 7990: loss 2.8368, time 5253.22ms 
iter 7991: loss 2.4060, time 5264.49ms 
iter 7992: loss 2.5394, time 5258.74ms 
iter 7993: loss 2.4359, time 5276.08ms 
iter 7994: loss 2.1428, time 5322.32ms 
iter 7995: loss 2.6277, time 5325.20ms 
iter 7996: loss 2.5744, time 5262.86ms 
iter 7997: loss 2.4077, time 5269.11ms 
iter 7998: loss 2.5079, time 5282.68ms 
iter 7999: loss 2.2914, time 5210.13ms 
step 8000: train loss 2.4900, val loss 2.8456
iter 8000: loss 2.4492, time 19983.49ms 
iter 8001: loss 2.6151, time 5256.62ms 
iter 8002: loss 2.4805, time 5274.41ms 
iter 8003: loss 2.3319, time 5272.80ms 
iter 8004: loss 2.4457, time 5260.35ms 
iter 8005: loss 2.4443, time 5237.56ms 
iter 8006: loss 2.5301, time 5262.46ms 
iter 8007: loss 2.3603, time 5259.80ms 
iter 8008: loss 2.2191, time 5258.00ms 
iter 8009: loss 2.3643, time 5269.97ms 
iter 8010: loss 2.3974, time 5296.23ms 
iter 8011: loss 2.4460, time 5344.74ms 
iter 8012: loss 2.4308, time 5263.40ms 
iter 8013: loss 2.2312, time 5267.39ms 
iter 8014: loss 2.6274, time 5270.37ms 
iter 8015: loss 2.4691, time 5255.64ms 
iter 8016: loss 2.3134, time 5256.44ms 
iter 8017: loss 2.4213, time 5263.38ms 
iter 8018: loss 2.1833, time 5279.87ms 
iter 8019: loss 2.4977, time 5269.53ms 
iter 8020: loss 2.5790, time 5254.32ms 
iter 8021: loss 2.5834, time 5254.95ms 
iter 8022: loss 2.5076, time 5259.84ms 
iter 8023: loss 2.6570, time 5309.94ms 
iter 8024: loss 2.5162, time 5250.43ms 
iter 8025: loss 2.5615, time 5250.60ms 
iter 8026: loss 2.4703, time 5267.04ms 
iter 8027: loss 2.4832, time 5253.35ms 
iter 8028: loss 2.4119, time 5263.00ms 
iter 8029: loss 2.3547, time 5251.45ms 
iter 8030: loss 2.4636, time 5261.96ms 
iter 8031: loss 2.4820, time 5263.59ms 
iter 8032: loss 2.4809, time 5259.29ms 
iter 8033: loss 2.4651, time 5250.91ms 
iter 8034: loss 2.5901, time 5256.21ms 
iter 8035: loss 2.6451, time 5262.45ms 
iter 8036: loss 2.3155, time 5253.39ms 
iter 8037: loss 2.5378, time 5250.53ms 
iter 8038: loss 2.4335, time 5261.18ms 
iter 8039: loss 2.4184, time 5260.06ms 
iter 8040: loss 2.5496, time 5257.26ms 
iter 8041: loss 2.4562, time 5248.62ms 
iter 8042: loss 2.2981, time 5263.17ms 
iter 8043: loss 2.5265, time 5258.10ms 
iter 8044: loss 2.5444, time 5249.25ms 
iter 8045: loss 2.5599, time 5252.23ms 
iter 8046: loss 2.4187, time 5262.49ms 
iter 8047: loss 2.4495, time 5332.91ms 
iter 8048: loss 2.3822, time 5302.84ms 
iter 8049: loss 2.5753, time 5252.75ms 
step 8050: train loss 2.4774, val loss 2.8423
iter 8050: loss 2.6282, time 20058.73ms 
iter 8051: loss 2.4875, time 5261.27ms 
iter 8052: loss 2.5314, time 5261.38ms 
iter 8053: loss 2.5623, time 5257.65ms 
iter 8054: loss 2.5129, time 5271.26ms 
iter 8055: loss 2.3340, time 5258.16ms 
iter 8056: loss 2.7590, time 5260.04ms 
iter 8057: loss 2.3373, time 5253.26ms 
iter 8058: loss 2.4217, time 5269.55ms 
iter 8059: loss 2.2744, time 5265.65ms 
iter 8060: loss 2.3914, time 5257.59ms 
iter 8061: loss 2.1453, time 5250.10ms 
iter 8062: loss 2.3763, time 5277.44ms 
iter 8063: loss 2.6061, time 5253.05ms 
iter 8064: loss 2.6468, time 5266.04ms 
iter 8065: loss 2.3291, time 5329.03ms 
iter 8066: loss 2.6166, time 5337.03ms 
iter 8067: loss 2.5279, time 5215.72ms 
iter 8068: loss 2.1962, time 5120.49ms 
iter 8069: loss 2.5479, time 5116.86ms 
iter 8070: loss 2.4204, time 5248.95ms 
iter 8071: loss 2.4907, time 5212.95ms 
iter 8072: loss 2.2936, time 5262.89ms 
iter 8073: loss 2.4916, time 5240.05ms 
iter 8074: loss 2.7483, time 5245.38ms 
iter 8075: loss 2.6515, time 5239.20ms 
iter 8076: loss 2.4183, time 5242.35ms 
iter 8077: loss 2.4607, time 5236.64ms 
iter 8078: loss 2.4156, time 5247.19ms 
iter 8079: loss 2.4982, time 5245.97ms 
iter 8080: loss 2.5286, time 5243.74ms 
iter 8081: loss 2.4138, time 5250.11ms 
iter 8082: loss 2.1383, time 5256.18ms 
iter 8083: loss 2.5852, time 5253.70ms 
iter 8084: loss 2.4660, time 5238.40ms 
iter 8085: loss 2.3861, time 5234.56ms 
iter 8086: loss 2.3818, time 5241.47ms 
iter 8087: loss 2.6056, time 5241.27ms 
iter 8088: loss 2.4254, time 5234.04ms 
iter 8089: loss 2.5783, time 5235.18ms 
iter 8090: loss 2.4214, time 5247.97ms 
iter 8091: loss 2.5985, time 5250.15ms 
iter 8092: loss 2.4483, time 5238.81ms 
iter 8093: loss 2.5244, time 5242.35ms 
iter 8094: loss 2.4380, time 5235.44ms 
iter 8095: loss 2.6097, time 5243.99ms 
iter 8096: loss 2.4482, time 5259.70ms 
iter 8097: loss 2.5506, time 5261.64ms 
iter 8098: loss 2.5203, time 5278.81ms 
iter 8099: loss 2.6746, time 5279.47ms 
step 8100: train loss 2.4800, val loss 2.8499
iter 8100: loss 2.6724, time 20102.41ms 
iter 8101: loss 2.6209, time 5269.41ms 
iter 8102: loss 2.6199, time 5255.96ms 
iter 8103: loss 2.5468, time 5256.10ms 
iter 8104: loss 2.3684, time 5260.16ms 
iter 8105: loss 2.4200, time 5265.07ms 
iter 8106: loss 2.4910, time 5243.35ms 
iter 8107: loss 2.6067, time 5236.25ms 
iter 8108: loss 2.5451, time 5248.32ms 
iter 8109: loss 2.7288, time 5251.33ms 
iter 8110: loss 2.4025, time 5251.66ms 
iter 8111: loss 2.3078, time 5253.33ms 
iter 8112: loss 2.4594, time 5247.39ms 
iter 8113: loss 2.4469, time 5247.34ms 
iter 8114: loss 2.6402, time 5250.98ms 
iter 8115: loss 2.5804, time 5252.00ms 
iter 8116: loss 2.3793, time 5236.66ms 
iter 8117: loss 2.4014, time 5228.89ms 
iter 8118: loss 2.6216, time 5263.27ms 
iter 8119: loss 2.2238, time 5249.74ms 
iter 8120: loss 2.5649, time 5253.21ms 
iter 8121: loss 2.4204, time 5251.31ms 
iter 8122: loss 2.5016, time 5233.01ms 
iter 8123: loss 2.5504, time 5334.34ms 
iter 8124: loss 2.3310, time 5270.73ms 
iter 8125: loss 2.5632, time 5254.56ms 
iter 8126: loss 2.3848, time 5288.16ms 
iter 8127: loss 2.5601, time 5272.89ms 
iter 8128: loss 2.2132, time 5282.58ms 
iter 8129: loss 2.4655, time 5264.11ms 
iter 8130: loss 2.3221, time 5264.04ms 
iter 8131: loss 2.3819, time 5266.58ms 
iter 8132: loss 2.4687, time 5257.51ms 
iter 8133: loss 2.2357, time 5260.27ms 
iter 8134: loss 2.5942, time 5268.95ms 
iter 8135: loss 2.4067, time 5284.26ms 
iter 8136: loss 2.5900, time 5252.62ms 
iter 8137: loss 2.4687, time 5250.59ms 
iter 8138: loss 2.5984, time 5265.36ms 
iter 8139: loss 2.4023, time 5263.92ms 
iter 8140: loss 2.3000, time 5257.18ms 
iter 8141: loss 2.3956, time 5269.06ms 
iter 8142: loss 2.2428, time 5264.97ms 
iter 8143: loss 2.3942, time 5269.06ms 
iter 8144: loss 2.3172, time 5259.00ms 
iter 8145: loss 2.6739, time 5261.71ms 
iter 8146: loss 2.4469, time 5280.37ms 
iter 8147: loss 2.2816, time 5337.42ms 
iter 8148: loss 2.4819, time 5335.88ms 
iter 8149: loss 2.5929, time 5259.65ms 
step 8150: train loss 2.4786, val loss 2.8578
iter 8150: loss 2.3623, time 20012.60ms 
iter 8151: loss 2.4680, time 5265.25ms 
iter 8152: loss 2.4681, time 5259.25ms 
iter 8153: loss 2.6528, time 5258.70ms 
iter 8154: loss 2.6086, time 5283.21ms 
iter 8155: loss 2.4251, time 5262.40ms 
iter 8156: loss 2.4164, time 5264.13ms 
iter 8157: loss 2.6328, time 5263.21ms 
iter 8158: loss 2.4398, time 5270.89ms 
iter 8159: loss 2.3823, time 5274.37ms 
iter 8160: loss 2.3972, time 5262.48ms 
iter 8161: loss 2.5196, time 5253.96ms 
iter 8162: loss 2.3488, time 5260.76ms 
iter 8163: loss 2.4949, time 5251.35ms 
iter 8164: loss 2.4355, time 5259.35ms 
iter 8165: loss 2.4997, time 5264.14ms 
iter 8166: loss 2.5902, time 5268.08ms 
iter 8167: loss 2.5258, time 5256.18ms 
iter 8168: loss 2.3739, time 5254.40ms 
iter 8169: loss 2.2753, time 5259.69ms 
iter 8170: loss 2.7229, time 5273.30ms 
iter 8171: loss 2.4441, time 5226.61ms 
iter 8172: loss 2.4386, time 5247.79ms 
iter 8173: loss 2.6348, time 5259.44ms 
iter 8174: loss 2.5133, time 5283.13ms 
iter 8175: loss 2.4354, time 5322.69ms 
iter 8176: loss 2.7026, time 5245.87ms 
iter 8177: loss 2.6746, time 5319.38ms 
iter 8178: loss 2.6902, time 5308.82ms 
iter 8179: loss 2.3851, time 5252.73ms 
iter 8180: loss 2.4431, time 5261.07ms 
iter 8181: loss 2.6774, time 5298.90ms 
iter 8182: loss 2.6770, time 5255.56ms 
iter 8183: loss 2.3751, time 5253.70ms 
iter 8184: loss 2.4272, time 5274.40ms 
iter 8185: loss 2.5065, time 5265.40ms 
iter 8186: loss 2.3109, time 5263.17ms 
iter 8187: loss 2.6501, time 5250.87ms 
iter 8188: loss 2.5368, time 5249.07ms 
iter 8189: loss 2.7834, time 5259.75ms 
iter 8190: loss 2.4262, time 5257.11ms 
iter 8191: loss 2.4473, time 5250.87ms 
iter 8192: loss 2.5666, time 5260.67ms 
iter 8193: loss 2.4571, time 5272.69ms 
iter 8194: loss 2.5806, time 5250.49ms 
iter 8195: loss 2.5137, time 5245.47ms 
iter 8196: loss 2.2874, time 5247.93ms 
iter 8197: loss 2.3688, time 5264.79ms 
iter 8198: loss 2.6087, time 5255.91ms 
iter 8199: loss 2.3356, time 5255.96ms 
step 8200: train loss 2.4825, val loss 2.8364
iter 8200: loss 2.3333, time 20013.38ms 
iter 8201: loss 2.5400, time 5264.87ms 
iter 8202: loss 2.5792, time 5254.45ms 
iter 8203: loss 2.3499, time 5263.18ms 
iter 8204: loss 2.4458, time 5291.02ms 
iter 8205: loss 2.4782, time 5274.57ms 
iter 8206: loss 2.4445, time 5269.15ms 
iter 8207: loss 2.2417, time 5266.61ms 
iter 8208: loss 2.5942, time 5270.28ms 
iter 8209: loss 2.5052, time 5277.45ms 
iter 8210: loss 2.3843, time 5260.80ms 
iter 8211: loss 2.7342, time 5260.41ms 
iter 8212: loss 2.7147, time 5264.08ms 
iter 8213: loss 2.4904, time 5174.54ms 
iter 8214: loss 2.6721, time 5223.78ms 
iter 8215: loss 2.5719, time 5235.83ms 
iter 8216: loss 2.3987, time 5241.68ms 
iter 8217: loss 2.4871, time 5241.12ms 
iter 8218: loss 2.5429, time 5231.05ms 
iter 8219: loss 2.5895, time 5234.07ms 
iter 8220: loss 2.4741, time 5233.02ms 
iter 8221: loss 2.3948, time 5247.43ms 
iter 8222: loss 2.4456, time 5225.34ms 
iter 8223: loss 2.3330, time 5231.89ms 
iter 8224: loss 2.3042, time 5232.06ms 
iter 8225: loss 2.3783, time 5252.62ms 
iter 8226: loss 2.5778, time 5245.80ms 
iter 8227: loss 2.2550, time 5240.51ms 
iter 8228: loss 2.3332, time 5232.31ms 
iter 8229: loss 2.5058, time 5249.91ms 
iter 8230: loss 2.5213, time 5245.48ms 
iter 8231: loss 2.5542, time 5237.72ms 
iter 8232: loss 2.5485, time 5108.56ms 
iter 8233: loss 2.4347, time 5243.02ms 
iter 8234: loss 2.4113, time 5238.19ms 
iter 8235: loss 2.6085, time 5241.76ms 
iter 8236: loss 2.3236, time 5236.19ms 
iter 8237: loss 2.4910, time 5249.45ms 
iter 8238: loss 2.2384, time 5243.86ms 
iter 8239: loss 2.5086, time 5246.38ms 
iter 8240: loss 2.2223, time 5234.91ms 
iter 8241: loss 2.4549, time 5246.71ms 
iter 8242: loss 2.5895, time 5246.63ms 
iter 8243: loss 2.3448, time 5233.53ms 
iter 8244: loss 2.4374, time 5233.95ms 
iter 8245: loss 2.3805, time 5265.99ms 
iter 8246: loss 2.3637, time 5218.32ms 
iter 8247: loss 2.5543, time 5263.45ms 
iter 8248: loss 2.4949, time 5277.31ms 
iter 8249: loss 2.3835, time 5266.12ms 
step 8250: train loss 2.4828, val loss 2.8352
iter 8250: loss 2.4810, time 19980.83ms 
iter 8251: loss 2.5150, time 5248.98ms 
iter 8252: loss 2.5765, time 5241.51ms 
iter 8253: loss 2.4563, time 5243.67ms 
iter 8254: loss 2.3467, time 5246.52ms 
iter 8255: loss 2.5643, time 5267.36ms 
iter 8256: loss 2.3604, time 5252.53ms 
iter 8257: loss 2.3113, time 5247.35ms 
iter 8258: loss 2.5156, time 5208.55ms 
iter 8259: loss 2.4614, time 5263.05ms 
iter 8260: loss 2.4910, time 5252.93ms 
iter 8261: loss 2.3676, time 5252.65ms 
iter 8262: loss 2.3440, time 5258.61ms 
iter 8263: loss 2.4637, time 5255.90ms 
iter 8264: loss 2.4419, time 5244.53ms 
iter 8265: loss 2.5123, time 5254.86ms 
iter 8266: loss 2.4285, time 5245.99ms 
iter 8267: loss 2.2624, time 5268.71ms 
iter 8268: loss 2.5182, time 5166.18ms 
iter 8269: loss 2.5290, time 5204.23ms 
iter 8270: loss 2.5535, time 5261.96ms 
iter 8271: loss 2.7781, time 5267.06ms 
iter 8272: loss 2.5770, time 5271.61ms 
iter 8273: loss 2.3782, time 5267.88ms 
iter 8274: loss 2.4676, time 5273.89ms 
iter 8275: loss 2.7570, time 5256.02ms 
iter 8276: loss 2.5306, time 5253.29ms 
iter 8277: loss 2.6160, time 5260.83ms 
iter 8278: loss 2.5947, time 5275.45ms 
iter 8279: loss 2.6114, time 5268.57ms 
iter 8280: loss 2.3608, time 5267.97ms 
iter 8281: loss 2.4659, time 5266.62ms 
iter 8282: loss 2.3965, time 5268.65ms 
iter 8283: loss 2.5271, time 5228.59ms 
iter 8284: loss 2.6433, time 5080.46ms 
iter 8285: loss 2.4737, time 5109.89ms 
iter 8286: loss 2.5452, time 5252.87ms 
iter 8287: loss 2.3682, time 5251.00ms 
iter 8288: loss 2.5383, time 5238.80ms 
iter 8289: loss 2.3928, time 5232.37ms 
iter 8290: loss 2.5076, time 5193.61ms 
iter 8291: loss 2.8250, time 5225.75ms 
iter 8292: loss 2.2046, time 5242.67ms 
iter 8293: loss 2.4345, time 5198.63ms 
iter 8294: loss 2.3596, time 5256.58ms 
iter 8295: loss 2.5931, time 5231.99ms 
iter 8296: loss 2.6790, time 5232.42ms 
iter 8297: loss 2.3771, time 5233.40ms 
iter 8298: loss 2.4455, time 5251.20ms 
iter 8299: loss 2.4604, time 5233.06ms 
step 8300: train loss 2.4643, val loss 2.8292
iter 8300: loss 2.3961, time 20004.35ms 
iter 8301: loss 2.4975, time 5241.82ms 
iter 8302: loss 2.4820, time 5252.88ms 
iter 8303: loss 2.5409, time 5255.31ms 
iter 8304: loss 2.2204, time 5259.68ms 
iter 8305: loss 2.6129, time 5251.77ms 
iter 8306: loss 2.4647, time 5278.40ms 
iter 8307: loss 2.5133, time 5255.09ms 
iter 8308: loss 2.3051, time 5249.11ms 
iter 8309: loss 2.5851, time 5259.18ms 
iter 8310: loss 2.6776, time 5246.22ms 
iter 8311: loss 2.5460, time 5248.82ms 
iter 8312: loss 2.7007, time 5180.16ms 
iter 8313: loss 2.3447, time 5087.35ms 
iter 8314: loss 2.2147, time 5172.58ms 
iter 8315: loss 2.3922, time 5149.81ms 
iter 8316: loss 2.4395, time 5174.12ms 
iter 8317: loss 2.3425, time 5149.12ms 
iter 8318: loss 2.4077, time 5266.71ms 
iter 8319: loss 2.5395, time 5268.33ms 
iter 8320: loss 2.4790, time 5338.29ms 
iter 8321: loss 2.4541, time 5334.42ms 
iter 8322: loss 2.4700, time 5329.67ms 
iter 8323: loss 2.6130, time 5266.33ms 
iter 8324: loss 2.5317, time 5260.79ms 
iter 8325: loss 2.5308, time 5271.45ms 
iter 8326: loss 2.4580, time 5254.63ms 
iter 8327: loss 2.4824, time 5257.82ms 
iter 8328: loss 2.6681, time 5266.39ms 
iter 8329: loss 2.4672, time 5260.04ms 
iter 8330: loss 2.6186, time 5254.95ms 
iter 8331: loss 2.5809, time 5262.76ms 
iter 8332: loss 2.3132, time 5261.53ms 
iter 8333: loss 2.3078, time 5256.38ms 
iter 8334: loss 2.5023, time 5248.65ms 
iter 8335: loss 2.3553, time 5256.23ms 
iter 8336: loss 2.4622, time 5269.65ms 
iter 8337: loss 2.4011, time 5261.94ms 
iter 8338: loss 2.5573, time 5255.82ms 
iter 8339: loss 2.5458, time 5274.11ms 
iter 8340: loss 2.5152, time 5336.87ms 
iter 8341: loss 2.5128, time 5332.35ms 
iter 8342: loss 2.3541, time 5308.76ms 
iter 8343: loss 2.5180, time 5263.98ms 
iter 8344: loss 2.5197, time 5271.13ms 
iter 8345: loss 2.3376, time 5250.68ms 
iter 8346: loss 2.6079, time 5256.87ms 
iter 8347: loss 2.6275, time 5260.06ms 
iter 8348: loss 2.5049, time 5225.08ms 
iter 8349: loss 2.3639, time 5246.62ms 
step 8350: train loss 2.4702, val loss 2.8345
iter 8350: loss 2.3294, time 20208.33ms 
iter 8351: loss 2.1947, time 5325.86ms 
iter 8352: loss 2.4877, time 5229.20ms 
iter 8353: loss 2.4570, time 5015.08ms 
iter 8354: loss 2.4658, time 5284.45ms 
iter 8355: loss 2.2806, time 5259.78ms 
iter 8356: loss 2.5351, time 5259.78ms 
iter 8357: loss 2.4453, time 5253.38ms 
iter 8358: loss 2.4546, time 5249.10ms 
iter 8359: loss 2.3994, time 5257.33ms 
iter 8360: loss 2.6280, time 5306.12ms 
iter 8361: loss 2.3547, time 5282.76ms 
iter 8362: loss 2.5343, time 5320.97ms 
iter 8363: loss 2.6071, time 5297.60ms 
iter 8364: loss 2.4438, time 5340.04ms 
iter 8365: loss 2.4293, time 5292.62ms 
iter 8366: loss 2.5118, time 5285.26ms 
iter 8367: loss 1.9339, time 5322.31ms 
iter 8368: loss 2.5050, time 5328.92ms 
iter 8369: loss 2.5496, time 5278.84ms 
iter 8370: loss 2.3283, time 5325.23ms 
iter 8371: loss 2.6587, time 5271.36ms 
iter 8372: loss 2.5598, time 5261.48ms 
iter 8373: loss 2.2124, time 5269.86ms 
iter 8374: loss 2.4066, time 5123.30ms 
iter 8375: loss 2.5506, time 5247.87ms 
iter 8376: loss 2.4781, time 5247.23ms 
iter 8377: loss 2.3001, time 5251.73ms 
iter 8378: loss 2.2776, time 5262.55ms 
iter 8379: loss 2.5743, time 5263.55ms 
iter 8380: loss 2.4148, time 5267.73ms 
iter 8381: loss 2.4866, time 5255.42ms 
iter 8382: loss 2.4756, time 5249.52ms 
iter 8383: loss 2.4050, time 5240.93ms 
iter 8384: loss 2.3912, time 5254.75ms 
iter 8385: loss 2.7063, time 5252.18ms 
iter 8386: loss 2.3876, time 5246.33ms 
iter 8387: loss 2.4048, time 5244.63ms 
iter 8388: loss 2.4672, time 5244.44ms 
iter 8389: loss 2.5728, time 5238.46ms 
iter 8390: loss 2.5262, time 5257.60ms 
iter 8391: loss 2.7457, time 5249.74ms 
iter 8392: loss 2.5073, time 5251.85ms 
iter 8393: loss 2.5540, time 5245.24ms 
iter 8394: loss 2.5553, time 5314.24ms 
iter 8395: loss 2.4486, time 5319.83ms 
iter 8396: loss 2.4358, time 5326.99ms 
iter 8397: loss 2.3375, time 5313.50ms 
iter 8398: loss 2.6308, time 5258.87ms 
iter 8399: loss 2.3480, time 5254.68ms 
step 8400: train loss 2.4546, val loss 2.8444
iter 8400: loss 2.5900, time 20024.78ms 
iter 8401: loss 2.5046, time 5328.87ms 
iter 8402: loss 2.5383, time 5256.78ms 
iter 8403: loss 2.3241, time 5288.47ms 
iter 8404: loss 2.2669, time 5259.12ms 
iter 8405: loss 2.4445, time 5257.42ms 
iter 8406: loss 2.3979, time 5257.68ms 
iter 8407: loss 2.5564, time 5276.03ms 
iter 8408: loss 2.5806, time 5301.45ms 
iter 8409: loss 2.4879, time 5250.12ms 
iter 8410: loss 2.5664, time 5253.96ms 
iter 8411: loss 2.6129, time 5255.60ms 
iter 8412: loss 2.4991, time 5269.35ms 
iter 8413: loss 2.4392, time 5261.45ms 
iter 8414: loss 2.4948, time 5267.65ms 
iter 8415: loss 2.3790, time 5252.24ms 
iter 8416: loss 2.5041, time 5250.76ms 
iter 8417: loss 2.3601, time 5257.78ms 
iter 8418: loss 2.4813, time 5256.54ms 
iter 8419: loss 2.3321, time 5258.97ms 
iter 8420: loss 2.4613, time 5279.19ms 
iter 8421: loss 2.3438, time 5251.55ms 
iter 8422: loss 2.5629, time 5265.01ms 
iter 8423: loss 2.5971, time 5266.23ms 
iter 8424: loss 2.6042, time 5264.78ms 
iter 8425: loss 2.5363, time 5262.73ms 
iter 8426: loss 2.5028, time 5278.28ms 
iter 8427: loss 2.3750, time 5258.66ms 
iter 8428: loss 2.4451, time 5306.02ms 
iter 8429: loss 2.4830, time 5314.71ms 
iter 8430: loss 2.4575, time 5288.58ms 
iter 8431: loss 2.5668, time 5279.82ms 
iter 8432: loss 2.3284, time 5255.25ms 
iter 8433: loss 2.6409, time 5260.84ms 
iter 8434: loss 2.4334, time 5227.11ms 
iter 8435: loss 2.6379, time 5245.54ms 
iter 8436: loss 2.4950, time 5224.45ms 
iter 8437: loss 2.4462, time 5261.64ms 
iter 8438: loss 2.4438, time 5247.45ms 
iter 8439: loss 2.4924, time 5210.98ms 
iter 8440: loss 2.5914, time 5250.03ms 
iter 8441: loss 2.4506, time 5265.93ms 
iter 8442: loss 2.5678, time 5251.44ms 
iter 8443: loss 2.3639, time 5248.64ms 
iter 8444: loss 2.2042, time 5253.04ms 
iter 8445: loss 2.6324, time 5231.25ms 
iter 8446: loss 2.5649, time 5248.94ms 
iter 8447: loss 2.5966, time 5255.28ms 
iter 8448: loss 2.4848, time 5263.26ms 
iter 8449: loss 2.5393, time 5314.15ms 
step 8450: train loss 2.4724, val loss 2.8536
iter 8450: loss 2.5971, time 20040.83ms 
iter 8451: loss 2.6191, time 5251.67ms 
iter 8452: loss 2.4638, time 5263.82ms 
iter 8453: loss 2.5669, time 5338.24ms 
iter 8454: loss 2.4938, time 5330.99ms 
iter 8455: loss 2.4905, time 5308.49ms 
iter 8456: loss 2.2131, time 5316.42ms 
iter 8457: loss 2.6995, time 5313.79ms 
iter 8458: loss 2.5192, time 5329.82ms 
iter 8459: loss 2.6196, time 5325.83ms 
iter 8460: loss 2.1917, time 5271.25ms 
iter 8461: loss 2.3430, time 5258.49ms 
iter 8462: loss 2.5052, time 5252.92ms 
iter 8463: loss 2.6563, time 5260.46ms 
iter 8464: loss 2.1780, time 5276.83ms 
iter 8465: loss 2.4295, time 5311.21ms 
iter 8466: loss 2.3441, time 5341.28ms 
iter 8467: loss 2.4057, time 5334.68ms 
iter 8468: loss 2.3406, time 5323.39ms 
iter 8469: loss 2.4933, time 5275.02ms 
iter 8470: loss 2.3538, time 5272.13ms 
iter 8471: loss 2.5798, time 5274.59ms 
iter 8472: loss 2.5714, time 5322.91ms 
iter 8473: loss 2.4521, time 5267.80ms 
iter 8474: loss 2.5218, time 5309.69ms 
iter 8475: loss 2.4968, time 5281.14ms 
iter 8476: loss 2.3895, time 5305.96ms 
iter 8477: loss 2.6926, time 5258.35ms 
iter 8478: loss 2.4705, time 5260.62ms 
iter 8479: loss 2.4788, time 5276.49ms 
iter 8480: loss 2.5886, time 5273.39ms 
iter 8481: loss 2.4606, time 5270.31ms 
iter 8482: loss 2.2484, time 5261.93ms 
iter 8483: loss 2.5502, time 5271.12ms 
iter 8484: loss 2.3651, time 5259.06ms 
iter 8485: loss 2.2609, time 5261.34ms 
iter 8486: loss 2.5273, time 5251.95ms 
iter 8487: loss 2.5111, time 5270.72ms 
iter 8488: loss 2.4785, time 5259.59ms 
iter 8489: loss 2.4709, time 5256.79ms 
iter 8490: loss 2.5090, time 5248.41ms 
iter 8491: loss 2.3664, time 5269.34ms 
iter 8492: loss 2.3329, time 5257.44ms 
iter 8493: loss 2.6239, time 5263.31ms 
iter 8494: loss 2.5290, time 5259.18ms 
iter 8495: loss 2.2245, time 5266.93ms 
iter 8496: loss 2.4165, time 5255.74ms 
iter 8497: loss 2.3836, time 5257.93ms 
iter 8498: loss 2.2507, time 5260.65ms 
iter 8499: loss 2.1402, time 5271.23ms 
step 8500: train loss 2.4648, val loss 2.8460
iter 8500: loss 2.7024, time 20031.34ms 
iter 8501: loss 2.3984, time 5266.55ms 
iter 8502: loss 2.3425, time 5263.71ms 
iter 8503: loss 2.4277, time 5261.66ms 
iter 8504: loss 2.3428, time 5295.25ms 
iter 8505: loss 2.6043, time 5296.92ms 
iter 8506: loss 2.5437, time 5252.39ms 
iter 8507: loss 2.3921, time 5260.03ms 
iter 8508: loss 2.4639, time 5260.62ms 
iter 8509: loss 2.4164, time 5265.27ms 
iter 8510: loss 2.3037, time 5250.23ms 
iter 8511: loss 2.3221, time 5296.80ms 
iter 8512: loss 2.5296, time 5269.80ms 
iter 8513: loss 2.3813, time 5251.57ms 
iter 8514: loss 2.7027, time 5248.46ms 
iter 8515: loss 2.5852, time 5246.81ms 
iter 8516: loss 2.5842, time 5276.59ms 
iter 8517: loss 2.4206, time 5254.61ms 
iter 8518: loss 2.5081, time 5252.58ms 
iter 8519: loss 2.3945, time 5248.42ms 
iter 8520: loss 2.5249, time 5252.41ms 
iter 8521: loss 2.3347, time 5254.07ms 
iter 8522: loss 2.5336, time 5249.94ms 
iter 8523: loss 2.1393, time 5252.82ms 
iter 8524: loss 2.4414, time 5250.02ms 
iter 8525: loss 2.2379, time 5264.75ms 
iter 8526: loss 2.5934, time 5219.35ms 
iter 8527: loss 2.4108, time 5249.21ms 
iter 8528: loss 2.6021, time 5248.69ms 
iter 8529: loss 2.4958, time 5264.38ms 
iter 8530: loss 2.4446, time 5258.36ms 
iter 8531: loss 2.5224, time 5248.83ms 
iter 8532: loss 2.5361, time 5256.62ms 
iter 8533: loss 2.4500, time 5252.81ms 
iter 8534: loss 2.7150, time 5258.85ms 
iter 8535: loss 2.5054, time 5256.77ms 
iter 8536: loss 2.4893, time 5252.51ms 
iter 8537: loss 2.5165, time 5266.15ms 
iter 8538: loss 2.4954, time 5276.02ms 
iter 8539: loss 2.3933, time 5364.57ms 
iter 8540: loss 2.6475, time 5266.68ms 
iter 8541: loss 2.2496, time 5255.62ms 
iter 8542: loss 2.5030, time 5268.80ms 
iter 8543: loss 2.5204, time 5249.02ms 
iter 8544: loss 2.6717, time 5261.54ms 
iter 8545: loss 2.3742, time 5216.68ms 
iter 8546: loss 2.6840, time 5250.95ms 
iter 8547: loss 2.4768, time 5259.85ms 
iter 8548: loss 2.5308, time 5257.39ms 
iter 8549: loss 2.5357, time 5257.29ms 
step 8550: train loss 2.4804, val loss 2.8438
iter 8550: loss 2.4545, time 20036.02ms 
iter 8551: loss 2.4670, time 5254.58ms 
iter 8552: loss 2.7197, time 5273.06ms 
iter 8553: loss 2.4223, time 5298.56ms 
iter 8554: loss 2.4507, time 5264.77ms 
iter 8555: loss 2.4565, time 5247.50ms 
iter 8556: loss 2.4263, time 5255.38ms 
iter 8557: loss 2.5107, time 5255.53ms 
iter 8558: loss 2.4555, time 5258.23ms 
iter 8559: loss 2.5142, time 5231.70ms 
iter 8560: loss 2.5537, time 5263.07ms 
iter 8561: loss 2.4007, time 5260.24ms 
iter 8562: loss 2.5028, time 5254.37ms 
iter 8563: loss 2.4610, time 5255.94ms 
iter 8564: loss 2.5004, time 5408.02ms 
iter 8565: loss 2.3741, time 5389.20ms 
iter 8566: loss 2.5263, time 5216.87ms 
iter 8567: loss 2.3804, time 5248.13ms 
iter 8568: loss 2.2450, time 5256.61ms 
iter 8569: loss 2.0754, time 5336.66ms 
iter 8570: loss 2.4045, time 5247.34ms 
iter 8571: loss 2.3437, time 5249.12ms 
iter 8572: loss 2.5583, time 5325.21ms 
iter 8573: loss 2.4682, time 5262.23ms 
iter 8574: loss 2.5153, time 5247.86ms 
iter 8575: loss 2.5101, time 5331.29ms 
iter 8576: loss 2.4283, time 5405.89ms 
iter 8577: loss 2.3635, time 5259.47ms 
iter 8578: loss 2.7275, time 5255.26ms 
iter 8579: loss 2.3155, time 5251.47ms 
iter 8580: loss 2.2916, time 5295.58ms 
iter 8581: loss 2.5670, time 5334.10ms 
iter 8582: loss 2.3281, time 5333.32ms 
iter 8583: loss 2.4990, time 5311.52ms 
iter 8584: loss 2.5475, time 5330.92ms 
iter 8585: loss 2.4285, time 5324.70ms 
iter 8586: loss 2.3626, time 5329.29ms 
iter 8587: loss 2.6043, time 5399.38ms 
iter 8588: loss 2.4236, time 5395.08ms 
iter 8589: loss 2.5479, time 5384.23ms 
iter 8590: loss 2.5949, time 5262.73ms 
iter 8591: loss 2.3278, time 5329.07ms 
iter 8592: loss 2.3874, time 5283.66ms 
iter 8593: loss 2.2495, time 5244.39ms 
iter 8594: loss 2.4350, time 5245.89ms 
iter 8595: loss 2.5811, time 5261.16ms 
iter 8596: loss 2.5037, time 5248.07ms 
iter 8597: loss 2.2903, time 5245.64ms 
iter 8598: loss 2.6449, time 5256.27ms 
iter 8599: loss 2.5245, time 5272.29ms 
step 8600: train loss 2.4735, val loss 2.8439
iter 8600: loss 2.3060, time 19992.61ms 
iter 8601: loss 2.6330, time 5249.82ms 
iter 8602: loss 2.4854, time 5252.70ms 
iter 8603: loss 2.5804, time 5255.19ms 
iter 8604: loss 2.4270, time 5252.64ms 
iter 8605: loss 2.5730, time 5255.89ms 
iter 8606: loss 2.3737, time 5273.82ms 
iter 8607: loss 2.4608, time 5338.33ms 
iter 8608: loss 2.5330, time 5325.68ms 
iter 8609: loss 2.4747, time 5339.80ms 
iter 8610: loss 2.5565, time 5280.98ms 
iter 8611: loss 2.4646, time 5231.55ms 
iter 8612: loss 2.4319, time 5254.33ms 
iter 8613: loss 2.4120, time 5265.36ms 
iter 8614: loss 2.4256, time 5256.39ms 
iter 8615: loss 2.3331, time 5256.56ms 
iter 8616: loss 2.5830, time 5252.68ms 
iter 8617: loss 2.4939, time 5300.53ms 
iter 8618: loss 2.4814, time 5267.70ms 
iter 8619: loss 2.4002, time 5308.48ms 
iter 8620: loss 2.4847, time 5301.35ms 
iter 8621: loss 2.3151, time 5340.15ms 
iter 8622: loss 2.4638, time 5265.66ms 
iter 8623: loss 2.4378, time 5263.47ms 
iter 8624: loss 2.5361, time 5267.66ms 
iter 8625: loss 2.3407, time 5263.46ms 
iter 8626: loss 2.1956, time 5259.73ms 
iter 8627: loss 2.6765, time 5261.05ms 
iter 8628: loss 2.5740, time 5264.12ms 
iter 8629: loss 2.2697, time 5261.33ms 
iter 8630: loss 2.6381, time 5250.39ms 
iter 8631: loss 2.5851, time 5288.45ms 
iter 8632: loss 2.4220, time 5281.41ms 
iter 8633: loss 2.4090, time 5322.62ms 
iter 8634: loss 2.3847, time 5319.20ms 
iter 8635: loss 2.6685, time 5302.99ms 
iter 8636: loss 2.4566, time 5295.68ms 
iter 8637: loss 2.7673, time 5277.58ms 
iter 8638: loss 2.3680, time 5254.18ms 
iter 8639: loss 2.4868, time 5261.35ms 
iter 8640: loss 2.7450, time 5301.48ms 
iter 8641: loss 2.5992, time 5276.13ms 
iter 8642: loss 2.5862, time 5258.00ms 
iter 8643: loss 2.4801, time 5268.30ms 
iter 8644: loss 2.6001, time 5283.82ms 
iter 8645: loss 2.5592, time 5310.67ms 
iter 8646: loss 2.5813, time 5269.36ms 
iter 8647: loss 2.4209, time 5257.25ms 
iter 8648: loss 2.5608, time 5260.19ms 
iter 8649: loss 2.3590, time 5249.13ms 
step 8650: train loss 2.4757, val loss 2.8349
iter 8650: loss 2.3122, time 20023.34ms 
iter 8651: loss 2.5990, time 5272.21ms 
iter 8652: loss 2.3321, time 5264.16ms 
iter 8653: loss 2.6390, time 5266.59ms 
iter 8654: loss 2.6401, time 5262.56ms 
iter 8655: loss 2.6152, time 5275.70ms 
iter 8656: loss 2.3233, time 5265.18ms 
iter 8657: loss 2.4117, time 5255.79ms 
iter 8658: loss 2.4774, time 5272.28ms 
iter 8659: loss 2.2707, time 5262.95ms 
iter 8660: loss 2.4749, time 5251.36ms 
iter 8661: loss 2.3505, time 5260.05ms 
iter 8662: loss 2.6299, time 5265.52ms 
iter 8663: loss 2.2032, time 5268.46ms 
iter 8664: loss 2.6578, time 5262.25ms 
iter 8665: loss 2.5002, time 5263.14ms 
iter 8666: loss 2.5604, time 5256.96ms 
iter 8667: loss 2.6104, time 5271.05ms 
iter 8668: loss 2.6330, time 5256.83ms 
iter 8669: loss 2.3377, time 5250.88ms 
iter 8670: loss 2.6089, time 5260.77ms 
iter 8671: loss 2.4007, time 5256.75ms 
iter 8672: loss 2.5911, time 5250.53ms 
iter 8673: loss 2.6676, time 5250.48ms 
iter 8674: loss 2.3345, time 5254.82ms 
iter 8675: loss 2.1986, time 5260.19ms 
iter 8676: loss 2.2591, time 5305.73ms 
iter 8677: loss 2.3567, time 5300.13ms 
iter 8678: loss 2.4819, time 5276.53ms 
iter 8679: loss 2.5164, time 5277.28ms 
iter 8680: loss 2.2296, time 5328.38ms 
iter 8681: loss 2.6336, time 5334.59ms 
iter 8682: loss 2.4634, time 5305.97ms 
iter 8683: loss 2.4671, time 5283.26ms 
iter 8684: loss 2.3357, time 5260.19ms 
iter 8685: loss 2.5528, time 5262.36ms 
iter 8686: loss 2.4116, time 5266.96ms 
iter 8687: loss 2.5938, time 5265.47ms 
iter 8688: loss 2.6064, time 5266.29ms 
iter 8689: loss 2.4878, time 5271.91ms 
iter 8690: loss 2.4472, time 5256.56ms 
iter 8691: loss 2.3671, time 5285.66ms 
iter 8692: loss 2.8120, time 5280.53ms 
iter 8693: loss 2.5772, time 5252.26ms 
iter 8694: loss 2.4178, time 5283.01ms 
iter 8695: loss 2.4803, time 5320.55ms 
iter 8696: loss 2.4709, time 5264.36ms 
iter 8697: loss 2.4427, time 5298.08ms 
iter 8698: loss 2.5821, time 5327.00ms 
iter 8699: loss 2.5027, time 5264.72ms 
step 8700: train loss 2.4860, val loss 2.8581
iter 8700: loss 2.4848, time 20031.74ms 
iter 8701: loss 2.4708, time 5259.44ms 
iter 8702: loss 2.3820, time 5255.47ms 
iter 8703: loss 2.5343, time 5267.66ms 
iter 8704: loss 2.4466, time 5267.75ms 
iter 8705: loss 2.5339, time 5267.80ms 
iter 8706: loss 2.4689, time 5267.55ms 
iter 8707: loss 2.5962, time 5267.77ms 
iter 8708: loss 2.3216, time 5270.37ms 
iter 8709: loss 2.4837, time 5256.71ms 
iter 8710: loss 2.5510, time 5263.49ms 
iter 8711: loss 2.3544, time 5231.52ms 
iter 8712: loss 2.2248, time 5256.80ms 
iter 8713: loss 2.5379, time 5247.65ms 
iter 8714: loss 2.5375, time 5259.39ms 
iter 8715: loss 2.3579, time 5269.69ms 
iter 8716: loss 2.4313, time 5258.41ms 
iter 8717: loss 2.4338, time 5267.69ms 
iter 8718: loss 2.3695, time 5260.34ms 
iter 8719: loss 2.5914, time 5268.42ms 
iter 8720: loss 2.2218, time 5270.94ms 
iter 8721: loss 2.4706, time 5255.93ms 
iter 8722: loss 2.3526, time 5256.99ms 
iter 8723: loss 2.4415, time 5281.22ms 
iter 8724: loss 2.4735, time 5247.32ms 
iter 8725: loss 2.5448, time 5244.07ms 
iter 8726: loss 2.2801, time 5262.85ms 
iter 8727: loss 2.4397, time 5262.35ms 
iter 8728: loss 2.2172, time 5258.57ms 
iter 8729: loss 2.6032, time 5257.29ms 
iter 8730: loss 2.3631, time 5262.37ms 
iter 8731: loss 2.2859, time 5270.04ms 
iter 8732: loss 2.4244, time 5263.15ms 
iter 8733: loss 2.5129, time 5239.02ms 
iter 8734: loss 2.3361, time 5245.31ms 
iter 8735: loss 2.6342, time 5262.39ms 
iter 8736: loss 2.2495, time 5252.37ms 
iter 8737: loss 2.5606, time 5257.76ms 
iter 8738: loss 2.3349, time 5263.77ms 
iter 8739: loss 2.6486, time 5275.11ms 
iter 8740: loss 2.3507, time 5262.11ms 
iter 8741: loss 2.3394, time 5252.54ms 
iter 8742: loss 2.5596, time 5269.36ms 
iter 8743: loss 2.6564, time 5260.70ms 
iter 8744: loss 2.5709, time 5252.12ms 
iter 8745: loss 2.5007, time 5226.64ms 
iter 8746: loss 2.3943, time 5259.09ms 
iter 8747: loss 2.4054, time 5250.46ms 
iter 8748: loss 2.4582, time 5253.68ms 
iter 8749: loss 2.4894, time 5267.74ms 
step 8750: train loss 2.4549, val loss 2.8458
iter 8750: loss 2.3953, time 20015.71ms 
iter 8751: loss 2.4780, time 5245.68ms 
iter 8752: loss 2.5050, time 5252.30ms 
iter 8753: loss 2.4100, time 5264.36ms 
iter 8754: loss 2.4805, time 5247.21ms 
iter 8755: loss 2.4220, time 5247.52ms 
iter 8756: loss 2.3641, time 5252.40ms 
iter 8757: loss 2.6163, time 5261.88ms 
iter 8758: loss 2.6157, time 5256.16ms 
iter 8759: loss 2.4740, time 5256.06ms 
iter 8760: loss 2.5120, time 5261.34ms 
iter 8761: loss 2.2565, time 5232.23ms 
iter 8762: loss 2.5496, time 5246.90ms 
iter 8763: loss 2.6086, time 5247.69ms 
iter 8764: loss 2.3860, time 5262.17ms 
iter 8765: loss 2.6213, time 5249.84ms 
iter 8766: loss 2.6029, time 5250.65ms 
iter 8767: loss 2.3297, time 5264.41ms 
iter 8768: loss 2.5846, time 5264.09ms 
iter 8769: loss 2.4680, time 5256.24ms 
iter 8770: loss 2.5909, time 5260.87ms 
iter 8771: loss 2.3925, time 5262.91ms 
iter 8772: loss 2.5144, time 5247.49ms 
iter 8773: loss 2.4567, time 5225.36ms 
iter 8774: loss 2.3769, time 5258.78ms 
iter 8775: loss 2.4330, time 5269.34ms 
iter 8776: loss 2.4312, time 5257.48ms 
iter 8777: loss 2.5621, time 5250.86ms 
iter 8778: loss 2.6277, time 5250.32ms 
iter 8779: loss 2.3149, time 5269.65ms 
iter 8780: loss 2.5981, time 5252.50ms 
iter 8781: loss 2.3448, time 5254.10ms 
iter 8782: loss 2.4182, time 5257.65ms 
iter 8783: loss 2.2575, time 5260.68ms 
iter 8784: loss 2.6981, time 5259.65ms 
iter 8785: loss 2.5594, time 5261.02ms 
iter 8786: loss 2.4216, time 5258.21ms 
iter 8787: loss 2.6739, time 5268.22ms 
iter 8788: loss 2.1568, time 5261.61ms 
iter 8789: loss 2.3500, time 5253.88ms 
iter 8790: loss 2.5002, time 5256.21ms 
iter 8791: loss 2.3956, time 5262.97ms 
iter 8792: loss 2.6234, time 5249.60ms 
iter 8793: loss 2.5413, time 5251.64ms 
iter 8794: loss 2.5235, time 5248.37ms 
iter 8795: loss 2.5425, time 5221.59ms 
iter 8796: loss 2.5419, time 5248.70ms 
iter 8797: loss 2.5728, time 5255.50ms 
iter 8798: loss 2.4580, time 5258.44ms 
iter 8799: loss 2.4963, time 5253.19ms 
step 8800: train loss 2.4623, val loss 2.8359
iter 8800: loss 2.5558, time 19979.24ms 
iter 8801: loss 2.2393, time 5248.46ms 
iter 8802: loss 2.3915, time 5260.46ms 
iter 8803: loss 2.4160, time 5249.30ms 
iter 8804: loss 2.5644, time 5234.17ms 
iter 8805: loss 2.3513, time 5248.42ms 
iter 8806: loss 2.3321, time 5260.56ms 
iter 8807: loss 2.3930, time 5259.73ms 
iter 8808: loss 2.4104, time 5248.68ms 
iter 8809: loss 2.7437, time 5252.26ms 
iter 8810: loss 2.4896, time 5263.50ms 
iter 8811: loss 2.4369, time 5259.34ms 
iter 8812: loss 2.3087, time 5250.96ms 
iter 8813: loss 2.4503, time 5257.33ms 
iter 8814: loss 2.5221, time 5265.03ms 
iter 8815: loss 2.3444, time 5276.75ms 
iter 8816: loss 2.6185, time 5258.66ms 
iter 8817: loss 2.5525, time 5250.15ms 
iter 8818: loss 2.2510, time 5247.81ms 
iter 8819: loss 2.4669, time 5200.59ms 
iter 8820: loss 2.3653, time 5259.47ms 
iter 8821: loss 2.4199, time 5247.65ms 
iter 8822: loss 2.3397, time 5244.78ms 
iter 8823: loss 2.2691, time 4987.07ms 
iter 8824: loss 2.5736, time 5036.05ms 
iter 8825: loss 2.5553, time 4983.99ms 
iter 8826: loss 2.4135, time 5033.48ms 
iter 8827: loss 2.2672, time 5016.68ms 
iter 8828: loss 2.3353, time 4986.74ms 
iter 8829: loss 2.2558, time 4982.51ms 
iter 8830: loss 2.6571, time 5011.87ms 
iter 8831: loss 2.6951, time 5037.52ms 
iter 8832: loss 2.3725, time 5243.33ms 
iter 8833: loss 2.5431, time 5246.13ms 
iter 8834: loss 2.6749, time 5248.43ms 
iter 8835: loss 2.4476, time 5242.67ms 
iter 8836: loss 2.5758, time 5254.77ms 
iter 8837: loss 2.4547, time 5251.95ms 
iter 8838: loss 2.5212, time 5257.44ms 
iter 8839: loss 2.5899, time 5253.72ms 
iter 8840: loss 2.4319, time 5241.93ms 
iter 8841: loss 2.6125, time 5248.76ms 
iter 8842: loss 2.4444, time 5254.15ms 
iter 8843: loss 2.5493, time 5252.31ms 
iter 8844: loss 2.3023, time 5252.04ms 
iter 8845: loss 2.4702, time 5218.46ms 
iter 8846: loss 2.4873, time 5247.14ms 
iter 8847: loss 2.2603, time 5242.43ms 
iter 8848: loss 2.5103, time 5245.52ms 
iter 8849: loss 2.4048, time 5246.20ms 
step 8850: train loss 2.4546, val loss 2.8512
iter 8850: loss 2.4695, time 19998.11ms 
iter 8851: loss 2.5078, time 5214.68ms 
iter 8852: loss 2.5756, time 5239.56ms 
iter 8853: loss 2.3704, time 5250.62ms 
iter 8854: loss 2.5494, time 5261.36ms 
iter 8855: loss 2.4408, time 5215.35ms 
iter 8856: loss 2.4179, time 5221.47ms 
iter 8857: loss 2.5340, time 5257.99ms 
iter 8858: loss 2.3404, time 5259.61ms 
iter 8859: loss 2.4231, time 5244.69ms 
iter 8860: loss 2.4618, time 5253.10ms 
iter 8861: loss 2.5977, time 5244.31ms 
iter 8862: loss 2.5290, time 5260.45ms 
iter 8863: loss 2.4051, time 5236.48ms 
iter 8864: loss 2.5797, time 5215.71ms 
iter 8865: loss 2.4202, time 5207.65ms 
iter 8866: loss 2.2605, time 5245.25ms 
iter 8867: loss 2.5219, time 5239.62ms 
iter 8868: loss 2.2400, time 5241.70ms 
iter 8869: loss 2.2660, time 5241.43ms 
iter 8870: loss 2.5886, time 5263.87ms 
iter 8871: loss 2.5643, time 5240.22ms 
iter 8872: loss 2.4264, time 5243.52ms 
iter 8873: loss 2.6599, time 5247.43ms 
iter 8874: loss 2.4420, time 5256.14ms 
iter 8875: loss 2.6151, time 5248.65ms 
iter 8876: loss 2.5739, time 5254.51ms 
iter 8877: loss 2.4673, time 5261.10ms 
iter 8878: loss 2.3682, time 5267.02ms 
iter 8879: loss 2.3973, time 5253.99ms 
iter 8880: loss 2.3183, time 5254.46ms 
iter 8881: loss 2.3575, time 5261.27ms 
iter 8882: loss 2.6313, time 5263.69ms 
iter 8883: loss 2.4896, time 5247.70ms 
iter 8884: loss 2.4257, time 5256.50ms 
iter 8885: loss 2.6767, time 5264.60ms 
iter 8886: loss 2.4114, time 5264.05ms 
iter 8887: loss 2.5106, time 5245.85ms 
iter 8888: loss 2.7232, time 5260.61ms 
iter 8889: loss 2.6059, time 5270.42ms 
iter 8890: loss 2.4368, time 5271.55ms 
iter 8891: loss 2.3550, time 5256.07ms 
iter 8892: loss 2.4230, time 5246.32ms 
iter 8893: loss 2.5209, time 5255.64ms 
iter 8894: loss 2.2817, time 5264.94ms 
iter 8895: loss 2.5921, time 5246.24ms 
iter 8896: loss 2.5754, time 5245.30ms 
iter 8897: loss 2.5547, time 5250.19ms 
iter 8898: loss 2.6048, time 5265.28ms 
iter 8899: loss 2.4121, time 5252.05ms 
step 8900: train loss 2.4524, val loss 2.8316
iter 8900: loss 2.4416, time 19980.86ms 
iter 8901: loss 2.6149, time 5254.60ms 
iter 8902: loss 2.4646, time 5270.99ms 
iter 8903: loss 2.5457, time 5254.38ms 
iter 8904: loss 2.4848, time 5254.60ms 
iter 8905: loss 2.9907, time 5262.27ms 
iter 8906: loss 2.4221, time 5277.84ms 
iter 8907: loss 2.6400, time 5254.11ms 
iter 8908: loss 2.5649, time 5247.95ms 
iter 8909: loss 2.1752, time 5250.58ms 
iter 8910: loss 2.5677, time 5272.00ms 
iter 8911: loss 2.5056, time 5265.46ms 
iter 8912: loss 2.5550, time 5248.43ms 
iter 8913: loss 2.4853, time 5251.16ms 
iter 8914: loss 2.3495, time 5271.21ms 
iter 8915: loss 2.5489, time 5251.18ms 
iter 8916: loss 2.5881, time 5249.25ms 
iter 8917: loss 2.4327, time 5246.88ms 
iter 8918: loss 2.4800, time 5260.06ms 
iter 8919: loss 2.2695, time 5257.77ms 
iter 8920: loss 2.4368, time 5248.79ms 
iter 8921: loss 2.4393, time 5251.59ms 
iter 8922: loss 2.6237, time 5262.26ms 
iter 8923: loss 2.5371, time 5258.95ms 
iter 8924: loss 2.1602, time 5251.88ms 
iter 8925: loss 2.6089, time 5256.94ms 
iter 8926: loss 2.4544, time 5268.65ms 
iter 8927: loss 2.5086, time 5264.57ms 
iter 8928: loss 2.3609, time 5254.71ms 
iter 8929: loss 2.6128, time 5254.68ms 
iter 8930: loss 2.4753, time 5256.59ms 
iter 8931: loss 2.3929, time 5277.64ms 
iter 8932: loss 2.6389, time 5256.73ms 
iter 8933: loss 2.7541, time 5256.21ms 
iter 8934: loss 2.5749, time 5252.53ms 
iter 8935: loss 2.4707, time 5262.02ms 
iter 8936: loss 2.4066, time 5252.26ms 
iter 8937: loss 2.3846, time 5250.82ms 
iter 8938: loss 2.5380, time 5237.68ms 
iter 8939: loss 2.4997, time 5255.46ms 
iter 8940: loss 2.5789, time 5095.51ms 
iter 8941: loss 2.3467, time 5132.06ms 
iter 8942: loss 2.2479, time 5259.74ms 
iter 8943: loss 2.5240, time 5261.88ms 
iter 8944: loss 2.4098, time 5229.96ms 
iter 8945: loss 2.1589, time 5256.59ms 
iter 8946: loss 2.3896, time 5261.90ms 
iter 8947: loss 2.6185, time 5249.64ms 
iter 8948: loss 2.2887, time 5265.13ms 
iter 8949: loss 2.3515, time 5245.45ms 
step 8950: train loss 2.4595, val loss 2.8376
iter 8950: loss 2.5315, time 19999.40ms 
iter 8951: loss 2.4628, time 5248.06ms 
iter 8952: loss 2.4447, time 5259.53ms 
iter 8953: loss 2.2708, time 5252.58ms 
iter 8954: loss 2.3019, time 5251.13ms 
iter 8955: loss 2.4388, time 5251.38ms 
iter 8956: loss 2.4232, time 5256.90ms 
iter 8957: loss 2.3816, time 5265.37ms 
iter 8958: loss 2.5787, time 5249.28ms 
iter 8959: loss 2.3520, time 5254.89ms 
iter 8960: loss 2.6649, time 5261.88ms 
iter 8961: loss 2.3060, time 5261.21ms 
iter 8962: loss 2.4792, time 5257.89ms 
iter 8963: loss 2.7218, time 5254.16ms 
iter 8964: loss 2.4727, time 5256.66ms 
iter 8965: loss 2.6078, time 5281.44ms 
iter 8966: loss 2.5352, time 5277.17ms 
iter 8967: loss 2.3124, time 5253.39ms 
iter 8968: loss 2.2079, time 5252.67ms 
iter 8969: loss 2.4332, time 5266.22ms 
iter 8970: loss 2.5248, time 5247.92ms 
iter 8971: loss 2.5136, time 5253.42ms 
iter 8972: loss 2.5606, time 5274.55ms 
iter 8973: loss 2.4539, time 5263.06ms 
iter 8974: loss 2.6648, time 5263.30ms 
iter 8975: loss 2.3554, time 5250.52ms 
iter 8976: loss 2.6114, time 5260.20ms 
iter 8977: loss 2.4702, time 5259.78ms 
iter 8978: loss 2.6529, time 5256.62ms 
iter 8979: loss 2.4081, time 5264.37ms 
iter 8980: loss 2.6535, time 5275.13ms 
iter 8981: loss 2.2868, time 5271.14ms 
iter 8982: loss 2.5662, time 5283.97ms 
iter 8983: loss 2.5101, time 5264.14ms 
iter 8984: loss 2.4944, time 5266.81ms 
iter 8985: loss 2.4235, time 5270.96ms 
iter 8986: loss 2.4655, time 5310.87ms 
iter 8987: loss 2.4652, time 5274.83ms 
iter 8988: loss 2.4493, time 5267.41ms 
iter 8989: loss 2.3762, time 5257.13ms 
iter 8990: loss 2.5925, time 5249.20ms 
iter 8991: loss 2.5249, time 5257.68ms 
iter 8992: loss 2.5423, time 5260.12ms 
iter 8993: loss 2.4125, time 5106.61ms 
iter 8994: loss 2.4046, time 5015.81ms 
iter 8995: loss 2.4561, time 5248.54ms 
iter 8996: loss 2.4962, time 5247.11ms 
iter 8997: loss 2.5542, time 5246.42ms 
iter 8998: loss 2.3994, time 5257.39ms 
iter 8999: loss 2.7154, time 5262.09ms 
step 9000: train loss 2.4484, val loss 2.8504
iter 9000: loss 2.3720, time 20032.08ms 
iter 9001: loss 2.4801, time 5247.79ms 
iter 9002: loss 2.3018, time 5247.34ms 
iter 9003: loss 2.4427, time 5257.83ms 
iter 9004: loss 2.4860, time 5249.32ms 
iter 9005: loss 2.3506, time 5250.90ms 
iter 9006: loss 2.6122, time 5257.64ms 
iter 9007: loss 2.5241, time 5256.07ms 
iter 9008: loss 2.5568, time 5250.25ms 
iter 9009: loss 2.6677, time 5252.01ms 
iter 9010: loss 2.6371, time 5260.36ms 
iter 9011: loss 2.5022, time 5257.49ms 
iter 9012: loss 2.4444, time 5262.28ms 
iter 9013: loss 2.2686, time 5266.49ms 
iter 9014: loss 2.5879, time 5260.92ms 
iter 9015: loss 2.2324, time 5264.86ms 
iter 9016: loss 2.3630, time 5253.26ms 
iter 9017: loss 2.2679, time 5274.40ms 
iter 9018: loss 2.5172, time 5257.71ms 
iter 9019: loss 2.3958, time 5275.97ms 
iter 9020: loss 2.5656, time 5262.64ms 
iter 9021: loss 2.4738, time 5259.73ms 
iter 9022: loss 2.3627, time 5269.01ms 
iter 9023: loss 2.5131, time 5260.05ms 
iter 9024: loss 2.5167, time 5256.51ms 
iter 9025: loss 2.5187, time 5250.30ms 
iter 9026: loss 2.3684, time 5261.82ms 
iter 9027: loss 2.4504, time 5153.11ms 
iter 9028: loss 2.5612, time 5257.13ms 
iter 9029: loss 2.4867, time 5217.83ms 
iter 9030: loss 2.5427, time 5012.44ms 
iter 9031: loss 2.7127, time 5127.33ms 
iter 9032: loss 2.2915, time 5252.78ms 
iter 9033: loss 2.1892, time 5252.63ms 
iter 9034: loss 2.3945, time 5266.64ms 
iter 9035: loss 2.4518, time 5141.59ms 
iter 9036: loss 2.4099, time 5015.28ms 
iter 9037: loss 2.5048, time 5258.25ms 
iter 9038: loss 2.6536, time 5259.67ms 
iter 9039: loss 2.4660, time 5254.41ms 
iter 9040: loss 2.6646, time 5233.93ms 
iter 9041: loss 2.4829, time 5268.46ms 
iter 9042: loss 2.4212, time 5255.54ms 
iter 9043: loss 2.6127, time 5254.74ms 
iter 9044: loss 2.3438, time 5252.41ms 
iter 9045: loss 2.4810, time 5263.08ms 
iter 9046: loss 2.4116, time 5253.26ms 
iter 9047: loss 2.5453, time 5246.58ms 
iter 9048: loss 2.3810, time 5254.46ms 
iter 9049: loss 2.5316, time 5227.90ms 
step 9050: train loss 2.4688, val loss 2.8607
iter 9050: loss 2.4044, time 19997.97ms 
iter 9051: loss 2.2981, time 5249.53ms 
iter 9052: loss 2.6569, time 5265.44ms 
iter 9053: loss 2.5271, time 5259.82ms 
iter 9054: loss 2.5147, time 5255.60ms 
iter 9055: loss 2.6547, time 5254.36ms 
iter 9056: loss 2.4871, time 5269.54ms 
iter 9057: loss 2.5773, time 5250.98ms 
iter 9058: loss 2.4857, time 5267.03ms 
iter 9059: loss 2.4219, time 5234.04ms 
iter 9060: loss 2.3742, time 5274.92ms 
iter 9061: loss 2.7276, time 5272.31ms 
iter 9062: loss 2.3448, time 5257.87ms 
iter 9063: loss 2.5745, time 5262.68ms 
iter 9064: loss 2.3467, time 5273.29ms 
iter 9065: loss 2.5310, time 5265.40ms 
iter 9066: loss 2.3567, time 5264.61ms 
iter 9067: loss 2.3574, time 5256.51ms 
iter 9068: loss 2.6428, time 5273.04ms 
iter 9069: loss 2.3736, time 5257.45ms 
iter 9070: loss 2.5665, time 5249.87ms 
iter 9071: loss 2.4230, time 5253.75ms 
iter 9072: loss 2.4751, time 5269.38ms 
iter 9073: loss 2.5338, time 5254.10ms 
iter 9074: loss 2.5192, time 5262.80ms 
iter 9075: loss 2.3713, time 5260.19ms 
iter 9076: loss 2.5683, time 5270.79ms 
iter 9077: loss 2.7438, time 5265.62ms 
iter 9078: loss 2.5180, time 5257.90ms 
iter 9079: loss 2.6300, time 5249.28ms 
iter 9080: loss 2.6181, time 5266.12ms 
iter 9081: loss 2.6310, time 5249.99ms 
iter 9082: loss 2.3608, time 5259.34ms 
iter 9083: loss 2.5005, time 5262.31ms 
iter 9084: loss 2.3800, time 5261.20ms 
iter 9085: loss 2.4522, time 5256.35ms 
iter 9086: loss 2.4887, time 5257.55ms 
iter 9087: loss 2.4096, time 5257.30ms 
iter 9088: loss 2.2249, time 5269.36ms 
iter 9089: loss 2.4952, time 5261.31ms 
iter 9090: loss 2.5511, time 5255.87ms 
iter 9091: loss 2.2660, time 5251.71ms 
iter 9092: loss 2.4355, time 5266.76ms 
iter 9093: loss 2.4975, time 5253.51ms 
iter 9094: loss 2.3229, time 5250.28ms 
iter 9095: loss 2.4305, time 5247.21ms 
iter 9096: loss 2.5552, time 5258.34ms 
iter 9097: loss 2.3441, time 5253.98ms 
iter 9098: loss 2.4926, time 5248.62ms 
iter 9099: loss 2.8621, time 5240.91ms 
step 9100: train loss 2.4500, val loss 2.8482
iter 9100: loss 2.5787, time 20022.24ms 
iter 9101: loss 2.5264, time 5255.12ms 
iter 9102: loss 2.6208, time 5256.69ms 
iter 9103: loss 2.4215, time 5253.31ms 
iter 9104: loss 2.3493, time 5252.03ms 
iter 9105: loss 2.4460, time 5264.40ms 
iter 9106: loss 2.4055, time 5271.12ms 
iter 9107: loss 2.4309, time 5249.76ms 
iter 9108: loss 2.4321, time 5240.94ms 
iter 9109: loss 2.6502, time 5262.02ms 
iter 9110: loss 2.4843, time 5276.75ms 
iter 9111: loss 2.3001, time 5258.10ms 
iter 9112: loss 2.5540, time 5254.71ms 
iter 9113: loss 2.5970, time 5248.04ms 
iter 9114: loss 2.4993, time 5266.24ms 
iter 9115: loss 2.5396, time 5220.60ms 
iter 9116: loss 2.5588, time 5249.34ms 
iter 9117: loss 2.5020, time 5213.30ms 
iter 9118: loss 2.3624, time 5270.17ms 
iter 9119: loss 2.5454, time 5247.90ms 
iter 9120: loss 2.4054, time 5247.81ms 
iter 9121: loss 2.3694, time 5261.96ms 
iter 9122: loss 2.3340, time 5249.58ms 
iter 9123: loss 2.3805, time 5245.11ms 
iter 9124: loss 2.2099, time 5254.99ms 
iter 9125: loss 2.4012, time 5253.22ms 
iter 9126: loss 2.5655, time 5260.06ms 
iter 9127: loss 2.4971, time 5249.66ms 
iter 9128: loss 2.4127, time 5248.03ms 
iter 9129: loss 2.4744, time 5255.43ms 
iter 9130: loss 2.4647, time 5259.66ms 
iter 9131: loss 2.4363, time 5248.67ms 
iter 9132: loss 2.3659, time 5257.40ms 
iter 9133: loss 2.3274, time 5262.66ms 
iter 9134: loss 2.3833, time 5255.86ms 
iter 9135: loss 2.3824, time 5255.23ms 
iter 9136: loss 2.4465, time 5257.07ms 
iter 9137: loss 2.3265, time 5277.09ms 
iter 9138: loss 2.5958, time 5257.47ms 
iter 9139: loss 2.5036, time 5257.97ms 
iter 9140: loss 2.6546, time 5260.57ms 
iter 9141: loss 2.5953, time 5266.76ms 
iter 9142: loss 2.2175, time 5238.66ms 
iter 9143: loss 2.3950, time 5264.15ms 
iter 9144: loss 2.4868, time 5274.19ms 
iter 9145: loss 2.6815, time 5281.54ms 
iter 9146: loss 2.6316, time 5263.78ms 
iter 9147: loss 2.4315, time 5253.09ms 
iter 9148: loss 2.3815, time 5270.37ms 
iter 9149: loss 2.2940, time 5252.71ms 
step 9150: train loss 2.4667, val loss 2.8493
iter 9150: loss 2.4390, time 20022.77ms 
iter 9151: loss 2.6198, time 5255.61ms 
iter 9152: loss 2.4524, time 5255.88ms 
iter 9153: loss 2.4445, time 5247.31ms 
iter 9154: loss 2.3299, time 5250.80ms 
iter 9155: loss 2.5457, time 5234.19ms 
iter 9156: loss 2.5077, time 5268.21ms 
iter 9157: loss 2.3597, time 5249.91ms 
iter 9158: loss 2.5028, time 5250.92ms 
iter 9159: loss 2.6566, time 5254.78ms 
iter 9160: loss 2.3289, time 5264.12ms 
iter 9161: loss 2.7199, time 5212.54ms 
iter 9162: loss 2.4622, time 5235.97ms 
iter 9163: loss 2.2973, time 5264.20ms 
iter 9164: loss 2.3808, time 5257.27ms 
iter 9165: loss 2.4006, time 5253.98ms 
iter 9166: loss 2.4664, time 5248.95ms 
iter 9167: loss 2.6428, time 5257.90ms 
iter 9168: loss 2.3882, time 5261.39ms 
iter 9169: loss 2.5059, time 5258.33ms 
iter 9170: loss 2.4936, time 5258.58ms 
iter 9171: loss 2.5299, time 5266.45ms 
iter 9172: loss 2.4550, time 5249.04ms 
iter 9173: loss 2.2847, time 5253.20ms 
iter 9174: loss 2.4386, time 5265.57ms 
iter 9175: loss 2.2373, time 5259.25ms 
iter 9176: loss 2.3997, time 5253.55ms 
iter 9177: loss 2.3963, time 5235.86ms 
iter 9178: loss 2.5794, time 5266.86ms 
iter 9179: loss 2.3272, time 5258.09ms 
iter 9180: loss 2.4553, time 5223.46ms 
iter 9181: loss 2.5084, time 5260.26ms 
iter 9182: loss 2.4282, time 5265.74ms 
iter 9183: loss 2.5675, time 5258.57ms 
iter 9184: loss 2.4508, time 5250.99ms 
iter 9185: loss 2.7589, time 5260.05ms 
iter 9186: loss 2.5375, time 5260.87ms 
iter 9187: loss 2.4623, time 5250.32ms 
iter 9188: loss 2.3672, time 5250.13ms 
iter 9189: loss 2.5728, time 5274.04ms 
iter 9190: loss 2.3145, time 5271.73ms 
iter 9191: loss 2.5295, time 5265.54ms 
iter 9192: loss 2.3640, time 5262.99ms 
iter 9193: loss 2.5102, time 5265.41ms 
iter 9194: loss 2.4762, time 5258.41ms 
iter 9195: loss 2.6082, time 5254.90ms 
iter 9196: loss 2.4368, time 5265.09ms 
iter 9197: loss 2.3146, time 5259.40ms 
iter 9198: loss 2.3010, time 5255.02ms 
iter 9199: loss 2.5870, time 5250.25ms 
step 9200: train loss 2.4628, val loss 2.8417
iter 9200: loss 2.5359, time 20033.86ms 
iter 9201: loss 2.4611, time 5263.02ms 
iter 9202: loss 2.2359, time 5216.19ms 
iter 9203: loss 2.5689, time 5248.37ms 
iter 9204: loss 2.6374, time 5262.77ms 
iter 9205: loss 2.3916, time 5268.85ms 
iter 9206: loss 2.4480, time 5244.37ms 
iter 9207: loss 2.4204, time 5250.14ms 
iter 9208: loss 2.4323, time 5264.08ms 
iter 9209: loss 2.4197, time 5252.89ms 
iter 9210: loss 2.4969, time 5252.88ms 
iter 9211: loss 2.3070, time 5261.05ms 
iter 9212: loss 2.4558, time 5266.50ms 
iter 9213: loss 2.3188, time 5254.05ms 
iter 9214: loss 2.4574, time 5224.60ms 
iter 9215: loss 2.5073, time 5248.67ms 
iter 9216: loss 2.7117, time 5264.07ms 
iter 9217: loss 2.2740, time 5255.57ms 
iter 9218: loss 2.4617, time 5251.08ms 
iter 9219: loss 2.1979, time 5257.72ms 
iter 9220: loss 2.6076, time 5268.72ms 
iter 9221: loss 2.5848, time 5260.54ms 
iter 9222: loss 2.5591, time 5260.30ms 
iter 9223: loss 2.5433, time 5256.91ms 
iter 9224: loss 2.3700, time 5263.19ms 
iter 9225: loss 2.6489, time 5262.32ms 
iter 9226: loss 2.4955, time 5255.60ms 
iter 9227: loss 2.2706, time 5249.46ms 
iter 9228: loss 2.6691, time 5257.51ms 
iter 9229: loss 2.7458, time 5263.24ms 
iter 9230: loss 2.4688, time 5257.44ms 
iter 9231: loss 2.3907, time 5252.23ms 
iter 9232: loss 2.4780, time 5268.25ms 
iter 9233: loss 2.4535, time 5251.58ms 
iter 9234: loss 2.1725, time 5282.75ms 
iter 9235: loss 2.3685, time 5260.78ms 
iter 9236: loss 2.4085, time 5269.26ms 
iter 9237: loss 2.3734, time 5259.20ms 
iter 9238: loss 2.4253, time 5256.87ms 
iter 9239: loss 2.5303, time 5264.69ms 
iter 9240: loss 2.5336, time 5273.05ms 
iter 9241: loss 2.5785, time 5272.24ms 
iter 9242: loss 2.5309, time 5258.00ms 
iter 9243: loss 2.0922, time 5258.91ms 
iter 9244: loss 2.4572, time 5269.73ms 
iter 9245: loss 2.7937, time 5240.74ms 
iter 9246: loss 2.2862, time 5248.82ms 
iter 9247: loss 2.5789, time 5248.02ms 
iter 9248: loss 2.6726, time 5257.77ms 
iter 9249: loss 2.7050, time 5260.82ms 
step 9250: train loss 2.4649, val loss 2.8432
iter 9250: loss 2.6227, time 20009.67ms 
iter 9251: loss 2.2794, time 5294.93ms 
iter 9252: loss 2.4183, time 5413.77ms 
iter 9253: loss 2.2783, time 5257.52ms 
iter 9254: loss 2.7552, time 5252.49ms 
iter 9255: loss 2.4287, time 5255.21ms 
iter 9256: loss 2.4490, time 5258.32ms 
iter 9257: loss 2.4676, time 5258.09ms 
iter 9258: loss 2.6588, time 5262.55ms 
iter 9259: loss 2.5786, time 5256.51ms 
iter 9260: loss 2.4088, time 5267.44ms 
iter 9261: loss 2.4365, time 5262.90ms 
iter 9262: loss 2.5398, time 5249.85ms 
iter 9263: loss 2.5298, time 5253.75ms 
iter 9264: loss 2.5489, time 5260.18ms 
iter 9265: loss 2.3147, time 5257.74ms 
iter 9266: loss 2.3285, time 5251.18ms 
iter 9267: loss 2.2740, time 5266.83ms 
iter 9268: loss 2.5744, time 5255.34ms 
iter 9269: loss 2.3688, time 5252.66ms 
iter 9270: loss 2.4314, time 5227.32ms 
iter 9271: loss 2.3855, time 5260.55ms 
iter 9272: loss 2.3083, time 5248.90ms 
iter 9273: loss 2.5400, time 5250.85ms 
iter 9274: loss 2.3642, time 5256.01ms 
iter 9275: loss 2.5467, time 5255.11ms 
iter 9276: loss 2.4690, time 5250.83ms 
iter 9277: loss 2.5758, time 5250.15ms 
iter 9278: loss 2.1430, time 5257.86ms 
iter 9279: loss 2.5038, time 5257.09ms 
iter 9280: loss 2.4619, time 5248.59ms 
iter 9281: loss 2.5152, time 5249.05ms 
iter 9282: loss 2.4968, time 5274.25ms 
iter 9283: loss 2.5392, time 5250.90ms 
iter 9284: loss 2.3428, time 5252.35ms 
iter 9285: loss 2.4745, time 5255.24ms 
iter 9286: loss 2.3833, time 5263.01ms 
iter 9287: loss 2.2613, time 5247.82ms 
iter 9288: loss 2.5525, time 5257.26ms 
iter 9289: loss 2.5084, time 5261.28ms 
iter 9290: loss 2.3888, time 5262.91ms 
iter 9291: loss 2.4406, time 5259.61ms 
iter 9292: loss 2.3804, time 5262.92ms 
iter 9293: loss 2.5691, time 5276.21ms 
iter 9294: loss 2.4060, time 5265.63ms 
iter 9295: loss 2.5101, time 5259.97ms 
iter 9296: loss 2.3586, time 5255.01ms 
iter 9297: loss 2.5478, time 5261.62ms 
iter 9298: loss 2.3896, time 5249.56ms 
iter 9299: loss 2.3579, time 5200.59ms 
step 9300: train loss 2.4525, val loss 2.8460
iter 9300: loss 2.7176, time 19978.19ms 
iter 9301: loss 2.3262, time 5265.27ms 
iter 9302: loss 2.2231, time 5254.12ms 
iter 9303: loss 2.4711, time 5005.12ms 
iter 9304: loss 2.6343, time 5232.22ms 
iter 9305: loss 2.4107, time 5250.94ms 
iter 9306: loss 2.3183, time 5265.64ms 
iter 9307: loss 2.6397, time 5259.09ms 
iter 9308: loss 2.7228, time 5261.63ms 
iter 9309: loss 2.4676, time 5266.47ms 
iter 9310: loss 2.5192, time 5278.32ms 
iter 9311: loss 2.4675, time 5265.42ms 
iter 9312: loss 2.6134, time 5249.01ms 
iter 9313: loss 2.5943, time 5265.62ms 
iter 9314: loss 2.3425, time 5260.46ms 
iter 9315: loss 2.4707, time 5268.45ms 
iter 9316: loss 2.5244, time 5258.62ms 
iter 9317: loss 2.5555, time 5260.06ms 
iter 9318: loss 2.6668, time 5258.16ms 
iter 9319: loss 2.4405, time 5251.14ms 
iter 9320: loss 2.5355, time 5253.90ms 
iter 9321: loss 2.5267, time 5265.31ms 
iter 9322: loss 2.3751, time 5261.70ms 
iter 9323: loss 2.6247, time 5254.96ms 
iter 9324: loss 2.5244, time 5260.45ms 
iter 9325: loss 2.2536, time 5275.92ms 
iter 9326: loss 2.4300, time 5256.01ms 
iter 9327: loss 2.0898, time 5254.26ms 
iter 9328: loss 2.4992, time 5248.80ms 
iter 9329: loss 2.4657, time 5267.50ms 
iter 9330: loss 2.4595, time 5252.22ms 
iter 9331: loss 2.5320, time 5250.17ms 
iter 9332: loss 2.5262, time 5258.67ms 
iter 9333: loss 2.2979, time 5262.78ms 
iter 9334: loss 2.3901, time 5248.93ms 
iter 9335: loss 2.4988, time 5261.47ms 
iter 9336: loss 2.4631, time 5257.52ms 
iter 9337: loss 2.4796, time 5273.90ms 
iter 9338: loss 2.3817, time 5236.32ms 
iter 9339: loss 2.3919, time 5260.33ms 
iter 9340: loss 2.4928, time 5264.58ms 
iter 9341: loss 2.0513, time 5270.55ms 
iter 9342: loss 2.4968, time 5258.95ms 
iter 9343: loss 2.5779, time 5261.29ms 
iter 9344: loss 2.2123, time 5260.61ms 
iter 9345: loss 2.6072, time 5265.07ms 
iter 9346: loss 2.3160, time 5256.83ms 
iter 9347: loss 2.2947, time 5265.77ms 
iter 9348: loss 2.6980, time 5270.23ms 
iter 9349: loss 2.5530, time 5265.21ms 
step 9350: train loss 2.4545, val loss 2.8363
iter 9350: loss 2.5466, time 20025.98ms 
iter 9351: loss 2.4844, time 5258.82ms 
iter 9352: loss 2.5663, time 5255.86ms 
iter 9353: loss 2.4718, time 5265.69ms 
iter 9354: loss 2.4243, time 5265.45ms 
iter 9355: loss 2.3258, time 5262.54ms 
iter 9356: loss 2.3402, time 5254.14ms 
iter 9357: loss 2.4560, time 5271.47ms 
iter 9358: loss 2.4423, time 5257.94ms 
iter 9359: loss 2.2883, time 5254.39ms 
iter 9360: loss 2.3364, time 5265.07ms 
iter 9361: loss 2.0328, time 5268.68ms 
iter 9362: loss 2.3845, time 5254.86ms 
iter 9363: loss 2.4013, time 5252.62ms 
iter 9364: loss 2.4601, time 5253.79ms 
iter 9365: loss 2.2834, time 5266.36ms 
iter 9366: loss 2.8226, time 5253.12ms 
iter 9367: loss 2.5293, time 5256.76ms 
iter 9368: loss 2.4998, time 5251.13ms 
iter 9369: loss 2.2423, time 5263.87ms 
iter 9370: loss 2.4739, time 5257.07ms 
iter 9371: loss 2.4579, time 5260.22ms 
iter 9372: loss 2.6599, time 5261.40ms 
iter 9373: loss 2.4338, time 5280.45ms 
iter 9374: loss 2.4937, time 5256.88ms 
iter 9375: loss 2.3395, time 5249.26ms 
iter 9376: loss 2.4477, time 5249.38ms 
iter 9377: loss 2.3035, time 5256.68ms 
iter 9378: loss 2.6005, time 5256.08ms 
iter 9379: loss 2.5502, time 5247.50ms 
iter 9380: loss 2.2049, time 5249.41ms 
iter 9381: loss 2.3115, time 5264.51ms 
iter 9382: loss 2.1459, time 5252.83ms 
iter 9383: loss 2.4749, time 5254.40ms 
iter 9384: loss 2.4970, time 5253.90ms 
iter 9385: loss 2.6204, time 5260.89ms 
iter 9386: loss 2.1735, time 5255.31ms 
iter 9387: loss 2.3233, time 5249.16ms 
iter 9388: loss 2.5683, time 5250.71ms 
iter 9389: loss 2.5515, time 5262.02ms 
iter 9390: loss 2.3920, time 5257.45ms 
iter 9391: loss 2.5128, time 5249.95ms 
iter 9392: loss 2.5928, time 5255.44ms 
iter 9393: loss 2.5628, time 5271.21ms 
iter 9394: loss 2.6125, time 5259.08ms 
iter 9395: loss 2.4641, time 5263.20ms 
iter 9396: loss 2.4232, time 5280.95ms 
iter 9397: loss 2.4724, time 5277.02ms 
iter 9398: loss 2.2613, time 5256.05ms 
iter 9399: loss 2.2706, time 5274.38ms 
step 9400: train loss 2.4455, val loss 2.8485
iter 9400: loss 2.4531, time 20016.85ms 
iter 9401: loss 2.6311, time 5256.99ms 
iter 9402: loss 2.3947, time 5251.99ms 
iter 9403: loss 2.3463, time 5253.12ms 
iter 9404: loss 2.2576, time 5257.98ms 
iter 9405: loss 2.5416, time 5263.05ms 
iter 9406: loss 2.5150, time 5254.78ms 
iter 9407: loss 2.6513, time 5251.46ms 
iter 9408: loss 2.4902, time 5269.23ms 
iter 9409: loss 2.5687, time 5255.82ms 
iter 9410: loss 2.3018, time 5251.13ms 
iter 9411: loss 2.4340, time 5258.18ms 
iter 9412: loss 2.4266, time 5267.98ms 
iter 9413: loss 2.2486, time 5260.08ms 
iter 9414: loss 2.4851, time 5262.11ms 
iter 9415: loss 2.5703, time 5272.55ms 
iter 9416: loss 2.6129, time 5268.69ms 
iter 9417: loss 1.9883, time 5259.00ms 
iter 9418: loss 2.5056, time 5269.90ms 
iter 9419: loss 2.8068, time 5266.45ms 
iter 9420: loss 2.3794, time 5260.14ms 
iter 9421: loss 2.3193, time 5255.64ms 
iter 9422: loss 2.5858, time 5227.02ms 
iter 9423: loss 2.6265, time 5256.96ms 
iter 9424: loss 2.4043, time 5259.73ms 
iter 9425: loss 2.4912, time 5251.38ms 
iter 9426: loss 2.2607, time 5268.02ms 
iter 9427: loss 2.3933, time 5261.00ms 
iter 9428: loss 2.2776, time 5254.76ms 
iter 9429: loss 2.4228, time 5264.39ms 
iter 9430: loss 2.5899, time 5258.97ms 
iter 9431: loss 2.5829, time 5261.34ms 
iter 9432: loss 2.5732, time 5248.36ms 
iter 9433: loss 2.3681, time 5262.48ms 
iter 9434: loss 2.4615, time 5252.07ms 
iter 9435: loss 2.1433, time 5248.44ms 
iter 9436: loss 2.6070, time 5246.42ms 
iter 9437: loss 2.3074, time 5256.62ms 
iter 9438: loss 2.5065, time 5224.24ms 
iter 9439: loss 2.3469, time 5249.67ms 
iter 9440: loss 2.4868, time 5258.90ms 
iter 9441: loss 2.5899, time 5267.37ms 
iter 9442: loss 2.7570, time 5258.00ms 
iter 9443: loss 2.5855, time 5260.13ms 
iter 9444: loss 2.5893, time 5251.89ms 
iter 9445: loss 2.6848, time 5270.03ms 
iter 9446: loss 2.3378, time 5251.01ms 
iter 9447: loss 2.5351, time 5248.34ms 
iter 9448: loss 2.6928, time 5260.02ms 
iter 9449: loss 2.4918, time 5255.07ms 
step 9450: train loss 2.4576, val loss 2.8496
iter 9450: loss 2.5300, time 19965.44ms 
iter 9451: loss 2.5586, time 5254.36ms 
iter 9452: loss 2.5277, time 5272.07ms 
iter 9453: loss 2.4600, time 5264.62ms 
iter 9454: loss 2.5597, time 5256.38ms 
iter 9455: loss 2.3620, time 5278.22ms 
iter 9456: loss 2.3305, time 5277.43ms 
iter 9457: loss 2.1926, time 5261.54ms 
iter 9458: loss 2.4441, time 5261.13ms 
iter 9459: loss 2.3735, time 5276.35ms 
iter 9460: loss 2.3984, time 5264.62ms 
iter 9461: loss 2.2855, time 5264.25ms 
iter 9462: loss 2.6200, time 5258.17ms 
iter 9463: loss 2.3894, time 5239.98ms 
iter 9464: loss 2.4601, time 5254.03ms 
iter 9465: loss 2.2543, time 5252.04ms 
iter 9466: loss 2.6517, time 5268.24ms 
iter 9467: loss 2.4411, time 5263.30ms 
iter 9468: loss 2.5004, time 5254.48ms 
iter 9469: loss 2.5583, time 5267.18ms 
iter 9470: loss 2.4771, time 5278.55ms 
iter 9471: loss 2.3488, time 5260.87ms 
iter 9472: loss 2.5970, time 5256.75ms 
iter 9473: loss 2.3347, time 5264.83ms 
iter 9474: loss 2.5681, time 5258.05ms 
iter 9475: loss 2.4086, time 5253.73ms 
iter 9476: loss 2.5366, time 5253.34ms 
iter 9477: loss 2.6341, time 5267.88ms 
iter 9478: loss 2.6089, time 5257.90ms 
iter 9479: loss 2.2470, time 5250.56ms 
iter 9480: loss 2.4478, time 5252.37ms 
iter 9481: loss 2.4463, time 5263.77ms 
iter 9482: loss 2.5611, time 5249.29ms 
iter 9483: loss 2.1842, time 5253.02ms 
iter 9484: loss 2.3239, time 5258.51ms 
iter 9485: loss 2.5524, time 5254.97ms 
iter 9486: loss 2.4025, time 5249.80ms 
iter 9487: loss 2.3112, time 5261.06ms 
iter 9488: loss 2.5165, time 5259.64ms 
iter 9489: loss 2.5495, time 5255.57ms 
iter 9490: loss 2.3885, time 5211.04ms 
iter 9491: loss 2.4583, time 5249.54ms 
iter 9492: loss 2.3530, time 5254.37ms 
iter 9493: loss 2.4324, time 5224.66ms 
iter 9494: loss 2.3443, time 5240.38ms 
iter 9495: loss 2.2247, time 5242.98ms 
iter 9496: loss 2.4195, time 5134.70ms 
iter 9497: loss 2.4075, time 5263.50ms 
iter 9498: loss 2.3795, time 5259.32ms 
iter 9499: loss 2.4348, time 5261.67ms 
step 9500: train loss 2.4457, val loss 2.8540
iter 9500: loss 2.5928, time 20024.56ms 
iter 9501: loss 2.3526, time 5256.99ms 
iter 9502: loss 2.7555, time 5229.60ms 
iter 9503: loss 2.4592, time 5253.66ms 
iter 9504: loss 2.3335, time 5264.31ms 
iter 9505: loss 2.5181, time 5274.16ms 
iter 9506: loss 2.4002, time 5268.17ms 
iter 9507: loss 2.4984, time 5262.41ms 
iter 9508: loss 2.2879, time 5265.40ms 
iter 9509: loss 2.2645, time 5267.19ms 
iter 9510: loss 2.3107, time 5261.72ms 
iter 9511: loss 2.4853, time 5263.14ms 
iter 9512: loss 2.2672, time 5252.74ms 
iter 9513: loss 2.3114, time 5268.69ms 
iter 9514: loss 2.6459, time 5258.69ms 
iter 9515: loss 2.2964, time 5254.45ms 
iter 9516: loss 2.2909, time 5247.06ms 
iter 9517: loss 2.3221, time 5268.47ms 
iter 9518: loss 2.4269, time 5256.64ms 
iter 9519: loss 2.5388, time 5262.95ms 
iter 9520: loss 2.4456, time 5257.27ms 
iter 9521: loss 2.4060, time 5275.39ms 
iter 9522: loss 2.4112, time 5264.63ms 
iter 9523: loss 2.4051, time 5262.23ms 
iter 9524: loss 2.3242, time 5260.74ms 
iter 9525: loss 2.3220, time 5271.97ms 
iter 9526: loss 2.6543, time 5263.71ms 
iter 9527: loss 2.4822, time 5259.76ms 
iter 9528: loss 2.5903, time 5254.47ms 
iter 9529: loss 2.5043, time 5255.04ms 
iter 9530: loss 2.2874, time 5259.56ms 
iter 9531: loss 2.3404, time 5246.71ms 
iter 9532: loss 2.4245, time 5247.22ms 
iter 9533: loss 2.6049, time 5254.48ms 
iter 9534: loss 2.2565, time 5257.08ms 
iter 9535: loss 2.4414, time 5249.34ms 
iter 9536: loss 2.6897, time 5250.94ms 
iter 9537: loss 2.3616, time 5262.37ms 
iter 9538: loss 2.3769, time 5251.41ms 
iter 9539: loss 2.3454, time 5245.86ms 
iter 9540: loss 2.5295, time 5258.26ms 
iter 9541: loss 2.3513, time 5258.09ms 
iter 9542: loss 2.1429, time 5249.58ms 
iter 9543: loss 2.3923, time 5256.07ms 
iter 9544: loss 2.0906, time 5264.87ms 
iter 9545: loss 2.4774, time 5260.00ms 
iter 9546: loss 2.5113, time 5256.05ms 
iter 9547: loss 2.4842, time 5243.29ms 
iter 9548: loss 2.3386, time 5268.58ms 
iter 9549: loss 2.3581, time 5251.11ms 
step 9550: train loss 2.4499, val loss 2.8454
iter 9550: loss 2.4872, time 19988.90ms 
iter 9551: loss 2.0959, time 5255.46ms 
iter 9552: loss 2.2527, time 5258.17ms 
iter 9553: loss 2.6357, time 5248.28ms 
iter 9554: loss 2.6824, time 5257.66ms 
iter 9555: loss 2.3497, time 5271.71ms 
iter 9556: loss 2.4326, time 5258.32ms 
iter 9557: loss 2.4833, time 5254.27ms 
iter 9558: loss 2.5576, time 5256.03ms 
iter 9559: loss 2.6714, time 5267.12ms 
iter 9560: loss 2.3803, time 5255.05ms 
iter 9561: loss 2.4954, time 5256.17ms 
iter 9562: loss 2.3130, time 5271.46ms 
iter 9563: loss 2.5369, time 5262.48ms 
iter 9564: loss 2.3797, time 5260.07ms 
iter 9565: loss 2.2415, time 5263.14ms 
iter 9566: loss 2.7314, time 5260.21ms 
iter 9567: loss 2.4348, time 5255.35ms 
iter 9568: loss 2.6516, time 5251.50ms 
iter 9569: loss 2.2640, time 5260.32ms 
iter 9570: loss 2.6433, time 5253.31ms 
iter 9571: loss 2.5859, time 5253.01ms 
iter 9572: loss 2.0836, time 5258.97ms 
iter 9573: loss 2.5549, time 5252.82ms 
iter 9574: loss 2.4805, time 5251.79ms 
iter 9575: loss 2.2320, time 5251.57ms 
iter 9576: loss 2.7837, time 5262.22ms 
iter 9577: loss 2.3492, time 5263.45ms 
iter 9578: loss 2.4956, time 5237.25ms 
iter 9579: loss 2.4443, time 5253.55ms 
iter 9580: loss 2.4341, time 5267.65ms 
iter 9581: loss 2.6058, time 5267.71ms 
iter 9582: loss 2.4359, time 5269.47ms 
iter 9583: loss 2.2603, time 5258.94ms 
iter 9584: loss 2.2287, time 5277.65ms 
iter 9585: loss 2.5745, time 5249.99ms 
iter 9586: loss 2.8534, time 5258.40ms 
iter 9587: loss 2.3469, time 5247.04ms 
iter 9588: loss 2.4725, time 5261.77ms 
iter 9589: loss 2.4826, time 5251.72ms 
iter 9590: loss 2.5977, time 5247.35ms 
iter 9591: loss 2.3609, time 5256.24ms 
iter 9592: loss 2.2832, time 5252.99ms 
iter 9593: loss 2.3779, time 5249.79ms 
iter 9594: loss 2.3850, time 5246.78ms 
iter 9595: loss 2.5191, time 5265.13ms 
iter 9596: loss 2.4667, time 5254.38ms 
iter 9597: loss 2.4086, time 5256.97ms 
iter 9598: loss 2.5206, time 5259.44ms 
iter 9599: loss 2.4469, time 5269.26ms 
step 9600: train loss 2.4536, val loss 2.8504
iter 9600: loss 2.3830, time 20004.59ms 
iter 9601: loss 2.3502, time 5251.73ms 
iter 9602: loss 2.5576, time 5268.01ms 
iter 9603: loss 2.3678, time 5252.43ms 
iter 9604: loss 2.3955, time 5255.42ms 
iter 9605: loss 2.3329, time 5266.38ms 
iter 9606: loss 2.2241, time 5251.00ms 
iter 9607: loss 2.5608, time 5248.55ms 
iter 9608: loss 2.4943, time 5256.81ms 
iter 9609: loss 2.3768, time 5268.62ms 
iter 9610: loss 2.3394, time 5259.03ms 
iter 9611: loss 2.4597, time 5251.96ms 
iter 9612: loss 2.4055, time 5255.63ms 
iter 9613: loss 2.4854, time 5255.05ms 
iter 9614: loss 2.3713, time 5248.55ms 
iter 9615: loss 2.4826, time 5263.42ms 
