tokens per iteration will be: 491,520
Initializing a new model from scratch
config:OpenELMConfig {
  "_name_or_path": "./",
  "activation_fn_name": "swish",
  "architectures": [
    "OpenELMForCausalLM"
  ],
  "attn_pdrop": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_openelm.OpenELMConfig",
    "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "ffn_dim_divisor": 256,
  "ffn_multipliers": [
    0.5,
    0.75,
    1.0,
    1.25,
    1.5,
    1.75,
    2.0,
    2.25
  ],
  "ffn_with_glu": true,
  "head_dim": 64,
  "initializer_range": 0.02,
  "max_context_length": 1024,
  "model_dim": 1280,
  "model_type": "openelm",
  "normalization_layer_name": "rms_norm",
  "normalize_qk_projections": true,
  "num_gqa_groups": 2,
  "num_kv_heads": [
    3,
    3,
    4,
    4,
    4,
    4,
    5,
    5
  ],
  "num_query_heads": [
    10,
    12,
    12,
    14,
    16,
    18,
    18,
    20
  ],
  "num_transformer_layers": 8,
  "qkv_multipliers": [
    0.5,
    1.0
  ],
  "rope_freq_constant": 10000,
  "rope_max_length": 1024,
  "share_input_output_layers": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

config:OpenELMConfig {
  "_name_or_path": "./",
  "activation_fn_name": "swish",
  "architectures": [
    "OpenELMForCausalLM"
  ],
  "attn_pdrop": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_openelm.OpenELMConfig",
    "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "ffn_dim_divisor": 256,
  "ffn_multipliers": [
    0.5,
    0.75,
    1.0,
    1.25,
    1.5,
    1.5,
    1.75,
    2.0
  ],
  "ffn_with_glu": true,
  "head_dim": 64,
  "initializer_range": 0.02,
  "max_context_length": 1024,
  "model_dim": 954,
  "model_type": "openelm",
  "normalization_layer_name": "rms_norm",
  "normalize_qk_projections": true,
  "num_gqa_groups": 2,
  "num_kv_heads": [
    3,
    3,
    3,
    3,
    4,
    4,
    4,
    5
  ],
  "num_query_heads": [
    6,
    6,
    6,
    6,
    8,
    8,
    8,
    10
  ],
  "num_transformer_layers": 8,
  "qkv_multipliers": [
    0.5,
    1.0
  ],
  "rope_freq_constant": 10000,
  "rope_max_length": 1024,
  "share_input_output_layers": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

num decayed parameter tensors: 33, with 87,875,802 parameters
num non-decayed parameter tensors: 33, with 17,242 parameters
using fused AdamW: True
compiling the model... (takes a ~minute)
number of parameters: 87.89M
number of transformer parameters: 39.95M
step 0: train loss 10.5025, val loss 10.4970
iter 0: loss 10.5799, time 50514.26ms 
iter 1: loss 10.5796, time 5267.38ms 
iter 2: loss 10.4688, time 5301.32ms 
iter 3: loss 10.2551, time 5300.24ms 
iter 4: loss 10.1043, time 5298.93ms 
iter 5: loss 9.8233, time 5299.22ms 
iter 6: loss 9.7248, time 5281.53ms 
iter 7: loss 9.2209, time 5289.52ms 
iter 8: loss 8.6346, time 5299.19ms 
iter 9: loss 8.3333, time 5295.37ms 
iter 10: loss 8.1003, time 5289.99ms 
iter 11: loss 7.3845, time 5301.90ms 
iter 12: loss 7.5899, time 5277.67ms 
iter 13: loss 6.9994, time 5113.44ms 
iter 14: loss 7.0138, time 5283.53ms 
iter 15: loss 7.2620, time 5293.81ms 
iter 16: loss 7.2665, time 5292.05ms 
iter 17: loss 6.8987, time 5287.16ms 
iter 18: loss 6.8298, time 5288.46ms 
iter 19: loss 6.9406, time 5291.48ms 
iter 20: loss 6.7965, time 5291.57ms 
iter 21: loss 6.9763, time 5292.38ms 
iter 22: loss 6.9215, time 5284.01ms 
iter 23: loss 6.8027, time 5291.84ms 
iter 24: loss 6.5395, time 5269.46ms 
iter 25: loss 6.6460, time 5293.48ms 
iter 26: loss 6.4320, time 5293.32ms 
iter 27: loss 6.4304, time 5295.61ms 
iter 28: loss 6.3867, time 5291.06ms 
iter 29: loss 6.3342, time 5289.20ms 
iter 30: loss 6.4646, time 5297.35ms 
iter 31: loss 6.8895, time 5252.56ms 
iter 32: loss 6.3118, time 5288.59ms 
iter 33: loss 6.2903, time 5296.09ms 
iter 34: loss 6.1535, time 5258.45ms 
iter 35: loss 6.4720, time 5239.84ms 
iter 36: loss 6.4104, time 5257.06ms 
iter 37: loss 6.0654, time 5206.63ms 
iter 38: loss 6.3762, time 5208.80ms 
iter 39: loss 6.4116, time 5301.80ms 
iter 40: loss 6.2060, time 5227.05ms 
iter 41: loss 5.9968, time 5212.28ms 
iter 42: loss 5.9413, time 5279.62ms 
iter 43: loss 5.8623, time 5220.76ms 
iter 44: loss 6.1146, time 5219.19ms 
iter 45: loss 5.8951, time 5250.05ms 
iter 46: loss 5.8522, time 5261.26ms 
iter 47: loss 6.2237, time 5283.05ms 
iter 48: loss 6.0658, time 5282.20ms 
iter 49: loss 5.8246, time 5276.89ms 
step 50: train loss 5.9289, val loss 5.8790
iter 50: loss 5.9678, time 20106.67ms 
iter 51: loss 5.7347, time 5269.56ms 
iter 52: loss 6.0861, time 5257.35ms 
iter 53: loss 5.9711, time 5229.39ms 
iter 54: loss 5.9827, time 5208.56ms 
iter 55: loss 5.7979, time 5295.84ms 
iter 56: loss 6.0502, time 5254.88ms 
iter 57: loss 5.8065, time 5285.22ms 
iter 58: loss 5.7372, time 5287.82ms 
iter 59: loss 5.5935, time 5295.84ms 
iter 60: loss 5.7002, time 5293.62ms 
iter 61: loss 5.6113, time 5303.61ms 
iter 62: loss 5.8951, time 5289.68ms 
iter 63: loss 5.7612, time 5280.18ms 
iter 64: loss 5.5591, time 5274.39ms 
iter 65: loss 5.9734, time 5242.45ms 
iter 66: loss 6.0447, time 5257.26ms 
iter 67: loss 5.5758, time 5246.98ms 
iter 68: loss 5.8282, time 5251.25ms 
iter 69: loss 5.7603, time 5237.57ms 
iter 70: loss 5.5142, time 5273.55ms 
iter 71: loss 6.0232, time 5306.48ms 
iter 72: loss 5.6006, time 5277.22ms 
iter 73: loss 5.3392, time 5285.10ms 
iter 74: loss 5.5770, time 5289.45ms 
iter 75: loss 5.6204, time 5283.30ms 
iter 76: loss 5.4328, time 5282.45ms 
iter 77: loss 5.4639, time 5249.41ms 
iter 78: loss 5.5202, time 5285.81ms 
iter 79: loss 5.9883, time 5292.05ms 
iter 80: loss 5.4416, time 5278.23ms 
iter 81: loss 5.5187, time 5263.81ms 
iter 82: loss 5.2628, time 5310.00ms 
iter 83: loss 5.2009, time 5225.97ms 
iter 84: loss 5.6814, time 5315.33ms 
iter 85: loss 5.1933, time 5294.43ms 
iter 86: loss 5.5920, time 5286.42ms 
iter 87: loss 5.2097, time 5294.50ms 
iter 88: loss 5.3572, time 5305.24ms 
iter 89: loss 5.3541, time 5282.78ms 
iter 90: loss 5.2127, time 5293.74ms 
iter 91: loss 5.0246, time 5281.60ms 
iter 92: loss 5.1990, time 5283.53ms 
iter 93: loss 5.2273, time 5291.00ms 
iter 94: loss 5.1269, time 5281.85ms 
iter 95: loss 5.2411, time 5290.85ms 
iter 96: loss 5.2423, time 5289.15ms 
iter 97: loss 4.9649, time 5289.87ms 
iter 98: loss 4.9649, time 5285.74ms 
iter 99: loss 5.0221, time 5280.04ms 
step 100: train loss 5.1374, val loss 5.0762
iter 100: loss 4.8922, time 20087.20ms 
iter 101: loss 5.0608, time 5291.66ms 
iter 102: loss 5.1675, time 5288.88ms 
iter 103: loss 4.9385, time 5288.69ms 
iter 104: loss 5.4637, time 5283.50ms 
iter 105: loss 5.3337, time 5277.78ms 
iter 106: loss 5.0369, time 5288.01ms 
iter 107: loss 5.0450, time 5281.40ms 
iter 108: loss 4.9562, time 5286.33ms 
iter 109: loss 4.8118, time 5228.49ms 
iter 110: loss 5.0391, time 5264.16ms 
iter 111: loss 4.8499, time 5244.43ms 
iter 112: loss 4.8218, time 5222.52ms 
iter 113: loss 4.7303, time 5276.00ms 
iter 114: loss 4.9421, time 5303.60ms 
iter 115: loss 4.9804, time 5285.33ms 
iter 116: loss 4.8070, time 5280.86ms 
iter 117: loss 4.8664, time 5264.81ms 
iter 118: loss 5.1704, time 5283.86ms 
iter 119: loss 4.5921, time 5284.95ms 
iter 120: loss 5.0369, time 5290.65ms 
iter 121: loss 4.6919, time 5284.66ms 
iter 122: loss 4.8570, time 5285.40ms 
iter 123: loss 4.7037, time 5254.03ms 
iter 124: loss 4.7557, time 5258.03ms 
iter 125: loss 4.6509, time 5287.41ms 
iter 126: loss 4.5294, time 5279.12ms 
iter 127: loss 4.6617, time 5289.10ms 
iter 128: loss 4.6532, time 5291.44ms 
iter 129: loss 4.7039, time 5291.43ms 
iter 130: loss 4.7743, time 5283.47ms 
iter 131: loss 4.5231, time 5280.37ms 
iter 132: loss 4.5607, time 5253.59ms 
iter 133: loss 4.3947, time 5247.23ms 
iter 134: loss 4.7989, time 5287.29ms 
iter 135: loss 4.4520, time 5293.55ms 
iter 136: loss 4.5068, time 5283.87ms 
iter 137: loss 4.3841, time 5282.08ms 
iter 138: loss 4.6078, time 5297.94ms 
iter 139: loss 4.4405, time 5284.98ms 
iter 140: loss 4.7342, time 5296.39ms 
iter 141: loss 4.2865, time 5284.48ms 
iter 142: loss 4.3657, time 5247.89ms 
iter 143: loss 4.5631, time 5285.68ms 
iter 144: loss 4.6628, time 5285.00ms 
iter 145: loss 4.3509, time 5285.88ms 
iter 146: loss 4.3003, time 5281.50ms 
iter 147: loss 4.4828, time 5273.33ms 
iter 148: loss 4.4609, time 5278.74ms 
iter 149: loss 4.3271, time 5238.53ms 
step 150: train loss 4.3862, val loss 4.3170
iter 150: loss 4.1814, time 20089.72ms 
iter 151: loss 4.2038, time 5274.18ms 
iter 152: loss 4.2638, time 5288.74ms 
iter 153: loss 4.2353, time 5299.08ms 
iter 154: loss 4.6315, time 5293.02ms 
iter 155: loss 4.2162, time 5290.98ms 
iter 156: loss 4.4238, time 5286.30ms 
iter 157: loss 4.4471, time 5292.96ms 
iter 158: loss 4.1934, time 5285.96ms 
iter 159: loss 4.2849, time 5291.07ms 
iter 160: loss 4.0757, time 5284.01ms 
iter 161: loss 4.4180, time 5296.96ms 
iter 162: loss 4.4125, time 5297.62ms 
iter 163: loss 4.1900, time 5258.11ms 
iter 164: loss 4.3185, time 5291.74ms 
iter 165: loss 4.3391, time 5299.57ms 
iter 166: loss 4.3764, time 5289.32ms 
iter 167: loss 4.1854, time 5288.82ms 
iter 168: loss 4.0860, time 5297.94ms 
iter 169: loss 4.1842, time 5308.80ms 
iter 170: loss 4.2042, time 5288.07ms 
iter 171: loss 4.4905, time 5289.52ms 
iter 172: loss 4.1873, time 5217.54ms 
iter 173: loss 4.4149, time 5297.41ms 
iter 174: loss 4.2194, time 5220.58ms 
iter 175: loss 3.9516, time 5234.55ms 
iter 176: loss 4.1361, time 5283.53ms 
iter 177: loss 4.2259, time 5300.78ms 
iter 178: loss 3.9104, time 5295.07ms 
iter 179: loss 4.0216, time 5304.03ms 
iter 180: loss 4.2769, time 5124.23ms 
iter 181: loss 3.9853, time 5133.37ms 
iter 182: loss 4.1329, time 5122.09ms 
iter 183: loss 4.0146, time 5144.37ms 
iter 184: loss 4.1114, time 5133.58ms 
iter 185: loss 4.0504, time 5280.31ms 
iter 186: loss 4.0335, time 5278.04ms 
iter 187: loss 4.1477, time 5282.29ms 
iter 188: loss 4.2878, time 5261.63ms 
iter 189: loss 3.8799, time 5236.64ms 
iter 190: loss 4.2104, time 5291.55ms 
iter 191: loss 4.0414, time 5290.89ms 
iter 192: loss 3.9351, time 5310.85ms 
iter 193: loss 4.2574, time 5287.47ms 
iter 194: loss 4.3393, time 5288.42ms 
iter 195: loss 3.9253, time 5295.68ms 
iter 196: loss 4.0379, time 5294.60ms 
iter 197: loss 4.2203, time 5299.97ms 
iter 198: loss 3.9760, time 5265.09ms 
iter 199: loss 4.2576, time 5294.11ms 
step 200: train loss 4.0612, val loss 3.9847
iter 200: loss 3.9103, time 20095.93ms 
iter 201: loss 3.7938, time 5312.73ms 
iter 202: loss 4.1233, time 5295.72ms 
iter 203: loss 4.0998, time 5288.07ms 
iter 204: loss 4.0649, time 5294.14ms 
iter 205: loss 3.9975, time 5295.48ms 
iter 206: loss 3.9306, time 5291.72ms 
iter 207: loss 4.1351, time 5296.32ms 
iter 208: loss 4.2825, time 5291.45ms 
iter 209: loss 3.9111, time 5298.48ms 
iter 210: loss 4.0173, time 5302.46ms 
iter 211: loss 3.8778, time 5284.24ms 
iter 212: loss 3.9443, time 5293.97ms 
iter 213: loss 4.1738, time 5296.34ms 
iter 214: loss 3.8756, time 5296.08ms 
iter 215: loss 3.9526, time 5276.29ms 
iter 216: loss 3.9033, time 5278.35ms 
iter 217: loss 3.8178, time 5286.43ms 
iter 218: loss 3.9570, time 5305.47ms 
iter 219: loss 3.9942, time 5299.57ms 
iter 220: loss 4.1269, time 5292.51ms 
iter 221: loss 3.8796, time 5287.40ms 
iter 222: loss 4.0895, time 5277.64ms 
iter 223: loss 3.8840, time 5290.48ms 
iter 224: loss 4.0916, time 5300.49ms 
iter 225: loss 3.9297, time 5292.87ms 
iter 226: loss 3.7994, time 5288.53ms 
iter 227: loss 3.9140, time 5281.52ms 
iter 228: loss 3.9067, time 5288.78ms 
iter 229: loss 3.9129, time 5283.06ms 
iter 230: loss 3.8993, time 5291.26ms 
iter 231: loss 3.9935, time 5283.65ms 
iter 232: loss 3.7897, time 5285.62ms 
iter 233: loss 3.8807, time 5294.02ms 
iter 234: loss 4.0587, time 5270.26ms 
iter 235: loss 3.8016, time 5293.44ms 
iter 236: loss 3.9962, time 5293.07ms 
iter 237: loss 4.0742, time 5292.02ms 
iter 238: loss 4.2351, time 5287.15ms 
iter 239: loss 3.9434, time 5301.96ms 
iter 240: loss 3.8674, time 5292.99ms 
iter 241: loss 3.9612, time 5280.61ms 
iter 242: loss 3.9968, time 5308.75ms 
iter 243: loss 4.0473, time 5297.46ms 
iter 244: loss 3.8879, time 5295.38ms 
iter 245: loss 3.9620, time 5262.11ms 
iter 246: loss 3.8928, time 5303.89ms 
iter 247: loss 3.7215, time 5344.30ms 
iter 248: loss 3.7958, time 5413.32ms 
iter 249: loss 3.8340, time 5272.67ms 
step 250: train loss 3.8871, val loss 3.8346
iter 250: loss 3.8946, time 20072.86ms 
iter 251: loss 3.7994, time 5257.42ms 
iter 252: loss 3.6735, time 5291.10ms 
iter 253: loss 3.7274, time 5282.69ms 
iter 254: loss 3.8015, time 5290.27ms 
iter 255: loss 3.9616, time 5299.70ms 
iter 256: loss 3.9889, time 5292.69ms 
iter 257: loss 3.8041, time 5289.28ms 
iter 258: loss 3.8793, time 5283.82ms 
iter 259: loss 4.0172, time 5305.60ms 
iter 260: loss 3.8150, time 5300.81ms 
iter 261: loss 3.9137, time 5297.01ms 
iter 262: loss 4.0127, time 5291.40ms 
iter 263: loss 4.0508, time 5297.71ms 
iter 264: loss 3.8081, time 5283.79ms 
iter 265: loss 3.9198, time 5241.99ms 
iter 266: loss 3.8748, time 5293.40ms 
iter 267: loss 3.8722, time 5281.15ms 
iter 268: loss 3.8173, time 5282.92ms 
iter 269: loss 3.7159, time 5291.00ms 
iter 270: loss 3.7508, time 5283.76ms 
iter 271: loss 3.8464, time 5296.39ms 
iter 272: loss 3.8432, time 5365.20ms 
iter 273: loss 4.0518, time 5364.25ms 
iter 274: loss 3.8626, time 5318.99ms 
iter 275: loss 3.6941, time 5034.96ms 
iter 276: loss 4.1131, time 5196.95ms 
iter 277: loss 3.7398, time 5279.28ms 
iter 278: loss 4.1939, time 5267.46ms 
iter 279: loss 3.8093, time 5302.04ms 
iter 280: loss 3.8982, time 5285.47ms 
iter 281: loss 3.7169, time 5287.66ms 
iter 282: loss 3.8433, time 5281.59ms 
iter 283: loss 3.7540, time 5278.96ms 
iter 284: loss 3.8810, time 5286.20ms 
iter 285: loss 3.7505, time 5284.13ms 
iter 286: loss 4.1051, time 5257.65ms 
iter 287: loss 3.7933, time 5277.35ms 
iter 288: loss 4.0661, time 5278.51ms 
iter 289: loss 3.7785, time 5283.72ms 
iter 290: loss 3.8121, time 5279.79ms 
iter 291: loss 3.7744, time 5286.79ms 
iter 292: loss 3.6288, time 5291.56ms 
iter 293: loss 3.9321, time 5283.51ms 
iter 294: loss 3.9925, time 5300.44ms 
iter 295: loss 3.7168, time 5279.64ms 
iter 296: loss 3.6689, time 5279.03ms 
iter 297: loss 3.9138, time 5278.37ms 
iter 298: loss 3.9537, time 5291.57ms 
iter 299: loss 3.6847, time 5258.11ms 
step 300: train loss 3.8103, val loss 3.7483
iter 300: loss 4.0898, time 20084.23ms 
iter 301: loss 3.6918, time 5279.41ms 
iter 302: loss 3.6576, time 5278.48ms 
iter 303: loss 3.6911, time 5283.94ms 
iter 304: loss 3.8225, time 5282.55ms 
iter 305: loss 3.9578, time 5300.80ms 
iter 306: loss 3.7150, time 5298.41ms 
iter 307: loss 3.7045, time 5293.28ms 
iter 308: loss 3.7760, time 5299.53ms 
iter 309: loss 3.7467, time 5280.38ms 
iter 310: loss 3.8698, time 5286.30ms 
iter 311: loss 3.8453, time 5286.57ms 
iter 312: loss 3.9024, time 5274.41ms 
iter 313: loss 3.9631, time 5285.59ms 
iter 314: loss 3.9265, time 5278.48ms 
iter 315: loss 3.8962, time 5280.99ms 
iter 316: loss 3.7709, time 5303.58ms 
iter 317: loss 3.5963, time 5292.83ms 
iter 318: loss 3.9519, time 5296.21ms 
iter 319: loss 3.8086, time 5277.37ms 
iter 320: loss 3.6186, time 5281.08ms 
iter 321: loss 3.7674, time 5286.68ms 
iter 322: loss 3.8301, time 5255.54ms 
iter 323: loss 3.7500, time 5302.60ms 
iter 324: loss 3.7613, time 5286.15ms 
iter 325: loss 3.7769, time 5292.16ms 
iter 326: loss 3.7197, time 5295.41ms 
iter 327: loss 3.7357, time 5283.45ms 
iter 328: loss 3.7203, time 5284.15ms 
iter 329: loss 3.6372, time 5277.97ms 
iter 330: loss 3.6575, time 5269.36ms 
iter 331: loss 3.8396, time 5284.26ms 
iter 332: loss 3.8055, time 5278.11ms 
iter 333: loss 3.7722, time 5279.40ms 
iter 334: loss 3.6814, time 5263.50ms 
iter 335: loss 3.6505, time 5282.02ms 
iter 336: loss 3.6905, time 5285.63ms 
iter 337: loss 3.6058, time 5291.12ms 
iter 338: loss 3.8366, time 5368.81ms 
iter 339: loss 3.8618, time 5396.96ms 
iter 340: loss 3.7918, time 5346.73ms 
iter 341: loss 3.5819, time 5288.03ms 
iter 342: loss 3.9550, time 5266.80ms 
iter 343: loss 3.5919, time 5279.40ms 
iter 344: loss 3.7546, time 5290.06ms 
iter 345: loss 3.6451, time 5280.66ms 
iter 346: loss 3.6483, time 5293.25ms 
iter 347: loss 3.7861, time 5286.45ms 
iter 348: loss 3.7756, time 5289.70ms 
iter 349: loss 3.8252, time 5281.95ms 
step 350: train loss 3.7509, val loss 3.7071
iter 350: loss 3.9170, time 20029.60ms 
iter 351: loss 3.7509, time 5284.41ms 
iter 352: loss 3.8913, time 5299.49ms 
iter 353: loss 3.7758, time 5295.06ms 
iter 354: loss 3.7015, time 5300.76ms 
iter 355: loss 3.5247, time 5286.78ms 
iter 356: loss 3.6133, time 5279.09ms 
iter 357: loss 3.7071, time 5280.11ms 
iter 358: loss 3.8757, time 5253.29ms 
iter 359: loss 3.6887, time 5283.64ms 
iter 360: loss 3.5284, time 5283.52ms 
iter 361: loss 3.5929, time 5280.24ms 
iter 362: loss 3.6036, time 5290.69ms 
iter 363: loss 3.7170, time 5287.91ms 
iter 364: loss 3.6523, time 5279.16ms 
iter 365: loss 3.6918, time 5293.85ms 
iter 366: loss 3.8593, time 5294.53ms 
iter 367: loss 3.5634, time 5300.19ms 
iter 368: loss 3.6820, time 5281.94ms 
iter 369: loss 3.5545, time 5281.73ms 
iter 370: loss 3.5217, time 5282.76ms 
iter 371: loss 3.8500, time 5289.62ms 
iter 372: loss 3.7573, time 5293.43ms 
iter 373: loss 3.5469, time 5311.10ms 
iter 374: loss 3.6360, time 5304.54ms 
iter 375: loss 3.7730, time 5285.22ms 
iter 376: loss 3.7457, time 5341.09ms 
iter 377: loss 3.6025, time 5286.55ms 
iter 378: loss 3.6096, time 5285.70ms 
iter 379: loss 3.7827, time 5286.70ms 
iter 380: loss 3.7704, time 5309.23ms 
iter 381: loss 3.6555, time 5294.94ms 
iter 382: loss 3.5779, time 5280.92ms 
iter 383: loss 3.5920, time 5407.67ms 
iter 384: loss 3.7685, time 5415.20ms 
iter 385: loss 3.7401, time 5409.23ms 
iter 386: loss 3.9069, time 5417.39ms 
iter 387: loss 3.7947, time 5287.76ms 
iter 388: loss 3.6233, time 5314.66ms 
iter 389: loss 3.8463, time 5283.87ms 
iter 390: loss 3.4273, time 5286.27ms 
iter 391: loss 3.4926, time 5266.52ms 
iter 392: loss 3.6409, time 5282.63ms 
iter 393: loss 3.5268, time 5282.23ms 
iter 394: loss 3.5198, time 5293.70ms 
iter 395: loss 3.8794, time 5285.06ms 
iter 396: loss 3.7591, time 5291.78ms 
iter 397: loss 3.5580, time 5293.74ms 
iter 398: loss 3.7325, time 5280.90ms 
iter 399: loss 3.6733, time 5247.42ms 
step 400: train loss 3.6339, val loss 3.5982
iter 400: loss 3.5876, time 20011.16ms 
iter 401: loss 3.7434, time 5285.60ms 
iter 402: loss 3.5541, time 5266.42ms 
iter 403: loss 3.6958, time 5298.10ms 
iter 404: loss 3.7249, time 5303.39ms 
iter 405: loss 3.5793, time 5293.20ms 
iter 406: loss 3.4979, time 5290.36ms 
iter 407: loss 3.5402, time 5252.73ms 
iter 408: loss 3.6740, time 5301.81ms 
iter 409: loss 3.6316, time 5282.05ms 
iter 410: loss 3.7927, time 5286.44ms 
iter 411: loss 3.4990, time 5282.48ms 
iter 412: loss 3.5909, time 5291.28ms 
iter 413: loss 3.5001, time 5265.63ms 
iter 414: loss 3.8604, time 5283.09ms 
iter 415: loss 3.6636, time 5286.47ms 
iter 416: loss 3.5356, time 5283.99ms 
iter 417: loss 3.8102, time 5296.44ms 
iter 418: loss 3.7510, time 5271.60ms 
iter 419: loss 3.4721, time 5255.07ms 
iter 420: loss 3.6407, time 5272.10ms 
iter 421: loss 3.5142, time 5278.16ms 
iter 422: loss 3.4000, time 5281.27ms 
iter 423: loss 3.5288, time 5279.98ms 
iter 424: loss 3.7567, time 5279.90ms 
iter 425: loss 3.5407, time 5285.10ms 
iter 426: loss 3.4616, time 5298.53ms 
iter 427: loss 3.4223, time 5299.59ms 
iter 428: loss 3.5276, time 5294.23ms 
iter 429: loss 3.8134, time 5290.52ms 
iter 430: loss 3.6555, time 5284.46ms 
iter 431: loss 3.6116, time 5292.10ms 
iter 432: loss 3.6116, time 5293.15ms 
iter 433: loss 3.4388, time 5285.68ms 
iter 434: loss 3.6529, time 5300.08ms 
iter 435: loss 3.3515, time 5141.43ms 
iter 436: loss 3.5211, time 5097.70ms 
iter 437: loss 3.6545, time 5187.76ms 
iter 438: loss 3.5418, time 5284.97ms 
iter 439: loss 3.6543, time 5283.41ms 
iter 440: loss 3.4477, time 5285.28ms 
iter 441: loss 3.6048, time 5279.70ms 
iter 442: loss 3.7519, time 5297.73ms 
iter 443: loss 3.6523, time 5287.40ms 
iter 444: loss 3.5254, time 5283.07ms 
iter 445: loss 3.8187, time 5288.43ms 
iter 446: loss 3.5137, time 5290.78ms 
iter 447: loss 3.8184, time 5282.76ms 
iter 448: loss 3.5065, time 5277.06ms 
iter 449: loss 3.6348, time 5285.06ms 
step 450: train loss 3.5681, val loss 3.5542
iter 450: loss 3.5598, time 20088.88ms 
iter 451: loss 3.5922, time 5288.87ms 
iter 452: loss 3.3197, time 5287.22ms 
iter 453: loss 3.5270, time 5287.35ms 
iter 454: loss 3.4360, time 5283.03ms 
iter 455: loss 3.4597, time 5299.96ms 
iter 456: loss 3.4946, time 5278.89ms 
iter 457: loss 3.7100, time 5286.13ms 
iter 458: loss 3.4499, time 5291.13ms 
iter 459: loss 3.6328, time 5277.69ms 
iter 460: loss 3.6083, time 5285.29ms 
iter 461: loss 3.4326, time 5284.72ms 
iter 462: loss 3.5179, time 5399.60ms 
iter 463: loss 3.5853, time 5312.44ms 
iter 464: loss 3.5395, time 5279.95ms 
iter 465: loss 3.2556, time 5284.28ms 
iter 466: loss 3.5462, time 5271.71ms 
iter 467: loss 3.4900, time 5359.57ms 
iter 468: loss 3.5826, time 5393.60ms 
iter 469: loss 3.5650, time 5325.80ms 
iter 470: loss 3.4657, time 5284.00ms 
iter 471: loss 3.3829, time 5273.60ms 
iter 472: loss 3.9393, time 5332.47ms 
iter 473: loss 3.5783, time 5387.36ms 
iter 474: loss 3.6505, time 5279.32ms 
iter 475: loss 3.4959, time 5288.63ms 
iter 476: loss 3.4916, time 5290.42ms 
iter 477: loss 3.6074, time 5282.17ms 
iter 478: loss 3.4509, time 5279.91ms 
iter 479: loss 3.5545, time 5280.49ms 
iter 480: loss 3.5784, time 5291.24ms 
iter 481: loss 3.4025, time 5293.23ms 
iter 482: loss 3.5236, time 5283.27ms 
iter 483: loss 3.6652, time 5284.76ms 
iter 484: loss 3.4313, time 5283.19ms 
iter 485: loss 3.6054, time 5298.37ms 
iter 486: loss 3.4742, time 5294.81ms 
iter 487: loss 3.6365, time 5291.96ms 
iter 488: loss 3.6323, time 5297.70ms 
iter 489: loss 3.5862, time 5282.03ms 
iter 490: loss 3.5584, time 5287.78ms 
iter 491: loss 3.6290, time 5289.99ms 
iter 492: loss 3.6449, time 5279.71ms 
iter 493: loss 3.5373, time 5276.39ms 
iter 494: loss 3.3467, time 5287.64ms 
iter 495: loss 3.3744, time 5286.59ms 
iter 496: loss 3.5827, time 5297.43ms 
iter 497: loss 3.4572, time 5252.21ms 
iter 498: loss 3.5583, time 5294.63ms 
iter 499: loss 3.3417, time 5252.29ms 
step 500: train loss 3.5120, val loss 3.4876
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 500: loss 3.5279, time 21110.30ms 
iter 501: loss 3.6119, time 5296.27ms 
iter 502: loss 3.5751, time 5291.59ms 
iter 503: loss 3.3949, time 5302.87ms 
iter 504: loss 3.5535, time 5292.74ms 
iter 505: loss 3.5078, time 5217.61ms 
iter 506: loss 3.2838, time 5108.10ms 
iter 507: loss 3.5371, time 5069.24ms 
iter 508: loss 3.3906, time 5073.39ms 
iter 509: loss 3.5518, time 5116.62ms 
iter 510: loss 3.4347, time 5105.02ms 
iter 511: loss 3.5796, time 5129.42ms 
iter 512: loss 3.6538, time 5284.66ms 
iter 513: loss 3.4886, time 5294.82ms 
iter 514: loss 3.6960, time 5289.56ms 
iter 515: loss 3.4220, time 5278.66ms 
iter 516: loss 3.4836, time 5287.26ms 
iter 517: loss 3.4123, time 5251.02ms 
iter 518: loss 3.4194, time 5302.31ms 
iter 519: loss 3.4941, time 5298.85ms 
iter 520: loss 3.4533, time 5205.54ms 
iter 521: loss 3.4952, time 5283.88ms 
iter 522: loss 3.4641, time 5280.33ms 
iter 523: loss 3.6099, time 5292.36ms 
iter 524: loss 3.3619, time 5310.14ms 
iter 525: loss 3.4829, time 5290.56ms 
iter 526: loss 3.4325, time 5289.45ms 
iter 527: loss 3.3706, time 5295.39ms 
iter 528: loss 3.6023, time 5286.22ms 
iter 529: loss 3.2579, time 5217.13ms 
iter 530: loss 3.3792, time 5294.58ms 
iter 531: loss 3.4390, time 5281.21ms 
iter 532: loss 3.3817, time 5281.96ms 
iter 533: loss 3.4708, time 5292.07ms 
iter 534: loss 3.4493, time 5288.59ms 
iter 535: loss 3.3554, time 5304.97ms 
iter 536: loss 3.4052, time 5289.18ms 
iter 537: loss 3.3727, time 5286.74ms 
iter 538: loss 3.4710, time 5297.19ms 
iter 539: loss 3.2388, time 5307.79ms 
iter 540: loss 3.3648, time 5055.78ms 
iter 541: loss 3.3913, time 5035.35ms 
iter 542: loss 3.3868, time 5205.10ms 
iter 543: loss 3.5213, time 5284.02ms 
iter 544: loss 3.5318, time 5301.93ms 
iter 545: loss 3.4242, time 5264.04ms 
iter 546: loss 3.5066, time 5264.48ms 
iter 547: loss 3.3958, time 5224.28ms 
iter 548: loss 3.3896, time 5247.39ms 
iter 549: loss 3.4505, time 5233.80ms 
step 550: train loss 3.4674, val loss 3.4473
iter 550: loss 3.3616, time 20093.86ms 
iter 551: loss 3.3954, time 5304.63ms 
iter 552: loss 3.5158, time 5296.85ms 
iter 553: loss 3.5039, time 5293.62ms 
iter 554: loss 3.5152, time 5299.96ms 
iter 555: loss 3.5131, time 5301.00ms 
iter 556: loss 3.4279, time 5298.63ms 
iter 557: loss 3.5962, time 5247.26ms 
iter 558: loss 3.4540, time 5070.67ms 
iter 559: loss 3.2868, time 5050.05ms 
iter 560: loss 3.6046, time 5058.65ms 
iter 561: loss 3.4289, time 5307.32ms 
iter 562: loss 3.6101, time 5162.88ms 
iter 563: loss 3.4369, time 4993.36ms 
iter 564: loss 3.4851, time 5009.80ms 
iter 565: loss 3.4441, time 5292.28ms 
iter 566: loss 3.3843, time 5293.75ms 
iter 567: loss 3.4937, time 5308.53ms 
iter 568: loss 3.2796, time 5305.74ms 
iter 569: loss 3.2354, time 5301.12ms 
iter 570: loss 3.3808, time 5290.03ms 
iter 571: loss 3.4957, time 5287.42ms 
iter 572: loss 3.5525, time 5294.25ms 
iter 573: loss 3.5844, time 5293.72ms 
iter 574: loss 3.6437, time 5298.47ms 
iter 575: loss 3.3897, time 5306.23ms 
iter 576: loss 3.2384, time 5307.29ms 
iter 577: loss 3.2296, time 5296.54ms 
iter 578: loss 3.3764, time 5287.61ms 
iter 579: loss 3.4055, time 5286.80ms 
iter 580: loss 3.5172, time 5294.33ms 
iter 581: loss 3.3559, time 5306.62ms 
iter 582: loss 3.3482, time 5297.84ms 
iter 583: loss 3.4842, time 5292.93ms 
iter 584: loss 3.4396, time 5289.17ms 
iter 585: loss 3.5711, time 5271.58ms 
iter 586: loss 3.4996, time 5277.40ms 
iter 587: loss 3.6026, time 5286.89ms 
iter 588: loss 3.4351, time 5305.35ms 
iter 589: loss 3.2897, time 5286.38ms 
iter 590: loss 3.3579, time 5290.14ms 
iter 591: loss 3.3753, time 5279.80ms 
iter 592: loss 3.5763, time 5297.44ms 
iter 593: loss 3.3246, time 5288.65ms 
iter 594: loss 3.3891, time 5305.82ms 
iter 595: loss 3.4494, time 5292.25ms 
iter 596: loss 3.4223, time 5286.90ms 
iter 597: loss 3.4523, time 5251.94ms 
iter 598: loss 3.3455, time 5280.74ms 
iter 599: loss 3.4421, time 5291.32ms 
step 600: train loss 3.4272, val loss 3.4242
iter 600: loss 3.5819, time 20072.63ms 
iter 601: loss 3.4728, time 5289.26ms 
iter 602: loss 3.4031, time 5285.01ms 
iter 603: loss 3.3488, time 5285.63ms 
iter 604: loss 3.5053, time 5295.12ms 
iter 605: loss 3.5065, time 5293.18ms 
iter 606: loss 3.3654, time 5301.20ms 
iter 607: loss 3.3463, time 5300.45ms 
iter 608: loss 3.2816, time 5298.32ms 
iter 609: loss 3.2373, time 5296.26ms 
iter 610: loss 3.3109, time 5296.16ms 
iter 611: loss 3.5064, time 5275.82ms 
iter 612: loss 3.2655, time 5299.11ms 
iter 613: loss 3.4261, time 5289.89ms 
iter 614: loss 3.4900, time 5287.63ms 
iter 615: loss 3.4094, time 5308.80ms 
iter 616: loss 3.3982, time 5293.02ms 
iter 617: loss 3.4291, time 5153.71ms 
iter 618: loss 3.3822, time 5146.67ms 
iter 619: loss 3.4235, time 5289.17ms 
iter 620: loss 3.3431, time 5290.87ms 
iter 621: loss 3.2842, time 5284.26ms 
iter 622: loss 3.2787, time 5292.05ms 
iter 623: loss 3.2857, time 5290.90ms 
iter 624: loss 3.4078, time 5286.58ms 
iter 625: loss 3.4844, time 5284.59ms 
iter 626: loss 3.3389, time 5272.91ms 
iter 627: loss 3.5250, time 5300.28ms 
iter 628: loss 3.4264, time 5293.58ms 
iter 629: loss 3.4818, time 5272.41ms 
iter 630: loss 3.2965, time 5283.51ms 
iter 631: loss 3.4337, time 5293.66ms 
iter 632: loss 3.5042, time 5297.36ms 
iter 633: loss 3.5776, time 5292.82ms 
iter 634: loss 3.3493, time 5304.21ms 
iter 635: loss 3.3910, time 5285.13ms 
iter 636: loss 3.4476, time 5289.03ms 
iter 637: loss 3.4887, time 5294.52ms 
iter 638: loss 3.3811, time 5293.78ms 
iter 639: loss 3.4830, time 5289.02ms 
iter 640: loss 3.4112, time 5288.14ms 
iter 641: loss 3.4375, time 5288.60ms 
iter 642: loss 3.6602, time 5291.01ms 
iter 643: loss 3.3693, time 5290.56ms 
iter 644: loss 3.3682, time 5293.92ms 
iter 645: loss 3.5427, time 5289.65ms 
iter 646: loss 3.5143, time 5286.43ms 
iter 647: loss 3.6093, time 5293.59ms 
iter 648: loss 3.4593, time 5283.02ms 
iter 649: loss 3.3785, time 5282.53ms 
step 650: train loss 3.3749, val loss 3.3596
iter 650: loss 3.4281, time 20066.39ms 
iter 651: loss 3.2865, time 5278.53ms 
iter 652: loss 3.6178, time 5284.35ms 
iter 653: loss 3.2999, time 5288.21ms 
iter 654: loss 3.2557, time 5290.59ms 
iter 655: loss 3.3215, time 5290.46ms 
iter 656: loss 3.3653, time 5311.87ms 
iter 657: loss 3.3923, time 5298.27ms 
iter 658: loss 3.4311, time 5282.39ms 
iter 659: loss 3.2258, time 5288.69ms 
iter 660: loss 3.1079, time 5280.45ms 
iter 661: loss 3.3403, time 5285.21ms 
iter 662: loss 3.4319, time 5289.16ms 
iter 663: loss 3.6012, time 5291.70ms 
iter 664: loss 3.3625, time 5287.84ms 
iter 665: loss 3.5228, time 5285.15ms 
iter 666: loss 3.5695, time 5291.03ms 
iter 667: loss 3.4521, time 5289.29ms 
iter 668: loss 3.3773, time 5289.45ms 
iter 669: loss 3.3845, time 5288.00ms 
iter 670: loss 3.1907, time 5293.87ms 
iter 671: loss 3.3208, time 5295.17ms 
iter 672: loss 3.4110, time 5284.83ms 
iter 673: loss 3.2404, time 5291.92ms 
iter 674: loss 3.3079, time 5066.93ms 
iter 675: loss 3.3065, time 5004.95ms 
iter 676: loss 3.4429, time 5001.64ms 
iter 677: loss 3.4073, time 5008.30ms 
iter 678: loss 3.2407, time 5140.64ms 
iter 679: loss 3.4309, time 5291.09ms 
iter 680: loss 3.3713, time 5292.95ms 
iter 681: loss 3.6076, time 5280.44ms 
iter 682: loss 3.4029, time 5297.34ms 
iter 683: loss 3.2032, time 5280.74ms 
iter 684: loss 3.3377, time 5296.13ms 
iter 685: loss 3.2180, time 5255.78ms 
iter 686: loss 3.5373, time 5263.92ms 
iter 687: loss 3.3402, time 5289.41ms 
iter 688: loss 3.3050, time 5248.27ms 
iter 689: loss 3.3697, time 5309.46ms 
iter 690: loss 3.4414, time 5338.92ms 
iter 691: loss 3.1119, time 5294.87ms 
iter 692: loss 3.2263, time 5270.32ms 
iter 693: loss 3.3775, time 5344.98ms 
iter 694: loss 3.3029, time 5322.73ms 
iter 695: loss 3.4894, time 5345.37ms 
iter 696: loss 3.3782, time 5291.06ms 
iter 697: loss 3.4850, time 5298.45ms 
iter 698: loss 3.2895, time 5262.94ms 
iter 699: loss 3.2134, time 5286.58ms 
step 700: train loss 3.3415, val loss 3.3397
iter 700: loss 3.4075, time 20087.21ms 
iter 701: loss 3.3534, time 5296.89ms 
iter 702: loss 3.1935, time 5268.56ms 
iter 703: loss 3.2914, time 5280.88ms 
iter 704: loss 3.4719, time 5286.64ms 
iter 705: loss 3.3413, time 5290.24ms 
iter 706: loss 3.2498, time 5290.41ms 
iter 707: loss 3.2512, time 5298.36ms 
iter 708: loss 3.3006, time 5297.70ms 
iter 709: loss 3.4333, time 5289.42ms 
iter 710: loss 3.2340, time 5291.19ms 
iter 711: loss 3.3871, time 5290.19ms 
iter 712: loss 3.3390, time 5302.21ms 
iter 713: loss 3.2835, time 5288.90ms 
iter 714: loss 3.2371, time 5292.33ms 
iter 715: loss 3.4411, time 5290.20ms 
iter 716: loss 3.3053, time 5295.33ms 
iter 717: loss 3.3127, time 5307.57ms 
iter 718: loss 3.5058, time 5307.53ms 
iter 719: loss 3.2070, time 5299.66ms 
iter 720: loss 3.4045, time 5301.74ms 
iter 721: loss 3.0330, time 5306.29ms 
iter 722: loss 3.4029, time 5304.41ms 
iter 723: loss 3.5370, time 5260.69ms 
iter 724: loss 3.4299, time 5294.12ms 
iter 725: loss 3.2738, time 5295.46ms 
iter 726: loss 3.4444, time 5305.45ms 
iter 727: loss 3.3747, time 5298.64ms 
iter 728: loss 3.2411, time 5295.46ms 
iter 729: loss 3.1890, time 5277.28ms 
iter 730: loss 3.3954, time 5104.34ms 
iter 731: loss 3.2330, time 5075.82ms 
iter 732: loss 3.2042, time 5070.71ms 
iter 733: loss 3.2788, time 5047.74ms 
iter 734: loss 3.0034, time 5161.90ms 
iter 735: loss 3.2582, time 5285.30ms 
iter 736: loss 3.4432, time 5286.62ms 
iter 737: loss 3.3424, time 5285.55ms 
iter 738: loss 3.6162, time 5290.06ms 
iter 739: loss 3.3676, time 5300.13ms 
iter 740: loss 3.2784, time 5292.90ms 
iter 741: loss 3.3081, time 5295.51ms 
iter 742: loss 3.1957, time 5292.53ms 
iter 743: loss 3.2085, time 5290.57ms 
iter 744: loss 3.4254, time 5289.70ms 
iter 745: loss 3.2468, time 5302.41ms 
iter 746: loss 3.0197, time 5301.77ms 
iter 747: loss 3.2094, time 5311.45ms 
iter 748: loss 3.3935, time 5307.87ms 
iter 749: loss 3.3615, time 5307.31ms 
step 750: train loss 3.2951, val loss 3.2882
iter 750: loss 3.2198, time 20125.65ms 
iter 751: loss 3.1951, time 5263.12ms 
iter 752: loss 3.3055, time 5314.58ms 
iter 753: loss 3.2948, time 5305.80ms 
iter 754: loss 3.3618, time 5299.59ms 
iter 755: loss 3.1623, time 5303.97ms 
iter 756: loss 3.3347, time 5308.16ms 
iter 757: loss 3.2550, time 5312.02ms 
iter 758: loss 3.4819, time 5308.14ms 
iter 759: loss 3.2116, time 5166.98ms 
iter 760: loss 3.1747, time 5115.60ms 
iter 761: loss 3.3361, time 5078.12ms 
iter 762: loss 3.2512, time 5079.95ms 
iter 763: loss 3.2999, time 5097.54ms 
iter 764: loss 3.1290, time 5291.33ms 
iter 765: loss 3.2177, time 5297.33ms 
iter 766: loss 3.3076, time 5297.02ms 
iter 767: loss 3.2257, time 5296.60ms 
iter 768: loss 3.3532, time 5292.02ms 
iter 769: loss 3.4260, time 5291.55ms 
iter 770: loss 3.4871, time 5300.86ms 
iter 771: loss 3.3942, time 5297.16ms 
iter 772: loss 3.2918, time 5305.04ms 
iter 773: loss 3.3183, time 5299.81ms 
iter 774: loss 3.2248, time 5298.51ms 
iter 775: loss 3.2507, time 5299.02ms 
iter 776: loss 3.2022, time 5295.74ms 
iter 777: loss 3.3721, time 5296.12ms 
iter 778: loss 3.1791, time 5300.81ms 
iter 779: loss 3.3115, time 5295.39ms 
iter 780: loss 3.4897, time 5295.61ms 
iter 781: loss 3.4437, time 5301.55ms 
iter 782: loss 3.2846, time 5295.66ms 
iter 783: loss 3.1184, time 5281.75ms 
iter 784: loss 3.3522, time 5303.87ms 
iter 785: loss 3.2213, time 5301.23ms 
iter 786: loss 3.2449, time 5315.13ms 
iter 787: loss 3.4552, time 5318.51ms 
iter 788: loss 3.3617, time 5307.27ms 
iter 789: loss 3.2032, time 5301.38ms 
iter 790: loss 3.3648, time 5293.12ms 
iter 791: loss 3.5052, time 5293.74ms 
iter 792: loss 3.2447, time 5307.00ms 
iter 793: loss 3.2461, time 5302.13ms 
iter 794: loss 3.2100, time 5308.64ms 
iter 795: loss 3.3669, time 5297.98ms 
iter 796: loss 3.3259, time 5298.68ms 
iter 797: loss 3.2278, time 5292.43ms 
iter 798: loss 3.0766, time 5307.93ms 
iter 799: loss 3.3054, time 5305.27ms 
step 800: train loss 3.2546, val loss 3.2378
iter 800: loss 3.4147, time 20151.33ms 
iter 801: loss 3.2891, time 5305.92ms 
iter 802: loss 3.2177, time 5302.63ms 
iter 803: loss 3.1414, time 5301.35ms 
iter 804: loss 3.1248, time 5303.27ms 
iter 805: loss 3.3789, time 5281.85ms 
iter 806: loss 3.2989, time 5303.84ms 
iter 807: loss 3.2438, time 5297.99ms 
iter 808: loss 3.3112, time 5296.21ms 
iter 809: loss 3.2465, time 5301.52ms 
iter 810: loss 3.3843, time 5307.73ms 
iter 811: loss 3.2397, time 5309.92ms 
iter 812: loss 3.2670, time 5314.25ms 
iter 813: loss 3.2615, time 5309.53ms 
iter 814: loss 3.3005, time 5157.38ms 
iter 815: loss 3.2835, time 5104.41ms 
iter 816: loss 3.0480, time 5074.60ms 
iter 817: loss 3.3884, time 5225.98ms 
iter 818: loss 3.4579, time 5302.13ms 
iter 819: loss 3.1039, time 5295.55ms 
iter 820: loss 3.0478, time 5300.29ms 
iter 821: loss 3.3380, time 5293.97ms 
iter 822: loss 3.1743, time 5298.46ms 
iter 823: loss 3.2579, time 5289.96ms 
iter 824: loss 3.1505, time 5292.13ms 
iter 825: loss 3.1356, time 5295.35ms 
iter 826: loss 3.1351, time 5298.01ms 
iter 827: loss 3.2769, time 5308.67ms 
iter 828: loss 3.2517, time 5300.99ms 
iter 829: loss 3.3150, time 5165.42ms 
iter 830: loss 3.3588, time 5113.22ms 
iter 831: loss 3.2920, time 5078.20ms 
iter 832: loss 3.3308, time 5236.24ms 
iter 833: loss 3.1734, time 5313.72ms 
iter 834: loss 3.0776, time 5316.95ms 
iter 835: loss 3.5534, time 5298.37ms 
iter 836: loss 3.3165, time 5296.84ms 
iter 837: loss 3.1127, time 5303.08ms 
iter 838: loss 3.2842, time 5307.04ms 
iter 839: loss 3.2558, time 5288.49ms 
iter 840: loss 3.2242, time 5291.93ms 
iter 841: loss 3.2314, time 5303.90ms 
iter 842: loss 3.3880, time 5291.43ms 
iter 843: loss 3.4827, time 5262.88ms 
iter 844: loss 3.3578, time 5159.17ms 
iter 845: loss 3.0195, time 5292.33ms 
iter 846: loss 3.2685, time 5297.48ms 
iter 847: loss 3.3113, time 5293.49ms 
iter 848: loss 3.3446, time 5289.92ms 
iter 849: loss 3.2631, time 5289.47ms 
step 850: train loss 3.2210, val loss 3.2363
iter 850: loss 3.1216, time 20151.64ms 
iter 851: loss 3.0330, time 5291.69ms 
iter 852: loss 3.2877, time 5296.20ms 
iter 853: loss 3.2523, time 5295.89ms 
iter 854: loss 3.4804, time 5294.82ms 
iter 855: loss 3.2028, time 5293.71ms 
iter 856: loss 3.3192, time 5295.30ms 
iter 857: loss 3.2305, time 5293.27ms 
iter 858: loss 3.1819, time 5277.48ms 
iter 859: loss 3.2440, time 5308.24ms 
iter 860: loss 3.4353, time 5298.46ms 
iter 861: loss 3.3443, time 5290.67ms 
iter 862: loss 3.2132, time 5343.35ms 
iter 863: loss 3.0585, time 5321.17ms 
iter 864: loss 3.1674, time 5306.21ms 
iter 865: loss 3.2271, time 5296.62ms 
iter 866: loss 3.2784, time 5297.52ms 
iter 867: loss 3.1051, time 5304.39ms 
iter 868: loss 3.2715, time 5282.46ms 
iter 869: loss 3.2787, time 5292.28ms 
iter 870: loss 3.2241, time 5302.09ms 
iter 871: loss 3.1648, time 5307.25ms 
iter 872: loss 3.0664, time 5300.66ms 
iter 873: loss 3.0437, time 5307.50ms 
iter 874: loss 3.2112, time 5278.26ms 
iter 875: loss 3.2458, time 5293.27ms 
iter 876: loss 3.1815, time 5301.86ms 
iter 877: loss 3.3684, time 5261.46ms 
iter 878: loss 3.2457, time 5302.29ms 
iter 879: loss 3.2491, time 5299.74ms 
iter 880: loss 3.1715, time 5183.87ms 
iter 881: loss 3.2100, time 5289.53ms 
iter 882: loss 3.3083, time 5296.12ms 
iter 883: loss 3.3180, time 5292.60ms 
iter 884: loss 3.4122, time 5287.91ms 
iter 885: loss 3.1895, time 5295.64ms 
iter 886: loss 3.2376, time 5291.72ms 
iter 887: loss 3.1321, time 5288.29ms 
iter 888: loss 3.2468, time 5292.70ms 
iter 889: loss 3.1730, time 5288.66ms 
iter 890: loss 3.4050, time 5293.25ms 
iter 891: loss 3.1187, time 5288.01ms 
iter 892: loss 3.2178, time 5287.13ms 
iter 893: loss 3.1841, time 5304.95ms 
iter 894: loss 3.2027, time 5289.42ms 
iter 895: loss 3.2042, time 5293.62ms 
iter 896: loss 3.4099, time 5295.65ms 
iter 897: loss 3.0940, time 5295.45ms 
iter 898: loss 3.2095, time 5297.25ms 
iter 899: loss 3.4571, time 5298.71ms 
step 900: train loss 3.2012, val loss 3.2229
iter 900: loss 3.3506, time 20088.77ms 
iter 901: loss 3.0963, time 5284.40ms 
iter 902: loss 3.1123, time 5280.69ms 
iter 903: loss 3.2747, time 5307.55ms 
iter 904: loss 3.2481, time 5297.28ms 
iter 905: loss 3.2857, time 5291.54ms 
iter 906: loss 3.2788, time 5287.53ms 
iter 907: loss 3.3743, time 5292.83ms 
iter 908: loss 3.0550, time 5296.51ms 
iter 909: loss 3.2459, time 5268.26ms 
iter 910: loss 3.1833, time 5289.10ms 
iter 911: loss 3.0703, time 5289.39ms 
iter 912: loss 3.1116, time 5257.08ms 
iter 913: loss 3.3654, time 5261.89ms 
iter 914: loss 3.4474, time 5315.69ms 
iter 915: loss 3.1999, time 5306.31ms 
iter 916: loss 3.2177, time 5298.12ms 
iter 917: loss 3.3444, time 5292.60ms 
iter 918: loss 3.2893, time 5275.41ms 
iter 919: loss 3.2156, time 5295.82ms 
iter 920: loss 3.1910, time 5290.24ms 
iter 921: loss 2.9277, time 5298.10ms 
iter 922: loss 3.2396, time 5300.44ms 
iter 923: loss 3.3010, time 5293.61ms 
iter 924: loss 3.2868, time 5296.81ms 
iter 925: loss 3.3096, time 5295.35ms 
iter 926: loss 3.2359, time 5292.96ms 
iter 927: loss 3.1821, time 5301.60ms 
iter 928: loss 3.2501, time 5307.39ms 
iter 929: loss 3.2303, time 5298.86ms 
iter 930: loss 3.2348, time 5297.25ms 
iter 931: loss 3.2065, time 5309.15ms 
iter 932: loss 3.3705, time 5090.87ms 
iter 933: loss 3.1227, time 5222.96ms 
iter 934: loss 3.2791, time 5281.18ms 
iter 935: loss 3.2717, time 5296.61ms 
iter 936: loss 3.2884, time 5299.13ms 
iter 937: loss 3.2583, time 5304.85ms 
iter 938: loss 3.0109, time 5293.19ms 
iter 939: loss 3.0681, time 5290.05ms 
iter 940: loss 3.0974, time 5303.92ms 
iter 941: loss 3.3022, time 5303.76ms 
iter 942: loss 3.1048, time 5297.82ms 
iter 943: loss 3.3048, time 5301.41ms 
iter 944: loss 2.9710, time 5265.24ms 
iter 945: loss 3.0229, time 5259.09ms 
iter 946: loss 3.2351, time 5288.82ms 
iter 947: loss 3.3516, time 5294.72ms 
iter 948: loss 3.0868, time 5295.90ms 
iter 949: loss 3.0544, time 5249.67ms 
step 950: train loss 3.1920, val loss 3.1920
iter 950: loss 3.2835, time 20135.67ms 
iter 951: loss 3.1327, time 5285.66ms 
iter 952: loss 3.1540, time 5289.56ms 
iter 953: loss 3.0274, time 5284.69ms 
iter 954: loss 3.1004, time 5286.07ms 
iter 955: loss 3.1913, time 5297.68ms 
iter 956: loss 3.2612, time 5288.45ms 
iter 957: loss 3.1655, time 5290.07ms 
iter 958: loss 3.0399, time 5282.35ms 
iter 959: loss 3.1601, time 5285.55ms 
iter 960: loss 3.1602, time 5294.42ms 
iter 961: loss 3.0507, time 5294.47ms 
iter 962: loss 3.1109, time 5290.36ms 
iter 963: loss 3.3973, time 5284.52ms 
iter 964: loss 3.2340, time 5291.32ms 
iter 965: loss 3.2304, time 5296.25ms 
iter 966: loss 3.2535, time 5305.95ms 
iter 967: loss 3.1329, time 5317.22ms 
iter 968: loss 3.1800, time 5336.40ms 
iter 969: loss 3.1098, time 5267.29ms 
iter 970: loss 3.2335, time 5284.55ms 
iter 971: loss 3.3297, time 5294.67ms 
iter 972: loss 3.3329, time 5304.83ms 
iter 973: loss 3.2142, time 5305.81ms 
iter 974: loss 3.3101, time 5304.36ms 
iter 975: loss 3.2292, time 5295.54ms 
iter 976: loss 3.1549, time 5296.14ms 
iter 977: loss 3.4090, time 5303.34ms 
iter 978: loss 3.3776, time 5304.38ms 
iter 979: loss 3.2173, time 5314.76ms 
iter 980: loss 3.2145, time 5310.60ms 
iter 981: loss 3.0596, time 5297.63ms 
iter 982: loss 3.2478, time 5261.00ms 
iter 983: loss 3.2939, time 5281.55ms 
iter 984: loss 3.2150, time 5282.40ms 
iter 985: loss 3.1424, time 5285.43ms 
iter 986: loss 3.0670, time 5291.10ms 
iter 987: loss 3.1456, time 5290.76ms 
iter 988: loss 3.0787, time 5297.02ms 
iter 989: loss 3.0890, time 5303.32ms 
iter 990: loss 3.1891, time 5291.62ms 
iter 991: loss 3.1397, time 5293.34ms 
iter 992: loss 3.0198, time 5289.55ms 
iter 993: loss 3.1452, time 5302.82ms 
iter 994: loss 3.0502, time 5306.68ms 
iter 995: loss 3.1606, time 5317.30ms 
iter 996: loss 3.2672, time 5332.88ms 
iter 997: loss 3.1033, time 5381.57ms 
iter 998: loss 3.1020, time 5290.37ms 
iter 999: loss 3.0445, time 5272.41ms 
step 1000: train loss 3.1759, val loss 3.1672
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 1000: loss 3.1483, time 21345.70ms 
iter 1001: loss 3.1514, time 5292.96ms 
iter 1002: loss 3.1282, time 5300.39ms 
iter 1003: loss 3.1549, time 5298.45ms 
iter 1004: loss 3.1577, time 5294.69ms 
iter 1005: loss 3.1820, time 5289.86ms 
iter 1006: loss 3.0627, time 5287.33ms 
iter 1007: loss 3.1739, time 5294.00ms 
iter 1008: loss 3.0712, time 5297.57ms 
iter 1009: loss 3.2400, time 5290.05ms 
iter 1010: loss 3.4425, time 5287.72ms 
iter 1011: loss 3.0410, time 5295.14ms 
iter 1012: loss 3.0082, time 5284.02ms 
iter 1013: loss 3.0854, time 5293.14ms 
iter 1014: loss 3.0903, time 5290.75ms 
iter 1015: loss 3.1800, time 5301.29ms 
iter 1016: loss 3.4460, time 5299.17ms 
iter 1017: loss 3.1172, time 5287.50ms 
iter 1018: loss 3.2117, time 5294.43ms 
iter 1019: loss 3.3405, time 5298.40ms 
iter 1020: loss 3.2240, time 5290.59ms 
iter 1021: loss 3.2572, time 5300.76ms 
iter 1022: loss 3.1697, time 5306.06ms 
iter 1023: loss 3.1350, time 5303.26ms 
iter 1024: loss 3.2194, time 5298.91ms 
iter 1025: loss 3.3410, time 5295.67ms 
iter 1026: loss 3.2775, time 5300.36ms 
iter 1027: loss 3.2920, time 5302.56ms 
iter 1028: loss 3.0023, time 5290.73ms 
iter 1029: loss 3.1884, time 5291.87ms 
iter 1030: loss 3.0715, time 5293.98ms 
iter 1031: loss 3.0630, time 5306.50ms 
iter 1032: loss 3.1017, time 5297.62ms 
iter 1033: loss 3.2488, time 5266.52ms 
iter 1034: loss 3.5411, time 5290.29ms 
iter 1035: loss 3.2633, time 5298.28ms 
iter 1036: loss 2.9936, time 5297.07ms 
iter 1037: loss 3.3234, time 5294.10ms 
iter 1038: loss 3.1734, time 5305.36ms 
iter 1039: loss 3.1739, time 5300.73ms 
iter 1040: loss 3.0204, time 5301.49ms 
iter 1041: loss 2.9446, time 5303.51ms 
iter 1042: loss 3.1401, time 5296.15ms 
iter 1043: loss 3.1411, time 5288.72ms 
iter 1044: loss 3.2844, time 5296.27ms 
iter 1045: loss 2.8868, time 5282.25ms 
iter 1046: loss 3.0927, time 5290.07ms 
iter 1047: loss 3.2120, time 5344.80ms 
iter 1048: loss 3.0080, time 5299.29ms 
iter 1049: loss 3.2813, time 5379.26ms 
step 1050: train loss 3.1344, val loss 3.1498
iter 1050: loss 3.2818, time 20134.18ms 
iter 1051: loss 3.0979, time 5286.34ms 
iter 1052: loss 3.0478, time 5301.20ms 
iter 1053: loss 3.0219, time 5288.33ms 
iter 1054: loss 3.1172, time 5294.76ms 
iter 1055: loss 3.1865, time 5293.16ms 
iter 1056: loss 3.1814, time 5296.12ms 
iter 1057: loss 3.3165, time 5404.65ms 
iter 1058: loss 3.1867, time 5421.12ms 
iter 1059: loss 3.1672, time 5316.78ms 
iter 1060: loss 3.0417, time 5289.24ms 
iter 1061: loss 3.2080, time 5288.90ms 
iter 1062: loss 2.9953, time 5287.36ms 
iter 1063: loss 2.9594, time 5298.49ms 
iter 1064: loss 3.0245, time 5305.84ms 
iter 1065: loss 2.9544, time 5304.06ms 
iter 1066: loss 3.2325, time 5302.15ms 
iter 1067: loss 3.1082, time 5301.87ms 
iter 1068: loss 3.2181, time 5294.83ms 
iter 1069: loss 3.1450, time 5216.28ms 
iter 1070: loss 3.1931, time 5037.50ms 
iter 1071: loss 3.0910, time 5051.98ms 
iter 1072: loss 3.2242, time 5278.85ms 
iter 1073: loss 3.1512, time 5299.01ms 
iter 1074: loss 3.1737, time 5287.96ms 
iter 1075: loss 3.0778, time 5293.88ms 
iter 1076: loss 3.0667, time 5292.58ms 
iter 1077: loss 3.0480, time 5304.03ms 
iter 1078: loss 3.1501, time 5292.93ms 
iter 1079: loss 3.3424, time 5303.78ms 
iter 1080: loss 3.2313, time 5302.95ms 
iter 1081: loss 3.0546, time 5313.11ms 
iter 1082: loss 3.3031, time 5303.68ms 
iter 1083: loss 3.2491, time 5299.51ms 
iter 1084: loss 3.1930, time 5302.85ms 
iter 1085: loss 3.2478, time 5305.46ms 
iter 1086: loss 3.2236, time 5303.55ms 
iter 1087: loss 3.1580, time 5296.66ms 
iter 1088: loss 3.0848, time 5300.59ms 
iter 1089: loss 3.0671, time 5297.31ms 
iter 1090: loss 3.1022, time 5296.18ms 
iter 1091: loss 3.1540, time 5306.85ms 
iter 1092: loss 3.2661, time 5292.60ms 
iter 1093: loss 2.8950, time 5298.89ms 
iter 1094: loss 2.9004, time 5293.45ms 
iter 1095: loss 3.1329, time 5321.16ms 
iter 1096: loss 3.0808, time 5302.87ms 
iter 1097: loss 3.2019, time 5313.54ms 
iter 1098: loss 3.3068, time 5304.31ms 
iter 1099: loss 3.0844, time 5298.78ms 
step 1100: train loss 3.1210, val loss 3.1372
iter 1100: loss 3.1599, time 20132.93ms 
iter 1101: loss 3.0503, time 5294.39ms 
iter 1102: loss 3.0178, time 5304.05ms 
iter 1103: loss 3.1183, time 5293.48ms 
iter 1104: loss 2.9369, time 5292.75ms 
iter 1105: loss 3.2889, time 5297.37ms 
iter 1106: loss 2.8890, time 5297.88ms 
iter 1107: loss 3.0540, time 5297.64ms 
iter 1108: loss 3.0766, time 5303.31ms 
iter 1109: loss 3.0075, time 5300.76ms 
iter 1110: loss 3.0638, time 5291.81ms 
iter 1111: loss 3.2894, time 5293.09ms 
iter 1112: loss 3.0730, time 5290.03ms 
iter 1113: loss 3.1754, time 5294.23ms 
iter 1114: loss 3.0049, time 5293.69ms 
iter 1115: loss 3.2683, time 5287.61ms 
iter 1116: loss 2.9427, time 5301.36ms 
iter 1117: loss 3.0859, time 5295.83ms 
iter 1118: loss 3.2288, time 5299.39ms 
iter 1119: loss 3.0154, time 5295.18ms 
iter 1120: loss 3.0402, time 5304.65ms 
iter 1121: loss 3.1133, time 5298.06ms 
iter 1122: loss 3.3700, time 5296.10ms 
iter 1123: loss 3.2359, time 5291.72ms 
iter 1124: loss 3.1239, time 5292.92ms 
iter 1125: loss 3.1518, time 5295.96ms 
iter 1126: loss 2.8553, time 5304.74ms 
iter 1127: loss 2.9755, time 5304.60ms 
iter 1128: loss 2.9545, time 5297.44ms 
iter 1129: loss 2.9885, time 5305.49ms 
iter 1130: loss 3.2004, time 5300.17ms 
iter 1131: loss 3.1597, time 5300.86ms 
iter 1132: loss 3.1418, time 5297.32ms 
iter 1133: loss 2.9690, time 5295.04ms 
iter 1134: loss 3.1344, time 5257.81ms 
iter 1135: loss 3.3511, time 5293.52ms 
iter 1136: loss 3.0609, time 5287.97ms 
iter 1137: loss 3.0285, time 5289.94ms 
iter 1138: loss 3.0969, time 5289.80ms 
iter 1139: loss 2.9988, time 5290.81ms 
iter 1140: loss 3.0869, time 5248.03ms 
iter 1141: loss 2.9832, time 5289.87ms 
iter 1142: loss 3.0283, time 5286.04ms 
iter 1143: loss 3.0322, time 5305.97ms 
iter 1144: loss 3.3828, time 5304.47ms 
iter 1145: loss 2.9928, time 5294.68ms 
iter 1146: loss 3.1705, time 5290.55ms 
iter 1147: loss 3.0932, time 5294.98ms 
iter 1148: loss 3.0302, time 5295.52ms 
iter 1149: loss 3.0935, time 5297.20ms 
step 1150: train loss 3.0840, val loss 3.1124
iter 1150: loss 2.9892, time 20149.35ms 
iter 1151: loss 2.9598, time 5280.74ms 
iter 1152: loss 2.9822, time 5283.91ms 
iter 1153: loss 3.0551, time 5290.57ms 
iter 1154: loss 3.0770, time 5294.88ms 
iter 1155: loss 3.1548, time 5293.72ms 
iter 1156: loss 3.0352, time 5285.59ms 
iter 1157: loss 3.2632, time 5300.14ms 
iter 1158: loss 2.9601, time 5297.47ms 
iter 1159: loss 3.1161, time 5302.47ms 
iter 1160: loss 2.9638, time 5290.81ms 
iter 1161: loss 2.9270, time 5302.94ms 
iter 1162: loss 3.0873, time 5287.80ms 
iter 1163: loss 3.0545, time 5292.72ms 
iter 1164: loss 2.9338, time 5275.18ms 
iter 1165: loss 3.0934, time 5278.05ms 
iter 1166: loss 3.1735, time 5302.38ms 
iter 1167: loss 3.1228, time 5282.70ms 
iter 1168: loss 3.0754, time 5305.33ms 
iter 1169: loss 3.3349, time 5309.45ms 
iter 1170: loss 3.0258, time 5303.89ms 
iter 1171: loss 3.0235, time 5293.96ms 
iter 1172: loss 3.2788, time 5240.32ms 
iter 1173: loss 3.1705, time 5080.77ms 
iter 1174: loss 3.0588, time 5134.57ms 
iter 1175: loss 3.1427, time 5117.26ms 
iter 1176: loss 2.9348, time 5205.86ms 
iter 1177: loss 2.8817, time 5291.83ms 
iter 1178: loss 2.9951, time 5291.90ms 
iter 1179: loss 3.0022, time 5299.19ms 
iter 1180: loss 2.9959, time 5299.49ms 
iter 1181: loss 3.0798, time 5294.17ms 
iter 1182: loss 3.0202, time 5294.96ms 
iter 1183: loss 2.9432, time 5303.59ms 
iter 1184: loss 3.0359, time 5294.79ms 
iter 1185: loss 3.0248, time 5290.35ms 
iter 1186: loss 3.3256, time 5299.67ms 
iter 1187: loss 3.0909, time 5294.95ms 
iter 1188: loss 3.0283, time 5251.21ms 
iter 1189: loss 3.0393, time 5297.26ms 
iter 1190: loss 3.1100, time 5298.86ms 
iter 1191: loss 3.2757, time 5299.56ms 
iter 1192: loss 3.3030, time 5296.61ms 
iter 1193: loss 3.1016, time 5309.25ms 
iter 1194: loss 2.8991, time 5297.56ms 
iter 1195: loss 3.2666, time 5304.13ms 
iter 1196: loss 2.9284, time 5303.99ms 
iter 1197: loss 3.0011, time 5306.43ms 
iter 1198: loss 3.1753, time 5292.75ms 
iter 1199: loss 3.1117, time 5301.07ms 
step 1200: train loss 3.0703, val loss 3.1028
iter 1200: loss 2.9734, time 20191.06ms 
iter 1201: loss 3.1454, time 5303.12ms 
iter 1202: loss 3.1317, time 5296.83ms 
iter 1203: loss 3.1007, time 5304.98ms 
iter 1204: loss 2.9635, time 5281.50ms 
iter 1205: loss 3.1134, time 5285.23ms 
iter 1206: loss 2.9641, time 5305.61ms 
iter 1207: loss 2.9750, time 5303.00ms 
iter 1208: loss 3.0092, time 5288.09ms 
iter 1209: loss 3.1999, time 5188.22ms 
iter 1210: loss 2.9828, time 5295.81ms 
iter 1211: loss 3.1921, time 5301.49ms 
iter 1212: loss 2.9478, time 5307.30ms 
iter 1213: loss 3.0044, time 5300.03ms 
iter 1214: loss 3.0340, time 5307.32ms 
iter 1215: loss 2.9141, time 5307.30ms 
iter 1216: loss 3.1327, time 5293.05ms 
iter 1217: loss 2.9471, time 5306.52ms 
iter 1218: loss 3.0599, time 5307.65ms 
iter 1219: loss 2.8552, time 5269.98ms 
iter 1220: loss 3.1283, time 5308.07ms 
iter 1221: loss 3.1455, time 5302.83ms 
iter 1222: loss 3.0167, time 5305.75ms 
iter 1223: loss 3.2009, time 5308.36ms 
iter 1224: loss 3.0043, time 5304.59ms 
iter 1225: loss 3.0721, time 5314.42ms 
iter 1226: loss 3.0592, time 5311.36ms 
iter 1227: loss 3.0639, time 5223.30ms 
iter 1228: loss 3.0434, time 5053.98ms 
iter 1229: loss 3.1587, time 5055.12ms 
iter 1230: loss 2.9041, time 5071.07ms 
iter 1231: loss 3.1999, time 5300.58ms 
iter 1232: loss 3.0805, time 5303.39ms 
iter 1233: loss 3.1392, time 5212.35ms 
iter 1234: loss 3.0982, time 5103.64ms 
iter 1235: loss 2.9112, time 5169.62ms 
iter 1236: loss 2.8798, time 5061.80ms 
iter 1237: loss 2.9472, time 5073.48ms 
iter 1238: loss 3.0691, time 5058.87ms 
iter 1239: loss 3.1377, time 5057.98ms 
iter 1240: loss 3.0419, time 5212.18ms 
iter 1241: loss 3.1163, time 5301.05ms 
iter 1242: loss 2.8699, time 5303.18ms 
iter 1243: loss 3.1786, time 5299.47ms 
iter 1244: loss 2.9543, time 5110.09ms 
iter 1245: loss 2.9407, time 5061.33ms 
iter 1246: loss 2.9860, time 5056.83ms 
iter 1247: loss 3.1026, time 5097.19ms 
iter 1248: loss 2.9850, time 5283.55ms 
iter 1249: loss 3.0396, time 5295.33ms 
step 1250: train loss 3.0573, val loss 3.0824
iter 1250: loss 3.1758, time 20158.80ms 
iter 1251: loss 2.8745, time 5292.28ms 
iter 1252: loss 3.0612, time 5289.06ms 
iter 1253: loss 3.1801, time 5291.96ms 
iter 1254: loss 2.9877, time 5374.22ms 
iter 1255: loss 3.2579, time 5393.45ms 
iter 1256: loss 2.9064, time 5402.78ms 
iter 1257: loss 3.0603, time 5411.37ms 
iter 1258: loss 3.0301, time 5404.02ms 
iter 1259: loss 2.9935, time 5250.66ms 
iter 1260: loss 3.0718, time 5302.86ms 
iter 1261: loss 3.1535, time 5270.36ms 
iter 1262: loss 3.0118, time 5289.40ms 
iter 1263: loss 3.0245, time 5298.41ms 
iter 1264: loss 3.1608, time 5289.40ms 
iter 1265: loss 3.1419, time 5293.33ms 
iter 1266: loss 3.2732, time 5170.14ms 
iter 1267: loss 3.0524, time 5145.96ms 
iter 1268: loss 2.9107, time 5214.00ms 
iter 1269: loss 3.1162, time 5304.07ms 
iter 1270: loss 3.0996, time 5210.65ms 
iter 1271: loss 2.9835, time 5211.69ms 
iter 1272: loss 2.9812, time 5156.62ms 
iter 1273: loss 3.0956, time 5174.87ms 
iter 1274: loss 2.9113, time 5184.76ms 
iter 1275: loss 3.0975, time 5108.04ms 
iter 1276: loss 3.1307, time 5235.22ms 
iter 1277: loss 3.0943, time 5270.45ms 
iter 1278: loss 3.1142, time 5288.77ms 
iter 1279: loss 2.9444, time 5310.14ms 
iter 1280: loss 3.1082, time 5296.32ms 
iter 1281: loss 3.2305, time 5264.14ms 
iter 1282: loss 2.9019, time 5277.88ms 
iter 1283: loss 3.0517, time 5288.27ms 
iter 1284: loss 2.8703, time 5299.97ms 
iter 1285: loss 2.8635, time 5286.18ms 
iter 1286: loss 2.9061, time 5312.24ms 
iter 1287: loss 2.8502, time 5317.77ms 
iter 1288: loss 3.2190, time 5303.31ms 
iter 1289: loss 3.1435, time 5301.07ms 
iter 1290: loss 2.9546, time 5306.38ms 
iter 1291: loss 2.8515, time 5308.10ms 
iter 1292: loss 3.1000, time 5297.20ms 
iter 1293: loss 3.1193, time 5294.53ms 
iter 1294: loss 3.0492, time 5296.15ms 
iter 1295: loss 3.0301, time 5310.52ms 
iter 1296: loss 3.0664, time 5286.87ms 
iter 1297: loss 3.1627, time 5291.17ms 
iter 1298: loss 3.0075, time 5294.79ms 
iter 1299: loss 3.0131, time 5301.29ms 
step 1300: train loss 3.0394, val loss 3.0829
iter 1300: loss 2.8630, time 20164.26ms 
iter 1301: loss 3.1256, time 5300.58ms 
iter 1302: loss 3.1144, time 5308.82ms 
iter 1303: loss 2.9491, time 5269.15ms 
iter 1304: loss 2.9562, time 5301.23ms 
iter 1305: loss 2.9875, time 5295.16ms 
iter 1306: loss 2.9937, time 5297.79ms 
iter 1307: loss 3.0762, time 5301.59ms 
iter 1308: loss 3.0525, time 5306.11ms 
iter 1309: loss 2.9484, time 5294.68ms 
iter 1310: loss 3.2465, time 5293.22ms 
iter 1311: loss 2.9810, time 5307.23ms 
iter 1312: loss 3.1814, time 5299.37ms 
iter 1313: loss 2.9668, time 5291.96ms 
iter 1314: loss 3.0000, time 5296.49ms 
iter 1315: loss 3.1750, time 5290.91ms 
iter 1316: loss 2.9925, time 5071.45ms 
iter 1317: loss 3.0522, time 5204.16ms 
iter 1318: loss 3.0918, time 5299.92ms 
iter 1319: loss 2.9619, time 5294.84ms 
iter 1320: loss 3.0240, time 5295.27ms 
iter 1321: loss 2.8274, time 5288.88ms 
iter 1322: loss 3.0885, time 5290.70ms 
iter 1323: loss 3.2627, time 5300.56ms 
iter 1324: loss 2.9407, time 5286.79ms 
iter 1325: loss 2.9829, time 5301.35ms 
iter 1326: loss 2.9836, time 5292.14ms 
iter 1327: loss 3.0050, time 5250.50ms 
iter 1328: loss 3.1182, time 5298.85ms 
iter 1329: loss 2.9330, time 5311.62ms 
iter 1330: loss 2.9048, time 5253.68ms 
iter 1331: loss 3.0676, time 5284.20ms 
iter 1332: loss 3.0125, time 5253.22ms 
iter 1333: loss 2.9812, time 5290.27ms 
iter 1334: loss 2.8920, time 5288.55ms 
iter 1335: loss 3.0274, time 5283.27ms 
iter 1336: loss 3.0173, time 5089.11ms 
iter 1337: loss 3.0132, time 5168.93ms 
iter 1338: loss 3.0486, time 5244.71ms 
iter 1339: loss 2.9895, time 5231.27ms 
iter 1340: loss 3.1943, time 5286.15ms 
iter 1341: loss 2.9067, time 5222.60ms 
iter 1342: loss 2.9880, time 5257.29ms 
iter 1343: loss 2.6832, time 5214.97ms 
iter 1344: loss 3.1359, time 5124.15ms 
iter 1345: loss 2.9458, time 5111.18ms 
iter 1346: loss 2.8902, time 5149.49ms 
iter 1347: loss 2.9783, time 5285.15ms 
iter 1348: loss 3.0340, time 5279.60ms 
iter 1349: loss 2.9242, time 5268.90ms 
step 1350: train loss 3.0235, val loss 3.0587
iter 1350: loss 2.9934, time 20107.03ms 
iter 1351: loss 3.0215, time 5276.58ms 
iter 1352: loss 3.0141, time 5264.43ms 
iter 1353: loss 3.1792, time 5160.57ms 
iter 1354: loss 3.0118, time 5139.16ms 
iter 1355: loss 2.9657, time 5248.08ms 
iter 1356: loss 3.0460, time 5302.74ms 
iter 1357: loss 2.8477, time 5301.83ms 
iter 1358: loss 2.9767, time 5295.30ms 
iter 1359: loss 2.9713, time 5289.54ms 
iter 1360: loss 3.0574, time 5290.05ms 
iter 1361: loss 2.9606, time 5286.97ms 
iter 1362: loss 3.0251, time 5295.13ms 
iter 1363: loss 3.2079, time 5295.88ms 
iter 1364: loss 2.9138, time 5298.76ms 
iter 1365: loss 3.0017, time 5299.17ms 
iter 1366: loss 3.0393, time 5293.68ms 
iter 1367: loss 2.9077, time 5286.39ms 
iter 1368: loss 2.9747, time 5290.23ms 
iter 1369: loss 2.8663, time 5294.62ms 
iter 1370: loss 3.0745, time 5305.20ms 
iter 1371: loss 3.0869, time 5282.38ms 
iter 1372: loss 3.0786, time 5302.44ms 
iter 1373: loss 2.8452, time 5301.00ms 
iter 1374: loss 3.0123, time 5300.03ms 
iter 1375: loss 3.0297, time 5270.08ms 
iter 1376: loss 2.9739, time 5289.39ms 
iter 1377: loss 2.9641, time 5297.04ms 
iter 1378: loss 2.9952, time 5292.61ms 
iter 1379: loss 2.8390, time 5301.40ms 
iter 1380: loss 2.9954, time 5302.50ms 
iter 1381: loss 2.8106, time 5316.84ms 
iter 1382: loss 3.0586, time 5311.90ms 
iter 1383: loss 2.9463, time 5297.87ms 
iter 1384: loss 3.1943, time 5303.47ms 
iter 1385: loss 2.8508, time 5301.55ms 
iter 1386: loss 2.8728, time 5301.70ms 
iter 1387: loss 3.0346, time 5293.88ms 
iter 1388: loss 2.8755, time 5300.02ms 
iter 1389: loss 2.9684, time 5232.38ms 
iter 1390: loss 3.0343, time 5255.87ms 
iter 1391: loss 2.8696, time 5240.64ms 
iter 1392: loss 2.9773, time 5299.15ms 
iter 1393: loss 3.0104, time 5308.37ms 
iter 1394: loss 2.9548, time 5301.56ms 
iter 1395: loss 3.0554, time 5289.46ms 
iter 1396: loss 2.8994, time 5289.42ms 
iter 1397: loss 2.8904, time 5152.23ms 
iter 1398: loss 2.9306, time 5258.66ms 
iter 1399: loss 2.9106, time 5292.29ms 
step 1400: train loss 3.0028, val loss 3.0520
iter 1400: loss 2.9420, time 20155.96ms 
iter 1401: loss 2.8286, time 5306.02ms 
iter 1402: loss 2.9813, time 5297.65ms 
iter 1403: loss 3.0952, time 5291.90ms 
iter 1404: loss 2.9196, time 5286.55ms 
iter 1405: loss 3.0282, time 5290.24ms 
iter 1406: loss 2.9472, time 5291.63ms 
iter 1407: loss 2.8725, time 5292.23ms 
iter 1408: loss 3.1080, time 5300.10ms 
iter 1409: loss 2.9097, time 5305.09ms 
iter 1410: loss 2.9873, time 5311.17ms 
iter 1411: loss 2.9649, time 5322.48ms 
iter 1412: loss 2.9699, time 5307.45ms 
iter 1413: loss 3.0320, time 5288.83ms 
iter 1414: loss 2.9051, time 5299.79ms 
iter 1415: loss 2.9931, time 5301.72ms 
iter 1416: loss 2.8807, time 5290.75ms 
iter 1417: loss 3.1504, time 5289.14ms 
iter 1418: loss 3.0517, time 5295.22ms 
iter 1419: loss 2.9468, time 5298.73ms 
iter 1420: loss 2.9263, time 5292.61ms 
iter 1421: loss 2.9141, time 5305.47ms 
iter 1422: loss 2.6985, time 5288.25ms 
iter 1423: loss 3.0895, time 5292.56ms 
iter 1424: loss 2.9075, time 5302.11ms 
iter 1425: loss 3.0217, time 5283.20ms 
iter 1426: loss 2.9021, time 5293.74ms 
iter 1427: loss 2.9854, time 5293.55ms 
iter 1428: loss 2.9547, time 5293.04ms 
iter 1429: loss 3.0129, time 5281.60ms 
iter 1430: loss 2.8186, time 5292.55ms 
iter 1431: loss 2.8837, time 5305.81ms 
iter 1432: loss 3.0733, time 5292.26ms 
iter 1433: loss 2.7784, time 5306.46ms 
iter 1434: loss 3.1389, time 5301.53ms 
iter 1435: loss 2.8340, time 5306.32ms 
iter 1436: loss 3.0094, time 5289.50ms 
iter 1437: loss 3.0571, time 5296.59ms 
iter 1438: loss 2.9416, time 5264.94ms 
iter 1439: loss 3.1065, time 5259.64ms 
iter 1440: loss 3.0724, time 5246.27ms 
iter 1441: loss 3.0237, time 5294.59ms 
iter 1442: loss 2.9511, time 5197.97ms 
iter 1443: loss 2.8187, time 5271.89ms 
iter 1444: loss 2.9852, time 5241.91ms 
iter 1445: loss 2.8756, time 5276.02ms 
iter 1446: loss 2.8456, time 5257.36ms 
iter 1447: loss 3.0966, time 5213.88ms 
iter 1448: loss 3.2095, time 5198.58ms 
iter 1449: loss 3.1321, time 5263.23ms 
step 1450: train loss 2.9916, val loss 3.0498
iter 1450: loss 2.9864, time 20098.00ms 
iter 1451: loss 2.9855, time 5298.91ms 
iter 1452: loss 3.0104, time 5287.54ms 
iter 1453: loss 2.8615, time 5280.81ms 
iter 1454: loss 2.8181, time 5296.64ms 
iter 1455: loss 2.9813, time 5233.60ms 
iter 1456: loss 2.9040, time 5280.11ms 
iter 1457: loss 3.0647, time 5153.85ms 
iter 1458: loss 3.0963, time 5302.86ms 
iter 1459: loss 2.9136, time 5342.32ms 
iter 1460: loss 3.0444, time 5295.68ms 
iter 1461: loss 3.0696, time 5285.04ms 
iter 1462: loss 2.9820, time 5291.21ms 
iter 1463: loss 3.4209, time 5291.45ms 
iter 1464: loss 3.3059, time 5269.18ms 
iter 1465: loss 2.9298, time 5309.34ms 
iter 1466: loss 2.8903, time 5234.53ms 
iter 1467: loss 2.9824, time 5291.11ms 
iter 1468: loss 2.7872, time 5301.89ms 
iter 1469: loss 3.1677, time 5294.95ms 
iter 1470: loss 2.9603, time 5297.44ms 
iter 1471: loss 2.9558, time 5290.13ms 
iter 1472: loss 2.8746, time 5280.40ms 
iter 1473: loss 2.8651, time 5155.13ms 
iter 1474: loss 2.9822, time 5288.02ms 
iter 1475: loss 3.2431, time 5294.32ms 
iter 1476: loss 3.1480, time 5284.90ms 
iter 1477: loss 3.1053, time 5300.23ms 
iter 1478: loss 2.6559, time 5286.01ms 
iter 1479: loss 3.0276, time 5288.73ms 
iter 1480: loss 2.8750, time 5249.76ms 
iter 1481: loss 2.9752, time 5285.94ms 
iter 1482: loss 3.0953, time 5302.45ms 
iter 1483: loss 2.9542, time 5294.78ms 
iter 1484: loss 3.0026, time 5292.25ms 
iter 1485: loss 2.9893, time 5294.15ms 
iter 1486: loss 2.7576, time 5291.37ms 
iter 1487: loss 2.9986, time 5234.90ms 
iter 1488: loss 2.9619, time 5289.53ms 
iter 1489: loss 2.8911, time 5286.88ms 
iter 1490: loss 2.8475, time 5286.37ms 
iter 1491: loss 2.9300, time 5293.11ms 
iter 1492: loss 2.9289, time 5289.59ms 
iter 1493: loss 3.1849, time 5255.33ms 
iter 1494: loss 2.8852, time 5234.86ms 
iter 1495: loss 3.0325, time 5302.25ms 
iter 1496: loss 2.8414, time 5291.22ms 
iter 1497: loss 3.0718, time 5348.71ms 
iter 1498: loss 3.0329, time 5298.16ms 
iter 1499: loss 3.0478, time 5298.11ms 
step 1500: train loss 2.9970, val loss 3.0475
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 1500: loss 3.1668, time 21222.14ms 
iter 1501: loss 2.8226, time 5284.70ms 
iter 1502: loss 2.8608, time 5297.58ms 
iter 1503: loss 3.0145, time 5302.71ms 
iter 1504: loss 3.0352, time 5234.13ms 
iter 1505: loss 3.0568, time 5284.79ms 
iter 1506: loss 3.0216, time 5291.62ms 
iter 1507: loss 2.8940, time 5288.31ms 
iter 1508: loss 2.9005, time 5294.74ms 
iter 1509: loss 2.8273, time 5292.18ms 
iter 1510: loss 3.1225, time 5296.38ms 
iter 1511: loss 2.8929, time 5295.19ms 
iter 1512: loss 3.0401, time 5276.91ms 
iter 1513: loss 3.1807, time 5294.37ms 
iter 1514: loss 3.0650, time 5289.66ms 
iter 1515: loss 2.8861, time 5295.25ms 
iter 1516: loss 2.7902, time 5291.51ms 
iter 1517: loss 3.0925, time 5284.32ms 
iter 1518: loss 3.0759, time 5290.71ms 
iter 1519: loss 2.9813, time 5073.42ms 
iter 1520: loss 2.8646, time 5275.85ms 
iter 1521: loss 2.9249, time 5292.09ms 
iter 1522: loss 2.8851, time 5295.73ms 
iter 1523: loss 3.2078, time 5299.19ms 
iter 1524: loss 2.9017, time 5292.84ms 
iter 1525: loss 2.9685, time 5300.05ms 
iter 1526: loss 2.8562, time 5155.34ms 
iter 1527: loss 2.9503, time 5231.23ms 
iter 1528: loss 3.0509, time 5290.24ms 
iter 1529: loss 3.2099, time 5290.28ms 
iter 1530: loss 2.9685, time 5300.37ms 
iter 1531: loss 2.8717, time 5294.99ms 
iter 1532: loss 3.0039, time 5285.02ms 
iter 1533: loss 2.9822, time 5040.21ms 
iter 1534: loss 2.8443, time 5017.32ms 
iter 1535: loss 2.8217, time 5053.17ms 
iter 1536: loss 2.9619, time 5045.00ms 
iter 1537: loss 3.0767, time 5068.52ms 
iter 1538: loss 2.7942, time 5047.31ms 
iter 1539: loss 3.0036, time 5104.92ms 
iter 1540: loss 2.8803, time 5061.87ms 
iter 1541: loss 3.1533, time 5030.15ms 
iter 1542: loss 2.9396, time 5071.20ms 
iter 1543: loss 2.9432, time 5068.18ms 
iter 1544: loss 2.9121, time 5073.38ms 
iter 1545: loss 2.7792, time 5060.56ms 
iter 1546: loss 2.9580, time 5057.45ms 
iter 1547: loss 2.9097, time 5193.56ms 
iter 1548: loss 2.8654, time 5037.45ms 
iter 1549: loss 3.0061, time 5250.82ms 
step 1550: train loss 2.9688, val loss 3.0278
iter 1550: loss 2.8337, time 20109.85ms 
iter 1551: loss 3.0605, time 5288.18ms 
iter 1552: loss 2.9651, time 5242.10ms 
iter 1553: loss 3.1316, time 5284.06ms 
iter 1554: loss 3.1052, time 5216.17ms 
iter 1555: loss 3.1361, time 5242.74ms 
iter 1556: loss 3.1056, time 5288.52ms 
iter 1557: loss 2.8825, time 5285.05ms 
iter 1558: loss 2.9958, time 5293.99ms 
iter 1559: loss 2.7497, time 5300.50ms 
iter 1560: loss 3.0766, time 5289.36ms 
iter 1561: loss 3.1846, time 5292.96ms 
iter 1562: loss 2.8513, time 5289.43ms 
iter 1563: loss 3.1179, time 5305.55ms 
iter 1564: loss 2.7873, time 5310.63ms 
iter 1565: loss 3.0265, time 5291.81ms 
iter 1566: loss 3.0760, time 5248.96ms 
iter 1567: loss 3.0881, time 5295.00ms 
iter 1568: loss 2.9613, time 5301.09ms 
iter 1569: loss 3.0279, time 5300.95ms 
iter 1570: loss 2.8399, time 5295.12ms 
iter 1571: loss 3.1518, time 5292.13ms 
iter 1572: loss 3.0198, time 5291.82ms 
iter 1573: loss 3.0290, time 5287.09ms 
iter 1574: loss 3.0207, time 5258.42ms 
iter 1575: loss 2.8477, time 5288.10ms 
iter 1576: loss 3.0561, time 5303.82ms 
iter 1577: loss 3.1900, time 5310.06ms 
iter 1578: loss 2.9558, time 5281.74ms 
iter 1579: loss 2.9599, time 5283.24ms 
iter 1580: loss 2.8636, time 5281.84ms 
iter 1581: loss 2.9145, time 5205.99ms 
iter 1582: loss 2.9601, time 5261.68ms 
iter 1583: loss 2.9280, time 5279.49ms 
iter 1584: loss 3.0760, time 5282.53ms 
iter 1585: loss 2.9348, time 5282.37ms 
iter 1586: loss 2.9423, time 5262.31ms 
iter 1587: loss 3.0125, time 5285.18ms 
iter 1588: loss 2.8019, time 5280.11ms 
iter 1589: loss 3.0767, time 5288.10ms 
iter 1590: loss 2.9340, time 5263.10ms 
iter 1591: loss 2.8904, time 5282.16ms 
iter 1592: loss 2.9322, time 5239.88ms 
iter 1593: loss 2.9863, time 5223.61ms 
iter 1594: loss 3.1388, time 5261.78ms 
iter 1595: loss 3.0885, time 5240.50ms 
iter 1596: loss 3.1149, time 5274.73ms 
iter 1597: loss 2.9586, time 5282.05ms 
iter 1598: loss 2.6409, time 5200.01ms 
iter 1599: loss 2.8214, time 5271.49ms 
step 1600: train loss 2.9623, val loss 3.0150
iter 1600: loss 2.8568, time 20132.60ms 
iter 1601: loss 3.0695, time 5278.63ms 
iter 1602: loss 2.9496, time 5272.00ms 
iter 1603: loss 3.0595, time 5288.84ms 
iter 1604: loss 2.7913, time 5276.80ms 
iter 1605: loss 2.8547, time 5281.90ms 
iter 1606: loss 2.9087, time 5265.88ms 
iter 1607: loss 3.0894, time 5280.59ms 
iter 1608: loss 2.9893, time 5239.83ms 
iter 1609: loss 2.7198, time 5269.07ms 
iter 1610: loss 2.8460, time 5292.99ms 
iter 1611: loss 2.8513, time 5262.46ms 
iter 1612: loss 2.8814, time 5280.41ms 
iter 1613: loss 2.9054, time 5275.74ms 
iter 1614: loss 2.7839, time 5278.40ms 
iter 1615: loss 3.1204, time 5237.43ms 
iter 1616: loss 2.9103, time 5118.44ms 
iter 1617: loss 2.9292, time 5076.75ms 
iter 1618: loss 3.0363, time 5271.74ms 
iter 1619: loss 2.9806, time 5282.81ms 
iter 1620: loss 3.0281, time 5289.62ms 
iter 1621: loss 2.8675, time 5286.80ms 
iter 1622: loss 3.0468, time 5286.39ms 
iter 1623: loss 2.9093, time 5288.56ms 
iter 1624: loss 3.1418, time 5260.50ms 
iter 1625: loss 2.7807, time 5293.84ms 
iter 1626: loss 2.7728, time 5287.55ms 
iter 1627: loss 2.9583, time 5253.18ms 
iter 1628: loss 2.6904, time 5116.13ms 
iter 1629: loss 2.9411, time 5102.54ms 
iter 1630: loss 2.9248, time 5283.65ms 
iter 1631: loss 3.0263, time 5306.25ms 
iter 1632: loss 3.0436, time 5290.52ms 
iter 1633: loss 3.0242, time 5305.24ms 
iter 1634: loss 2.9909, time 5248.02ms 
iter 1635: loss 3.0700, time 5292.82ms 
iter 1636: loss 2.9383, time 5265.31ms 
iter 1637: loss 3.0542, time 5283.99ms 
iter 1638: loss 2.9485, time 5301.63ms 
iter 1639: loss 3.1597, time 5274.89ms 
iter 1640: loss 2.9331, time 5311.49ms 
iter 1641: loss 2.9065, time 5281.87ms 
iter 1642: loss 3.0971, time 5297.23ms 
iter 1643: loss 3.0872, time 5227.87ms 
iter 1644: loss 2.8423, time 5247.93ms 
iter 1645: loss 2.9886, time 5249.55ms 
iter 1646: loss 2.9053, time 5282.39ms 
iter 1647: loss 2.9208, time 5217.50ms 
iter 1648: loss 2.9116, time 5208.65ms 
iter 1649: loss 2.8297, time 5310.69ms 
step 1650: train loss 2.9507, val loss 3.0010
iter 1650: loss 2.9173, time 20141.02ms 
iter 1651: loss 3.1575, time 5346.65ms 
iter 1652: loss 2.7851, time 5320.47ms 
iter 1653: loss 3.0180, time 5244.00ms 
iter 1654: loss 3.0456, time 5273.68ms 
iter 1655: loss 3.0082, time 5254.87ms 
iter 1656: loss 2.7886, time 5293.38ms 
iter 1657: loss 2.9423, time 5249.84ms 
iter 1658: loss 3.1463, time 5071.16ms 
iter 1659: loss 2.9908, time 5065.63ms 
iter 1660: loss 2.8114, time 5154.90ms 
iter 1661: loss 2.9202, time 5249.89ms 
iter 1662: loss 2.9250, time 5293.32ms 
iter 1663: loss 2.9161, time 5086.30ms 
iter 1664: loss 2.9842, time 5073.09ms 
iter 1665: loss 3.0131, time 5074.72ms 
iter 1666: loss 2.8063, time 5077.73ms 
iter 1667: loss 2.9489, time 5230.00ms 
iter 1668: loss 2.8688, time 5331.46ms 
iter 1669: loss 2.9556, time 5256.60ms 
iter 1670: loss 2.9923, time 5251.94ms 
iter 1671: loss 2.8988, time 5293.78ms 
iter 1672: loss 3.0976, time 5197.92ms 
iter 1673: loss 2.8357, time 5213.75ms 
iter 1674: loss 3.0926, time 5299.03ms 
iter 1675: loss 3.0403, time 5292.30ms 
iter 1676: loss 3.1915, time 5311.32ms 
iter 1677: loss 2.9002, time 5288.24ms 
iter 1678: loss 2.8418, time 5311.51ms 
iter 1679: loss 2.8822, time 5320.60ms 
iter 1680: loss 2.9300, time 5305.49ms 
iter 1681: loss 2.9360, time 5310.47ms 
iter 1682: loss 2.9240, time 5298.39ms 
iter 1683: loss 2.9370, time 5278.15ms 
iter 1684: loss 2.9344, time 5320.61ms 
iter 1685: loss 3.0568, time 5305.01ms 
iter 1686: loss 2.7956, time 5220.64ms 
iter 1687: loss 2.9117, time 5272.05ms 
iter 1688: loss 2.9529, time 5267.88ms 
iter 1689: loss 2.7813, time 5303.27ms 
iter 1690: loss 3.2579, time 5256.01ms 
iter 1691: loss 2.8757, time 5216.29ms 
iter 1692: loss 2.9375, time 5143.31ms 
iter 1693: loss 2.6699, time 5241.58ms 
iter 1694: loss 2.9723, time 5264.90ms 
iter 1695: loss 2.9191, time 5300.49ms 
iter 1696: loss 2.7875, time 5306.69ms 
iter 1697: loss 2.8293, time 5312.45ms 
iter 1698: loss 3.1378, time 5290.81ms 
iter 1699: loss 2.9422, time 5216.39ms 
step 1700: train loss 2.9399, val loss 2.9882
iter 1700: loss 2.9683, time 20150.54ms 
iter 1701: loss 2.8466, time 5252.12ms 
iter 1702: loss 2.8146, time 5255.12ms 
iter 1703: loss 2.7158, time 5179.93ms 
iter 1704: loss 2.9271, time 5249.36ms 
iter 1705: loss 2.8682, time 5249.49ms 
iter 1706: loss 2.8917, time 5251.86ms 
iter 1707: loss 2.9384, time 5255.19ms 
iter 1708: loss 2.9532, time 5245.83ms 
iter 1709: loss 2.8828, time 5295.53ms 
iter 1710: loss 2.8635, time 5229.99ms 
iter 1711: loss 2.8530, time 5256.81ms 
iter 1712: loss 3.0111, time 5271.27ms 
iter 1713: loss 2.9201, time 5272.51ms 
iter 1714: loss 3.2044, time 5307.36ms 
iter 1715: loss 3.0072, time 5298.04ms 
iter 1716: loss 2.8097, time 5316.39ms 
iter 1717: loss 3.1238, time 5299.25ms 
iter 1718: loss 2.9273, time 5303.49ms 
iter 1719: loss 2.7229, time 5268.99ms 
iter 1720: loss 2.7677, time 5247.88ms 
iter 1721: loss 2.8426, time 5250.35ms 
iter 1722: loss 2.7681, time 5252.87ms 
iter 1723: loss 2.8570, time 5220.42ms 
iter 1724: loss 2.9822, time 5247.10ms 
iter 1725: loss 3.0758, time 5245.70ms 
iter 1726: loss 3.0940, time 5250.46ms 
iter 1727: loss 2.8761, time 5252.69ms 
iter 1728: loss 2.8853, time 5225.89ms 
iter 1729: loss 2.7149, time 5249.38ms 
iter 1730: loss 2.9626, time 5259.87ms 
iter 1731: loss 2.9680, time 5250.70ms 
iter 1732: loss 2.8391, time 5257.94ms 
iter 1733: loss 2.9629, time 5251.29ms 
iter 1734: loss 3.0430, time 5255.30ms 
iter 1735: loss 2.9583, time 5255.31ms 
iter 1736: loss 2.8508, time 5247.32ms 
iter 1737: loss 2.7431, time 5258.73ms 
iter 1738: loss 2.9165, time 5249.85ms 
iter 1739: loss 2.9407, time 5250.28ms 
iter 1740: loss 3.0386, time 5250.29ms 
iter 1741: loss 2.8281, time 5251.57ms 
iter 1742: loss 2.9895, time 5253.06ms 
iter 1743: loss 2.8361, time 5173.59ms 
iter 1744: loss 3.0737, time 5250.94ms 
iter 1745: loss 2.8604, time 5253.29ms 
iter 1746: loss 3.1274, time 5248.78ms 
iter 1747: loss 2.9760, time 5261.24ms 
iter 1748: loss 2.7912, time 5247.11ms 
iter 1749: loss 2.9506, time 5236.57ms 
step 1750: train loss 2.9271, val loss 2.9998
iter 1750: loss 3.1858, time 19945.44ms 
iter 1751: loss 2.9081, time 5251.74ms 
iter 1752: loss 2.8560, time 5242.45ms 
iter 1753: loss 2.9908, time 5232.40ms 
iter 1754: loss 2.8356, time 5256.61ms 
iter 1755: loss 3.0266, time 5265.42ms 
iter 1756: loss 2.9101, time 5250.83ms 
iter 1757: loss 3.0171, time 5259.92ms 
iter 1758: loss 2.9570, time 5257.78ms 
iter 1759: loss 2.9619, time 5248.12ms 
iter 1760: loss 2.9901, time 5246.21ms 
iter 1761: loss 3.0727, time 5257.22ms 
iter 1762: loss 2.7564, time 5211.96ms 
iter 1763: loss 2.7239, time 5250.73ms 
iter 1764: loss 2.7633, time 5256.71ms 
iter 1765: loss 2.9997, time 5269.88ms 
iter 1766: loss 2.8558, time 5277.14ms 
iter 1767: loss 3.0364, time 5305.06ms 
iter 1768: loss 2.9707, time 5291.51ms 
iter 1769: loss 3.1281, time 5267.69ms 
iter 1770: loss 2.9555, time 5286.59ms 
iter 1771: loss 3.0170, time 5296.75ms 
iter 1772: loss 2.8448, time 5271.52ms 
iter 1773: loss 2.8774, time 5245.61ms 
iter 1774: loss 2.9248, time 5196.39ms 
iter 1775: loss 3.0679, time 5219.89ms 
iter 1776: loss 2.9634, time 5315.90ms 
iter 1777: loss 3.1229, time 5256.85ms 
iter 1778: loss 3.1227, time 5295.39ms 
iter 1779: loss 2.8038, time 5272.74ms 
iter 1780: loss 3.1484, time 5297.54ms 
iter 1781: loss 2.8768, time 5295.08ms 
iter 1782: loss 2.9558, time 5264.94ms 
iter 1783: loss 3.2531, time 5114.07ms 
iter 1784: loss 2.8967, time 5062.43ms 
iter 1785: loss 3.0820, time 5064.34ms 
iter 1786: loss 2.8183, time 5274.93ms 
iter 1787: loss 2.9263, time 5305.67ms 
iter 1788: loss 3.0760, time 5299.44ms 
iter 1789: loss 2.8220, time 5291.67ms 
iter 1790: loss 3.0861, time 5294.76ms 
iter 1791: loss 2.8131, time 5298.27ms 
iter 1792: loss 3.0085, time 5289.09ms 
iter 1793: loss 3.0491, time 5298.41ms 
iter 1794: loss 3.0446, time 5293.53ms 
iter 1795: loss 2.9733, time 5293.10ms 
iter 1796: loss 2.9062, time 5303.50ms 
iter 1797: loss 2.9567, time 5241.16ms 
iter 1798: loss 2.8704, time 5312.18ms 
iter 1799: loss 2.8093, time 5315.31ms 
step 1800: train loss 2.9029, val loss 2.9834
iter 1800: loss 3.0663, time 20104.14ms 
iter 1801: loss 3.1431, time 5310.30ms 
iter 1802: loss 2.8606, time 5298.32ms 
iter 1803: loss 2.9277, time 5304.45ms 
iter 1804: loss 2.9964, time 5306.13ms 
iter 1805: loss 3.0764, time 5298.08ms 
iter 1806: loss 2.7536, time 5299.06ms 
iter 1807: loss 2.8184, time 5309.64ms 
iter 1808: loss 2.8601, time 5294.94ms 
iter 1809: loss 2.8232, time 5292.08ms 
iter 1810: loss 2.7361, time 5297.00ms 
iter 1811: loss 2.9843, time 5306.88ms 
iter 1812: loss 2.9062, time 5299.62ms 
iter 1813: loss 2.9564, time 5318.70ms 
iter 1814: loss 2.8224, time 5303.13ms 
iter 1815: loss 3.1223, time 5306.98ms 
iter 1816: loss 3.0165, time 5307.91ms 
iter 1817: loss 2.8255, time 5315.87ms 
iter 1818: loss 2.8146, time 5311.99ms 
iter 1819: loss 2.8576, time 5309.43ms 
iter 1820: loss 2.9250, time 5310.10ms 
iter 1821: loss 2.9272, time 5310.59ms 
iter 1822: loss 2.8990, time 5304.46ms 
iter 1823: loss 3.1122, time 5299.47ms 
iter 1824: loss 2.8597, time 5299.74ms 
iter 1825: loss 2.8250, time 5296.74ms 
iter 1826: loss 2.9237, time 5306.63ms 
iter 1827: loss 3.0837, time 5300.54ms 
iter 1828: loss 3.0636, time 5303.23ms 
iter 1829: loss 2.9410, time 5161.63ms 
iter 1830: loss 2.9740, time 5111.18ms 
iter 1831: loss 2.9272, time 5080.39ms 
iter 1832: loss 2.9413, time 5171.79ms 
iter 1833: loss 3.1173, time 5191.32ms 
iter 1834: loss 2.8965, time 5114.96ms 
iter 1835: loss 2.8878, time 5075.90ms 
iter 1836: loss 2.9548, time 5092.68ms 
iter 1837: loss 2.7949, time 5296.50ms 
iter 1838: loss 3.1622, time 5298.72ms 
iter 1839: loss 3.0415, time 5302.50ms 
iter 1840: loss 2.8648, time 5282.65ms 
iter 1841: loss 2.9213, time 5286.03ms 
iter 1842: loss 2.7932, time 5299.84ms 
iter 1843: loss 2.8618, time 5304.28ms 
iter 1844: loss 2.9337, time 5264.84ms 
iter 1845: loss 2.9297, time 5310.89ms 
iter 1846: loss 2.9708, time 5308.93ms 
iter 1847: loss 2.8301, time 5306.20ms 
iter 1848: loss 3.0519, time 5296.47ms 
iter 1849: loss 2.9066, time 5236.37ms 
step 1850: train loss 2.8855, val loss 2.9715
iter 1850: loss 2.9586, time 20158.12ms 
iter 1851: loss 2.8698, time 5304.43ms 
iter 1852: loss 2.9283, time 5302.91ms 
iter 1853: loss 2.5737, time 5302.32ms 
iter 1854: loss 2.8054, time 5307.62ms 
iter 1855: loss 2.9919, time 5307.27ms 
iter 1856: loss 3.0597, time 5305.58ms 
iter 1857: loss 3.1122, time 5307.82ms 
iter 1858: loss 3.0864, time 5300.39ms 
iter 1859: loss 3.0490, time 5155.65ms 
iter 1860: loss 2.8045, time 5243.71ms 
iter 1861: loss 2.9274, time 5279.19ms 
iter 1862: loss 2.9327, time 5317.73ms 
iter 1863: loss 2.9592, time 5160.88ms 
iter 1864: loss 2.6155, time 5190.90ms 
iter 1865: loss 3.0273, time 5282.19ms 
iter 1866: loss 2.9307, time 5235.64ms 
iter 1867: loss 2.9308, time 5307.40ms 
iter 1868: loss 2.9967, time 5270.88ms 
iter 1869: loss 2.8097, time 5191.78ms 
iter 1870: loss 2.9893, time 5247.65ms 
iter 1871: loss 2.7889, time 5244.62ms 
iter 1872: loss 2.7463, time 5236.45ms 
iter 1873: loss 2.8959, time 5258.87ms 
iter 1874: loss 2.9542, time 5314.10ms 
iter 1875: loss 2.9117, time 5330.64ms 
iter 1876: loss 2.7966, time 5270.81ms 
iter 1877: loss 3.0711, time 5293.07ms 
iter 1878: loss 2.9026, time 5290.38ms 
iter 1879: loss 3.0028, time 5299.71ms 
iter 1880: loss 2.8811, time 5303.31ms 
iter 1881: loss 2.7559, time 5182.98ms 
iter 1882: loss 2.8646, time 5157.17ms 
iter 1883: loss 2.8635, time 5295.92ms 
iter 1884: loss 3.1013, time 5297.55ms 
iter 1885: loss 2.9893, time 5286.23ms 
iter 1886: loss 3.0206, time 5309.21ms 
iter 1887: loss 2.8784, time 5294.29ms 
iter 1888: loss 2.8806, time 5301.47ms 
iter 1889: loss 2.7981, time 5313.32ms 
iter 1890: loss 2.7811, time 5298.00ms 
iter 1891: loss 3.0200, time 5052.43ms 
iter 1892: loss 2.7835, time 5274.60ms 
iter 1893: loss 2.9529, time 5352.51ms 
iter 1894: loss 2.8420, time 5300.55ms 
iter 1895: loss 2.8618, time 5302.71ms 
iter 1896: loss 2.7949, time 5272.76ms 
iter 1897: loss 2.8774, time 5308.19ms 
iter 1898: loss 2.8550, time 5243.76ms 
iter 1899: loss 2.8547, time 5303.06ms 
step 1900: train loss 2.8982, val loss 2.9733
iter 1900: loss 3.0865, time 20226.79ms 
iter 1901: loss 2.9844, time 5308.30ms 
iter 1902: loss 2.8782, time 5302.97ms 
iter 1903: loss 2.8338, time 5296.13ms 
iter 1904: loss 3.1149, time 5293.36ms 
iter 1905: loss 2.6949, time 5298.78ms 
iter 1906: loss 2.9275, time 5284.23ms 
iter 1907: loss 3.0255, time 5292.72ms 
iter 1908: loss 2.7319, time 5301.83ms 
iter 1909: loss 2.7382, time 5287.66ms 
iter 1910: loss 2.8781, time 5292.28ms 
iter 1911: loss 2.8743, time 5291.31ms 
iter 1912: loss 2.7877, time 5245.73ms 
iter 1913: loss 3.0799, time 5303.84ms 
iter 1914: loss 2.7357, time 5302.77ms 
iter 1915: loss 2.9176, time 5304.53ms 
iter 1916: loss 2.8381, time 5307.62ms 
iter 1917: loss 2.8999, time 5305.10ms 
iter 1918: loss 2.7965, time 5315.41ms 
iter 1919: loss 3.0546, time 5307.64ms 
iter 1920: loss 2.6970, time 5296.46ms 
iter 1921: loss 2.8290, time 5295.56ms 
iter 1922: loss 2.6559, time 5303.49ms 
iter 1923: loss 2.7552, time 5301.92ms 
iter 1924: loss 2.8425, time 5303.51ms 
iter 1925: loss 2.8074, time 5286.67ms 
iter 1926: loss 2.8847, time 5290.47ms 
iter 1927: loss 2.6834, time 5288.12ms 
iter 1928: loss 2.9661, time 5291.76ms 
iter 1929: loss 2.9539, time 5296.51ms 
iter 1930: loss 2.9670, time 5297.51ms 
iter 1931: loss 2.8564, time 5304.95ms 
iter 1932: loss 2.9712, time 5295.48ms 
iter 1933: loss 3.0466, time 5269.14ms 
iter 1934: loss 2.9832, time 5055.59ms 
iter 1935: loss 2.6104, time 5057.78ms 
iter 1936: loss 2.6223, time 5061.46ms 
iter 1937: loss 2.9072, time 5075.55ms 
iter 1938: loss 2.8865, time 5067.56ms 
iter 1939: loss 3.0387, time 5075.25ms 
iter 1940: loss 3.1055, time 5259.78ms 
iter 1941: loss 2.8183, time 5280.42ms 
iter 1942: loss 2.8858, time 5273.42ms 
iter 1943: loss 2.8389, time 5291.61ms 
iter 1944: loss 2.8169, time 5277.12ms 
iter 1945: loss 2.8982, time 5294.21ms 
iter 1946: loss 2.7174, time 5279.20ms 
iter 1947: loss 3.0313, time 5295.93ms 
iter 1948: loss 2.8799, time 5292.88ms 
iter 1949: loss 2.9924, time 5289.80ms 
step 1950: train loss 2.8910, val loss 2.9661
iter 1950: loss 2.8584, time 20184.60ms 
iter 1951: loss 2.8282, time 5309.15ms 
iter 1952: loss 2.8594, time 5300.00ms 
iter 1953: loss 2.9384, time 5304.28ms 
iter 1954: loss 2.8361, time 5295.17ms 
iter 1955: loss 2.8582, time 5297.60ms 
iter 1956: loss 2.7529, time 5294.99ms 
iter 1957: loss 3.1733, time 5305.54ms 
iter 1958: loss 3.0519, time 5292.38ms 
iter 1959: loss 2.7375, time 5273.42ms 
iter 1960: loss 2.8345, time 5294.59ms 
iter 1961: loss 2.7188, time 5298.04ms 
iter 1962: loss 2.7618, time 5294.42ms 
iter 1963: loss 2.8607, time 5300.88ms 
iter 1964: loss 2.8196, time 5272.88ms 
iter 1965: loss 2.9577, time 5310.29ms 
iter 1966: loss 2.8048, time 5322.27ms 
iter 1967: loss 2.8865, time 5304.57ms 
iter 1968: loss 2.9780, time 5302.75ms 
iter 1969: loss 3.1036, time 5310.51ms 
iter 1970: loss 2.9263, time 5293.77ms 
iter 1971: loss 2.8806, time 5282.03ms 
iter 1972: loss 2.9611, time 5285.55ms 
iter 1973: loss 2.9545, time 5272.01ms 
iter 1974: loss 2.7707, time 5293.77ms 
iter 1975: loss 2.9429, time 5291.56ms 
iter 1976: loss 3.0150, time 5297.09ms 
iter 1977: loss 2.8279, time 5290.77ms 
iter 1978: loss 2.8870, time 5286.80ms 
iter 1979: loss 2.7802, time 5275.22ms 
iter 1980: loss 2.7141, time 5269.82ms 
iter 1981: loss 2.7673, time 5285.10ms 
iter 1982: loss 2.7809, time 5282.02ms 
iter 1983: loss 2.8782, time 5299.24ms 
iter 1984: loss 2.7436, time 5287.38ms 
iter 1985: loss 2.8762, time 5298.12ms 
iter 1986: loss 2.8294, time 5174.85ms 
iter 1987: loss 2.8260, time 5295.98ms 
iter 1988: loss 2.9357, time 5302.82ms 
iter 1989: loss 3.0422, time 5306.59ms 
iter 1990: loss 3.0121, time 5311.38ms 
iter 1991: loss 2.7982, time 5286.26ms 
iter 1992: loss 2.8747, time 5301.38ms 
iter 1993: loss 2.7491, time 5309.65ms 
iter 1994: loss 2.7483, time 5250.95ms 
iter 1995: loss 3.0316, time 5302.91ms 
iter 1996: loss 2.8397, time 5263.35ms 
iter 1997: loss 3.0118, time 5246.61ms 
iter 1998: loss 2.9188, time 5306.20ms 
iter 1999: loss 2.8672, time 5312.45ms 
step 2000: train loss 2.8815, val loss 2.9515
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 2000: loss 2.8862, time 21289.30ms 
iter 2001: loss 3.0219, time 5310.95ms 
iter 2002: loss 2.8082, time 5303.84ms 
iter 2003: loss 2.6089, time 5305.88ms 
iter 2004: loss 2.9557, time 5302.41ms 
iter 2005: loss 2.7650, time 5297.48ms 
iter 2006: loss 2.9323, time 5310.50ms 
iter 2007: loss 2.9719, time 5301.51ms 
iter 2008: loss 2.8545, time 5302.63ms 
iter 2009: loss 3.1312, time 5301.35ms 
iter 2010: loss 2.8354, time 5309.11ms 
iter 2011: loss 3.0510, time 5311.14ms 
iter 2012: loss 3.0618, time 5300.99ms 
iter 2013: loss 3.0926, time 5236.54ms 
iter 2014: loss 2.9954, time 5234.80ms 
iter 2015: loss 2.9050, time 5282.52ms 
iter 2016: loss 2.9414, time 5305.37ms 
iter 2017: loss 2.9151, time 5301.62ms 
iter 2018: loss 2.8090, time 5311.69ms 
iter 2019: loss 2.8692, time 5289.33ms 
iter 2020: loss 2.9570, time 5294.91ms 
iter 2021: loss 2.8141, time 5294.97ms 
iter 2022: loss 2.7286, time 5300.35ms 
iter 2023: loss 2.6966, time 5296.12ms 
iter 2024: loss 2.9324, time 5241.77ms 
iter 2025: loss 2.7336, time 5218.64ms 
iter 2026: loss 2.7738, time 5255.49ms 
iter 2027: loss 3.0149, time 5297.61ms 
iter 2028: loss 2.8923, time 5211.18ms 
iter 2029: loss 2.7458, time 5241.63ms 
iter 2030: loss 2.9143, time 5240.06ms 
iter 2031: loss 2.8595, time 5151.17ms 
iter 2032: loss 2.8835, time 5159.25ms 
iter 2033: loss 2.8207, time 5204.68ms 
iter 2034: loss 2.8159, time 5224.86ms 
iter 2035: loss 2.8498, time 5279.19ms 
iter 2036: loss 3.0000, time 5291.21ms 
iter 2037: loss 2.9787, time 5307.05ms 
iter 2038: loss 2.8619, time 5302.89ms 
iter 2039: loss 2.8573, time 5313.60ms 
iter 2040: loss 2.8473, time 5098.55ms 
iter 2041: loss 2.9739, time 5240.54ms 
iter 2042: loss 2.7757, time 5307.10ms 
iter 2043: loss 2.9257, time 5304.56ms 
iter 2044: loss 2.9131, time 5236.50ms 
iter 2045: loss 2.9129, time 5289.22ms 
iter 2046: loss 2.8056, time 5287.49ms 
iter 2047: loss 2.9647, time 5299.92ms 
iter 2048: loss 2.9059, time 5282.20ms 
iter 2049: loss 2.7787, time 5289.99ms 
step 2050: train loss 2.8797, val loss 2.9510
iter 2050: loss 2.8849, time 20150.63ms 
iter 2051: loss 2.8185, time 5262.65ms 
iter 2052: loss 2.8308, time 5292.24ms 
iter 2053: loss 2.8012, time 5304.65ms 
iter 2054: loss 2.8408, time 5297.00ms 
iter 2055: loss 2.9650, time 5317.81ms 
iter 2056: loss 3.0040, time 5298.49ms 
iter 2057: loss 2.7819, time 5305.64ms 
iter 2058: loss 2.9217, time 5275.14ms 
iter 2059: loss 2.9679, time 5304.69ms 
iter 2060: loss 2.7477, time 5296.47ms 
iter 2061: loss 2.9962, time 5096.17ms 
iter 2062: loss 2.8050, time 5121.13ms 
iter 2063: loss 2.7680, time 5296.45ms 
iter 2064: loss 2.9622, time 5299.36ms 
iter 2065: loss 2.7694, time 5287.73ms 
iter 2066: loss 2.8314, time 5297.71ms 
iter 2067: loss 2.9965, time 5302.72ms 
iter 2068: loss 2.7084, time 5294.13ms 
iter 2069: loss 2.7102, time 5286.71ms 
iter 2070: loss 2.7979, time 5305.50ms 
iter 2071: loss 2.7481, time 5327.45ms 
iter 2072: loss 2.8493, time 5298.98ms 
iter 2073: loss 2.8997, time 5288.29ms 
iter 2074: loss 2.8658, time 5297.45ms 
iter 2075: loss 2.9399, time 5288.30ms 
iter 2076: loss 2.7129, time 5296.97ms 
iter 2077: loss 2.7304, time 5294.64ms 
iter 2078: loss 2.9066, time 5276.55ms 
iter 2079: loss 2.7384, time 5290.79ms 
iter 2080: loss 3.0546, time 5298.17ms 
iter 2081: loss 2.9231, time 5257.41ms 
iter 2082: loss 2.8332, time 5271.45ms 
iter 2083: loss 2.8006, time 5270.50ms 
iter 2084: loss 2.9201, time 5285.72ms 
iter 2085: loss 2.8429, time 5296.93ms 
iter 2086: loss 2.9548, time 5293.02ms 
iter 2087: loss 2.9444, time 5287.98ms 
iter 2088: loss 2.8394, time 5296.76ms 
iter 2089: loss 2.9009, time 5305.50ms 
iter 2090: loss 2.8285, time 5298.65ms 
iter 2091: loss 3.0705, time 5300.22ms 
iter 2092: loss 2.8996, time 5295.45ms 
iter 2093: loss 2.7185, time 5301.70ms 
iter 2094: loss 3.0979, time 5300.56ms 
iter 2095: loss 3.0714, time 5295.82ms 
iter 2096: loss 3.0149, time 5296.66ms 
iter 2097: loss 2.7309, time 5308.33ms 
iter 2098: loss 3.0699, time 5192.78ms 
iter 2099: loss 2.8960, time 5055.70ms 
step 2100: train loss 2.8489, val loss 2.9515
iter 2100: loss 2.7544, time 19908.27ms 
iter 2101: loss 2.6925, time 5278.70ms 
iter 2102: loss 2.8233, time 5312.18ms 
iter 2103: loss 2.9831, time 5300.98ms 
iter 2104: loss 3.0891, time 5300.18ms 
iter 2105: loss 2.6903, time 5293.75ms 
iter 2106: loss 2.8674, time 5297.90ms 
iter 2107: loss 2.7985, time 5292.73ms 
iter 2108: loss 2.8381, time 5284.58ms 
iter 2109: loss 2.8001, time 5167.05ms 
iter 2110: loss 2.7327, time 5266.28ms 
iter 2111: loss 2.6517, time 5315.33ms 
iter 2112: loss 2.9103, time 5303.86ms 
iter 2113: loss 2.9692, time 5191.44ms 
iter 2114: loss 2.8791, time 5291.85ms 
iter 2115: loss 2.7971, time 5064.53ms 
iter 2116: loss 3.0964, time 5243.30ms 
iter 2117: loss 2.9098, time 5297.74ms 
iter 2118: loss 2.8555, time 5300.02ms 
iter 2119: loss 2.9798, time 5304.41ms 
iter 2120: loss 2.9240, time 5316.43ms 
iter 2121: loss 2.9508, time 5300.92ms 
iter 2122: loss 2.8475, time 5314.73ms 
iter 2123: loss 2.8811, time 5325.87ms 
iter 2124: loss 2.8299, time 5305.46ms 
iter 2125: loss 2.8072, time 5298.80ms 
iter 2126: loss 2.7735, time 5261.66ms 
iter 2127: loss 2.8921, time 5283.94ms 
iter 2128: loss 2.7994, time 5300.99ms 
iter 2129: loss 2.8705, time 5307.71ms 
iter 2130: loss 2.8675, time 5305.37ms 
iter 2131: loss 2.9001, time 5313.81ms 
iter 2132: loss 2.7382, time 5310.16ms 
iter 2133: loss 2.8465, time 5309.34ms 
iter 2134: loss 2.8819, time 5295.85ms 
iter 2135: loss 2.7704, time 5297.14ms 
iter 2136: loss 2.8635, time 5295.08ms 
iter 2137: loss 2.7435, time 5296.70ms 
iter 2138: loss 2.9446, time 5293.71ms 
iter 2139: loss 2.8624, time 5298.82ms 
iter 2140: loss 2.7357, time 5302.02ms 
iter 2141: loss 2.7698, time 5313.46ms 
iter 2142: loss 2.8736, time 5313.88ms 
iter 2143: loss 2.8587, time 5266.02ms 
iter 2144: loss 3.0195, time 5293.40ms 
iter 2145: loss 2.7809, time 5302.05ms 
iter 2146: loss 2.9944, time 5294.47ms 
iter 2147: loss 2.7965, time 5261.79ms 
iter 2148: loss 2.9241, time 5298.92ms 
iter 2149: loss 2.8407, time 5285.47ms 
step 2150: train loss 2.8500, val loss 2.9435
iter 2150: loss 2.8329, time 20180.06ms 
iter 2151: loss 2.9273, time 5307.82ms 
iter 2152: loss 3.1487, time 5302.50ms 
iter 2153: loss 2.8411, time 5291.45ms 
iter 2154: loss 2.9203, time 5086.49ms 
iter 2155: loss 2.8633, time 5125.61ms 
iter 2156: loss 2.8836, time 5265.47ms 
iter 2157: loss 2.6260, time 5177.85ms 
iter 2158: loss 2.7724, time 5319.64ms 
iter 2159: loss 2.7343, time 5191.56ms 
iter 2160: loss 2.6578, time 5344.10ms 
iter 2161: loss 2.8335, time 5333.86ms 
iter 2162: loss 2.9264, time 5332.19ms 
iter 2163: loss 2.8485, time 5322.77ms 
iter 2164: loss 2.7392, time 5338.13ms 
iter 2165: loss 2.6790, time 5315.65ms 
iter 2166: loss 2.8535, time 5303.62ms 
iter 2167: loss 2.8077, time 5304.66ms 
iter 2168: loss 2.7489, time 5309.54ms 
iter 2169: loss 2.7566, time 5320.63ms 
iter 2170: loss 3.1267, time 5299.08ms 
iter 2171: loss 2.7786, time 5263.55ms 
iter 2172: loss 2.8060, time 5290.40ms 
iter 2173: loss 2.9875, time 5314.50ms 
iter 2174: loss 2.7758, time 5269.15ms 
iter 2175: loss 2.8219, time 5286.54ms 
iter 2176: loss 2.8091, time 5235.76ms 
iter 2177: loss 2.6928, time 5239.90ms 
iter 2178: loss 2.8389, time 5295.06ms 
iter 2179: loss 2.9015, time 5299.23ms 
iter 2180: loss 2.9241, time 5277.57ms 
iter 2181: loss 2.6609, time 5293.17ms 
iter 2182: loss 2.6328, time 5300.46ms 
iter 2183: loss 2.8593, time 5302.73ms 
iter 2184: loss 2.7803, time 5299.74ms 
iter 2185: loss 2.8677, time 5297.34ms 
iter 2186: loss 2.9440, time 5316.62ms 
iter 2187: loss 2.7964, time 5258.82ms 
iter 2188: loss 2.9697, time 5293.24ms 
iter 2189: loss 2.8401, time 5335.18ms 
iter 2190: loss 2.9521, time 5305.66ms 
iter 2191: loss 2.9294, time 5303.05ms 
iter 2192: loss 2.9996, time 5301.39ms 
iter 2193: loss 2.8380, time 5303.51ms 
iter 2194: loss 2.9232, time 5278.10ms 
iter 2195: loss 2.9367, time 5275.46ms 
iter 2196: loss 2.8420, time 5308.93ms 
iter 2197: loss 2.7943, time 5296.88ms 
iter 2198: loss 2.9459, time 5256.88ms 
iter 2199: loss 2.7707, time 5301.43ms 
step 2200: train loss 2.8356, val loss 2.9317
iter 2200: loss 2.9892, time 20199.90ms 
iter 2201: loss 2.7848, time 5316.61ms 
iter 2202: loss 2.8322, time 5301.60ms 
iter 2203: loss 2.7629, time 5227.73ms 
iter 2204: loss 2.8200, time 5183.07ms 
iter 2205: loss 2.8742, time 5300.36ms 
iter 2206: loss 2.9742, time 5285.83ms 
iter 2207: loss 2.7541, time 5309.62ms 
iter 2208: loss 2.8205, time 5308.61ms 
iter 2209: loss 2.9294, time 5280.23ms 
iter 2210: loss 3.0106, time 5307.06ms 
iter 2211: loss 2.6760, time 5277.75ms 
iter 2212: loss 3.0239, time 5304.96ms 
iter 2213: loss 2.9076, time 5292.88ms 
iter 2214: loss 2.8366, time 5265.84ms 
iter 2215: loss 2.9099, time 5243.87ms 
iter 2216: loss 2.9928, time 5251.95ms 
iter 2217: loss 2.9294, time 5277.17ms 
iter 2218: loss 2.9283, time 5260.52ms 
iter 2219: loss 2.9078, time 5267.57ms 
iter 2220: loss 2.7560, time 5271.49ms 
iter 2221: loss 2.7973, time 5295.29ms 
iter 2222: loss 2.9647, time 5291.23ms 
iter 2223: loss 2.7301, time 5308.56ms 
iter 2224: loss 2.8985, time 5282.26ms 
iter 2225: loss 2.9172, time 5312.16ms 
iter 2226: loss 2.8011, time 5313.88ms 
iter 2227: loss 2.8203, time 5300.13ms 
iter 2228: loss 2.6712, time 5304.84ms 
iter 2229: loss 2.7848, time 5319.83ms 
iter 2230: loss 2.7504, time 5312.25ms 
iter 2231: loss 2.8198, time 5283.00ms 
iter 2232: loss 2.8234, time 5294.18ms 
iter 2233: loss 2.7066, time 5292.51ms 
iter 2234: loss 2.6267, time 5296.90ms 
iter 2235: loss 2.7292, time 5303.26ms 
iter 2236: loss 2.8331, time 5283.29ms 
iter 2237: loss 2.9758, time 5266.61ms 
iter 2238: loss 2.8808, time 5231.46ms 
iter 2239: loss 2.8487, time 5248.19ms 
iter 2240: loss 2.8845, time 5242.56ms 
iter 2241: loss 2.9947, time 5245.16ms 
iter 2242: loss 2.8925, time 5242.77ms 
iter 2243: loss 2.8465, time 5250.77ms 
iter 2244: loss 2.7623, time 5260.81ms 
iter 2245: loss 2.9100, time 5244.68ms 
iter 2246: loss 2.9581, time 5247.07ms 
iter 2247: loss 2.6585, time 5252.73ms 
iter 2248: loss 2.8625, time 5222.40ms 
iter 2249: loss 2.8321, time 5246.78ms 
step 2250: train loss 2.8351, val loss 2.9245
iter 2250: loss 2.7926, time 19949.97ms 
iter 2251: loss 2.8084, time 5189.30ms 
iter 2252: loss 2.6705, time 5106.63ms 
iter 2253: loss 2.8965, time 5300.61ms 
iter 2254: loss 2.8718, time 5297.34ms 
iter 2255: loss 2.7787, time 5313.61ms 
iter 2256: loss 2.6673, time 5303.27ms 
iter 2257: loss 2.8551, time 5256.99ms 
iter 2258: loss 2.9742, time 5301.78ms 
iter 2259: loss 2.6981, time 5269.74ms 
iter 2260: loss 2.7763, time 5307.22ms 
iter 2261: loss 2.8441, time 5289.58ms 
iter 2262: loss 2.9563, time 5275.81ms 
iter 2263: loss 2.7578, time 5298.49ms 
iter 2264: loss 2.8303, time 5292.28ms 
iter 2265: loss 2.8436, time 5300.06ms 
iter 2266: loss 2.6025, time 5296.79ms 
iter 2267: loss 3.0460, time 5234.37ms 
iter 2268: loss 2.9041, time 5288.19ms 
iter 2269: loss 3.0579, time 5296.86ms 
iter 2270: loss 2.8231, time 5314.84ms 
iter 2271: loss 2.8243, time 5297.84ms 
iter 2272: loss 2.7960, time 5290.97ms 
iter 2273: loss 2.7305, time 5259.34ms 
iter 2274: loss 2.6209, time 5286.35ms 
iter 2275: loss 2.8678, time 5304.13ms 
iter 2276: loss 2.5729, time 5312.08ms 
iter 2277: loss 2.9930, time 5305.93ms 
iter 2278: loss 2.8775, time 5311.39ms 
iter 2279: loss 2.6904, time 5297.73ms 
iter 2280: loss 2.8258, time 5259.41ms 
iter 2281: loss 2.7235, time 5197.45ms 
iter 2282: loss 2.7413, time 5274.92ms 
iter 2283: loss 2.7399, time 5302.42ms 
iter 2284: loss 2.7735, time 5303.57ms 
iter 2285: loss 2.7372, time 5315.87ms 
iter 2286: loss 3.0574, time 5262.15ms 
iter 2287: loss 2.7727, time 5239.27ms 
iter 2288: loss 2.5762, time 5292.73ms 
iter 2289: loss 2.7935, time 5186.39ms 
iter 2290: loss 2.8574, time 5135.97ms 
iter 2291: loss 2.9506, time 5181.63ms 
iter 2292: loss 2.6912, time 5139.41ms 
iter 2293: loss 2.8434, time 5294.50ms 
iter 2294: loss 2.7640, time 5299.83ms 
iter 2295: loss 2.8821, time 5289.78ms 
iter 2296: loss 2.8141, time 5312.66ms 
iter 2297: loss 2.8589, time 5323.11ms 
iter 2298: loss 2.9751, time 5308.52ms 
iter 2299: loss 2.8603, time 5291.49ms 
step 2300: train loss 2.8231, val loss 2.9325
iter 2300: loss 2.7270, time 20167.39ms 
iter 2301: loss 2.9343, time 5246.39ms 
iter 2302: loss 2.6532, time 5309.25ms 
iter 2303: loss 2.9150, time 5312.98ms 
iter 2304: loss 2.8150, time 5239.22ms 
iter 2305: loss 2.8355, time 5309.15ms 
iter 2306: loss 2.8815, time 5247.01ms 
iter 2307: loss 2.9429, time 5212.59ms 
iter 2308: loss 2.8555, time 5297.75ms 
iter 2309: loss 2.7953, time 5312.41ms 
iter 2310: loss 2.7333, time 5316.79ms 
iter 2311: loss 2.7581, time 5302.25ms 
iter 2312: loss 2.7620, time 5285.35ms 
iter 2313: loss 2.6994, time 5291.44ms 
iter 2314: loss 2.5930, time 5306.57ms 
iter 2315: loss 2.6783, time 5346.37ms 
iter 2316: loss 3.0909, time 5308.51ms 
iter 2317: loss 2.8953, time 5277.06ms 
iter 2318: loss 2.8007, time 5293.45ms 
iter 2319: loss 3.0457, time 5306.98ms 
iter 2320: loss 3.0553, time 5302.86ms 
iter 2321: loss 2.8235, time 5202.12ms 
iter 2322: loss 2.9780, time 5300.41ms 
iter 2323: loss 2.6866, time 5303.76ms 
iter 2324: loss 2.8802, time 5286.46ms 
iter 2325: loss 2.7848, time 5307.87ms 
iter 2326: loss 2.9891, time 5328.67ms 
iter 2327: loss 3.0041, time 5283.09ms 
iter 2328: loss 2.5957, time 5292.70ms 
iter 2329: loss 2.9076, time 5303.64ms 
iter 2330: loss 2.6706, time 5289.20ms 
iter 2331: loss 2.7558, time 5291.01ms 
iter 2332: loss 2.7785, time 5300.83ms 
iter 2333: loss 2.9859, time 5290.65ms 
iter 2334: loss 2.7229, time 5273.52ms 
iter 2335: loss 2.7886, time 5296.69ms 
iter 2336: loss 2.8872, time 5287.08ms 
iter 2337: loss 2.8552, time 5265.63ms 
iter 2338: loss 2.9027, time 5197.49ms 
iter 2339: loss 2.7802, time 5281.99ms 
iter 2340: loss 2.7951, time 5331.30ms 
iter 2341: loss 2.9494, time 5322.62ms 
iter 2342: loss 2.7583, time 5350.08ms 
iter 2343: loss 2.8134, time 5326.64ms 
iter 2344: loss 2.8121, time 5302.42ms 
iter 2345: loss 2.8435, time 5117.42ms 
iter 2346: loss 2.6597, time 5236.46ms 
iter 2347: loss 2.7456, time 5302.88ms 
iter 2348: loss 2.7491, time 5242.88ms 
iter 2349: loss 2.7611, time 5196.82ms 
step 2350: train loss 2.8035, val loss 2.9209
iter 2350: loss 2.9115, time 20213.21ms 
iter 2351: loss 2.8683, time 5277.54ms 
iter 2352: loss 2.8127, time 5298.65ms 
iter 2353: loss 3.0962, time 5057.88ms 
iter 2354: loss 3.0916, time 5088.68ms 
iter 2355: loss 2.7232, time 5291.78ms 
iter 2356: loss 2.7995, time 5137.81ms 
iter 2357: loss 2.8763, time 5038.67ms 
iter 2358: loss 2.9132, time 5033.45ms 
iter 2359: loss 2.8431, time 5269.90ms 
iter 2360: loss 2.7266, time 5295.78ms 
iter 2361: loss 2.7330, time 5117.41ms 
iter 2362: loss 2.8703, time 5090.45ms 
iter 2363: loss 2.8475, time 5199.61ms 
iter 2364: loss 2.7253, time 5303.11ms 
iter 2365: loss 2.7861, time 5310.49ms 
iter 2366: loss 2.8696, time 5301.01ms 
iter 2367: loss 2.8158, time 5300.08ms 
iter 2368: loss 2.7079, time 5192.12ms 
iter 2369: loss 2.7459, time 5239.26ms 
iter 2370: loss 2.9032, time 5252.46ms 
iter 2371: loss 2.8879, time 5285.55ms 
iter 2372: loss 2.8013, time 5080.01ms 
iter 2373: loss 2.6751, time 5193.46ms 
iter 2374: loss 2.7314, time 5220.38ms 
iter 2375: loss 2.9536, time 5289.65ms 
iter 2376: loss 2.7792, time 5281.34ms 
iter 2377: loss 2.5373, time 5301.15ms 
iter 2378: loss 2.9978, time 5288.82ms 
iter 2379: loss 2.7549, time 5319.02ms 
iter 2380: loss 2.6400, time 5206.30ms 
iter 2381: loss 2.9105, time 5160.95ms 
iter 2382: loss 2.7263, time 5313.65ms 
iter 2383: loss 2.8335, time 5259.17ms 
iter 2384: loss 2.7433, time 5304.47ms 
iter 2385: loss 2.8692, time 5137.42ms 
iter 2386: loss 2.8697, time 5193.01ms 
iter 2387: loss 2.9390, time 4979.65ms 
iter 2388: loss 3.1210, time 4978.84ms 
iter 2389: loss 2.7011, time 5019.20ms 
iter 2390: loss 2.8347, time 5008.92ms 
iter 2391: loss 2.7750, time 5153.87ms 
iter 2392: loss 2.9686, time 5155.01ms 
iter 2393: loss 2.6607, time 5125.20ms 
iter 2394: loss 2.6582, time 5083.79ms 
iter 2395: loss 2.8982, time 5143.54ms 
iter 2396: loss 2.6961, time 5166.68ms 
iter 2397: loss 2.8201, time 5306.53ms 
iter 2398: loss 2.9050, time 5306.62ms 
iter 2399: loss 2.9731, time 5320.30ms 
step 2400: train loss 2.8255, val loss 2.9280
iter 2400: loss 2.9485, time 20029.57ms 
iter 2401: loss 2.9333, time 5004.60ms 
iter 2402: loss 2.6525, time 5152.27ms 
iter 2403: loss 2.7800, time 5161.84ms 
iter 2404: loss 2.9079, time 5279.62ms 
iter 2405: loss 2.9203, time 5282.51ms 
iter 2406: loss 2.7251, time 5227.70ms 
iter 2407: loss 2.7977, time 5305.70ms 
iter 2408: loss 2.8065, time 5297.92ms 
iter 2409: loss 2.9054, time 5144.60ms 
iter 2410: loss 2.8539, time 5090.17ms 
iter 2411: loss 2.8360, time 5085.72ms 
iter 2412: loss 2.6636, time 5002.22ms 
iter 2413: loss 2.7519, time 5230.37ms 
iter 2414: loss 2.9217, time 5316.51ms 
iter 2415: loss 2.7427, time 5308.91ms 
iter 2416: loss 2.6871, time 5289.77ms 
iter 2417: loss 2.9430, time 5317.01ms 
iter 2418: loss 2.7890, time 5202.91ms 
iter 2419: loss 2.7963, time 5150.19ms 
iter 2420: loss 3.0003, time 4986.40ms 
iter 2421: loss 2.7290, time 5062.83ms 
iter 2422: loss 2.7841, time 5317.87ms 
iter 2423: loss 2.7131, time 5307.89ms 
iter 2424: loss 2.7855, time 5192.66ms 
iter 2425: loss 2.9127, time 5326.32ms 
iter 2426: loss 2.7328, time 5303.93ms 
iter 2427: loss 2.8397, time 5310.78ms 
iter 2428: loss 2.8585, time 5164.48ms 
iter 2429: loss 2.8206, time 5115.83ms 
iter 2430: loss 2.6812, time 5078.69ms 
iter 2431: loss 2.7585, time 5273.26ms 
iter 2432: loss 2.6923, time 5311.14ms 
iter 2433: loss 2.7891, time 5308.53ms 
iter 2434: loss 2.6531, time 5324.41ms 
iter 2435: loss 2.9617, time 5325.26ms 
iter 2436: loss 2.6661, time 5309.64ms 
iter 2437: loss 2.6724, time 5286.61ms 
iter 2438: loss 2.7585, time 5069.11ms 
iter 2439: loss 2.6721, time 5296.00ms 
iter 2440: loss 2.7852, time 5316.08ms 
iter 2441: loss 2.9165, time 5314.56ms 
iter 2442: loss 2.6722, time 5306.51ms 
iter 2443: loss 2.6358, time 5316.42ms 
iter 2444: loss 2.7988, time 5315.16ms 
iter 2445: loss 2.7918, time 5242.12ms 
iter 2446: loss 2.6657, time 5146.55ms 
iter 2447: loss 2.7382, time 5114.84ms 
iter 2448: loss 3.0735, time 5127.71ms 
iter 2449: loss 2.8094, time 5311.77ms 
step 2450: train loss 2.7961, val loss 2.9077
iter 2450: loss 2.8786, time 20241.56ms 
iter 2451: loss 2.7886, time 5254.56ms 
iter 2452: loss 2.8107, time 5231.03ms 
iter 2453: loss 2.7814, time 5255.86ms 
iter 2454: loss 2.8115, time 5317.23ms 
iter 2455: loss 2.8000, time 5266.21ms 
iter 2456: loss 2.8824, time 5292.39ms 
iter 2457: loss 2.7851, time 5224.81ms 
iter 2458: loss 2.6017, time 5278.43ms 
iter 2459: loss 2.7799, time 5265.07ms 
iter 2460: loss 2.8289, time 5140.71ms 
iter 2461: loss 2.8254, time 5302.94ms 
iter 2462: loss 2.8944, time 5307.14ms 
iter 2463: loss 2.7688, time 5227.10ms 
iter 2464: loss 2.9578, time 5271.30ms 
iter 2465: loss 2.7464, time 5187.99ms 
iter 2466: loss 2.8063, time 5154.19ms 
iter 2467: loss 2.8500, time 5001.27ms 
iter 2468: loss 2.6249, time 5107.08ms 
iter 2469: loss 2.7484, time 5191.02ms 
iter 2470: loss 2.5916, time 5238.35ms 
iter 2471: loss 2.6641, time 5137.97ms 
iter 2472: loss 3.0622, time 5116.46ms 
iter 2473: loss 2.8959, time 5018.96ms 
iter 2474: loss 2.7030, time 5165.78ms 
iter 2475: loss 2.6747, time 5116.60ms 
iter 2476: loss 2.9031, time 5289.98ms 
iter 2477: loss 2.9752, time 5305.09ms 
iter 2478: loss 2.7722, time 5136.91ms 
iter 2479: loss 2.5513, time 5113.61ms 
iter 2480: loss 2.7190, time 5346.21ms 
iter 2481: loss 2.7127, time 5300.74ms 
iter 2482: loss 2.7580, time 5308.69ms 
iter 2483: loss 2.6674, time 5237.36ms 
iter 2484: loss 2.7190, time 5310.81ms 
iter 2485: loss 2.6289, time 5109.70ms 
iter 2486: loss 2.6968, time 5124.95ms 
iter 2487: loss 3.0141, time 5157.30ms 
iter 2488: loss 2.7861, time 5217.50ms 
iter 2489: loss 2.9371, time 5314.12ms 
iter 2490: loss 2.8364, time 5293.42ms 
iter 2491: loss 2.8249, time 5299.72ms 
iter 2492: loss 2.5366, time 5301.93ms 
iter 2493: loss 2.8524, time 5302.91ms 
iter 2494: loss 2.8038, time 5088.17ms 
iter 2495: loss 2.6239, time 5251.31ms 
iter 2496: loss 2.7010, time 5145.93ms 
iter 2497: loss 2.7982, time 5167.97ms 
iter 2498: loss 2.9934, time 5234.49ms 
iter 2499: loss 2.8538, time 5089.89ms 
step 2500: train loss 2.7922, val loss 2.9043
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 2500: loss 2.7274, time 21166.83ms 
iter 2501: loss 2.8515, time 5071.53ms 
iter 2502: loss 2.9211, time 5037.58ms 
iter 2503: loss 3.0190, time 5293.38ms 
iter 2504: loss 2.8612, time 5290.70ms 
iter 2505: loss 2.7036, time 5297.54ms 
iter 2506: loss 2.7430, time 5307.55ms 
iter 2507: loss 2.5909, time 5314.06ms 
iter 2508: loss 2.8753, time 5296.42ms 
iter 2509: loss 2.6424, time 5230.55ms 
iter 2510: loss 2.6962, time 5187.80ms 
iter 2511: loss 2.6582, time 5297.52ms 
iter 2512: loss 2.6978, time 5299.29ms 
iter 2513: loss 2.8410, time 5181.25ms 
iter 2514: loss 2.5996, time 5335.66ms 
iter 2515: loss 2.6777, time 5285.23ms 
iter 2516: loss 2.8161, time 5129.01ms 
iter 2517: loss 2.7587, time 5252.08ms 
iter 2518: loss 2.8268, time 5315.86ms 
iter 2519: loss 2.8046, time 5299.38ms 
iter 2520: loss 2.7514, time 5304.94ms 
iter 2521: loss 2.8578, time 5317.83ms 
iter 2522: loss 2.9109, time 5209.32ms 
iter 2523: loss 2.6548, time 5301.71ms 
iter 2524: loss 2.7827, time 5319.90ms 
iter 2525: loss 2.8257, time 5170.94ms 
iter 2526: loss 2.8261, time 5106.27ms 
iter 2527: loss 2.6472, time 5239.05ms 
iter 2528: loss 2.8349, time 5287.91ms 
iter 2529: loss 2.6220, time 5240.86ms 
iter 2530: loss 2.7727, time 5267.13ms 
iter 2531: loss 2.8813, time 5283.35ms 
iter 2532: loss 3.0959, time 5206.92ms 
iter 2533: loss 2.6783, time 5184.48ms 
iter 2534: loss 2.8125, time 5042.54ms 
iter 2535: loss 2.6763, time 5221.31ms 
iter 2536: loss 2.8559, time 5245.65ms 
iter 2537: loss 2.8410, time 5324.02ms 
iter 2538: loss 2.8060, time 5322.92ms 
iter 2539: loss 2.5633, time 5299.90ms 
iter 2540: loss 2.9759, time 5291.66ms 
iter 2541: loss 2.8750, time 5133.58ms 
iter 2542: loss 2.6990, time 5173.83ms 
iter 2543: loss 2.6906, time 5303.78ms 
iter 2544: loss 2.7885, time 5304.71ms 
iter 2545: loss 2.8679, time 5303.90ms 
iter 2546: loss 2.6155, time 5298.73ms 
iter 2547: loss 2.7892, time 5220.78ms 
iter 2548: loss 2.8548, time 5315.76ms 
iter 2549: loss 2.9142, time 5259.23ms 
step 2550: train loss 2.7815, val loss 2.8947
iter 2550: loss 2.8719, time 20186.85ms 
iter 2551: loss 2.5935, time 5333.11ms 
iter 2552: loss 2.7574, time 5313.74ms 
iter 2553: loss 2.6506, time 5206.51ms 
iter 2554: loss 2.7113, time 5270.29ms 
iter 2555: loss 2.9411, time 5315.53ms 
iter 2556: loss 2.9114, time 5171.25ms 
iter 2557: loss 2.7389, time 5053.68ms 
iter 2558: loss 2.7861, time 5254.04ms 
iter 2559: loss 2.6397, time 5160.46ms 
iter 2560: loss 2.7505, time 5257.39ms 
iter 2561: loss 2.7992, time 5254.25ms 
iter 2562: loss 2.7206, time 5271.32ms 
iter 2563: loss 3.1952, time 5287.69ms 
iter 2564: loss 2.8114, time 5019.34ms 
iter 2565: loss 2.7595, time 5193.85ms 
iter 2566: loss 2.8640, time 5205.45ms 
iter 2567: loss 2.7080, time 5220.45ms 
iter 2568: loss 2.6462, time 5254.23ms 
iter 2569: loss 2.7174, time 5298.85ms 
iter 2570: loss 2.6355, time 5286.99ms 
iter 2571: loss 2.7917, time 5173.87ms 
iter 2572: loss 3.0040, time 5166.78ms 
iter 2573: loss 2.8300, time 5207.21ms 
iter 2574: loss 2.5488, time 5220.60ms 
iter 2575: loss 2.7555, time 5298.19ms 
iter 2576: loss 2.5779, time 5300.61ms 
iter 2577: loss 2.7687, time 5220.04ms 
iter 2578: loss 2.8731, time 5300.11ms 
iter 2579: loss 2.6473, time 5231.19ms 
iter 2580: loss 2.9080, time 5311.13ms 
iter 2581: loss 2.8122, time 5240.73ms 
iter 2582: loss 2.8208, time 5247.39ms 
iter 2583: loss 2.6703, time 5277.61ms 
iter 2584: loss 2.6030, time 5166.72ms 
iter 2585: loss 2.8968, time 5068.26ms 
iter 2586: loss 2.7472, time 5087.86ms 
iter 2587: loss 2.5966, time 5139.97ms 
iter 2588: loss 2.7087, time 5325.58ms 
iter 2589: loss 2.7563, time 5288.68ms 
iter 2590: loss 2.6903, time 5174.27ms 
iter 2591: loss 2.7105, time 5147.50ms 
iter 2592: loss 2.7133, time 5059.37ms 
iter 2593: loss 2.7969, time 5262.69ms 
iter 2594: loss 2.9660, time 5314.64ms 
iter 2595: loss 2.6543, time 5300.78ms 
iter 2596: loss 2.8120, time 5297.01ms 
iter 2597: loss 2.7379, time 5295.51ms 
iter 2598: loss 2.7664, time 5293.77ms 
iter 2599: loss 3.0321, time 5300.77ms 
step 2600: train loss 2.7757, val loss 2.9038
iter 2600: loss 2.7158, time 20105.98ms 
iter 2601: loss 2.8013, time 5245.51ms 
iter 2602: loss 2.7710, time 5227.21ms 
iter 2603: loss 2.9040, time 5240.96ms 
iter 2604: loss 2.6409, time 5233.58ms 
iter 2605: loss 2.8225, time 5252.61ms 
iter 2606: loss 2.8920, time 5113.17ms 
iter 2607: loss 2.7437, time 5181.44ms 
iter 2608: loss 2.9123, time 5239.92ms 
iter 2609: loss 2.7743, time 5256.99ms 
iter 2610: loss 2.8655, time 5284.04ms 
iter 2611: loss 2.7923, time 5314.55ms 
iter 2612: loss 2.8102, time 5295.94ms 
iter 2613: loss 2.7527, time 5298.42ms 
iter 2614: loss 2.7646, time 5140.29ms 
iter 2615: loss 2.7106, time 5126.04ms 
iter 2616: loss 2.6529, time 5057.16ms 
iter 2617: loss 2.8895, time 5078.66ms 
iter 2618: loss 2.8040, time 5206.21ms 
iter 2619: loss 2.8236, time 5289.86ms 
iter 2620: loss 2.8869, time 5285.97ms 
iter 2621: loss 2.8661, time 5284.31ms 
iter 2622: loss 2.9553, time 5252.21ms 
iter 2623: loss 2.6509, time 5026.54ms 
iter 2624: loss 2.9397, time 5173.45ms 
iter 2625: loss 2.7269, time 5314.88ms 
iter 2626: loss 2.5251, time 5313.22ms 
iter 2627: loss 2.8145, time 5321.06ms 
iter 2628: loss 2.8406, time 5273.73ms 
iter 2629: loss 2.8797, time 5297.44ms 
iter 2630: loss 2.6347, time 5291.57ms 
iter 2631: loss 2.7372, time 5054.44ms 
iter 2632: loss 2.8442, time 5307.25ms 
iter 2633: loss 2.8679, time 5296.86ms 
iter 2634: loss 2.6698, time 5293.60ms 
iter 2635: loss 2.7163, time 5293.12ms 
iter 2636: loss 3.0897, time 5313.71ms 
iter 2637: loss 2.8403, time 5338.80ms 
iter 2638: loss 2.7549, time 5397.42ms 
iter 2639: loss 2.8650, time 5111.25ms 
iter 2640: loss 2.7702, time 5172.54ms 
iter 2641: loss 3.0041, time 5237.80ms 
iter 2642: loss 2.7421, time 5315.24ms 
iter 2643: loss 2.6680, time 5313.75ms 
iter 2644: loss 2.8931, time 5307.79ms 
iter 2645: loss 2.9219, time 5341.13ms 
iter 2646: loss 2.8912, time 5233.47ms 
iter 2647: loss 2.7437, time 4994.10ms 
iter 2648: loss 2.5943, time 5086.10ms 
iter 2649: loss 2.7074, time 5203.04ms 
step 2650: train loss 2.7652, val loss 2.8827
iter 2650: loss 2.9240, time 20313.08ms 
iter 2651: loss 2.7288, time 5287.48ms 
iter 2652: loss 2.8241, time 5163.25ms 
iter 2653: loss 2.7106, time 5321.71ms 
iter 2654: loss 2.8166, time 5313.00ms 
iter 2655: loss 2.7370, time 5330.68ms 
iter 2656: loss 2.8127, time 5295.61ms 
iter 2657: loss 2.7097, time 5319.73ms 
iter 2658: loss 2.6821, time 5183.87ms 
iter 2659: loss 2.8447, time 5291.73ms 
iter 2660: loss 3.0290, time 5272.40ms 
iter 2661: loss 2.7845, time 5282.54ms 
iter 2662: loss 2.8190, time 5306.41ms 
iter 2663: loss 2.7314, time 5297.63ms 
iter 2664: loss 2.9638, time 5333.34ms 
iter 2665: loss 2.7277, time 5304.79ms 
iter 2666: loss 2.7884, time 5313.72ms 
iter 2667: loss 2.6885, time 5236.28ms 
iter 2668: loss 2.7233, time 5267.39ms 
iter 2669: loss 2.8346, time 5249.55ms 
iter 2670: loss 2.9614, time 5195.69ms 
iter 2671: loss 2.7650, time 5287.92ms 
iter 2672: loss 2.6962, time 5305.75ms 
iter 2673: loss 2.6546, time 5305.95ms 
iter 2674: loss 2.7910, time 5316.49ms 
iter 2675: loss 2.7281, time 5234.78ms 
iter 2676: loss 3.0092, time 5063.40ms 
iter 2677: loss 2.8028, time 5225.54ms 
iter 2678: loss 2.5690, time 5321.71ms 
iter 2679: loss 2.4922, time 5188.39ms 
iter 2680: loss 2.7575, time 5292.16ms 
iter 2681: loss 2.6957, time 5300.45ms 
iter 2682: loss 2.5532, time 5302.89ms 
iter 2683: loss 2.7887, time 5222.81ms 
iter 2684: loss 2.8008, time 5070.69ms 
iter 2685: loss 2.6202, time 5137.12ms 
iter 2686: loss 2.6890, time 5294.92ms 
iter 2687: loss 2.8822, time 5212.67ms 
iter 2688: loss 2.6601, time 5070.54ms 
iter 2689: loss 2.7100, time 5068.66ms 
iter 2690: loss 2.8483, time 5074.89ms 
iter 2691: loss 2.6448, time 5180.65ms 
iter 2692: loss 2.8318, time 5298.28ms 
iter 2693: loss 2.7561, time 5315.23ms 
iter 2694: loss 2.7889, time 5313.97ms 
iter 2695: loss 2.7481, time 5306.32ms 
iter 2696: loss 2.8532, time 5312.64ms 
iter 2697: loss 2.7125, time 5218.00ms 
iter 2698: loss 2.6617, time 5296.17ms 
iter 2699: loss 2.7423, time 5315.54ms 
step 2700: train loss 2.7587, val loss 2.8969
iter 2700: loss 2.8839, time 20013.28ms 
iter 2701: loss 2.7262, time 5259.34ms 
iter 2702: loss 2.7003, time 5289.34ms 
iter 2703: loss 2.8237, time 5298.92ms 
iter 2704: loss 2.8868, time 5290.27ms 
iter 2705: loss 2.7863, time 5295.30ms 
iter 2706: loss 2.7383, time 5160.46ms 
iter 2707: loss 2.7451, time 5296.72ms 
iter 2708: loss 2.7822, time 5288.64ms 
iter 2709: loss 2.7356, time 5282.40ms 
iter 2710: loss 2.7916, time 5317.37ms 
iter 2711: loss 2.8817, time 5267.01ms 
iter 2712: loss 2.7768, time 5168.26ms 
iter 2713: loss 2.7578, time 5303.78ms 
iter 2714: loss 2.6856, time 5291.06ms 
iter 2715: loss 2.5855, time 5308.38ms 
iter 2716: loss 2.7963, time 5274.15ms 
iter 2717: loss 2.7453, time 5289.83ms 
iter 2718: loss 2.6878, time 5289.98ms 
iter 2719: loss 2.6942, time 5257.80ms 
iter 2720: loss 2.8932, time 5256.20ms 
iter 2721: loss 2.7576, time 5296.13ms 
iter 2722: loss 2.7747, time 5290.11ms 
iter 2723: loss 2.6994, time 5280.36ms 
iter 2724: loss 2.9005, time 5222.44ms 
iter 2725: loss 2.8380, time 5313.66ms 
iter 2726: loss 2.8271, time 5240.35ms 
iter 2727: loss 2.6013, time 5322.71ms 
iter 2728: loss 2.7513, time 5301.32ms 
iter 2729: loss 2.7951, time 5203.99ms 
iter 2730: loss 2.8148, time 5123.22ms 
iter 2731: loss 2.6897, time 5125.08ms 
iter 2732: loss 2.6725, time 5104.45ms 
iter 2733: loss 2.7514, time 5218.57ms 
iter 2734: loss 2.8151, time 5238.57ms 
iter 2735: loss 2.5712, time 5186.74ms 
iter 2736: loss 2.9695, time 5182.01ms 
iter 2737: loss 2.6445, time 5332.83ms 
iter 2738: loss 2.7276, time 5290.19ms 
iter 2739: loss 2.5500, time 5355.42ms 
iter 2740: loss 2.8463, time 5296.67ms 
iter 2741: loss 2.7429, time 5297.57ms 
iter 2742: loss 2.5934, time 5171.49ms 
iter 2743: loss 2.8406, time 5256.19ms 
iter 2744: loss 2.7712, time 5265.80ms 
iter 2745: loss 2.6650, time 5269.60ms 
iter 2746: loss 2.8777, time 5317.61ms 
iter 2747: loss 2.7342, time 5196.42ms 
iter 2748: loss 2.6451, time 5244.92ms 
iter 2749: loss 2.7492, time 5343.60ms 
step 2750: train loss 2.7619, val loss 2.9059
iter 2750: loss 2.8139, time 20248.74ms 
iter 2751: loss 2.7829, time 5323.42ms 
iter 2752: loss 2.7372, time 5313.84ms 
iter 2753: loss 2.7650, time 5301.82ms 
iter 2754: loss 2.8291, time 5114.54ms 
iter 2755: loss 2.6901, time 5183.30ms 
iter 2756: loss 2.5830, time 5117.81ms 
iter 2757: loss 2.7985, time 5131.14ms 
iter 2758: loss 2.5181, time 5224.01ms 
iter 2759: loss 2.7967, time 5138.91ms 
iter 2760: loss 2.6583, time 5309.71ms 
iter 2761: loss 3.0072, time 5297.21ms 
iter 2762: loss 2.6146, time 5286.80ms 
iter 2763: loss 2.8202, time 5311.89ms 
iter 2764: loss 2.7409, time 5335.52ms 
iter 2765: loss 2.9312, time 5313.42ms 
iter 2766: loss 2.7136, time 5324.51ms 
iter 2767: loss 2.6536, time 5300.38ms 
iter 2768: loss 2.7685, time 5305.42ms 
iter 2769: loss 2.6929, time 5297.41ms 
iter 2770: loss 3.0151, time 5314.05ms 
iter 2771: loss 2.7806, time 5334.76ms 
iter 2772: loss 2.8860, time 5310.80ms 
iter 2773: loss 2.9138, time 5306.69ms 
iter 2774: loss 2.9770, time 5284.12ms 
iter 2775: loss 2.6831, time 5299.41ms 
iter 2776: loss 2.4745, time 5314.86ms 
iter 2777: loss 2.6884, time 5313.44ms 
iter 2778: loss 2.7683, time 5291.63ms 
iter 2779: loss 2.6015, time 5300.59ms 
iter 2780: loss 2.7310, time 5313.10ms 
iter 2781: loss 2.5748, time 5308.70ms 
iter 2782: loss 2.7927, time 5238.35ms 
iter 2783: loss 2.6938, time 5239.26ms 
iter 2784: loss 2.6968, time 5172.00ms 
iter 2785: loss 2.6673, time 5300.74ms 
iter 2786: loss 2.8099, time 5307.59ms 
iter 2787: loss 2.9101, time 5221.95ms 
iter 2788: loss 2.8501, time 5181.38ms 
iter 2789: loss 2.6829, time 5296.31ms 
iter 2790: loss 2.7535, time 5309.82ms 
iter 2791: loss 2.6376, time 5304.74ms 
iter 2792: loss 2.8477, time 5077.87ms 
iter 2793: loss 2.8314, time 5181.37ms 
iter 2794: loss 2.9161, time 5205.97ms 
iter 2795: loss 2.8178, time 5035.86ms 
iter 2796: loss 2.7395, time 5183.60ms 
iter 2797: loss 2.8320, time 5212.70ms 
iter 2798: loss 2.8396, time 5241.86ms 
iter 2799: loss 2.8823, time 5291.32ms 
step 2800: train loss 2.7347, val loss 2.8746
iter 2800: loss 2.7934, time 20005.59ms 
iter 2801: loss 2.8394, time 4983.77ms 
iter 2802: loss 2.5925, time 4998.22ms 
iter 2803: loss 2.8213, time 4990.10ms 
iter 2804: loss 2.8047, time 5179.00ms 
iter 2805: loss 2.9337, time 5314.66ms 
iter 2806: loss 2.7316, time 5324.24ms 
iter 2807: loss 2.4776, time 5317.52ms 
iter 2808: loss 2.7171, time 5248.06ms 
iter 2809: loss 2.6520, time 5278.97ms 
iter 2810: loss 2.6502, time 5175.28ms 
iter 2811: loss 2.6281, time 5098.87ms 
iter 2812: loss 2.8251, time 5044.78ms 
iter 2813: loss 2.6049, time 5059.30ms 
iter 2814: loss 2.9740, time 5179.27ms 
iter 2815: loss 2.7901, time 5297.32ms 
iter 2816: loss 2.6126, time 5179.73ms 
iter 2817: loss 2.6652, time 5077.10ms 
iter 2818: loss 2.7995, time 5254.03ms 
iter 2819: loss 2.6859, time 5306.95ms 
iter 2820: loss 2.6591, time 5101.28ms 
iter 2821: loss 2.8245, time 4994.44ms 
iter 2822: loss 2.9240, time 5075.38ms 
iter 2823: loss 2.6255, time 5261.76ms 
iter 2824: loss 2.7609, time 5305.50ms 
iter 2825: loss 2.6881, time 5258.58ms 
iter 2826: loss 2.6432, time 5238.07ms 
iter 2827: loss 3.0072, time 5294.06ms 
iter 2828: loss 2.9162, time 5319.48ms 
iter 2829: loss 2.7315, time 5195.89ms 
iter 2830: loss 2.8891, time 5299.63ms 
iter 2831: loss 2.7108, time 5208.38ms 
iter 2832: loss 2.7622, time 5321.85ms 
iter 2833: loss 2.6133, time 5240.06ms 
iter 2834: loss 2.9105, time 5206.93ms 
iter 2835: loss 2.7790, time 5058.56ms 
iter 2836: loss 2.8771, time 5028.82ms 
iter 2837: loss 2.6489, time 5263.84ms 
iter 2838: loss 2.8485, time 5198.47ms 
iter 2839: loss 2.6333, time 5235.98ms 
iter 2840: loss 2.6819, time 5312.45ms 
iter 2841: loss 2.6982, time 5293.01ms 
iter 2842: loss 2.6645, time 5293.77ms 
iter 2843: loss 2.7465, time 5282.95ms 
iter 2844: loss 2.8775, time 5175.11ms 
iter 2845: loss 2.8069, time 5299.56ms 
iter 2846: loss 2.8352, time 5306.70ms 
iter 2847: loss 2.6308, time 5257.18ms 
iter 2848: loss 2.8085, time 5306.68ms 
iter 2849: loss 2.6586, time 5298.12ms 
step 2850: train loss 2.7394, val loss 2.8791
iter 2850: loss 2.7413, time 20205.87ms 
iter 2851: loss 2.6499, time 5284.04ms 
iter 2852: loss 2.6481, time 5327.39ms 
iter 2853: loss 2.6559, time 5206.94ms 
iter 2854: loss 2.7248, time 5258.93ms 
iter 2855: loss 2.8277, time 5304.19ms 
iter 2856: loss 2.7382, time 5201.81ms 
iter 2857: loss 2.5810, time 5158.27ms 
iter 2858: loss 2.7284, time 5306.27ms 
iter 2859: loss 2.5594, time 5307.39ms 
iter 2860: loss 2.5946, time 5304.46ms 
iter 2861: loss 2.7608, time 5324.37ms 
iter 2862: loss 2.7292, time 5305.18ms 
iter 2863: loss 2.6542, time 5307.77ms 
iter 2864: loss 2.7271, time 5299.54ms 
iter 2865: loss 2.7764, time 5009.81ms 
iter 2866: loss 2.5840, time 5296.55ms 
iter 2867: loss 2.6553, time 5314.42ms 
iter 2868: loss 3.1512, time 5316.77ms 
iter 2869: loss 2.6342, time 5323.74ms 
iter 2870: loss 2.9638, time 5332.68ms 
iter 2871: loss 2.8370, time 5269.96ms 
iter 2872: loss 3.1096, time 5169.62ms 
iter 2873: loss 2.8034, time 5230.94ms 
iter 2874: loss 2.7806, time 5310.95ms 
iter 2875: loss 2.7360, time 5179.45ms 
iter 2876: loss 2.6990, time 5104.28ms 
iter 2877: loss 2.9217, time 5215.59ms 
iter 2878: loss 2.8664, time 5307.96ms 
iter 2879: loss 2.5647, time 5237.33ms 
iter 2880: loss 2.7314, time 5100.43ms 
iter 2881: loss 2.6431, time 5203.71ms 
iter 2882: loss 2.7263, time 5233.11ms 
iter 2883: loss 2.6618, time 5264.17ms 
iter 2884: loss 2.7143, time 5214.35ms 
iter 2885: loss 2.8285, time 5319.66ms 
iter 2886: loss 2.7408, time 5316.18ms 
iter 2887: loss 2.9827, time 5270.58ms 
iter 2888: loss 2.8227, time 5090.77ms 
iter 2889: loss 2.7716, time 5288.91ms 
iter 2890: loss 2.8270, time 5305.95ms 
iter 2891: loss 2.8008, time 5307.12ms 
iter 2892: loss 2.6679, time 5293.30ms 
iter 2893: loss 2.7428, time 5294.40ms 
iter 2894: loss 2.7537, time 5305.17ms 
iter 2895: loss 2.6377, time 5168.23ms 
iter 2896: loss 2.7422, time 5233.15ms 
iter 2897: loss 2.7082, time 5295.60ms 
iter 2898: loss 2.8336, time 5280.86ms 
iter 2899: loss 2.6719, time 5303.95ms 
step 2900: train loss 2.7503, val loss 2.8760
iter 2900: loss 2.6039, time 20058.66ms 
iter 2901: loss 2.7727, time 5304.66ms 
iter 2902: loss 2.8790, time 5296.73ms 
iter 2903: loss 2.7505, time 5223.01ms 
iter 2904: loss 2.7447, time 5302.06ms 
iter 2905: loss 2.9548, time 5306.28ms 
iter 2906: loss 2.8629, time 5299.05ms 
iter 2907: loss 2.6197, time 5084.27ms 
iter 2908: loss 2.7504, time 5304.25ms 
iter 2909: loss 2.5539, time 5301.45ms 
iter 2910: loss 2.7760, time 5256.30ms 
iter 2911: loss 2.7373, time 5307.15ms 
iter 2912: loss 2.6163, time 5314.75ms 
iter 2913: loss 2.7357, time 5326.74ms 
iter 2914: loss 2.6861, time 5193.35ms 
iter 2915: loss 2.6113, time 5261.64ms 
iter 2916: loss 2.7377, time 5272.39ms 
iter 2917: loss 2.7230, time 5323.26ms 
iter 2918: loss 2.5526, time 5296.12ms 
iter 2919: loss 2.9979, time 5214.57ms 
iter 2920: loss 2.8607, time 5309.98ms 
iter 2921: loss 2.8279, time 5320.82ms 
iter 2922: loss 2.7729, time 5316.27ms 
iter 2923: loss 2.7174, time 5289.40ms 
iter 2924: loss 2.7259, time 5210.18ms 
iter 2925: loss 2.7677, time 5092.29ms 
iter 2926: loss 2.7981, time 5285.47ms 
iter 2927: loss 2.6030, time 5180.08ms 
iter 2928: loss 2.6909, time 5311.39ms 
iter 2929: loss 2.6295, time 5275.47ms 
iter 2930: loss 2.6417, time 5223.21ms 
iter 2931: loss 2.6720, time 5236.31ms 
iter 2932: loss 2.7822, time 5254.59ms 
iter 2933: loss 2.6940, time 5293.63ms 
iter 2934: loss 2.8935, time 5255.17ms 
iter 2935: loss 2.7420, time 5304.23ms 
iter 2936: loss 2.7026, time 5314.84ms 
iter 2937: loss 2.7809, time 5282.87ms 
iter 2938: loss 2.7946, time 5295.61ms 
iter 2939: loss 2.7355, time 5295.89ms 
iter 2940: loss 2.6736, time 5297.00ms 
iter 2941: loss 2.7699, time 5274.00ms 
iter 2942: loss 2.6459, time 5054.18ms 
iter 2943: loss 2.6395, time 5073.15ms 
iter 2944: loss 2.6239, time 5183.17ms 
iter 2945: loss 2.6389, time 5237.50ms 
iter 2946: loss 2.6882, time 5308.09ms 
iter 2947: loss 2.6898, time 5314.63ms 
iter 2948: loss 2.8866, time 5252.58ms 
iter 2949: loss 2.7959, time 5327.03ms 
step 2950: train loss 2.7256, val loss 2.8815
iter 2950: loss 2.6608, time 20313.87ms 
iter 2951: loss 2.6662, time 5286.98ms 
iter 2952: loss 2.7694, time 5307.29ms 
iter 2953: loss 2.7983, time 5255.28ms 
iter 2954: loss 2.6767, time 5228.58ms 
iter 2955: loss 2.7247, time 5091.22ms 
iter 2956: loss 2.6203, time 5069.12ms 
iter 2957: loss 2.7012, time 5055.74ms 
iter 2958: loss 2.8962, time 5221.73ms 
iter 2959: loss 2.6239, time 5304.74ms 
iter 2960: loss 2.8011, time 5309.15ms 
iter 2961: loss 2.6275, time 5220.96ms 
iter 2962: loss 2.7066, time 5129.30ms 
iter 2963: loss 2.8791, time 5173.82ms 
iter 2964: loss 2.6832, time 5104.84ms 
iter 2965: loss 2.7162, time 5224.78ms 
iter 2966: loss 3.0195, time 5328.71ms 
iter 2967: loss 2.7448, time 5283.26ms 
iter 2968: loss 2.7727, time 5165.48ms 
iter 2969: loss 2.7904, time 5210.76ms 
iter 2970: loss 2.7733, time 5330.59ms 
iter 2971: loss 2.6037, time 5235.28ms 
iter 2972: loss 2.7573, time 5268.77ms 
iter 2973: loss 2.8159, time 5325.58ms 
iter 2974: loss 2.8810, time 5333.12ms 
iter 2975: loss 2.7283, time 5263.76ms 
iter 2976: loss 2.7667, time 5228.62ms 
iter 2977: loss 2.7169, time 5082.15ms 
iter 2978: loss 2.7443, time 5093.94ms 
iter 2979: loss 2.6841, time 5053.58ms 
iter 2980: loss 2.6535, time 5077.71ms 
iter 2981: loss 2.7443, time 5069.73ms 
iter 2982: loss 2.4994, time 5089.08ms 
iter 2983: loss 2.7371, time 5098.20ms 
iter 2984: loss 2.6165, time 5075.83ms 
iter 2985: loss 2.5698, time 5094.72ms 
iter 2986: loss 2.6657, time 5288.75ms 
iter 2987: loss 2.7466, time 5103.59ms 
iter 2988: loss 2.7142, time 5131.10ms 
iter 2989: loss 2.5763, time 5315.09ms 
iter 2990: loss 2.7481, time 5291.52ms 
iter 2991: loss 2.8109, time 5311.53ms 
iter 2992: loss 2.5205, time 5148.04ms 
iter 2993: loss 2.6997, time 5085.31ms 
iter 2994: loss 2.6522, time 5155.50ms 
iter 2995: loss 2.4009, time 5341.82ms 
iter 2996: loss 2.7273, time 5289.46ms 
iter 2997: loss 2.8956, time 5308.30ms 
iter 2998: loss 2.7783, time 5315.94ms 
iter 2999: loss 2.8107, time 5299.54ms 
step 3000: train loss 2.7255, val loss 2.8764
iter 3000: loss 2.8369, time 20280.24ms 
iter 3001: loss 2.6287, time 5317.81ms 
iter 3002: loss 2.6849, time 5313.84ms 
iter 3003: loss 2.7835, time 5231.21ms 
iter 3004: loss 2.5340, time 5046.55ms 
iter 3005: loss 2.7655, time 5222.48ms 
iter 3006: loss 2.7709, time 5332.29ms 
iter 3007: loss 2.6647, time 5281.84ms 
iter 3008: loss 2.6410, time 5320.33ms 
iter 3009: loss 2.8024, time 5126.45ms 
iter 3010: loss 2.5936, time 5274.42ms 
iter 3011: loss 2.7989, time 5232.18ms 
iter 3012: loss 2.8316, time 5233.18ms 
iter 3013: loss 2.7546, time 5319.11ms 
iter 3014: loss 3.0373, time 5293.19ms 
iter 3015: loss 2.8674, time 5306.90ms 
iter 3016: loss 2.7429, time 5311.01ms 
iter 3017: loss 2.7251, time 5335.08ms 
iter 3018: loss 2.7801, time 5323.43ms 
iter 3019: loss 2.8489, time 5314.85ms 
iter 3020: loss 2.6186, time 5325.15ms 
iter 3021: loss 2.6391, time 5311.25ms 
iter 3022: loss 2.5051, time 5317.57ms 
iter 3023: loss 2.7687, time 5285.62ms 
iter 3024: loss 2.8794, time 5304.49ms 
iter 3025: loss 2.5609, time 5213.03ms 
iter 3026: loss 2.8890, time 5203.42ms 
iter 3027: loss 2.6274, time 5301.71ms 
iter 3028: loss 2.7088, time 5272.89ms 
iter 3029: loss 2.8322, time 5294.38ms 
iter 3030: loss 2.6421, time 5272.45ms 
iter 3031: loss 2.7895, time 5075.59ms 
iter 3032: loss 2.7829, time 5123.17ms 
iter 3033: loss 2.6418, time 5229.65ms 
iter 3034: loss 2.6854, time 5312.66ms 
iter 3035: loss 2.7389, time 5309.98ms 
iter 3036: loss 2.5811, time 5322.90ms 
iter 3037: loss 2.8719, time 5311.35ms 
iter 3038: loss 2.6407, time 5260.65ms 
iter 3039: loss 2.5804, time 5303.90ms 
iter 3040: loss 2.7176, time 5185.15ms 
iter 3041: loss 2.6017, time 5270.01ms 
iter 3042: loss 2.6930, time 5250.91ms 
iter 3043: loss 2.7073, time 5308.48ms 
iter 3044: loss 2.6350, time 5163.70ms 
iter 3045: loss 2.4983, time 5270.63ms 
iter 3046: loss 2.9865, time 5330.47ms 
iter 3047: loss 3.0168, time 5121.26ms 
iter 3048: loss 2.9646, time 5298.70ms 
iter 3049: loss 2.7480, time 5292.90ms 
step 3050: train loss 2.7460, val loss 2.8699
iter 3050: loss 2.7104, time 20254.55ms 
iter 3051: loss 2.6921, time 5315.62ms 
iter 3052: loss 2.6736, time 5275.58ms 
iter 3053: loss 2.7282, time 5290.13ms 
iter 3054: loss 2.7395, time 5309.70ms 
iter 3055: loss 2.7113, time 5300.79ms 
iter 3056: loss 2.7197, time 5306.99ms 
iter 3057: loss 2.6198, time 5312.11ms 
iter 3058: loss 2.6722, time 5304.25ms 
iter 3059: loss 2.7294, time 5249.56ms 
iter 3060: loss 2.7185, time 5319.23ms 
iter 3061: loss 2.8745, time 5255.82ms 
iter 3062: loss 2.7005, time 5307.96ms 
iter 3063: loss 2.6776, time 5299.75ms 
iter 3064: loss 2.6381, time 5304.95ms 
iter 3065: loss 2.8163, time 5313.56ms 
iter 3066: loss 2.6728, time 5198.12ms 
iter 3067: loss 2.6299, time 5319.94ms 
iter 3068: loss 2.5929, time 5298.61ms 
iter 3069: loss 2.7776, time 5285.85ms 
iter 3070: loss 2.9066, time 5062.11ms 
iter 3071: loss 2.7608, time 5079.35ms 
iter 3072: loss 2.8100, time 5065.47ms 
iter 3073: loss 2.5909, time 5047.38ms 
iter 3074: loss 2.6950, time 5162.92ms 
iter 3075: loss 2.5960, time 5129.04ms 
iter 3076: loss 2.6494, time 5303.34ms 
iter 3077: loss 2.7782, time 5325.80ms 
iter 3078: loss 2.8389, time 5298.38ms 
iter 3079: loss 2.9176, time 5313.17ms 
iter 3080: loss 2.8134, time 5179.51ms 
iter 3081: loss 2.5902, time 5293.65ms 
iter 3082: loss 2.8004, time 5339.28ms 
iter 3083: loss 3.0053, time 5295.53ms 
iter 3084: loss 2.5493, time 5297.65ms 
iter 3085: loss 2.8067, time 5312.77ms 
iter 3086: loss 2.6597, time 5260.74ms 
iter 3087: loss 2.7943, time 5248.16ms 
iter 3088: loss 2.7438, time 5305.90ms 
iter 3089: loss 2.6794, time 5313.58ms 
iter 3090: loss 2.7419, time 5330.91ms 
iter 3091: loss 2.6372, time 5253.37ms 
iter 3092: loss 2.6843, time 5193.20ms 
iter 3093: loss 2.9579, time 5245.31ms 
iter 3094: loss 2.7961, time 5068.20ms 
iter 3095: loss 2.7207, time 5075.03ms 
iter 3096: loss 2.5058, time 5085.56ms 
iter 3097: loss 2.8586, time 5167.28ms 
iter 3098: loss 2.6154, time 5119.52ms 
iter 3099: loss 2.6660, time 5285.50ms 
step 3100: train loss 2.7155, val loss 2.8844
iter 3100: loss 2.6995, time 20380.54ms 
iter 3101: loss 2.7424, time 5314.94ms 
iter 3102: loss 2.9569, time 5314.53ms 
iter 3103: loss 2.5908, time 5303.30ms 
iter 3104: loss 2.7476, time 5102.07ms 
iter 3105: loss 2.7253, time 5314.09ms 
iter 3106: loss 2.6788, time 5274.42ms 
iter 3107: loss 2.8092, time 5302.68ms 
iter 3108: loss 2.7481, time 5302.43ms 
iter 3109: loss 2.9119, time 5308.73ms 
iter 3110: loss 2.8017, time 5312.58ms 
iter 3111: loss 2.7981, time 5307.34ms 
iter 3112: loss 2.8496, time 5320.82ms 
iter 3113: loss 2.5976, time 5151.21ms 
iter 3114: loss 2.6886, time 5318.24ms 
iter 3115: loss 2.5151, time 5232.34ms 
iter 3116: loss 2.5787, time 5321.29ms 
iter 3117: loss 2.6999, time 5307.24ms 
iter 3118: loss 2.8721, time 5308.56ms 
iter 3119: loss 2.6842, time 5288.31ms 
iter 3120: loss 2.7820, time 5249.40ms 
iter 3121: loss 2.6616, time 5282.66ms 
iter 3122: loss 2.8941, time 5298.99ms 
iter 3123: loss 2.7802, time 5037.81ms 
iter 3124: loss 2.6983, time 5190.94ms 
iter 3125: loss 2.6619, time 5287.41ms 
iter 3126: loss 2.7897, time 5300.28ms 
iter 3127: loss 2.7001, time 5314.02ms 
iter 3128: loss 2.5987, time 5293.97ms 
iter 3129: loss 2.5734, time 5295.16ms 
iter 3130: loss 2.8389, time 5053.68ms 
iter 3131: loss 2.7579, time 5060.16ms 
iter 3132: loss 2.8418, time 5051.31ms 
iter 3133: loss 2.7576, time 5123.83ms 
iter 3134: loss 2.4723, time 5152.67ms 
iter 3135: loss 2.8934, time 5264.21ms 
iter 3136: loss 2.9482, time 5304.26ms 
iter 3137: loss 2.6098, time 5158.62ms 
iter 3138: loss 2.7361, time 5077.62ms 
iter 3139: loss 2.6746, time 5084.79ms 
iter 3140: loss 2.7553, time 5003.18ms 
iter 3141: loss 2.8487, time 5022.70ms 
iter 3142: loss 2.8414, time 5322.46ms 
iter 3143: loss 2.7969, time 5231.45ms 
iter 3144: loss 2.7174, time 5326.50ms 
iter 3145: loss 2.7508, time 5309.50ms 
iter 3146: loss 2.6511, time 5302.66ms 
iter 3147: loss 2.6685, time 5287.20ms 
iter 3148: loss 2.8851, time 5094.71ms 
iter 3149: loss 2.6565, time 5110.31ms 
step 3150: train loss 2.7181, val loss 2.8647
iter 3150: loss 2.7420, time 20247.03ms 
iter 3151: loss 2.7606, time 5209.61ms 
iter 3152: loss 2.6935, time 5306.81ms 
iter 3153: loss 2.8186, time 5224.05ms 
iter 3154: loss 2.6441, time 5174.97ms 
iter 3155: loss 2.4310, time 5315.44ms 
iter 3156: loss 2.6199, time 5311.84ms 
iter 3157: loss 2.7360, time 5106.09ms 
iter 3158: loss 2.6524, time 5300.71ms 
iter 3159: loss 2.7452, time 5337.33ms 
iter 3160: loss 2.5334, time 5312.50ms 
iter 3161: loss 2.7539, time 5128.92ms 
iter 3162: loss 2.6131, time 5116.88ms 
iter 3163: loss 2.6755, time 5268.41ms 
iter 3164: loss 2.8205, time 5298.96ms 
iter 3165: loss 2.5454, time 5304.91ms 
iter 3166: loss 2.7537, time 5299.68ms 
iter 3167: loss 2.6152, time 5127.65ms 
iter 3168: loss 2.5484, time 5070.35ms 
iter 3169: loss 2.7268, time 5209.67ms 
iter 3170: loss 2.6989, time 5123.35ms 
iter 3171: loss 2.7600, time 5303.99ms 
iter 3172: loss 2.7522, time 5310.36ms 
iter 3173: loss 2.7369, time 5263.07ms 
iter 3174: loss 2.7563, time 5330.37ms 
iter 3175: loss 2.5807, time 5323.24ms 
iter 3176: loss 2.5269, time 5292.53ms 
iter 3177: loss 2.7863, time 5009.03ms 
iter 3178: loss 2.7552, time 5153.53ms 
iter 3179: loss 2.7166, time 5207.47ms 
iter 3180: loss 2.6957, time 5218.90ms 
iter 3181: loss 2.7392, time 5247.19ms 
iter 3182: loss 2.8282, time 5323.23ms 
iter 3183: loss 2.8980, time 5310.72ms 
iter 3184: loss 2.7350, time 5182.82ms 
iter 3185: loss 2.8258, time 5098.69ms 
iter 3186: loss 2.5317, time 5225.40ms 
iter 3187: loss 2.6389, time 5291.22ms 
iter 3188: loss 2.6829, time 5300.03ms 
iter 3189: loss 2.7446, time 5305.73ms 
iter 3190: loss 2.6929, time 5168.78ms 
iter 3191: loss 2.8338, time 5324.01ms 
iter 3192: loss 2.7038, time 5145.43ms 
iter 3193: loss 2.6318, time 5076.72ms 
iter 3194: loss 2.5556, time 5123.17ms 
iter 3195: loss 2.4986, time 5149.03ms 
iter 3196: loss 2.8277, time 5183.16ms 
iter 3197: loss 2.7752, time 5303.06ms 
iter 3198: loss 2.5829, time 5314.10ms 
iter 3199: loss 2.5271, time 5301.28ms 
step 3200: train loss 2.7052, val loss 2.8783
iter 3200: loss 2.7264, time 20292.49ms 
iter 3201: loss 2.7142, time 5298.41ms 
iter 3202: loss 2.6439, time 5326.70ms 
iter 3203: loss 2.6966, time 5298.65ms 
iter 3204: loss 2.9512, time 5228.25ms 
iter 3205: loss 2.8278, time 5078.79ms 
iter 3206: loss 2.6109, time 5034.07ms 
iter 3207: loss 2.7482, time 5212.39ms 
iter 3208: loss 2.6246, time 5307.56ms 
iter 3209: loss 2.7159, time 5346.61ms 
iter 3210: loss 2.6728, time 5271.93ms 
iter 3211: loss 2.5938, time 5324.25ms 
iter 3212: loss 2.5809, time 5304.61ms 
iter 3213: loss 2.4023, time 5065.45ms 
iter 3214: loss 2.7348, time 5113.91ms 
iter 3215: loss 2.6199, time 5254.09ms 
iter 3216: loss 2.7194, time 5230.16ms 
iter 3217: loss 2.7224, time 5278.39ms 
iter 3218: loss 2.6854, time 5333.78ms 
iter 3219: loss 2.6328, time 5221.11ms 
iter 3220: loss 2.8870, time 5217.93ms 
iter 3221: loss 2.8592, time 5124.52ms 
iter 3222: loss 2.5941, time 5150.41ms 
iter 3223: loss 2.8324, time 5260.20ms 
iter 3224: loss 2.7388, time 5249.99ms 
iter 3225: loss 2.5573, time 5332.15ms 
iter 3226: loss 2.6374, time 5315.23ms 
iter 3227: loss 2.5325, time 5301.72ms 
iter 3228: loss 2.7140, time 5295.48ms 
iter 3229: loss 2.7555, time 5117.19ms 
iter 3230: loss 2.4284, time 5281.92ms 
iter 3231: loss 2.6287, time 5302.87ms 
iter 3232: loss 2.9039, time 5252.75ms 
iter 3233: loss 2.9089, time 5102.61ms 
iter 3234: loss 2.5827, time 5258.47ms 
iter 3235: loss 2.8236, time 5218.35ms 
iter 3236: loss 2.6440, time 5090.93ms 
iter 3237: loss 2.6741, time 4993.60ms 
iter 3238: loss 2.8842, time 5147.85ms 
iter 3239: loss 2.5611, time 5316.20ms 
iter 3240: loss 2.6657, time 5319.79ms 
iter 3241: loss 2.6680, time 5310.74ms 
iter 3242: loss 2.6097, time 5171.44ms 
iter 3243: loss 2.5112, time 5043.95ms 
iter 3244: loss 2.9146, time 5081.97ms 
iter 3245: loss 2.7122, time 5031.17ms 
iter 3246: loss 2.6170, time 5051.38ms 
iter 3247: loss 2.7173, time 5077.61ms 
iter 3248: loss 2.7306, time 5190.93ms 
iter 3249: loss 2.9069, time 5310.48ms 
step 3250: train loss 2.7050, val loss 2.8779
iter 3250: loss 2.9602, time 20077.20ms 
iter 3251: loss 2.5188, time 5294.57ms 
iter 3252: loss 2.6697, time 5295.84ms 
iter 3253: loss 2.6190, time 5323.93ms 
iter 3254: loss 2.7139, time 5310.80ms 
iter 3255: loss 2.7401, time 5138.68ms 
iter 3256: loss 2.6067, time 5047.62ms 
iter 3257: loss 2.6731, time 5116.81ms 
iter 3258: loss 2.6602, time 5253.80ms 
iter 3259: loss 2.5003, time 5313.67ms 
iter 3260: loss 2.8864, time 5325.90ms 
iter 3261: loss 2.7512, time 5201.09ms 
iter 3262: loss 2.6410, time 5259.02ms 
iter 3263: loss 2.6510, time 5288.09ms 
iter 3264: loss 2.6411, time 5318.94ms 
iter 3265: loss 2.7791, time 5251.11ms 
iter 3266: loss 2.5578, time 5309.03ms 
iter 3267: loss 2.5499, time 5303.74ms 
iter 3268: loss 2.7503, time 5103.76ms 
iter 3269: loss 2.8534, time 5162.76ms 
iter 3270: loss 2.6467, time 5267.39ms 
iter 3271: loss 2.9591, time 5301.15ms 
iter 3272: loss 2.7440, time 5250.65ms 
iter 3273: loss 2.6293, time 5312.47ms 
iter 3274: loss 2.7598, time 5298.92ms 
iter 3275: loss 2.9120, time 5247.16ms 
iter 3276: loss 2.7893, time 5237.69ms 
iter 3277: loss 2.8039, time 5191.05ms 
iter 3278: loss 2.7014, time 5304.34ms 
iter 3279: loss 2.5811, time 5116.60ms 
iter 3280: loss 2.6056, time 5245.54ms 
iter 3281: loss 2.5535, time 5190.19ms 
iter 3282: loss 2.9077, time 5231.83ms 
iter 3283: loss 2.8749, time 5282.19ms 
iter 3284: loss 2.7162, time 5269.15ms 
iter 3285: loss 2.5315, time 5314.95ms 
iter 3286: loss 2.4358, time 5315.56ms 
iter 3287: loss 2.6600, time 5037.63ms 
iter 3288: loss 2.6576, time 5204.36ms 
iter 3289: loss 2.8216, time 5331.86ms 
iter 3290: loss 2.8520, time 5302.26ms 
iter 3291: loss 2.5090, time 5286.31ms 
iter 3292: loss 2.6529, time 5303.45ms 
iter 3293: loss 2.7999, time 5212.98ms 
iter 3294: loss 2.8267, time 5294.10ms 
iter 3295: loss 2.4967, time 5049.74ms 
iter 3296: loss 2.7298, time 5303.54ms 
iter 3297: loss 2.8253, time 5317.94ms 
iter 3298: loss 2.4954, time 5275.24ms 
iter 3299: loss 2.7126, time 5293.63ms 
step 3300: train loss 2.6994, val loss 2.8686
iter 3300: loss 2.5836, time 20185.24ms 
iter 3301: loss 2.7608, time 5245.47ms 
iter 3302: loss 2.6850, time 5116.78ms 
iter 3303: loss 2.8231, time 5316.80ms 
iter 3304: loss 2.6312, time 5312.91ms 
iter 3305: loss 2.6909, time 5291.80ms 
iter 3306: loss 2.5214, time 5303.68ms 
iter 3307: loss 2.8589, time 5105.59ms 
iter 3308: loss 2.5384, time 5152.29ms 
iter 3309: loss 2.7557, time 5302.36ms 
iter 3310: loss 2.6237, time 5351.33ms 
iter 3311: loss 2.4951, time 5253.54ms 
iter 3312: loss 2.7807, time 5065.66ms 
iter 3313: loss 2.6607, time 5249.08ms 
iter 3314: loss 2.6111, time 5304.15ms 
iter 3315: loss 2.5942, time 5107.86ms 
iter 3316: loss 2.7410, time 5318.46ms 
iter 3317: loss 2.5513, time 5311.04ms 
iter 3318: loss 2.6666, time 5238.59ms 
iter 3319: loss 2.7739, time 5213.63ms 
iter 3320: loss 2.5127, time 5256.13ms 
iter 3321: loss 2.7974, time 5321.68ms 
iter 3322: loss 2.8717, time 5116.89ms 
iter 3323: loss 2.6839, time 5227.01ms 
iter 3324: loss 2.8866, time 5209.71ms 
iter 3325: loss 2.8000, time 5111.29ms 
iter 3326: loss 2.6128, time 5257.68ms 
iter 3327: loss 2.7798, time 5293.51ms 
iter 3328: loss 2.7251, time 5310.52ms 
iter 3329: loss 2.6558, time 5311.47ms 
iter 3330: loss 2.7738, time 5236.32ms 
iter 3331: loss 2.9042, time 5259.18ms 
iter 3332: loss 2.4012, time 5243.45ms 
iter 3333: loss 2.4898, time 5307.05ms 
iter 3334: loss 2.6424, time 5320.23ms 
iter 3335: loss 2.6544, time 5282.41ms 
iter 3336: loss 2.6835, time 5218.75ms 
iter 3337: loss 2.7389, time 5182.76ms 
iter 3338: loss 2.7659, time 5099.68ms 
iter 3339: loss 2.9352, time 5274.18ms 
iter 3340: loss 2.6632, time 5308.63ms 
iter 3341: loss 2.6545, time 5288.31ms 
iter 3342: loss 2.7752, time 5230.89ms 
iter 3343: loss 2.6365, time 5287.97ms 
iter 3344: loss 2.6768, time 5249.76ms 
iter 3345: loss 2.7052, time 5141.39ms 
iter 3346: loss 2.5670, time 5201.60ms 
iter 3347: loss 2.6392, time 5260.65ms 
iter 3348: loss 2.5722, time 5291.57ms 
iter 3349: loss 2.5521, time 5287.86ms 
step 3350: train loss 2.6936, val loss 2.8578
iter 3350: loss 2.6937, time 19975.88ms 
iter 3351: loss 2.6640, time 5008.40ms 
iter 3352: loss 2.7670, time 5303.55ms 
iter 3353: loss 2.5299, time 5318.96ms 
iter 3354: loss 2.6891, time 5315.39ms 
iter 3355: loss 2.5119, time 5302.29ms 
iter 3356: loss 2.7795, time 5090.28ms 
iter 3357: loss 2.5323, time 5083.78ms 
iter 3358: loss 2.7344, time 5075.68ms 
iter 3359: loss 2.7302, time 5059.93ms 
iter 3360: loss 2.6271, time 5270.58ms 
iter 3361: loss 2.7297, time 5221.17ms 
iter 3362: loss 2.6672, time 5306.86ms 
iter 3363: loss 2.7604, time 5325.54ms 
iter 3364: loss 2.6496, time 5293.08ms 
iter 3365: loss 2.7101, time 5310.14ms 
iter 3366: loss 2.6156, time 5316.26ms 
iter 3367: loss 2.4806, time 5132.70ms 
iter 3368: loss 2.8551, time 5097.60ms 
iter 3369: loss 2.7131, time 5098.28ms 
iter 3370: loss 2.7765, time 5083.48ms 
iter 3371: loss 2.7623, time 5080.92ms 
iter 3372: loss 2.7780, time 5080.31ms 
iter 3373: loss 2.5662, time 5286.92ms 
iter 3374: loss 2.7150, time 5311.07ms 
iter 3375: loss 2.7781, time 5120.20ms 
iter 3376: loss 2.6292, time 5302.11ms 
iter 3377: loss 2.7201, time 5352.22ms 
iter 3378: loss 2.8123, time 5323.64ms 
iter 3379: loss 2.7877, time 5359.10ms 
iter 3380: loss 2.6049, time 5298.96ms 
iter 3381: loss 2.7779, time 5324.68ms 
iter 3382: loss 2.5554, time 5255.06ms 
iter 3383: loss 2.5274, time 5210.72ms 
iter 3384: loss 2.4997, time 5317.21ms 
iter 3385: loss 2.5361, time 5310.18ms 
iter 3386: loss 2.5573, time 5308.88ms 
iter 3387: loss 2.6922, time 5316.65ms 
iter 3388: loss 2.7981, time 5300.55ms 
iter 3389: loss 3.0148, time 5304.43ms 
iter 3390: loss 2.5625, time 5163.59ms 
iter 3391: loss 2.5226, time 5215.46ms 
iter 3392: loss 2.7722, time 5269.30ms 
iter 3393: loss 2.4755, time 5291.62ms 
iter 3394: loss 2.6362, time 5307.62ms 
iter 3395: loss 2.6079, time 5308.60ms 
iter 3396: loss 2.4564, time 5316.60ms 
iter 3397: loss 2.5391, time 5313.89ms 
iter 3398: loss 2.7397, time 5266.82ms 
iter 3399: loss 2.7282, time 5315.26ms 
step 3400: train loss 2.6917, val loss 2.8853
iter 3400: loss 2.8539, time 20133.40ms 
iter 3401: loss 2.4781, time 5267.56ms 
iter 3402: loss 2.4887, time 5222.00ms 
iter 3403: loss 2.6644, time 5297.42ms 
iter 3404: loss 2.5056, time 5292.69ms 
iter 3405: loss 2.7425, time 5298.26ms 
iter 3406: loss 2.6872, time 5186.00ms 
iter 3407: loss 2.6027, time 5301.95ms 
iter 3408: loss 2.7805, time 5344.58ms 
iter 3409: loss 2.8721, time 5176.15ms 
iter 3410: loss 2.4649, time 5329.46ms 
iter 3411: loss 2.7111, time 5119.04ms 
iter 3412: loss 2.6579, time 5333.76ms 
iter 3413: loss 2.7377, time 5362.43ms 
iter 3414: loss 2.7568, time 5293.22ms 
iter 3415: loss 2.6140, time 5240.82ms 
iter 3416: loss 2.6547, time 4979.93ms 
iter 3417: loss 2.6027, time 5104.73ms 
iter 3418: loss 2.6107, time 5140.33ms 
iter 3419: loss 2.8485, time 5197.62ms 
iter 3420: loss 2.6647, time 5309.32ms 
iter 3421: loss 2.9611, time 5333.02ms 
iter 3422: loss 2.7539, time 5281.71ms 
iter 3423: loss 2.8709, time 5322.44ms 
iter 3424: loss 2.6606, time 5252.02ms 
iter 3425: loss 2.7832, time 5302.07ms 
iter 3426: loss 2.6788, time 5124.12ms 
iter 3427: loss 2.7048, time 5211.91ms 
iter 3428: loss 2.7138, time 5306.06ms 
iter 3429: loss 2.8486, time 5311.50ms 
iter 3430: loss 2.8742, time 5311.59ms 
iter 3431: loss 2.7328, time 5329.20ms 
iter 3432: loss 2.8542, time 5305.09ms 
iter 3433: loss 2.7868, time 5074.50ms 
iter 3434: loss 2.5285, time 5027.52ms 
iter 3435: loss 2.6239, time 5066.53ms 
iter 3436: loss 2.7924, time 5097.67ms 
iter 3437: loss 2.6754, time 5071.95ms 
iter 3438: loss 2.6302, time 5318.50ms 
iter 3439: loss 2.5802, time 5285.86ms 
iter 3440: loss 2.6029, time 5310.35ms 
iter 3441: loss 2.6957, time 5150.36ms 
iter 3442: loss 2.6712, time 5260.67ms 
iter 3443: loss 2.4786, time 5201.42ms 
iter 3444: loss 2.6453, time 5296.48ms 
iter 3445: loss 2.7805, time 5207.84ms 
iter 3446: loss 2.6260, time 5276.62ms 
iter 3447: loss 2.7343, time 5309.10ms 
iter 3448: loss 2.5514, time 5285.14ms 
iter 3449: loss 2.8148, time 5324.38ms 
step 3450: train loss 2.6883, val loss 2.8554
iter 3450: loss 2.5378, time 20181.72ms 
iter 3451: loss 2.6994, time 5310.36ms 
iter 3452: loss 2.5850, time 5021.78ms 
iter 3453: loss 2.7640, time 4993.90ms 
iter 3454: loss 2.7550, time 5118.95ms 
iter 3455: loss 2.5833, time 5173.87ms 
iter 3456: loss 2.5760, time 5325.31ms 
iter 3457: loss 2.5774, time 5303.10ms 
iter 3458: loss 2.9488, time 5320.58ms 
iter 3459: loss 2.6508, time 5322.20ms 
iter 3460: loss 2.6958, time 5245.38ms 
iter 3461: loss 2.5902, time 5281.61ms 
iter 3462: loss 2.6866, time 5287.74ms 
iter 3463: loss 2.5609, time 5227.91ms 
iter 3464: loss 2.7941, time 5104.74ms 
iter 3465: loss 2.6562, time 5193.72ms 
iter 3466: loss 2.8453, time 5305.37ms 
iter 3467: loss 2.7386, time 5308.18ms 
iter 3468: loss 2.9433, time 5240.16ms 
iter 3469: loss 2.7382, time 5136.65ms 
iter 3470: loss 2.8497, time 5095.78ms 
iter 3471: loss 2.6002, time 5075.28ms 
iter 3472: loss 2.7539, time 5131.06ms 
iter 3473: loss 2.7078, time 5148.69ms 
iter 3474: loss 2.8439, time 5196.85ms 
iter 3475: loss 2.6846, time 5323.33ms 
iter 3476: loss 2.4868, time 5313.82ms 
iter 3477: loss 2.8713, time 5307.93ms 
iter 3478: loss 2.7135, time 5319.25ms 
iter 3479: loss 2.7912, time 5170.10ms 
iter 3480: loss 2.6131, time 5314.20ms 
iter 3481: loss 2.6744, time 5312.56ms 
iter 3482: loss 2.6073, time 5323.22ms 
iter 3483: loss 2.4914, time 5300.12ms 
iter 3484: loss 2.6518, time 5304.60ms 
iter 3485: loss 2.6221, time 5295.04ms 
iter 3486: loss 2.7569, time 5275.59ms 
iter 3487: loss 2.4950, time 5095.29ms 
iter 3488: loss 2.6596, time 5169.51ms 
iter 3489: loss 2.6625, time 5292.31ms 
iter 3490: loss 2.7127, time 5238.21ms 
iter 3491: loss 2.8768, time 5257.34ms 
iter 3492: loss 2.7372, time 5307.05ms 
iter 3493: loss 2.6268, time 5208.96ms 
iter 3494: loss 2.6683, time 5128.55ms 
iter 3495: loss 2.7088, time 5301.61ms 
iter 3496: loss 2.6330, time 5261.50ms 
iter 3497: loss 2.4854, time 5256.60ms 
iter 3498: loss 2.7832, time 5295.58ms 
iter 3499: loss 2.7237, time 5315.21ms 
step 3500: train loss 2.6709, val loss 2.8624
iter 3500: loss 2.7463, time 20141.81ms 
iter 3501: loss 2.6375, time 5320.21ms 
iter 3502: loss 2.5552, time 5308.95ms 
iter 3503: loss 2.6513, time 5264.12ms 
iter 3504: loss 2.7819, time 5302.53ms 
iter 3505: loss 2.4483, time 5313.41ms 
iter 3506: loss 2.7663, time 5315.58ms 
iter 3507: loss 2.5307, time 5276.32ms 
iter 3508: loss 2.7454, time 5289.65ms 
iter 3509: loss 2.5841, time 5318.76ms 
iter 3510: loss 2.5969, time 5306.69ms 
iter 3511: loss 2.5495, time 5288.95ms 
iter 3512: loss 2.8158, time 5300.90ms 
iter 3513: loss 2.9961, time 5295.68ms 
iter 3514: loss 2.6587, time 5067.40ms 
iter 3515: loss 2.6851, time 5084.90ms 
iter 3516: loss 2.8222, time 5306.72ms 
iter 3517: loss 2.7737, time 5193.90ms 
iter 3518: loss 2.7721, time 5060.46ms 
iter 3519: loss 2.6033, time 5240.60ms 
iter 3520: loss 2.7377, time 5262.28ms 
iter 3521: loss 2.6792, time 5250.99ms 
iter 3522: loss 2.7457, time 5068.95ms 
iter 3523: loss 2.5037, time 5146.84ms 
iter 3524: loss 2.7352, time 5111.23ms 
iter 3525: loss 2.7314, time 5304.71ms 
iter 3526: loss 2.6647, time 5310.83ms 
iter 3527: loss 2.8248, time 5323.50ms 
iter 3528: loss 2.6537, time 5310.68ms 
iter 3529: loss 2.5408, time 5284.55ms 
iter 3530: loss 2.6276, time 5111.27ms 
iter 3531: loss 2.5956, time 5313.22ms 
iter 3532: loss 2.5954, time 5272.61ms 
iter 3533: loss 2.7167, time 5305.32ms 
iter 3534: loss 2.7334, time 5126.91ms 
iter 3535: loss 2.7971, time 5184.78ms 
iter 3536: loss 2.3985, time 5322.51ms 
iter 3537: loss 2.7676, time 5233.28ms 
iter 3538: loss 2.6994, time 5056.90ms 
iter 3539: loss 2.6670, time 5277.24ms 
iter 3540: loss 2.4974, time 5288.39ms 
iter 3541: loss 2.5118, time 5312.60ms 
iter 3542: loss 2.6955, time 5322.90ms 
iter 3543: loss 2.5796, time 5322.52ms 
iter 3544: loss 2.6256, time 5242.84ms 
iter 3545: loss 2.6745, time 5162.11ms 
iter 3546: loss 2.8784, time 5268.84ms 
iter 3547: loss 2.7200, time 5317.80ms 
iter 3548: loss 2.6996, time 5320.45ms 
iter 3549: loss 3.0936, time 5305.94ms 
step 3550: train loss 2.6759, val loss 2.8587
iter 3550: loss 2.8306, time 19862.88ms 
iter 3551: loss 2.8227, time 5040.50ms 
iter 3552: loss 2.6982, time 5169.79ms 
iter 3553: loss 2.8292, time 5322.45ms 
iter 3554: loss 2.9635, time 5324.62ms 
iter 3555: loss 2.6889, time 5303.06ms 
iter 3556: loss 2.5110, time 5314.63ms 
iter 3557: loss 2.7396, time 5308.38ms 
iter 3558: loss 2.6413, time 5141.87ms 
iter 3559: loss 2.7767, time 5308.27ms 
iter 3560: loss 2.6660, time 5284.65ms 
iter 3561: loss 2.7266, time 5303.72ms 
iter 3562: loss 2.8553, time 5302.74ms 
iter 3563: loss 2.8423, time 5301.59ms 
iter 3564: loss 2.5166, time 5305.92ms 
iter 3565: loss 2.7848, time 5183.10ms 
iter 3566: loss 2.6603, time 5224.05ms 
iter 3567: loss 2.5348, time 5247.93ms 
iter 3568: loss 2.5641, time 5321.83ms 
iter 3569: loss 2.7765, time 5318.79ms 
iter 3570: loss 2.4295, time 5172.39ms 
iter 3571: loss 2.5800, time 5306.93ms 
iter 3572: loss 2.6949, time 5254.75ms 
iter 3573: loss 2.8039, time 5115.64ms 
iter 3574: loss 2.5566, time 5197.75ms 
iter 3575: loss 2.6669, time 5151.53ms 
iter 3576: loss 2.6615, time 5292.19ms 
iter 3577: loss 2.4981, time 5307.26ms 
iter 3578: loss 2.5850, time 5309.06ms 
iter 3579: loss 2.8004, time 5293.42ms 
iter 3580: loss 2.7937, time 5296.03ms 
iter 3581: loss 2.5926, time 5107.77ms 
iter 3582: loss 2.7968, time 5139.04ms 
iter 3583: loss 2.7165, time 5325.72ms 
iter 3584: loss 2.6762, time 5222.22ms 
iter 3585: loss 2.6299, time 5311.68ms 
iter 3586: loss 2.8459, time 5297.14ms 
iter 3587: loss 2.7520, time 5309.37ms 
iter 3588: loss 2.7563, time 5310.49ms 
iter 3589: loss 2.6337, time 5174.27ms 
iter 3590: loss 2.6345, time 5290.95ms 
iter 3591: loss 2.8184, time 5301.71ms 
iter 3592: loss 2.6833, time 5307.80ms 
iter 3593: loss 2.6556, time 5311.20ms 
iter 3594: loss 2.5659, time 5303.56ms 
iter 3595: loss 2.7397, time 5279.29ms 
iter 3596: loss 2.5759, time 5309.78ms 
iter 3597: loss 2.7276, time 5250.52ms 
iter 3598: loss 2.6499, time 5317.73ms 
iter 3599: loss 2.4721, time 5322.87ms 
step 3600: train loss 2.6739, val loss 2.8389
iter 3600: loss 2.5806, time 20093.13ms 
iter 3601: loss 2.5009, time 5108.67ms 
iter 3602: loss 2.6266, time 5118.69ms 
iter 3603: loss 2.7016, time 5322.98ms 
iter 3604: loss 2.4836, time 5310.31ms 
iter 3605: loss 2.6750, time 5167.52ms 
iter 3606: loss 2.3413, time 5206.23ms 
iter 3607: loss 2.4933, time 5053.56ms 
iter 3608: loss 2.6656, time 5193.73ms 
iter 3609: loss 2.7250, time 5167.26ms 
iter 3610: loss 2.6272, time 5092.57ms 
iter 3611: loss 2.4564, time 5222.69ms 
iter 3612: loss 2.6404, time 5108.70ms 
iter 3613: loss 2.6083, time 5248.56ms 
iter 3614: loss 2.7551, time 5256.67ms 
iter 3615: loss 2.7248, time 5307.10ms 
iter 3616: loss 2.4957, time 5295.85ms 
iter 3617: loss 2.6456, time 5128.79ms 
iter 3618: loss 2.8212, time 5013.01ms 
iter 3619: loss 2.7589, time 5037.37ms 
iter 3620: loss 2.6296, time 5255.76ms 
iter 3621: loss 2.5818, time 5150.59ms 
iter 3622: loss 2.4245, time 5219.61ms 
iter 3623: loss 2.7335, time 5223.40ms 
iter 3624: loss 2.6421, time 5308.46ms 
iter 3625: loss 2.7325, time 5095.28ms 
iter 3626: loss 2.7581, time 4992.55ms 
iter 3627: loss 2.6325, time 5343.45ms 
iter 3628: loss 2.8446, time 5246.41ms 
iter 3629: loss 2.6861, time 5294.66ms 
iter 3630: loss 2.4418, time 5290.59ms 
iter 3631: loss 2.7496, time 5295.82ms 
iter 3632: loss 2.8128, time 5304.47ms 
iter 3633: loss 2.9160, time 5319.00ms 
iter 3634: loss 2.7399, time 5231.76ms 
iter 3635: loss 2.5448, time 5051.09ms 
iter 3636: loss 2.7245, time 5049.23ms 
iter 3637: loss 2.5353, time 5229.47ms 
iter 3638: loss 2.5660, time 5063.15ms 
iter 3639: loss 2.6492, time 5063.46ms 
iter 3640: loss 2.7215, time 5246.18ms 
iter 3641: loss 2.5724, time 5034.69ms 
iter 3642: loss 2.8061, time 4993.91ms 
iter 3643: loss 2.8100, time 5077.69ms 
iter 3644: loss 2.7032, time 5284.01ms 
iter 3645: loss 2.6123, time 5314.67ms 
iter 3646: loss 2.6543, time 5306.87ms 
iter 3647: loss 2.6073, time 5270.82ms 
iter 3648: loss 2.6678, time 5308.33ms 
iter 3649: loss 2.6419, time 5124.03ms 
step 3650: train loss 2.6693, val loss 2.8655
iter 3650: loss 2.6171, time 20201.20ms 
iter 3651: loss 2.7300, time 5290.15ms 
iter 3652: loss 2.6153, time 5286.06ms 
iter 3653: loss 2.7100, time 5280.28ms 
iter 3654: loss 2.6575, time 5110.32ms 
iter 3655: loss 2.6368, time 5220.49ms 
iter 3656: loss 2.6124, time 5311.43ms 
iter 3657: loss 2.4540, time 5319.13ms 
iter 3658: loss 2.7506, time 5315.93ms 
iter 3659: loss 2.6768, time 5218.39ms 
iter 3660: loss 2.7211, time 5228.47ms 
iter 3661: loss 2.5180, time 5298.39ms 
iter 3662: loss 2.8519, time 5243.51ms 
iter 3663: loss 2.4330, time 5303.60ms 
iter 3664: loss 2.6984, time 5088.73ms 
iter 3665: loss 2.7073, time 5282.27ms 
iter 3666: loss 2.8493, time 5280.01ms 
iter 3667: loss 2.7909, time 5281.56ms 
iter 3668: loss 2.5231, time 5290.04ms 
iter 3669: loss 2.6265, time 5149.73ms 
iter 3670: loss 2.5954, time 5105.53ms 
iter 3671: loss 2.8308, time 5083.25ms 
iter 3672: loss 2.6368, time 5050.38ms 
iter 3673: loss 2.5519, time 5112.34ms 
iter 3674: loss 2.6627, time 5185.23ms 
iter 3675: loss 2.5156, time 5144.35ms 
iter 3676: loss 2.6501, time 5128.88ms 
iter 3677: loss 2.6371, time 5099.15ms 
iter 3678: loss 2.6675, time 5069.05ms 
iter 3679: loss 2.7354, time 5241.30ms 
iter 3680: loss 2.5043, time 5304.31ms 
iter 3681: loss 2.6480, time 5273.95ms 
iter 3682: loss 2.9394, time 5312.07ms 
iter 3683: loss 2.6378, time 5308.31ms 
iter 3684: loss 2.5597, time 5288.09ms 
iter 3685: loss 2.8330, time 5259.07ms 
iter 3686: loss 2.7184, time 5111.84ms 
iter 3687: loss 2.5714, time 5134.06ms 
iter 3688: loss 2.5667, time 5146.54ms 
iter 3689: loss 2.5477, time 5094.67ms 
iter 3690: loss 2.6803, time 5174.03ms 
iter 3691: loss 2.8464, time 5267.30ms 
iter 3692: loss 2.8646, time 5280.14ms 
iter 3693: loss 2.6728, time 5283.37ms 
iter 3694: loss 2.5256, time 5290.28ms 
iter 3695: loss 2.7010, time 5289.77ms 
iter 3696: loss 2.6270, time 5297.06ms 
iter 3697: loss 2.6775, time 5113.15ms 
iter 3698: loss 2.6687, time 5193.10ms 
iter 3699: loss 2.5306, time 5295.80ms 
step 3700: train loss 2.6683, val loss 2.8597
iter 3700: loss 2.6192, time 20161.87ms 
iter 3701: loss 2.7518, time 5309.14ms 
iter 3702: loss 2.6805, time 5154.58ms 
iter 3703: loss 2.6531, time 5127.71ms 
iter 3704: loss 2.7698, time 5074.76ms 
iter 3705: loss 2.6304, time 5089.71ms 
iter 3706: loss 2.9597, time 5299.79ms 
iter 3707: loss 2.5993, time 5315.22ms 
iter 3708: loss 2.7585, time 5304.19ms 
iter 3709: loss 2.6454, time 5285.02ms 
iter 3710: loss 2.8485, time 5287.60ms 
iter 3711: loss 2.6484, time 5294.06ms 
iter 3712: loss 2.6964, time 5155.11ms 
iter 3713: loss 2.8177, time 5013.83ms 
iter 3714: loss 2.6832, time 5010.05ms 
iter 3715: loss 2.5928, time 5254.68ms 
iter 3716: loss 2.7405, time 5303.69ms 
iter 3717: loss 2.7429, time 5294.59ms 
iter 3718: loss 2.6973, time 5312.62ms 
iter 3719: loss 2.6365, time 5291.85ms 
iter 3720: loss 2.7219, time 5305.78ms 
iter 3721: loss 2.6764, time 5247.16ms 
iter 3722: loss 2.9034, time 5154.70ms 
iter 3723: loss 2.6483, time 5297.67ms 
iter 3724: loss 2.5302, time 5294.46ms 
iter 3725: loss 2.6439, time 5290.67ms 
iter 3726: loss 2.6891, time 5293.45ms 
iter 3727: loss 2.6242, time 5304.51ms 
iter 3728: loss 2.7343, time 5238.90ms 
iter 3729: loss 2.6907, time 5029.03ms 
iter 3730: loss 2.6775, time 5034.43ms 
iter 3731: loss 2.5698, time 5035.32ms 
iter 3732: loss 2.7347, time 5062.83ms 
iter 3733: loss 2.6248, time 5069.69ms 
iter 3734: loss 2.7111, time 5184.89ms 
iter 3735: loss 2.6897, time 5303.62ms 
iter 3736: loss 2.7063, time 5299.62ms 
iter 3737: loss 2.6154, time 5083.16ms 
iter 3738: loss 2.8153, time 5219.47ms 
iter 3739: loss 2.6641, time 5308.26ms 
iter 3740: loss 2.5137, time 5310.94ms 
iter 3741: loss 2.5938, time 5198.39ms 
iter 3742: loss 2.9267, time 5244.90ms 
iter 3743: loss 2.7805, time 5287.86ms 
iter 3744: loss 2.5740, time 5298.23ms 
iter 3745: loss 2.6409, time 5293.09ms 
iter 3746: loss 2.7308, time 5275.56ms 
iter 3747: loss 2.9058, time 5286.66ms 
iter 3748: loss 2.6724, time 5298.95ms 
iter 3749: loss 2.8983, time 5169.03ms 
step 3750: train loss 2.6768, val loss 2.8792
iter 3750: loss 2.5807, time 19899.97ms 
iter 3751: loss 2.5708, time 5277.43ms 
iter 3752: loss 2.6538, time 5289.79ms 
iter 3753: loss 2.8158, time 5282.80ms 
iter 3754: loss 2.7035, time 5293.24ms 
iter 3755: loss 2.8666, time 5286.53ms 
iter 3756: loss 2.6740, time 5287.96ms 
iter 3757: loss 2.5640, time 5297.88ms 
iter 3758: loss 2.6000, time 5102.66ms 
iter 3759: loss 2.6819, time 5064.26ms 
iter 3760: loss 2.6037, time 5162.32ms 
iter 3761: loss 2.9052, time 5132.40ms 
iter 3762: loss 2.5614, time 5262.10ms 
iter 3763: loss 2.5468, time 5282.66ms 
iter 3764: loss 2.4240, time 5285.87ms 
iter 3765: loss 2.7780, time 5290.60ms 
iter 3766: loss 2.6701, time 5287.38ms 
iter 3767: loss 2.4525, time 5294.43ms 
iter 3768: loss 2.5425, time 5308.47ms 
iter 3769: loss 2.6437, time 5169.65ms 
iter 3770: loss 2.6535, time 5311.54ms 
iter 3771: loss 2.7253, time 5298.57ms 
iter 3772: loss 2.6663, time 5307.30ms 
iter 3773: loss 2.6472, time 5295.62ms 
iter 3774: loss 2.6037, time 5298.81ms 
iter 3775: loss 2.6138, time 5300.64ms 
iter 3776: loss 2.6958, time 5157.62ms 
iter 3777: loss 2.5255, time 5097.29ms 
iter 3778: loss 2.6347, time 5070.76ms 
iter 3779: loss 2.6989, time 5121.22ms 
iter 3780: loss 2.9187, time 5165.51ms 
iter 3781: loss 2.6474, time 5117.25ms 
iter 3782: loss 2.7180, time 5074.91ms 
iter 3783: loss 2.6611, time 5072.12ms 
iter 3784: loss 2.5379, time 5065.00ms 
iter 3785: loss 2.7885, time 5074.27ms 
iter 3786: loss 2.6345, time 5244.22ms 
iter 3787: loss 2.6455, time 5110.70ms 
iter 3788: loss 2.5670, time 5067.31ms 
iter 3789: loss 2.6355, time 5183.21ms 
iter 3790: loss 2.6240, time 5303.21ms 
iter 3791: loss 2.8613, time 5301.44ms 
iter 3792: loss 2.7372, time 5295.64ms 
iter 3793: loss 2.7484, time 5189.68ms 
iter 3794: loss 2.5119, time 5292.53ms 
iter 3795: loss 2.6514, time 5298.99ms 
iter 3796: loss 2.7101, time 5286.04ms 
iter 3797: loss 2.4277, time 5277.15ms 
iter 3798: loss 2.7931, time 5275.34ms 
iter 3799: loss 2.5304, time 5251.90ms 
step 3800: train loss 2.6488, val loss 2.8437
iter 3800: loss 2.7629, time 20151.71ms 
iter 3801: loss 2.4753, time 5295.95ms 
iter 3802: loss 2.7569, time 5298.07ms 
iter 3803: loss 2.7266, time 5289.62ms 
iter 3804: loss 2.5477, time 5291.56ms 
iter 3805: loss 2.7889, time 5283.99ms 
iter 3806: loss 2.6206, time 5297.68ms 
iter 3807: loss 2.5688, time 5289.54ms 
iter 3808: loss 2.8112, time 5301.63ms 
iter 3809: loss 2.7030, time 5296.97ms 
iter 3810: loss 2.5263, time 5312.36ms 
iter 3811: loss 2.9229, time 5308.30ms 
iter 3812: loss 2.6586, time 5306.41ms 
iter 3813: loss 2.5169, time 5291.01ms 
iter 3814: loss 2.7449, time 5271.90ms 
iter 3815: loss 2.6921, time 5304.78ms 
iter 3816: loss 2.7207, time 5296.31ms 
iter 3817: loss 2.7282, time 5309.25ms 
iter 3818: loss 2.7955, time 5299.37ms 
iter 3819: loss 2.7072, time 5301.94ms 
iter 3820: loss 2.8276, time 5310.82ms 
iter 3821: loss 2.8848, time 5311.52ms 
iter 3822: loss 2.6923, time 5274.40ms 
iter 3823: loss 2.7341, time 5288.38ms 
iter 3824: loss 2.7974, time 5308.49ms 
iter 3825: loss 2.5222, time 5217.37ms 
iter 3826: loss 2.5775, time 5304.00ms 
iter 3827: loss 2.8036, time 5301.57ms 
iter 3828: loss 2.6280, time 5288.98ms 
iter 3829: loss 2.6973, time 5288.94ms 
iter 3830: loss 2.6749, time 5289.50ms 
iter 3831: loss 2.5895, time 5296.53ms 
iter 3832: loss 2.6118, time 5287.28ms 
iter 3833: loss 2.6070, time 5293.44ms 
iter 3834: loss 2.6992, time 5297.36ms 
iter 3835: loss 2.6619, time 5280.60ms 
iter 3836: loss 2.5919, time 5290.71ms 
iter 3837: loss 2.4804, time 5295.55ms 
iter 3838: loss 2.6467, time 5302.17ms 
iter 3839: loss 2.7966, time 5291.87ms 
iter 3840: loss 2.4672, time 5282.59ms 
iter 3841: loss 2.7605, time 5293.51ms 
iter 3842: loss 2.6808, time 5202.07ms 
iter 3843: loss 2.8435, time 5291.97ms 
iter 3844: loss 2.6604, time 5310.27ms 
iter 3845: loss 2.5174, time 5278.03ms 
iter 3846: loss 2.8083, time 5190.10ms 
iter 3847: loss 2.4552, time 5181.26ms 
iter 3848: loss 2.7853, time 5255.95ms 
iter 3849: loss 2.5949, time 5302.38ms 
step 3850: train loss 2.6566, val loss 2.8539
iter 3850: loss 2.6194, time 20005.95ms 
iter 3851: loss 2.6211, time 5232.35ms 
iter 3852: loss 2.5435, time 5246.46ms 
iter 3853: loss 2.7122, time 5267.99ms 
iter 3854: loss 2.7133, time 5183.40ms 
iter 3855: loss 2.7595, time 5242.69ms 
iter 3856: loss 2.4415, time 5284.84ms 
iter 3857: loss 2.8371, time 5114.76ms 
iter 3858: loss 2.7265, time 5249.95ms 
iter 3859: loss 2.5225, time 5294.30ms 
iter 3860: loss 2.8454, time 5320.09ms 
iter 3861: loss 2.9084, time 5308.11ms 
iter 3862: loss 2.4053, time 5310.52ms 
iter 3863: loss 2.6074, time 5274.57ms 
iter 3864: loss 2.6939, time 5264.92ms 
iter 3865: loss 2.6099, time 5224.40ms 
iter 3866: loss 2.6825, time 5303.56ms 
iter 3867: loss 2.7170, time 5289.53ms 
iter 3868: loss 2.8393, time 5263.24ms 
iter 3869: loss 2.6587, time 5175.53ms 
iter 3870: loss 2.7251, time 5211.51ms 
iter 3871: loss 2.7440, time 5273.09ms 
iter 3872: loss 2.9051, time 5324.91ms 
iter 3873: loss 2.5193, time 5336.40ms 
iter 3874: loss 2.7472, time 5325.54ms 
iter 3875: loss 2.9515, time 5259.73ms 
iter 3876: loss 2.6393, time 5341.20ms 
iter 3877: loss 2.6292, time 5310.83ms 
iter 3878: loss 2.6641, time 5288.57ms 
iter 3879: loss 2.6643, time 5031.98ms 
iter 3880: loss 2.7882, time 5171.05ms 
iter 3881: loss 2.7557, time 5074.48ms 
iter 3882: loss 2.7752, time 5239.65ms 
iter 3883: loss 2.7631, time 5265.97ms 
iter 3884: loss 2.8337, time 5282.91ms 
iter 3885: loss 2.7099, time 5136.05ms 
iter 3886: loss 2.8333, time 5274.02ms 
iter 3887: loss 2.6287, time 5307.55ms 
iter 3888: loss 2.7583, time 5251.47ms 
iter 3889: loss 2.9752, time 5309.03ms 
iter 3890: loss 2.8048, time 5241.71ms 
iter 3891: loss 2.6625, time 5082.12ms 
iter 3892: loss 2.8075, time 5066.88ms 
iter 3893: loss 2.7589, time 5084.48ms 
iter 3894: loss 2.6446, time 5060.69ms 
iter 3895: loss 2.7154, time 5076.36ms 
iter 3896: loss 2.7390, time 5054.66ms 
iter 3897: loss 2.5560, time 5069.83ms 
iter 3898: loss 2.6528, time 5065.05ms 
iter 3899: loss 2.6002, time 5077.85ms 
step 3900: train loss 2.6521, val loss 2.8460
iter 3900: loss 2.6432, time 20027.49ms 
iter 3901: loss 2.6080, time 5233.29ms 
iter 3902: loss 2.5957, time 5165.32ms 
iter 3903: loss 2.6199, time 5289.76ms 
iter 3904: loss 2.7061, time 5297.24ms 
iter 3905: loss 2.6207, time 5295.06ms 
iter 3906: loss 2.7466, time 5173.49ms 
iter 3907: loss 2.7566, time 5239.53ms 
iter 3908: loss 2.7682, time 5228.12ms 
iter 3909: loss 2.6238, time 5298.26ms 
iter 3910: loss 2.5547, time 5296.04ms 
iter 3911: loss 2.6926, time 5239.20ms 
iter 3912: loss 2.6200, time 5330.82ms 
iter 3913: loss 2.6768, time 5264.73ms 
iter 3914: loss 2.9237, time 5287.26ms 
iter 3915: loss 2.5929, time 5177.77ms 
iter 3916: loss 2.7215, time 5261.42ms 
iter 3917: loss 2.6495, time 5291.95ms 
iter 3918: loss 2.6432, time 5298.24ms 
iter 3919: loss 2.5571, time 5303.46ms 
iter 3920: loss 2.4279, time 5276.29ms 
iter 3921: loss 2.4949, time 5192.67ms 
iter 3922: loss 2.6156, time 5089.16ms 
iter 3923: loss 2.6880, time 5017.67ms 
iter 3924: loss 2.5530, time 5156.58ms 
iter 3925: loss 2.7494, time 5225.68ms 
iter 3926: loss 2.6100, time 5276.34ms 
iter 3927: loss 2.6963, time 5224.64ms 
iter 3928: loss 2.8891, time 5194.49ms 
iter 3929: loss 2.6663, time 5086.95ms 
iter 3930: loss 2.6646, time 5296.32ms 
iter 3931: loss 2.7246, time 5272.38ms 
iter 3932: loss 2.5897, time 5252.94ms 
iter 3933: loss 2.6799, time 5295.02ms 
iter 3934: loss 2.6720, time 5298.56ms 
iter 3935: loss 2.8075, time 5250.53ms 
iter 3936: loss 2.7336, time 5245.92ms 
iter 3937: loss 2.5185, time 5290.67ms 
iter 3938: loss 2.4937, time 5309.04ms 
iter 3939: loss 2.8362, time 5294.09ms 
iter 3940: loss 2.5921, time 5334.39ms 
iter 3941: loss 2.5883, time 5297.91ms 
iter 3942: loss 2.6439, time 5279.79ms 
iter 3943: loss 2.5827, time 5293.25ms 
iter 3944: loss 2.6343, time 5314.20ms 
iter 3945: loss 2.4779, time 5301.96ms 
iter 3946: loss 2.7358, time 5297.74ms 
iter 3947: loss 2.4042, time 5295.44ms 
iter 3948: loss 2.5345, time 5298.44ms 
iter 3949: loss 2.6369, time 5299.80ms 
step 3950: train loss 2.6455, val loss 2.8678
iter 3950: loss 2.6261, time 20228.72ms 
iter 3951: loss 2.6791, time 5308.02ms 
iter 3952: loss 2.6578, time 5210.82ms 
iter 3953: loss 2.4572, time 5304.19ms 
iter 3954: loss 2.6509, time 5306.22ms 
iter 3955: loss 2.5231, time 5317.83ms 
iter 3956: loss 2.3972, time 5295.16ms 
iter 3957: loss 2.7836, time 5271.98ms 
iter 3958: loss 2.6122, time 5084.04ms 
iter 3959: loss 2.7465, time 5049.65ms 
iter 3960: loss 2.4652, time 5173.86ms 
iter 3961: loss 2.7349, time 5347.62ms 
iter 3962: loss 2.5809, time 5330.43ms 
iter 3963: loss 2.6525, time 5307.54ms 
iter 3964: loss 2.7761, time 5335.58ms 
iter 3965: loss 2.5929, time 5317.92ms 
iter 3966: loss 2.7225, time 5253.15ms 
iter 3967: loss 2.5995, time 5308.26ms 
iter 3968: loss 2.8012, time 5345.85ms 
iter 3969: loss 2.7938, time 5258.14ms 
iter 3970: loss 2.6606, time 5299.46ms 
iter 3971: loss 2.5724, time 5275.01ms 
iter 3972: loss 2.6755, time 5214.10ms 
iter 3973: loss 2.7995, time 5244.47ms 
iter 3974: loss 2.5577, time 5239.42ms 
iter 3975: loss 2.5253, time 5237.36ms 
iter 3976: loss 2.6261, time 5250.85ms 
iter 3977: loss 2.6771, time 5234.28ms 
iter 3978: loss 2.5950, time 5250.27ms 
iter 3979: loss 2.6404, time 5105.91ms 
iter 3980: loss 2.6011, time 5226.44ms 
iter 3981: loss 2.7617, time 5226.60ms 
iter 3982: loss 2.7319, time 5236.16ms 
iter 3983: loss 2.8007, time 5304.65ms 
iter 3984: loss 2.6175, time 5304.14ms 
iter 3985: loss 2.5792, time 5238.48ms 
iter 3986: loss 2.4996, time 5221.66ms 
iter 3987: loss 2.5052, time 5224.10ms 
iter 3988: loss 2.4841, time 5330.15ms 
iter 3989: loss 2.7767, time 5291.67ms 
iter 3990: loss 2.7161, time 5312.93ms 
iter 3991: loss 2.5828, time 5289.99ms 
iter 3992: loss 2.7139, time 5314.33ms 
iter 3993: loss 2.5168, time 5291.08ms 
iter 3994: loss 2.4780, time 5311.73ms 
iter 3995: loss 2.5115, time 5287.23ms 
iter 3996: loss 2.5655, time 5298.83ms 
iter 3997: loss 2.6545, time 5270.07ms 
iter 3998: loss 2.5080, time 5298.40ms 
iter 3999: loss 2.7755, time 5290.20ms 
step 4000: train loss 2.6642, val loss 2.8418
iter 4000: loss 2.6923, time 20000.11ms 
iter 4001: loss 2.5623, time 5298.36ms 
iter 4002: loss 2.7271, time 5301.67ms 
iter 4003: loss 2.6959, time 5302.61ms 
iter 4004: loss 2.4395, time 5294.35ms 
iter 4005: loss 2.5464, time 5294.26ms 
iter 4006: loss 2.9142, time 5301.85ms 
iter 4007: loss 2.8676, time 5289.28ms 
iter 4008: loss 2.4936, time 5311.81ms 
iter 4009: loss 2.7276, time 5049.95ms 
iter 4010: loss 2.6710, time 5053.65ms 
iter 4011: loss 2.4631, time 5039.31ms 
iter 4012: loss 2.7259, time 5037.66ms 
iter 4013: loss 2.5611, time 5087.70ms 
iter 4014: loss 2.5448, time 5089.55ms 
iter 4015: loss 2.6409, time 5078.65ms 
iter 4016: loss 2.6608, time 5069.28ms 
iter 4017: loss 2.7080, time 5062.03ms 
iter 4018: loss 2.6923, time 5060.83ms 
iter 4019: loss 2.6808, time 4992.66ms 
iter 4020: loss 2.4729, time 5070.66ms 
iter 4021: loss 2.5303, time 5086.27ms 
iter 4022: loss 2.7592, time 5153.42ms 
iter 4023: loss 2.7008, time 5231.02ms 
iter 4024: loss 2.6237, time 5229.37ms 
iter 4025: loss 2.7999, time 5250.01ms 
iter 4026: loss 2.5810, time 5269.18ms 
iter 4027: loss 2.6172, time 5217.23ms 
iter 4028: loss 2.6181, time 5104.48ms 
iter 4029: loss 2.6861, time 5061.81ms 
iter 4030: loss 2.4017, time 4968.08ms 
iter 4031: loss 2.8925, time 5036.17ms 
iter 4032: loss 2.6281, time 5200.95ms 
iter 4033: loss 2.9562, time 5220.59ms 
iter 4034: loss 2.8488, time 5234.99ms 
iter 4035: loss 2.7097, time 5276.47ms 
iter 4036: loss 2.4594, time 5275.61ms 
iter 4037: loss 2.5850, time 5268.44ms 
iter 4038: loss 2.3741, time 5198.22ms 
iter 4039: loss 2.9093, time 5205.95ms 
iter 4040: loss 2.8469, time 5264.39ms 
iter 4041: loss 2.6027, time 5258.82ms 
iter 4042: loss 2.7335, time 5275.46ms 
iter 4043: loss 2.7950, time 5281.38ms 
iter 4044: loss 2.6305, time 5262.01ms 
iter 4045: loss 2.7116, time 5272.15ms 
iter 4046: loss 2.6559, time 5288.29ms 
iter 4047: loss 2.6832, time 5294.43ms 
iter 4048: loss 2.7331, time 5305.10ms 
iter 4049: loss 2.4200, time 5273.10ms 
step 4050: train loss 2.6313, val loss 2.8467
iter 4050: loss 2.5378, time 20063.34ms 
iter 4051: loss 2.7851, time 5277.42ms 
iter 4052: loss 2.4979, time 5287.03ms 
iter 4053: loss 3.0439, time 5282.90ms 
iter 4054: loss 2.7033, time 5285.34ms 
iter 4055: loss 2.6436, time 5282.09ms 
iter 4056: loss 2.5972, time 5273.87ms 
iter 4057: loss 2.8965, time 5274.04ms 
iter 4058: loss 2.6230, time 5264.96ms 
iter 4059: loss 2.6567, time 5305.86ms 
iter 4060: loss 2.6265, time 5242.89ms 
iter 4061: loss 2.5734, time 5307.85ms 
iter 4062: loss 2.4571, time 5291.92ms 
iter 4063: loss 2.6219, time 5284.85ms 
iter 4064: loss 2.7214, time 5305.34ms 
iter 4065: loss 2.6881, time 5282.17ms 
iter 4066: loss 2.6455, time 5294.52ms 
iter 4067: loss 2.7172, time 5266.78ms 
iter 4068: loss 2.7670, time 5298.74ms 
iter 4069: loss 2.5987, time 5271.04ms 
iter 4070: loss 2.7586, time 5275.73ms 
iter 4071: loss 2.5218, time 5306.41ms 
iter 4072: loss 2.5331, time 5317.62ms 
iter 4073: loss 2.6975, time 5298.95ms 
iter 4074: loss 2.6857, time 5245.14ms 
iter 4075: loss 2.7264, time 5292.49ms 
iter 4076: loss 2.5606, time 5296.20ms 
iter 4077: loss 2.5545, time 5283.14ms 
iter 4078: loss 2.6292, time 5287.35ms 
iter 4079: loss 2.6276, time 5304.61ms 
iter 4080: loss 2.6247, time 5306.64ms 
iter 4081: loss 2.6532, time 5300.95ms 
iter 4082: loss 2.7744, time 5312.11ms 
iter 4083: loss 2.5287, time 5316.76ms 
iter 4084: loss 2.6682, time 5268.53ms 
iter 4085: loss 2.7595, time 5297.26ms 
iter 4086: loss 2.6533, time 5288.34ms 
iter 4087: loss 2.6751, time 5293.28ms 
iter 4088: loss 2.6830, time 5289.35ms 
iter 4089: loss 2.5650, time 5046.10ms 
iter 4090: loss 2.6862, time 5190.24ms 
iter 4091: loss 2.7129, time 5249.65ms 
iter 4092: loss 2.4443, time 5283.08ms 
iter 4093: loss 2.6472, time 5291.30ms 
iter 4094: loss 2.7620, time 5277.54ms 
iter 4095: loss 2.5817, time 5291.40ms 
iter 4096: loss 2.7485, time 5297.45ms 
iter 4097: loss 2.6652, time 5317.74ms 
iter 4098: loss 2.7234, time 5258.10ms 
iter 4099: loss 2.6801, time 5292.66ms 
step 4100: train loss 2.6381, val loss 2.8345
iter 4100: loss 2.6674, time 20010.65ms 
iter 4101: loss 2.7269, time 5299.24ms 
iter 4102: loss 2.5886, time 5292.95ms 
iter 4103: loss 2.6476, time 5298.55ms 
iter 4104: loss 2.5884, time 5308.55ms 
iter 4105: loss 2.8661, time 5298.18ms 
iter 4106: loss 2.7064, time 5319.61ms 
iter 4107: loss 2.5971, time 5270.74ms 
iter 4108: loss 2.5390, time 5306.33ms 
iter 4109: loss 2.5451, time 5247.68ms 
iter 4110: loss 2.7053, time 5282.64ms 
iter 4111: loss 2.5947, time 5297.25ms 
iter 4112: loss 2.7766, time 5289.86ms 
iter 4113: loss 2.6495, time 5297.66ms 
iter 4114: loss 2.6785, time 5305.70ms 
iter 4115: loss 2.5603, time 5283.91ms 
iter 4116: loss 2.7521, time 5304.55ms 
iter 4117: loss 2.4479, time 5305.20ms 
iter 4118: loss 2.6439, time 5223.90ms 
iter 4119: loss 2.5540, time 5299.32ms 
iter 4120: loss 2.7442, time 5305.86ms 
iter 4121: loss 2.5310, time 5298.53ms 
iter 4122: loss 2.6625, time 5315.39ms 
iter 4123: loss 2.5125, time 5315.15ms 
iter 4124: loss 2.7551, time 5304.52ms 
iter 4125: loss 2.7765, time 5308.29ms 
iter 4126: loss 2.4896, time 5308.27ms 
iter 4127: loss 2.7146, time 5307.24ms 
iter 4128: loss 2.7062, time 5293.76ms 
iter 4129: loss 2.6714, time 5317.41ms 
iter 4130: loss 2.7479, time 5269.39ms 
iter 4131: loss 2.4182, time 5289.53ms 
iter 4132: loss 2.7540, time 5238.17ms 
iter 4133: loss 2.7455, time 5313.23ms 
iter 4134: loss 2.5770, time 5308.55ms 
iter 4135: loss 2.6185, time 5296.65ms 
iter 4136: loss 2.4477, time 5293.98ms 
iter 4137: loss 2.5806, time 5292.93ms 
iter 4138: loss 2.5322, time 5261.26ms 
iter 4139: loss 2.6627, time 5282.03ms 
iter 4140: loss 2.6201, time 5283.52ms 
iter 4141: loss 2.5973, time 5315.74ms 
iter 4142: loss 2.6448, time 5255.18ms 
iter 4143: loss 2.6482, time 5273.12ms 
iter 4144: loss 2.6520, time 5302.50ms 
iter 4145: loss 2.5725, time 5298.92ms 
iter 4146: loss 2.5493, time 5310.29ms 
iter 4147: loss 2.8445, time 5253.91ms 
iter 4148: loss 2.8130, time 5297.98ms 
iter 4149: loss 2.6899, time 5300.14ms 
step 4150: train loss 2.6371, val loss 2.8650
iter 4150: loss 2.9070, time 20135.44ms 
iter 4151: loss 2.5779, time 5307.35ms 
iter 4152: loss 2.7651, time 5300.72ms 
iter 4153: loss 2.6638, time 5295.73ms 
iter 4154: loss 2.8402, time 5295.69ms 
iter 4155: loss 2.8897, time 5311.34ms 
iter 4156: loss 2.5981, time 5314.07ms 
iter 4157: loss 2.5989, time 5284.89ms 
iter 4158: loss 2.7285, time 5301.64ms 
iter 4159: loss 2.6832, time 5255.28ms 
iter 4160: loss 2.5408, time 5286.99ms 
iter 4161: loss 2.6626, time 5303.83ms 
iter 4162: loss 2.5745, time 5310.98ms 
iter 4163: loss 2.4059, time 5303.10ms 
iter 4164: loss 2.6218, time 5310.27ms 
iter 4165: loss 2.5831, time 5314.44ms 
iter 4166: loss 2.3988, time 5309.07ms 
iter 4167: loss 2.7590, time 5320.30ms 
iter 4168: loss 2.6441, time 5300.75ms 
iter 4169: loss 2.6821, time 5303.09ms 
iter 4170: loss 2.5662, time 5296.23ms 
iter 4171: loss 2.7208, time 5300.66ms 
iter 4172: loss 2.6172, time 5301.14ms 
iter 4173: loss 2.7320, time 5302.05ms 
iter 4174: loss 2.6568, time 5311.62ms 
iter 4175: loss 2.5269, time 5298.80ms 
iter 4176: loss 2.7504, time 5311.61ms 
iter 4177: loss 2.5071, time 5311.08ms 
iter 4178: loss 2.5664, time 5310.01ms 
iter 4179: loss 2.6456, time 5292.55ms 
iter 4180: loss 2.6060, time 5303.84ms 
iter 4181: loss 2.6116, time 5289.25ms 
iter 4182: loss 2.7626, time 5300.90ms 
iter 4183: loss 2.5585, time 5308.04ms 
iter 4184: loss 2.6711, time 5299.63ms 
iter 4185: loss 2.7022, time 5317.12ms 
iter 4186: loss 2.5555, time 5377.98ms 
iter 4187: loss 2.8866, time 5356.53ms 
iter 4188: loss 2.8877, time 5380.57ms 
iter 4189: loss 2.6983, time 5393.05ms 
iter 4190: loss 2.4888, time 5325.39ms 
iter 4191: loss 2.7894, time 5347.64ms 
iter 4192: loss 2.6321, time 5423.50ms 
iter 4193: loss 2.7465, time 5420.94ms 
iter 4194: loss 2.5234, time 5402.56ms 
iter 4195: loss 2.6652, time 5320.45ms 
iter 4196: loss 2.6431, time 5311.47ms 
iter 4197: loss 2.5157, time 5330.78ms 
iter 4198: loss 2.7628, time 5423.46ms 
iter 4199: loss 2.5613, time 5328.12ms 
step 4200: train loss 2.6319, val loss 2.8396
iter 4200: loss 2.6323, time 20171.78ms 
iter 4201: loss 2.7654, time 5297.40ms 
iter 4202: loss 2.5909, time 5313.34ms 
iter 4203: loss 2.7588, time 5315.64ms 
iter 4204: loss 2.6874, time 5304.53ms 
iter 4205: loss 2.5158, time 5302.59ms 
iter 4206: loss 2.7506, time 5309.23ms 
iter 4207: loss 2.7464, time 5312.96ms 
iter 4208: loss 2.6573, time 5295.68ms 
iter 4209: loss 2.4901, time 5304.89ms 
iter 4210: loss 2.8168, time 5294.64ms 
iter 4211: loss 2.6577, time 5311.70ms 
iter 4212: loss 2.4125, time 5298.02ms 
iter 4213: loss 2.9014, time 5274.08ms 
iter 4214: loss 2.5170, time 5297.37ms 
iter 4215: loss 2.6921, time 5291.48ms 
iter 4216: loss 2.6999, time 5180.61ms 
iter 4217: loss 2.7350, time 5315.42ms 
iter 4218: loss 2.4401, time 5313.84ms 
iter 4219: loss 2.6593, time 5287.38ms 
iter 4220: loss 2.6779, time 5261.25ms 
iter 4221: loss 2.5547, time 5311.83ms 
iter 4222: loss 2.6780, time 5303.34ms 
iter 4223: loss 2.5558, time 5280.57ms 
iter 4224: loss 2.8917, time 5305.17ms 
iter 4225: loss 2.7218, time 5297.07ms 
iter 4226: loss 2.5037, time 5312.68ms 
iter 4227: loss 2.8625, time 5309.35ms 
iter 4228: loss 2.7568, time 5305.94ms 
iter 4229: loss 2.3705, time 5306.22ms 
iter 4230: loss 2.7064, time 5264.05ms 
iter 4231: loss 2.6260, time 5292.64ms 
iter 4232: loss 2.6280, time 5301.88ms 
iter 4233: loss 2.2947, time 5294.95ms 
iter 4234: loss 2.5360, time 5314.59ms 
iter 4235: loss 2.6587, time 5263.68ms 
iter 4236: loss 2.5211, time 5293.43ms 
iter 4237: loss 2.6537, time 5269.86ms 
iter 4238: loss 2.6109, time 5291.67ms 
iter 4239: loss 2.7174, time 5308.13ms 
iter 4240: loss 2.6254, time 5316.82ms 
iter 4241: loss 2.6010, time 5318.11ms 
iter 4242: loss 2.8733, time 5314.89ms 
iter 4243: loss 2.5244, time 5310.91ms 
iter 4244: loss 2.7446, time 5296.43ms 
iter 4245: loss 2.7338, time 5099.64ms 
iter 4246: loss 2.5732, time 5073.88ms 
iter 4247: loss 2.5548, time 5065.22ms 
iter 4248: loss 2.9047, time 5204.77ms 
iter 4249: loss 2.6687, time 5305.24ms 
step 4250: train loss 2.6324, val loss 2.8417
iter 4250: loss 2.6222, time 20189.89ms 
iter 4251: loss 2.6571, time 5303.01ms 
iter 4252: loss 2.5993, time 5313.35ms 
iter 4253: loss 2.4502, time 5308.06ms 
iter 4254: loss 2.8127, time 5297.11ms 
iter 4255: loss 2.6853, time 5296.79ms 
iter 4256: loss 2.7155, time 5301.09ms 
iter 4257: loss 2.6292, time 5298.85ms 
iter 4258: loss 2.6448, time 5290.69ms 
iter 4259: loss 2.6226, time 5302.54ms 
iter 4260: loss 2.5841, time 5306.64ms 
iter 4261: loss 2.6873, time 5213.69ms 
iter 4262: loss 2.6788, time 5071.36ms 
iter 4263: loss 2.7567, time 5068.37ms 
iter 4264: loss 2.4987, time 5073.85ms 
iter 4265: loss 2.7110, time 5089.36ms 
iter 4266: loss 2.6398, time 5271.46ms 
iter 4267: loss 2.5375, time 5307.42ms 
iter 4268: loss 2.6510, time 5313.77ms 
iter 4269: loss 2.8697, time 5072.83ms 
iter 4270: loss 2.7582, time 5063.27ms 
iter 4271: loss 2.5177, time 5086.86ms 
iter 4272: loss 2.6216, time 5069.95ms 
iter 4273: loss 2.7351, time 5062.97ms 
iter 4274: loss 2.8562, time 5062.42ms 
iter 4275: loss 2.7894, time 5221.33ms 
iter 4276: loss 2.6387, time 5316.32ms 
iter 4277: loss 2.7538, time 5130.71ms 
iter 4278: loss 2.5035, time 5076.01ms 
iter 4279: loss 3.0547, time 5108.96ms 
iter 4280: loss 2.6994, time 5327.28ms 
iter 4281: loss 2.6524, time 5306.65ms 
iter 4282: loss 2.4668, time 5318.77ms 
iter 4283: loss 2.5657, time 5166.19ms 
iter 4284: loss 2.5445, time 5068.16ms 
iter 4285: loss 2.7830, time 5072.11ms 
iter 4286: loss 2.6612, time 5197.17ms 
iter 4287: loss 2.6553, time 5324.93ms 
iter 4288: loss 2.5710, time 5313.16ms 
iter 4289: loss 2.3538, time 5299.00ms 
iter 4290: loss 2.7848, time 5305.52ms 
iter 4291: loss 2.6670, time 5309.49ms 
iter 4292: loss 2.6725, time 5300.02ms 
iter 4293: loss 2.5532, time 5263.82ms 
iter 4294: loss 2.5312, time 5271.04ms 
iter 4295: loss 2.3439, time 5263.44ms 
iter 4296: loss 2.7644, time 5306.26ms 
iter 4297: loss 2.6293, time 5310.23ms 
iter 4298: loss 2.7156, time 5302.41ms 
iter 4299: loss 2.5308, time 5320.57ms 
step 4300: train loss 2.6238, val loss 2.8431
iter 4300: loss 2.7672, time 20236.66ms 
iter 4301: loss 2.7316, time 5305.16ms 
iter 4302: loss 2.6978, time 5304.46ms 
iter 4303: loss 2.5848, time 5299.81ms 
iter 4304: loss 2.6242, time 5306.25ms 
iter 4305: loss 2.8043, time 5305.00ms 
iter 4306: loss 2.5439, time 5327.49ms 
iter 4307: loss 2.6419, time 5305.95ms 
iter 4308: loss 2.9124, time 5298.18ms 
iter 4309: loss 2.5905, time 5302.74ms 
iter 4310: loss 2.7814, time 5346.92ms 
iter 4311: loss 2.6794, time 5304.75ms 
iter 4312: loss 2.6470, time 5309.11ms 
iter 4313: loss 2.4980, time 5298.94ms 
iter 4314: loss 2.6831, time 5318.70ms 
iter 4315: loss 2.4855, time 5320.34ms 
iter 4316: loss 2.5843, time 5228.47ms 
iter 4317: loss 2.6850, time 5069.78ms 
iter 4318: loss 2.5604, time 5047.23ms 
iter 4319: loss 2.7658, time 5047.33ms 
iter 4320: loss 2.5574, time 5047.57ms 
iter 4321: loss 2.7398, time 5057.49ms 
iter 4322: loss 2.5612, time 5193.14ms 
iter 4323: loss 2.6430, time 5201.18ms 
iter 4324: loss 2.5677, time 5298.92ms 
iter 4325: loss 2.6338, time 5287.50ms 
iter 4326: loss 2.6436, time 5200.71ms 
iter 4327: loss 2.4690, time 5158.77ms 
iter 4328: loss 2.7751, time 5281.58ms 
iter 4329: loss 2.6109, time 5260.38ms 
iter 4330: loss 2.6191, time 5299.83ms 
iter 4331: loss 2.5917, time 5255.56ms 
iter 4332: loss 2.5022, time 5096.67ms 
iter 4333: loss 2.5520, time 5002.44ms 
iter 4334: loss 2.7250, time 5052.67ms 
iter 4335: loss 2.5726, time 5143.01ms 
iter 4336: loss 2.9161, time 5023.71ms 
iter 4337: loss 2.7328, time 5117.27ms 
iter 4338: loss 2.5436, time 5302.60ms 
iter 4339: loss 2.6832, time 5297.66ms 
iter 4340: loss 2.4421, time 5304.34ms 
iter 4341: loss 2.7206, time 5316.06ms 
iter 4342: loss 2.7668, time 5311.82ms 
iter 4343: loss 2.6188, time 5312.05ms 
iter 4344: loss 2.9153, time 5167.48ms 
iter 4345: loss 2.8653, time 5122.27ms 
iter 4346: loss 2.6601, time 5090.80ms 
iter 4347: loss 2.5413, time 5027.19ms 
iter 4348: loss 2.5721, time 4978.41ms 
iter 4349: loss 2.6454, time 4975.88ms 
step 4350: train loss 2.6300, val loss 2.8402
iter 4350: loss 2.6456, time 20068.54ms 
iter 4351: loss 2.6478, time 5307.76ms 
iter 4352: loss 2.6886, time 5308.37ms 
iter 4353: loss 2.7416, time 5308.64ms 
iter 4354: loss 2.7401, time 5304.34ms 
iter 4355: loss 2.6005, time 5306.36ms 
iter 4356: loss 2.6118, time 5322.16ms 
iter 4357: loss 2.7505, time 5319.16ms 
iter 4358: loss 2.7120, time 5097.72ms 
iter 4359: loss 2.6238, time 5199.49ms 
iter 4360: loss 2.5791, time 5309.72ms 
iter 4361: loss 2.8932, time 5270.78ms 
iter 4362: loss 2.4572, time 5270.20ms 
iter 4363: loss 2.5215, time 5311.41ms 
iter 4364: loss 2.7908, time 5307.22ms 
iter 4365: loss 2.4803, time 5315.37ms 
iter 4366: loss 2.6428, time 5321.01ms 
iter 4367: loss 2.6308, time 5310.89ms 
iter 4368: loss 2.4692, time 5339.44ms 
iter 4369: loss 2.5636, time 5296.80ms 
iter 4370: loss 2.4404, time 5302.98ms 
iter 4371: loss 2.5957, time 5316.25ms 
iter 4372: loss 2.6325, time 5309.77ms 
iter 4373: loss 2.6865, time 5296.63ms 
iter 4374: loss 2.5356, time 5304.46ms 
iter 4375: loss 2.5151, time 5299.65ms 
iter 4376: loss 2.6579, time 5301.56ms 
iter 4377: loss 2.5657, time 5271.15ms 
iter 4378: loss 2.6487, time 5305.58ms 
iter 4379: loss 2.5944, time 5290.34ms 
iter 4380: loss 2.6392, time 5293.09ms 
iter 4381: loss 2.4413, time 5289.49ms 
iter 4382: loss 2.5329, time 5296.28ms 
iter 4383: loss 2.7562, time 5300.28ms 
iter 4384: loss 2.6411, time 5282.93ms 
iter 4385: loss 2.6482, time 5235.48ms 
iter 4386: loss 2.5892, time 5186.41ms 
iter 4387: loss 2.6048, time 5279.34ms 
iter 4388: loss 2.7084, time 5313.83ms 
iter 4389: loss 2.4769, time 5307.96ms 
iter 4390: loss 2.5925, time 5303.16ms 
iter 4391: loss 2.6987, time 5299.22ms 
iter 4392: loss 2.5693, time 5304.58ms 
iter 4393: loss 2.6801, time 5313.62ms 
iter 4394: loss 2.6949, time 5318.44ms 
iter 4395: loss 2.7140, time 5308.68ms 
iter 4396: loss 2.7304, time 5319.45ms 
iter 4397: loss 2.6787, time 5312.37ms 
iter 4398: loss 2.4579, time 5313.13ms 
iter 4399: loss 2.7263, time 5350.89ms 
step 4400: train loss 2.6164, val loss 2.8428
iter 4400: loss 2.5661, time 20031.61ms 
iter 4401: loss 2.4394, time 5308.59ms 
iter 4402: loss 2.7870, time 5302.65ms 
iter 4403: loss 2.8543, time 5305.94ms 
iter 4404: loss 2.7306, time 5306.22ms 
iter 4405: loss 2.9033, time 5224.62ms 
iter 4406: loss 2.5112, time 5307.85ms 
iter 4407: loss 2.7236, time 5308.83ms 
iter 4408: loss 2.6102, time 5298.26ms 
iter 4409: loss 2.5054, time 5297.83ms 
iter 4410: loss 2.7607, time 5299.54ms 
iter 4411: loss 2.7423, time 5298.00ms 
iter 4412: loss 2.4747, time 5237.10ms 
iter 4413: loss 2.5745, time 5082.80ms 
iter 4414: loss 2.3356, time 5265.90ms 
iter 4415: loss 2.5343, time 5192.91ms 
iter 4416: loss 2.7170, time 5132.33ms 
iter 4417: loss 2.6722, time 5104.48ms 
iter 4418: loss 2.7462, time 5109.44ms 
iter 4419: loss 2.5477, time 5098.42ms 
iter 4420: loss 2.6331, time 5120.92ms 
iter 4421: loss 2.5638, time 5304.77ms 
iter 4422: loss 2.6194, time 5311.92ms 
iter 4423: loss 2.7556, time 5320.60ms 
iter 4424: loss 2.4874, time 5315.04ms 
iter 4425: loss 2.5646, time 5267.12ms 
iter 4426: loss 2.7204, time 5305.18ms 
iter 4427: loss 2.8219, time 5301.78ms 
iter 4428: loss 2.4587, time 5295.22ms 
iter 4429: loss 2.6203, time 5308.68ms 
iter 4430: loss 2.7778, time 5305.87ms 
iter 4431: loss 2.5988, time 5302.00ms 
iter 4432: loss 2.4533, time 5297.03ms 
iter 4433: loss 2.5788, time 5304.77ms 
iter 4434: loss 2.5754, time 5321.75ms 
iter 4435: loss 2.5910, time 5313.92ms 
iter 4436: loss 2.4930, time 5290.96ms 
iter 4437: loss 2.5935, time 5305.35ms 
iter 4438: loss 2.6565, time 5307.92ms 
iter 4439: loss 2.5086, time 5309.25ms 
iter 4440: loss 2.6515, time 5316.52ms 
iter 4441: loss 2.6190, time 5315.77ms 
iter 4442: loss 2.6168, time 5302.47ms 
iter 4443: loss 2.5000, time 5298.06ms 
iter 4444: loss 2.6426, time 5298.48ms 
iter 4445: loss 2.6462, time 5307.90ms 
iter 4446: loss 2.7935, time 5313.64ms 
iter 4447: loss 2.5798, time 5314.30ms 
iter 4448: loss 2.7419, time 5310.58ms 
iter 4449: loss 2.7245, time 5306.22ms 
step 4450: train loss 2.6220, val loss 2.8451
iter 4450: loss 2.6147, time 20175.88ms 
iter 4451: loss 2.5245, time 5300.82ms 
iter 4452: loss 2.5787, time 5292.80ms 
iter 4453: loss 2.4387, time 5289.14ms 
iter 4454: loss 2.8833, time 5308.09ms 
iter 4455: loss 2.6696, time 5299.64ms 
iter 4456: loss 2.6632, time 5233.82ms 
iter 4457: loss 2.5721, time 5208.46ms 
iter 4458: loss 2.3497, time 5218.64ms 
iter 4459: loss 2.6704, time 5243.72ms 
iter 4460: loss 2.5954, time 5295.99ms 
iter 4461: loss 2.6101, time 5292.30ms 
iter 4462: loss 2.4757, time 5303.96ms 
iter 4463: loss 2.7496, time 5303.04ms 
iter 4464: loss 2.6577, time 5277.74ms 
iter 4465: loss 2.4814, time 5282.56ms 
iter 4466: loss 2.5096, time 5288.66ms 
iter 4467: loss 2.7150, time 5289.18ms 
iter 4468: loss 2.5688, time 5303.35ms 
iter 4469: loss 2.5799, time 5313.37ms 
iter 4470: loss 2.4861, time 5285.91ms 
iter 4471: loss 2.4320, time 5296.64ms 
iter 4472: loss 2.5189, time 5292.20ms 
iter 4473: loss 2.6737, time 5303.44ms 
iter 4474: loss 2.4730, time 5309.45ms 
iter 4475: loss 2.4070, time 5309.52ms 
iter 4476: loss 2.6035, time 5284.10ms 
iter 4477: loss 2.5992, time 5287.07ms 
iter 4478: loss 2.6237, time 5284.77ms 
iter 4479: loss 2.7819, time 5295.30ms 
iter 4480: loss 2.6261, time 5298.02ms 
iter 4481: loss 2.6543, time 5283.54ms 
iter 4482: loss 2.7253, time 5158.22ms 
iter 4483: loss 2.6143, time 5257.27ms 
iter 4484: loss 2.7445, time 5264.49ms 
iter 4485: loss 2.2249, time 5274.57ms 
iter 4486: loss 2.7404, time 5295.09ms 
iter 4487: loss 2.6074, time 5304.88ms 
iter 4488: loss 2.4255, time 5300.31ms 
iter 4489: loss 2.7503, time 5302.43ms 
iter 4490: loss 2.7423, time 5297.78ms 
iter 4491: loss 2.4850, time 5308.15ms 
iter 4492: loss 2.6493, time 5300.66ms 
iter 4493: loss 2.6568, time 5292.11ms 
iter 4494: loss 2.6098, time 5301.15ms 
iter 4495: loss 2.6023, time 5296.91ms 
iter 4496: loss 2.5971, time 5298.95ms 
iter 4497: loss 2.5500, time 5291.12ms 
iter 4498: loss 2.2996, time 5301.98ms 
iter 4499: loss 2.4130, time 5298.84ms 
step 4500: train loss 2.6067, val loss 2.8288
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 4500: loss 2.6944, time 21308.90ms 
iter 4501: loss 2.5846, time 5287.62ms 
iter 4502: loss 2.4813, time 5293.93ms 
iter 4503: loss 2.4929, time 5294.21ms 
iter 4504: loss 2.6850, time 5289.01ms 
iter 4505: loss 2.7829, time 5297.34ms 
iter 4506: loss 2.6541, time 5299.60ms 
iter 4507: loss 2.5423, time 5291.49ms 
iter 4508: loss 2.5113, time 5302.32ms 
iter 4509: loss 2.6225, time 5249.33ms 
iter 4510: loss 2.5399, time 5252.56ms 
iter 4511: loss 2.4848, time 5268.00ms 
iter 4512: loss 2.7187, time 5294.39ms 
iter 4513: loss 2.5298, time 5289.16ms 
iter 4514: loss 2.2750, time 5285.02ms 
iter 4515: loss 2.6105, time 5289.99ms 
iter 4516: loss 2.5165, time 5288.37ms 
iter 4517: loss 2.5286, time 5304.75ms 
iter 4518: loss 2.5523, time 5281.36ms 
iter 4519: loss 2.6374, time 5287.78ms 
iter 4520: loss 2.5741, time 5290.49ms 
iter 4521: loss 2.6238, time 5302.12ms 
iter 4522: loss 2.4980, time 5296.76ms 
iter 4523: loss 2.5828, time 5307.91ms 
iter 4524: loss 2.5989, time 5307.34ms 
iter 4525: loss 2.7401, time 5289.96ms 
iter 4526: loss 2.5667, time 5269.16ms 
iter 4527: loss 2.6498, time 5306.89ms 
iter 4528: loss 2.5404, time 5301.24ms 
iter 4529: loss 2.6222, time 5299.66ms 
iter 4530: loss 2.6517, time 5314.91ms 
iter 4531: loss 2.6310, time 5286.47ms 
iter 4532: loss 2.6379, time 5285.22ms 
iter 4533: loss 2.6891, time 5292.58ms 
iter 4534: loss 2.5913, time 5311.57ms 
iter 4535: loss 2.5323, time 5299.80ms 
iter 4536: loss 2.6686, time 5289.82ms 
iter 4537: loss 2.7804, time 5274.64ms 
iter 4538: loss 2.6309, time 5309.26ms 
iter 4539: loss 2.5682, time 5295.51ms 
iter 4540: loss 2.6978, time 5290.85ms 
iter 4541: loss 2.8464, time 5266.47ms 
iter 4542: loss 2.7415, time 5289.75ms 
iter 4543: loss 2.3942, time 5295.50ms 
iter 4544: loss 2.6228, time 5289.85ms 
iter 4545: loss 2.7163, time 5298.08ms 
iter 4546: loss 2.6907, time 5231.02ms 
iter 4547: loss 2.5467, time 5274.06ms 
iter 4548: loss 2.7075, time 5089.95ms 
iter 4549: loss 2.5612, time 5072.16ms 
step 4550: train loss 2.5993, val loss 2.8310
iter 4550: loss 2.5836, time 20048.28ms 
iter 4551: loss 2.6079, time 5299.86ms 
iter 4552: loss 2.5793, time 5271.76ms 
iter 4553: loss 2.6316, time 5297.97ms 
iter 4554: loss 2.7476, time 5301.81ms 
iter 4555: loss 2.5656, time 5310.42ms 
iter 4556: loss 2.4303, time 5313.08ms 
iter 4557: loss 2.5618, time 5299.83ms 
iter 4558: loss 2.7122, time 5298.24ms 
iter 4559: loss 2.6732, time 5303.82ms 
iter 4560: loss 2.6948, time 5302.45ms 
iter 4561: loss 2.6854, time 5302.18ms 
iter 4562: loss 2.5478, time 5293.07ms 
iter 4563: loss 2.6687, time 5282.09ms 
iter 4564: loss 2.5627, time 5294.95ms 
iter 4565: loss 2.5466, time 5284.33ms 
iter 4566: loss 2.3949, time 5284.95ms 
iter 4567: loss 2.6823, time 5208.80ms 
iter 4568: loss 2.8472, time 5274.25ms 
iter 4569: loss 2.4939, time 5282.21ms 
iter 4570: loss 2.4990, time 5278.45ms 
iter 4571: loss 2.7074, time 5296.04ms 
iter 4572: loss 2.7488, time 5296.34ms 
iter 4573: loss 2.6146, time 5280.63ms 
iter 4574: loss 2.6070, time 5239.04ms 
iter 4575: loss 2.7182, time 5228.52ms 
iter 4576: loss 2.4270, time 5228.66ms 
iter 4577: loss 2.5111, time 5240.94ms 
iter 4578: loss 2.7305, time 5244.64ms 
iter 4579: loss 2.7819, time 5238.09ms 
iter 4580: loss 2.6457, time 5278.47ms 
iter 4581: loss 2.6393, time 5277.61ms 
iter 4582: loss 2.6500, time 5281.49ms 
iter 4583: loss 2.5151, time 5286.31ms 
iter 4584: loss 2.5456, time 5276.47ms 
iter 4585: loss 2.5059, time 5280.31ms 
iter 4586: loss 2.5992, time 5300.78ms 
iter 4587: loss 2.6687, time 5286.49ms 
iter 4588: loss 2.6018, time 5279.53ms 
iter 4589: loss 2.5302, time 5269.54ms 
iter 4590: loss 2.5665, time 5280.96ms 
iter 4591: loss 2.7592, time 5292.45ms 
iter 4592: loss 2.5165, time 5286.18ms 
iter 4593: loss 2.6293, time 5283.50ms 
iter 4594: loss 2.5166, time 5289.22ms 
iter 4595: loss 2.7767, time 5302.76ms 
iter 4596: loss 2.7943, time 5276.04ms 
iter 4597: loss 2.8133, time 5283.18ms 
iter 4598: loss 2.6821, time 5288.20ms 
iter 4599: loss 2.5899, time 5285.64ms 
step 4600: train loss 2.6100, val loss 2.8346
iter 4600: loss 2.5585, time 20110.05ms 
iter 4601: loss 2.5487, time 5293.08ms 
iter 4602: loss 2.5634, time 5285.57ms 
iter 4603: loss 2.4949, time 5286.61ms 
iter 4604: loss 2.6860, time 5276.90ms 
iter 4605: loss 2.7280, time 5280.18ms 
iter 4606: loss 2.7427, time 5282.20ms 
iter 4607: loss 2.5606, time 5289.26ms 
iter 4608: loss 2.6593, time 5292.44ms 
iter 4609: loss 2.7907, time 5284.51ms 
iter 4610: loss 2.4961, time 5283.23ms 
iter 4611: loss 2.4635, time 5279.26ms 
iter 4612: loss 2.6307, time 5289.01ms 
iter 4613: loss 2.8362, time 5288.17ms 
iter 4614: loss 2.5873, time 5278.61ms 
iter 4615: loss 2.5018, time 5278.58ms 
iter 4616: loss 2.7046, time 5280.34ms 
iter 4617: loss 2.9580, time 5284.39ms 
iter 4618: loss 2.7413, time 5285.75ms 
iter 4619: loss 2.5092, time 5290.80ms 
iter 4620: loss 2.6455, time 5284.00ms 
iter 4621: loss 2.4553, time 5294.22ms 
iter 4622: loss 2.5579, time 5291.36ms 
iter 4623: loss 2.8592, time 5298.31ms 
iter 4624: loss 2.5281, time 5261.98ms 
iter 4625: loss 2.6822, time 5289.34ms 
iter 4626: loss 2.7466, time 5282.51ms 
iter 4627: loss 2.5506, time 5281.53ms 
iter 4628: loss 2.2741, time 5277.66ms 
iter 4629: loss 2.7078, time 5287.26ms 
iter 4630: loss 2.6143, time 5288.56ms 
iter 4631: loss 2.6959, time 5283.98ms 
iter 4632: loss 2.5829, time 5278.65ms 
iter 4633: loss 2.5929, time 5286.13ms 
iter 4634: loss 2.6748, time 5290.69ms 
iter 4635: loss 2.4337, time 5282.59ms 
iter 4636: loss 2.9056, time 5287.98ms 
iter 4637: loss 2.5909, time 5274.66ms 
iter 4638: loss 2.3719, time 5281.04ms 
iter 4639: loss 2.7356, time 5287.56ms 
iter 4640: loss 2.7740, time 5294.14ms 
iter 4641: loss 2.3937, time 5278.85ms 
iter 4642: loss 2.4543, time 5284.17ms 
iter 4643: loss 2.6034, time 5285.63ms 
iter 4644: loss 2.6044, time 5278.59ms 
iter 4645: loss 2.6025, time 5292.12ms 
iter 4646: loss 2.4874, time 5294.52ms 
iter 4647: loss 2.4863, time 5295.74ms 
iter 4648: loss 2.5258, time 5279.68ms 
iter 4649: loss 2.6956, time 5285.75ms 
step 4650: train loss 2.6053, val loss 2.8276
iter 4650: loss 2.4580, time 20039.13ms 
iter 4651: loss 2.6088, time 5281.05ms 
iter 4652: loss 2.3086, time 5281.85ms 
iter 4653: loss 2.6835, time 5287.47ms 
iter 4654: loss 2.6388, time 5283.88ms 
iter 4655: loss 2.6469, time 5279.41ms 
iter 4656: loss 2.6199, time 5277.33ms 
iter 4657: loss 2.6376, time 5290.23ms 
iter 4658: loss 2.7061, time 5284.63ms 
iter 4659: loss 2.7455, time 5279.75ms 
iter 4660: loss 2.6473, time 5285.50ms 
iter 4661: loss 2.5806, time 5282.76ms 
iter 4662: loss 2.6769, time 5292.15ms 
iter 4663: loss 2.7604, time 5289.65ms 
iter 4664: loss 2.5562, time 5211.75ms 
iter 4665: loss 2.8039, time 5248.36ms 
iter 4666: loss 2.6351, time 5245.29ms 
iter 4667: loss 2.8299, time 5263.61ms 
iter 4668: loss 2.7661, time 4993.15ms 
iter 4669: loss 2.5218, time 5018.43ms 
iter 4670: loss 2.6504, time 5063.72ms 
iter 4671: loss 2.6891, time 5061.82ms 
iter 4672: loss 2.4863, time 5070.07ms 
iter 4673: loss 2.5974, time 5091.86ms 
iter 4674: loss 2.3062, time 5237.00ms 
iter 4675: loss 2.5045, time 5134.62ms 
iter 4676: loss 2.4268, time 5245.10ms 
iter 4677: loss 2.4505, time 5251.97ms 
iter 4678: loss 2.6409, time 5156.99ms 
iter 4679: loss 2.6427, time 5224.73ms 
iter 4680: loss 2.5206, time 5237.10ms 
iter 4681: loss 2.7662, time 5228.04ms 
iter 4682: loss 2.5039, time 5142.93ms 
iter 4683: loss 2.6619, time 5253.55ms 
iter 4684: loss 2.7982, time 5192.98ms 
iter 4685: loss 2.6555, time 5188.74ms 
iter 4686: loss 2.8253, time 5213.10ms 
iter 4687: loss 2.6706, time 5227.55ms 
iter 4688: loss 2.6928, time 5241.25ms 
iter 4689: loss 2.6774, time 5244.94ms 
iter 4690: loss 2.6670, time 5134.66ms 
iter 4691: loss 2.6807, time 5220.10ms 
iter 4692: loss 2.3663, time 5231.30ms 
iter 4693: loss 2.5783, time 5219.88ms 
iter 4694: loss 2.6004, time 5068.08ms 
iter 4695: loss 2.5437, time 5235.20ms 
iter 4696: loss 2.7365, time 5176.48ms 
iter 4697: loss 2.5823, time 5081.45ms 
iter 4698: loss 2.5512, time 5238.45ms 
iter 4699: loss 2.3831, time 5041.96ms 
step 4700: train loss 2.6089, val loss 2.8426
iter 4700: loss 2.6809, time 19754.01ms 
iter 4701: loss 2.7571, time 5199.99ms 
iter 4702: loss 2.6266, time 5267.79ms 
iter 4703: loss 2.6733, time 5266.37ms 
iter 4704: loss 2.6137, time 5248.94ms 
iter 4705: loss 2.5087, time 5178.66ms 
iter 4706: loss 2.5672, time 5254.30ms 
iter 4707: loss 2.6561, time 5209.82ms 
iter 4708: loss 2.5763, time 5172.12ms 
iter 4709: loss 2.7167, time 5226.08ms 
iter 4710: loss 2.5189, time 5245.11ms 
iter 4711: loss 2.6411, time 5249.64ms 
iter 4712: loss 2.4590, time 5254.41ms 
iter 4713: loss 2.4769, time 5236.64ms 
iter 4714: loss 2.6026, time 5191.96ms 
iter 4715: loss 2.6646, time 5258.10ms 
iter 4716: loss 2.6565, time 5266.72ms 
iter 4717: loss 2.8495, time 5256.49ms 
iter 4718: loss 2.4419, time 5156.76ms 
iter 4719: loss 2.6905, time 5140.14ms 
iter 4720: loss 2.4250, time 5079.92ms 
iter 4721: loss 2.5264, time 5210.77ms 
iter 4722: loss 2.6018, time 5255.12ms 
iter 4723: loss 2.6816, time 5233.24ms 
iter 4724: loss 2.7873, time 5249.27ms 
iter 4725: loss 2.6050, time 5234.99ms 
iter 4726: loss 2.4287, time 5252.70ms 
iter 4727: loss 2.4784, time 5228.45ms 
iter 4728: loss 2.8801, time 5213.01ms 
iter 4729: loss 2.5452, time 5227.20ms 
iter 4730: loss 2.3200, time 5250.60ms 
iter 4731: loss 2.4514, time 5173.64ms 
iter 4732: loss 2.5483, time 5180.74ms 
iter 4733: loss 2.5722, time 5118.10ms 
iter 4734: loss 2.6105, time 5213.51ms 
iter 4735: loss 2.7328, time 5257.46ms 
iter 4736: loss 2.5412, time 5238.80ms 
iter 4737: loss 2.4988, time 5197.58ms 
iter 4738: loss 2.6104, time 5229.29ms 
iter 4739: loss 2.6769, time 5084.21ms 
iter 4740: loss 2.3321, time 5103.98ms 
iter 4741: loss 2.5328, time 5241.33ms 
iter 4742: loss 2.4363, time 5118.81ms 
iter 4743: loss 2.6268, time 5072.01ms 
iter 4744: loss 2.7106, time 5049.91ms 
iter 4745: loss 2.6680, time 5215.68ms 
iter 4746: loss 2.5666, time 5122.68ms 
iter 4747: loss 2.5603, time 5117.47ms 
iter 4748: loss 2.6685, time 5203.46ms 
iter 4749: loss 2.6677, time 5199.36ms 
step 4750: train loss 2.6031, val loss 2.8386
iter 4750: loss 2.5336, time 20040.62ms 
iter 4751: loss 2.5274, time 5268.08ms 
iter 4752: loss 2.4426, time 5258.33ms 
iter 4753: loss 2.6951, time 5256.41ms 
iter 4754: loss 2.5212, time 5251.47ms 
iter 4755: loss 2.6493, time 5266.02ms 
iter 4756: loss 2.5546, time 5260.63ms 
iter 4757: loss 2.6437, time 5261.23ms 
iter 4758: loss 2.4932, time 5262.53ms 
iter 4759: loss 2.4195, time 5249.49ms 
iter 4760: loss 2.7433, time 5245.36ms 
iter 4761: loss 2.5609, time 5260.88ms 
iter 4762: loss 2.8175, time 5254.64ms 
iter 4763: loss 2.6530, time 5252.48ms 
iter 4764: loss 2.4940, time 5257.77ms 
iter 4765: loss 2.5666, time 5266.98ms 
iter 4766: loss 2.5280, time 5266.48ms 
iter 4767: loss 2.4100, time 5270.77ms 
iter 4768: loss 2.8025, time 5265.39ms 
iter 4769: loss 2.5334, time 5277.66ms 
iter 4770: loss 2.6939, time 5259.60ms 
iter 4771: loss 2.4835, time 5286.32ms 
iter 4772: loss 2.5887, time 5284.42ms 
iter 4773: loss 2.7646, time 5262.12ms 
iter 4774: loss 2.5835, time 5270.09ms 
iter 4775: loss 2.6780, time 5261.91ms 
iter 4776: loss 2.7688, time 5278.91ms 
iter 4777: loss 2.4931, time 5232.59ms 
iter 4778: loss 2.5195, time 5255.24ms 
iter 4779: loss 2.7061, time 5262.13ms 
iter 4780: loss 2.6232, time 5239.75ms 
iter 4781: loss 2.6578, time 5228.75ms 
iter 4782: loss 2.4348, time 5272.70ms 
iter 4783: loss 2.5421, time 5258.79ms 
iter 4784: loss 2.5131, time 5263.59ms 
iter 4785: loss 2.6040, time 5267.52ms 
iter 4786: loss 2.5721, time 5274.23ms 
iter 4787: loss 2.7100, time 5268.30ms 
iter 4788: loss 2.6438, time 5296.95ms 
iter 4789: loss 2.6000, time 5285.04ms 
iter 4790: loss 2.8238, time 5286.00ms 
iter 4791: loss 2.7951, time 5271.62ms 
iter 4792: loss 2.6456, time 5264.03ms 
iter 4793: loss 2.5335, time 5273.35ms 
iter 4794: loss 2.6376, time 5267.04ms 
iter 4795: loss 2.6733, time 5255.14ms 
iter 4796: loss 2.5882, time 5276.77ms 
iter 4797: loss 2.6068, time 5267.44ms 
iter 4798: loss 2.5585, time 5266.01ms 
iter 4799: loss 2.5918, time 5269.68ms 
step 4800: train loss 2.6052, val loss 2.8372
iter 4800: loss 2.6395, time 19971.81ms 
iter 4801: loss 2.7283, time 5264.42ms 
iter 4802: loss 2.6945, time 5103.73ms 
iter 4803: loss 2.5967, time 5016.53ms 
iter 4804: loss 2.5430, time 5283.11ms 
iter 4805: loss 2.6538, time 5298.70ms 
iter 4806: loss 2.6972, time 5259.76ms 
iter 4807: loss 2.7321, time 5259.38ms 
iter 4808: loss 2.5748, time 5272.21ms 
iter 4809: loss 2.5949, time 5289.09ms 
iter 4810: loss 2.6437, time 5137.48ms 
iter 4811: loss 2.7460, time 5039.60ms 
iter 4812: loss 2.4890, time 5105.44ms 
iter 4813: loss 2.5084, time 5018.24ms 
iter 4814: loss 2.5798, time 5024.37ms 
iter 4815: loss 2.4928, time 5024.08ms 
iter 4816: loss 2.8210, time 5008.96ms 
iter 4817: loss 2.8257, time 5175.97ms 
iter 4818: loss 2.4084, time 5201.12ms 
iter 4819: loss 2.4954, time 5064.66ms 
iter 4820: loss 2.6674, time 5032.23ms 
iter 4821: loss 2.7625, time 5065.28ms 
iter 4822: loss 2.6811, time 5081.23ms 
iter 4823: loss 2.6275, time 5267.40ms 
iter 4824: loss 2.3139, time 5078.46ms 
iter 4825: loss 2.5263, time 5160.60ms 
iter 4826: loss 2.6274, time 5056.01ms 
iter 4827: loss 2.6382, time 5268.64ms 
iter 4828: loss 2.5797, time 5275.17ms 
iter 4829: loss 2.5252, time 5137.14ms 
iter 4830: loss 2.3711, time 5236.90ms 
iter 4831: loss 2.6074, time 5127.92ms 
iter 4832: loss 2.4329, time 5257.82ms 
iter 4833: loss 2.6380, time 5134.67ms 
iter 4834: loss 2.6146, time 5287.17ms 
iter 4835: loss 2.7208, time 5274.80ms 
iter 4836: loss 2.6549, time 5263.46ms 
iter 4837: loss 2.6016, time 5265.44ms 
iter 4838: loss 2.6555, time 5165.07ms 
iter 4839: loss 2.5316, time 5248.06ms 
iter 4840: loss 2.5335, time 5302.42ms 
iter 4841: loss 2.7273, time 5299.89ms 
iter 4842: loss 2.7662, time 5288.89ms 
iter 4843: loss 2.6111, time 5311.36ms 
iter 4844: loss 2.6942, time 5314.68ms 
iter 4845: loss 2.5197, time 5274.91ms 
iter 4846: loss 2.6097, time 5265.85ms 
iter 4847: loss 2.6686, time 5281.08ms 
iter 4848: loss 2.6308, time 5271.42ms 
iter 4849: loss 2.7110, time 5192.15ms 
step 4850: train loss 2.5930, val loss 2.8458
iter 4850: loss 2.5322, time 19867.96ms 
iter 4851: loss 2.4864, time 5039.02ms 
iter 4852: loss 2.5970, time 5092.80ms 
iter 4853: loss 2.8092, time 5243.22ms 
iter 4854: loss 2.5885, time 5262.92ms 
iter 4855: loss 2.5021, time 5293.79ms 
iter 4856: loss 2.5009, time 5282.92ms 
iter 4857: loss 2.7582, time 5231.43ms 
iter 4858: loss 2.7080, time 5275.65ms 
iter 4859: loss 2.8317, time 5245.38ms 
iter 4860: loss 2.7291, time 5213.94ms 
iter 4861: loss 2.6272, time 5254.43ms 
iter 4862: loss 2.5876, time 5236.51ms 
iter 4863: loss 2.8357, time 5257.02ms 
iter 4864: loss 2.6366, time 5269.18ms 
iter 4865: loss 2.5509, time 5271.59ms 
iter 4866: loss 2.5798, time 5265.67ms 
iter 4867: loss 2.7204, time 5313.66ms 
iter 4868: loss 2.5940, time 5268.68ms 
iter 4869: loss 2.5323, time 5296.32ms 
iter 4870: loss 2.5386, time 5264.11ms 
iter 4871: loss 2.5247, time 5296.86ms 
iter 4872: loss 2.4944, time 5287.04ms 
iter 4873: loss 2.3680, time 5255.88ms 
iter 4874: loss 2.6327, time 5260.24ms 
iter 4875: loss 2.5426, time 5259.31ms 
iter 4876: loss 2.6484, time 5272.43ms 
iter 4877: loss 2.6231, time 5279.83ms 
iter 4878: loss 2.7356, time 5237.81ms 
iter 4879: loss 2.3932, time 5040.94ms 
iter 4880: loss 2.4352, time 5253.44ms 
iter 4881: loss 2.5583, time 5268.03ms 
iter 4882: loss 2.6325, time 5256.50ms 
iter 4883: loss 2.5781, time 5249.85ms 
iter 4884: loss 2.5776, time 5232.10ms 
iter 4885: loss 2.5090, time 5220.52ms 
iter 4886: loss 2.5935, time 5257.94ms 
iter 4887: loss 2.5602, time 5267.12ms 
iter 4888: loss 2.5430, time 5262.26ms 
iter 4889: loss 2.5015, time 5203.29ms 
iter 4890: loss 2.6023, time 5103.27ms 
iter 4891: loss 2.5019, time 5021.52ms 
iter 4892: loss 2.8764, time 5174.78ms 
iter 4893: loss 2.6164, time 5278.98ms 
iter 4894: loss 2.5713, time 5229.17ms 
iter 4895: loss 2.6399, time 5191.41ms 
iter 4896: loss 2.5347, time 5099.05ms 
iter 4897: loss 2.6865, time 5253.51ms 
iter 4898: loss 2.3102, time 5258.01ms 
iter 4899: loss 2.6376, time 5193.97ms 
step 4900: train loss 2.5862, val loss 2.8358
iter 4900: loss 2.6168, time 20002.13ms 
iter 4901: loss 2.7851, time 5231.40ms 
iter 4902: loss 2.6202, time 5189.16ms 
iter 4903: loss 2.4876, time 5181.78ms 
iter 4904: loss 2.4941, time 5226.11ms 
iter 4905: loss 2.5699, time 5231.13ms 
iter 4906: loss 2.7613, time 5245.19ms 
iter 4907: loss 2.6448, time 5261.50ms 
iter 4908: loss 2.5695, time 5249.64ms 
iter 4909: loss 2.5000, time 5277.84ms 
iter 4910: loss 2.6692, time 5106.97ms 
iter 4911: loss 2.6450, time 5041.44ms 
iter 4912: loss 2.7529, time 5035.17ms 
iter 4913: loss 2.5039, time 5131.42ms 
iter 4914: loss 2.4161, time 5048.77ms 
iter 4915: loss 2.7428, time 5251.51ms 
iter 4916: loss 2.4504, time 5249.08ms 
iter 4917: loss 2.7294, time 5256.04ms 
iter 4918: loss 2.9313, time 5242.99ms 
iter 4919: loss 2.8567, time 5275.02ms 
iter 4920: loss 2.5155, time 5100.74ms 
iter 4921: loss 2.6334, time 5050.68ms 
iter 4922: loss 2.7047, time 5053.95ms 
iter 4923: loss 2.5866, time 5123.34ms 
iter 4924: loss 2.4958, time 5036.95ms 
iter 4925: loss 2.7187, time 5049.57ms 
iter 4926: loss 2.5034, time 5091.20ms 
iter 4927: loss 2.4042, time 5274.82ms 
iter 4928: loss 2.7129, time 5193.62ms 
iter 4929: loss 2.5538, time 5264.74ms 
iter 4930: loss 2.2989, time 5253.99ms 
iter 4931: loss 2.5863, time 5120.31ms 
iter 4932: loss 2.5860, time 5188.44ms 
iter 4933: loss 2.5579, time 5034.31ms 
iter 4934: loss 2.5981, time 5049.03ms 
iter 4935: loss 2.8080, time 5047.03ms 
iter 4936: loss 2.5261, time 5040.59ms 
iter 4937: loss 2.4644, time 5040.37ms 
iter 4938: loss 2.7102, time 5019.17ms 
iter 4939: loss 2.6484, time 5013.70ms 
iter 4940: loss 2.8033, time 5017.51ms 
iter 4941: loss 2.5728, time 5044.05ms 
iter 4942: loss 2.5362, time 5013.91ms 
iter 4943: loss 2.5998, time 5063.53ms 
iter 4944: loss 2.5094, time 5039.63ms 
iter 4945: loss 2.6640, time 5018.70ms 
iter 4946: loss 2.8488, time 5039.41ms 
iter 4947: loss 2.5860, time 5019.86ms 
iter 4948: loss 2.7342, time 5042.92ms 
iter 4949: loss 2.6728, time 4988.06ms 
step 4950: train loss 2.5973, val loss 2.8420
iter 4950: loss 2.5867, time 19633.11ms 
iter 4951: loss 2.6999, time 5255.91ms 
iter 4952: loss 2.6139, time 5168.39ms 
iter 4953: loss 2.3311, time 5165.97ms 
iter 4954: loss 2.7450, time 5217.94ms 
iter 4955: loss 2.5544, time 5269.38ms 
iter 4956: loss 2.6148, time 5242.60ms 
iter 4957: loss 2.6755, time 5079.29ms 
iter 4958: loss 2.4728, time 5187.47ms 
iter 4959: loss 2.5372, time 5262.66ms 
iter 4960: loss 2.6251, time 5122.86ms 
iter 4961: loss 2.4161, time 5241.68ms 
iter 4962: loss 2.4927, time 5263.89ms 
iter 4963: loss 2.4909, time 5129.21ms 
iter 4964: loss 2.6614, time 5029.68ms 
iter 4965: loss 2.5003, time 5166.98ms 
iter 4966: loss 2.6245, time 5198.48ms 
iter 4967: loss 2.3779, time 5217.98ms 
iter 4968: loss 2.5572, time 5263.92ms 
iter 4969: loss 2.5105, time 5234.58ms 
iter 4970: loss 2.4770, time 5269.60ms 
iter 4971: loss 2.6986, time 5171.95ms 
iter 4972: loss 2.5221, time 5013.48ms 
iter 4973: loss 2.5477, time 5208.48ms 
iter 4974: loss 2.5440, time 5015.24ms 
iter 4975: loss 2.5342, time 5235.42ms 
iter 4976: loss 2.3975, time 5265.90ms 
iter 4977: loss 2.5248, time 5106.95ms 
iter 4978: loss 2.4441, time 5137.22ms 
iter 4979: loss 2.7412, time 5185.95ms 
iter 4980: loss 2.8425, time 5230.57ms 
iter 4981: loss 2.7220, time 5083.10ms 
iter 4982: loss 2.4314, time 5229.51ms 
iter 4983: loss 2.7466, time 5239.86ms 
iter 4984: loss 2.5293, time 5259.32ms 
iter 4985: loss 2.5530, time 5207.56ms 
iter 4986: loss 2.6359, time 5095.25ms 
iter 4987: loss 2.5508, time 5275.27ms 
iter 4988: loss 2.7087, time 5174.71ms 
iter 4989: loss 2.7442, time 5200.60ms 
iter 4990: loss 2.5915, time 5190.10ms 
iter 4991: loss 2.6969, time 5003.72ms 
iter 4992: loss 2.6262, time 5050.17ms 
iter 4993: loss 2.5680, time 5028.57ms 
iter 4994: loss 2.6178, time 5257.39ms 
iter 4995: loss 2.4798, time 5209.33ms 
iter 4996: loss 2.6611, time 5276.77ms 
iter 4997: loss 2.5646, time 5259.68ms 
iter 4998: loss 2.5764, time 4997.02ms 
iter 4999: loss 2.6654, time 5166.30ms 
step 5000: train loss 2.5749, val loss 2.8512
iter 5000: loss 2.5973, time 20035.46ms 
iter 5001: loss 2.7184, time 5200.73ms 
iter 5002: loss 2.6706, time 4964.34ms 
iter 5003: loss 2.5630, time 5056.46ms 
iter 5004: loss 2.5479, time 5154.32ms 
iter 5005: loss 2.5212, time 5097.37ms 
iter 5006: loss 2.4717, time 5109.80ms 
iter 5007: loss 2.4007, time 5256.34ms 
iter 5008: loss 2.5690, time 5203.26ms 
iter 5009: loss 2.5210, time 4968.18ms 
iter 5010: loss 2.3423, time 5020.95ms 
iter 5011: loss 2.5761, time 5099.42ms 
iter 5012: loss 2.4040, time 5258.37ms 
iter 5013: loss 2.4827, time 5080.62ms 
iter 5014: loss 2.4814, time 5039.82ms 
iter 5015: loss 2.7439, time 5033.38ms 
iter 5016: loss 2.8300, time 5038.03ms 
iter 5017: loss 2.6019, time 5033.75ms 
iter 5018: loss 2.2974, time 5090.73ms 
iter 5019: loss 2.6200, time 5090.28ms 
iter 5020: loss 2.7758, time 5135.48ms 
iter 5021: loss 2.5648, time 5258.04ms 
iter 5022: loss 2.6258, time 5242.23ms 
iter 5023: loss 2.6831, time 4986.95ms 
iter 5024: loss 2.7437, time 5222.37ms 
iter 5025: loss 2.5347, time 5189.62ms 
iter 5026: loss 2.4239, time 5248.48ms 
iter 5027: loss 2.4606, time 5218.10ms 
iter 5028: loss 2.5972, time 5251.72ms 
iter 5029: loss 2.3541, time 5250.93ms 
iter 5030: loss 2.5422, time 5176.88ms 
iter 5031: loss 2.5358, time 5190.12ms 
iter 5032: loss 2.4274, time 5203.28ms 
iter 5033: loss 2.6169, time 5193.64ms 
iter 5034: loss 2.6094, time 5244.78ms 
iter 5035: loss 2.6901, time 5270.65ms 
iter 5036: loss 2.6175, time 5244.93ms 
iter 5037: loss 2.5609, time 5149.06ms 
iter 5038: loss 2.6192, time 5237.80ms 
iter 5039: loss 2.4651, time 5250.10ms 
iter 5040: loss 2.4663, time 5156.63ms 
iter 5041: loss 2.7313, time 5198.91ms 
iter 5042: loss 2.4634, time 5235.51ms 
iter 5043: loss 2.5932, time 5213.36ms 
iter 5044: loss 2.5379, time 5010.48ms 
iter 5045: loss 2.7201, time 5158.94ms 
iter 5046: loss 2.5115, time 5238.58ms 
iter 5047: loss 2.5116, time 5136.43ms 
iter 5048: loss 2.3458, time 5251.04ms 
iter 5049: loss 2.6418, time 5285.81ms 
step 5050: train loss 2.5845, val loss 2.8500
iter 5050: loss 2.5805, time 19926.43ms 
iter 5051: loss 2.4356, time 5128.80ms 
iter 5052: loss 2.6021, time 5196.56ms 
iter 5053: loss 2.6504, time 5266.23ms 
iter 5054: loss 2.7211, time 5285.47ms 
iter 5055: loss 2.5219, time 5121.22ms 
iter 5056: loss 2.3441, time 5123.15ms 
iter 5057: loss 2.6037, time 5235.15ms 
iter 5058: loss 2.5338, time 5184.01ms 
iter 5059: loss 2.4171, time 5276.81ms 
iter 5060: loss 2.4580, time 5256.03ms 
iter 5061: loss 2.7611, time 5260.14ms 
iter 5062: loss 2.6825, time 5153.51ms 
iter 5063: loss 2.5330, time 5187.51ms 
iter 5064: loss 2.7589, time 5214.15ms 
iter 5065: loss 2.6833, time 5276.22ms 
iter 5066: loss 2.3652, time 5251.02ms 
iter 5067: loss 2.7469, time 5257.57ms 
iter 5068: loss 2.6626, time 5275.23ms 
iter 5069: loss 2.5819, time 5183.64ms 
iter 5070: loss 2.4279, time 5135.56ms 
iter 5071: loss 2.3295, time 5256.14ms 
iter 5072: loss 2.4757, time 5138.82ms 
iter 5073: loss 2.5265, time 5239.96ms 
iter 5074: loss 2.5530, time 5260.90ms 
iter 5075: loss 2.5369, time 5226.33ms 
iter 5076: loss 2.5623, time 5186.92ms 
iter 5077: loss 2.6614, time 5092.49ms 
iter 5078: loss 2.5279, time 5293.83ms 
iter 5079: loss 2.4581, time 5153.70ms 
iter 5080: loss 2.4103, time 5243.40ms 
iter 5081: loss 2.4106, time 5250.01ms 
iter 5082: loss 2.5869, time 5077.30ms 
iter 5083: loss 2.6840, time 5236.29ms 
iter 5084: loss 2.5257, time 5195.97ms 
iter 5085: loss 2.5327, time 5106.75ms 
iter 5086: loss 2.5255, time 5138.44ms 
iter 5087: loss 2.6913, time 5279.07ms 
iter 5088: loss 2.5091, time 5269.47ms 
iter 5089: loss 2.5742, time 5075.65ms 
iter 5090: loss 2.5791, time 5253.58ms 
iter 5091: loss 2.5879, time 5182.20ms 
iter 5092: loss 2.5565, time 5187.20ms 
iter 5093: loss 2.5943, time 5225.05ms 
iter 5094: loss 2.5903, time 5281.48ms 
iter 5095: loss 2.5443, time 5034.59ms 
iter 5096: loss 2.5556, time 4974.64ms 
iter 5097: loss 2.4478, time 5187.17ms 
iter 5098: loss 2.6730, time 5206.43ms 
iter 5099: loss 2.5337, time 5108.88ms 
step 5100: train loss 2.5838, val loss 2.8396
iter 5100: loss 2.7208, time 19731.17ms 
iter 5101: loss 2.6526, time 5144.22ms 
iter 5102: loss 2.5262, time 5167.70ms 
iter 5103: loss 2.5543, time 5236.57ms 
iter 5104: loss 2.5428, time 5267.43ms 
iter 5105: loss 2.6186, time 5279.29ms 
iter 5106: loss 2.6035, time 5268.30ms 
iter 5107: loss 2.5632, time 5210.37ms 
iter 5108: loss 2.5063, time 5120.12ms 
iter 5109: loss 2.4200, time 5243.70ms 
iter 5110: loss 2.4073, time 5136.14ms 
iter 5111: loss 2.5493, time 5229.15ms 
iter 5112: loss 2.8369, time 5235.03ms 
iter 5113: loss 2.6119, time 5258.29ms 
iter 5114: loss 2.7032, time 5150.07ms 
iter 5115: loss 2.4135, time 5046.66ms 
iter 5116: loss 2.7386, time 5173.68ms 
iter 5117: loss 2.6606, time 5159.22ms 
iter 5118: loss 2.6942, time 5253.95ms 
iter 5119: loss 2.6794, time 5268.82ms 
iter 5120: loss 2.7178, time 5272.55ms 
iter 5121: loss 2.5509, time 5234.61ms 
iter 5122: loss 2.5547, time 5149.01ms 
iter 5123: loss 2.7394, time 5277.69ms 
iter 5124: loss 2.5675, time 5179.31ms 
iter 5125: loss 2.5443, time 5270.20ms 
iter 5126: loss 2.6686, time 5264.78ms 
iter 5127: loss 2.5916, time 5130.06ms 
iter 5128: loss 2.4485, time 5259.01ms 
iter 5129: loss 2.6597, time 5236.87ms 
iter 5130: loss 2.5582, time 5307.31ms 
iter 5131: loss 2.4716, time 5252.75ms 
iter 5132: loss 2.5607, time 5287.16ms 
iter 5133: loss 2.8020, time 5300.65ms 
iter 5134: loss 2.3822, time 5125.28ms 
iter 5135: loss 2.2065, time 5192.15ms 
iter 5136: loss 2.7239, time 5214.94ms 
iter 5137: loss 2.6569, time 5158.42ms 
iter 5138: loss 2.4731, time 5217.83ms 
iter 5139: loss 2.3077, time 5262.92ms 
iter 5140: loss 2.7652, time 5271.31ms 
iter 5141: loss 2.6885, time 5099.16ms 
iter 5142: loss 2.5925, time 5000.98ms 
iter 5143: loss 2.6491, time 5116.85ms 
iter 5144: loss 2.6360, time 5097.39ms 
iter 5145: loss 2.6559, time 5200.37ms 
iter 5146: loss 2.7256, time 5254.75ms 
iter 5147: loss 2.5949, time 5178.62ms 
iter 5148: loss 2.5600, time 5071.33ms 
iter 5149: loss 2.5341, time 5272.83ms 
step 5150: train loss 2.5686, val loss 2.8430
iter 5150: loss 2.4300, time 19893.50ms 
iter 5151: loss 2.5918, time 5288.34ms 
iter 5152: loss 2.5943, time 5170.25ms 
iter 5153: loss 2.3383, time 5256.71ms 
iter 5154: loss 2.4646, time 5248.39ms 
iter 5155: loss 2.6954, time 5132.85ms 
iter 5156: loss 2.5360, time 5259.84ms 
iter 5157: loss 2.8255, time 5254.68ms 
iter 5158: loss 2.5611, time 5263.65ms 
iter 5159: loss 2.4294, time 5184.52ms 
iter 5160: loss 2.3929, time 5137.28ms 
iter 5161: loss 2.3801, time 5271.33ms 
iter 5162: loss 2.3693, time 5147.27ms 
iter 5163: loss 2.4751, time 5274.19ms 
iter 5164: loss 2.5190, time 5272.09ms 
iter 5165: loss 2.5379, time 5092.30ms 
iter 5166: loss 2.5516, time 5169.58ms 
iter 5167: loss 2.6132, time 5090.57ms 
iter 5168: loss 2.6543, time 5250.10ms 
iter 5169: loss 2.6519, time 5126.28ms 
iter 5170: loss 2.6893, time 5221.82ms 
iter 5171: loss 2.8452, time 5259.84ms 
iter 5172: loss 2.7327, time 5048.74ms 
iter 5173: loss 2.6515, time 5088.55ms 
iter 5174: loss 2.6908, time 5079.99ms 
iter 5175: loss 2.6755, time 5231.73ms 
iter 5176: loss 2.3999, time 5110.97ms 
iter 5177: loss 2.7177, time 5266.88ms 
iter 5178: loss 2.6083, time 5263.14ms 
iter 5179: loss 2.7256, time 5260.04ms 
iter 5180: loss 2.6125, time 5181.45ms 
iter 5181: loss 2.5987, time 5126.61ms 
iter 5182: loss 2.5959, time 5233.48ms 
iter 5183: loss 2.3814, time 5076.93ms 
iter 5184: loss 2.7395, time 5261.07ms 
iter 5185: loss 2.5135, time 5264.03ms 
iter 5186: loss 2.5981, time 5143.92ms 
iter 5187: loss 2.4010, time 5105.97ms 
iter 5188: loss 2.6114, time 5030.44ms 
iter 5189: loss 2.5923, time 5244.74ms 
iter 5190: loss 2.6259, time 5138.42ms 
iter 5191: loss 2.7596, time 5274.74ms 
iter 5192: loss 2.5974, time 5261.77ms 
iter 5193: loss 2.3681, time 5169.90ms 
iter 5194: loss 2.6011, time 5167.83ms 
iter 5195: loss 2.5569, time 5131.09ms 
iter 5196: loss 2.8395, time 5269.23ms 
iter 5197: loss 2.5529, time 5141.79ms 
iter 5198: loss 2.5685, time 5237.01ms 
iter 5199: loss 2.5025, time 5251.85ms 
step 5200: train loss 2.5794, val loss 2.8215
iter 5200: loss 2.5901, time 19983.38ms 
iter 5201: loss 2.6848, time 5182.86ms 
iter 5202: loss 2.6981, time 5147.34ms 
iter 5203: loss 2.7044, time 5286.64ms 
iter 5204: loss 2.3253, time 5263.87ms 
iter 5205: loss 2.4521, time 5229.27ms 
iter 5206: loss 2.4982, time 5264.42ms 
iter 5207: loss 2.5292, time 5189.94ms 
iter 5208: loss 2.5434, time 5263.58ms 
iter 5209: loss 2.6794, time 5205.82ms 
iter 5210: loss 2.7480, time 5256.22ms 
iter 5211: loss 2.4632, time 5192.33ms 
iter 5212: loss 2.8204, time 5079.79ms 
iter 5213: loss 2.4105, time 5086.11ms 
iter 5214: loss 2.5422, time 5082.98ms 
iter 5215: loss 2.3753, time 5266.05ms 
iter 5216: loss 2.5404, time 5188.44ms 
iter 5217: loss 2.4311, time 5262.76ms 
iter 5218: loss 2.6192, time 5282.03ms 
iter 5219: loss 2.4095, time 5150.95ms 
iter 5220: loss 2.6422, time 5299.48ms 
iter 5221: loss 2.7689, time 5225.32ms 
iter 5222: loss 2.7892, time 5285.00ms 
iter 5223: loss 2.5001, time 5214.79ms 
iter 5224: loss 2.6830, time 5263.15ms 
iter 5225: loss 2.6425, time 5275.81ms 
iter 5226: loss 2.5031, time 5123.69ms 
iter 5227: loss 2.6443, time 5293.32ms 
iter 5228: loss 2.8132, time 5166.17ms 
iter 5229: loss 2.5189, time 5279.70ms 
iter 5230: loss 2.5641, time 5218.61ms 
iter 5231: loss 2.6097, time 5264.54ms 
iter 5232: loss 2.6557, time 5277.46ms 
iter 5233: loss 2.5113, time 5135.60ms 
iter 5234: loss 2.6062, time 5258.60ms 
iter 5235: loss 2.2881, time 5227.43ms 
iter 5236: loss 2.6585, time 5236.24ms 
iter 5237: loss 2.6103, time 5217.04ms 
iter 5238: loss 2.5499, time 5312.35ms 
iter 5239: loss 2.6960, time 5351.23ms 
iter 5240: loss 2.5713, time 5153.75ms 
iter 5241: loss 2.6004, time 5206.18ms 
iter 5242: loss 2.5973, time 5189.35ms 
iter 5243: loss 2.6834, time 5264.29ms 
iter 5244: loss 2.3451, time 5188.80ms 
iter 5245: loss 2.5780, time 5042.82ms 
iter 5246: loss 2.5837, time 5041.68ms 
iter 5247: loss 2.5927, time 5012.72ms 
iter 5248: loss 2.6318, time 5033.94ms 
iter 5249: loss 2.4456, time 5024.95ms 
step 5250: train loss 2.5737, val loss 2.8323
iter 5250: loss 2.7579, time 19952.95ms 
iter 5251: loss 2.5967, time 5071.06ms 
iter 5252: loss 2.5572, time 5174.27ms 
iter 5253: loss 2.5684, time 5098.34ms 
iter 5254: loss 2.7830, time 5272.93ms 
iter 5255: loss 2.4640, time 5173.28ms 
iter 5256: loss 2.4319, time 5293.43ms 
iter 5257: loss 2.5811, time 5296.71ms 
iter 5258: loss 2.8876, time 5308.25ms 
iter 5259: loss 2.5736, time 5261.78ms 
iter 5260: loss 2.5776, time 5162.51ms 
iter 5261: loss 2.6026, time 5298.71ms 
iter 5262: loss 2.4677, time 5193.02ms 
iter 5263: loss 2.6547, time 5251.73ms 
iter 5264: loss 2.6542, time 5243.45ms 
iter 5265: loss 2.5874, time 5203.83ms 
iter 5266: loss 2.5952, time 5225.52ms 
iter 5267: loss 2.6191, time 5252.28ms 
iter 5268: loss 2.3759, time 5270.67ms 
iter 5269: loss 2.6240, time 5164.69ms 
iter 5270: loss 2.4115, time 5245.97ms 
iter 5271: loss 2.4442, time 5254.08ms 
iter 5272: loss 2.5593, time 5044.69ms 
iter 5273: loss 2.4711, time 5214.92ms 
iter 5274: loss 2.9295, time 5192.83ms 
iter 5275: loss 2.5662, time 5260.58ms 
iter 5276: loss 2.6008, time 5187.30ms 
iter 5277: loss 2.4855, time 5259.45ms 
iter 5278: loss 2.5399, time 5260.15ms 
iter 5279: loss 2.4597, time 5260.56ms 
iter 5280: loss 2.7416, time 5189.19ms 
iter 5281: loss 2.5706, time 5084.93ms 
iter 5282: loss 2.6197, time 5289.52ms 
iter 5283: loss 2.6177, time 5223.09ms 
iter 5284: loss 2.6423, time 5244.31ms 
iter 5285: loss 2.6433, time 5259.93ms 
iter 5286: loss 2.8212, time 5004.31ms 
iter 5287: loss 2.5102, time 5042.60ms 
iter 5288: loss 2.7260, time 4978.62ms 
iter 5289: loss 2.6252, time 5175.80ms 
iter 5290: loss 2.7982, time 5026.51ms 
iter 5291: loss 2.6107, time 5273.50ms 
iter 5292: loss 2.4975, time 5265.19ms 
iter 5293: loss 2.6009, time 5277.53ms 
iter 5294: loss 2.6118, time 5202.82ms 
iter 5295: loss 2.6300, time 5046.55ms 
iter 5296: loss 2.6658, time 5237.75ms 
iter 5297: loss 2.4545, time 5138.58ms 
iter 5298: loss 2.4303, time 5243.02ms 
iter 5299: loss 2.7326, time 5256.71ms 
step 5300: train loss 2.5801, val loss 2.8535
iter 5300: loss 2.4875, time 19999.84ms 
iter 5301: loss 2.6131, time 5251.41ms 
iter 5302: loss 2.5398, time 5267.91ms 
iter 5303: loss 2.5763, time 5163.28ms 
iter 5304: loss 2.6294, time 5262.41ms 
iter 5305: loss 2.6867, time 5254.18ms 
iter 5306: loss 2.2851, time 5273.58ms 
iter 5307: loss 2.3325, time 5191.74ms 
iter 5308: loss 2.6424, time 5152.30ms 
iter 5309: loss 2.6475, time 5044.67ms 
iter 5310: loss 2.4096, time 4974.23ms 
iter 5311: loss 2.6238, time 5262.52ms 
iter 5312: loss 2.6498, time 5179.72ms 
iter 5313: loss 2.5900, time 5274.69ms 
iter 5314: loss 2.7428, time 5189.40ms 
iter 5315: loss 2.4661, time 5141.32ms 
iter 5316: loss 2.5781, time 5273.03ms 
iter 5317: loss 2.6686, time 5183.57ms 
iter 5318: loss 2.7825, time 5203.42ms 
iter 5319: loss 2.4492, time 5213.25ms 
iter 5320: loss 2.4948, time 5188.06ms 
iter 5321: loss 2.5268, time 5270.85ms 
iter 5322: loss 2.5325, time 5200.92ms 
iter 5323: loss 2.4559, time 5158.46ms 
iter 5324: loss 2.5740, time 5153.97ms 
iter 5325: loss 2.5524, time 5163.35ms 
iter 5326: loss 2.4937, time 5273.17ms 
iter 5327: loss 2.4031, time 5219.97ms 
iter 5328: loss 2.6877, time 5202.19ms 
iter 5329: loss 2.5558, time 5126.92ms 
iter 5330: loss 2.5728, time 5117.29ms 
iter 5331: loss 2.4709, time 5279.44ms 
iter 5332: loss 2.4782, time 5235.64ms 
iter 5333: loss 2.5465, time 5285.62ms 
iter 5334: loss 2.5204, time 5096.06ms 
iter 5335: loss 2.4047, time 5217.08ms 
iter 5336: loss 2.6225, time 5296.49ms 
iter 5337: loss 2.5973, time 5186.62ms 
iter 5338: loss 2.6817, time 5044.08ms 
iter 5339: loss 2.2420, time 5100.98ms 
iter 5340: loss 2.6597, time 5202.58ms 
iter 5341: loss 2.7675, time 5193.95ms 
iter 5342: loss 2.5927, time 5250.61ms 
iter 5343: loss 2.6835, time 5131.05ms 
iter 5344: loss 2.4456, time 5178.17ms 
iter 5345: loss 2.7638, time 5197.32ms 
iter 5346: loss 2.6932, time 5200.45ms 
iter 5347: loss 2.6379, time 5253.88ms 
iter 5348: loss 2.5646, time 5090.17ms 
iter 5349: loss 2.5137, time 5121.96ms 
step 5350: train loss 2.5723, val loss 2.8401
iter 5350: loss 2.4412, time 19810.14ms 
iter 5351: loss 2.6344, time 4997.10ms 
iter 5352: loss 2.7218, time 5072.71ms 
iter 5353: loss 2.5933, time 5137.14ms 
iter 5354: loss 2.5548, time 5181.72ms 
iter 5355: loss 2.7886, time 5271.00ms 
iter 5356: loss 2.5770, time 5254.63ms 
iter 5357: loss 2.7382, time 5184.01ms 
iter 5358: loss 2.6388, time 5261.35ms 
iter 5359: loss 2.6075, time 5205.86ms 
iter 5360: loss 2.8459, time 5277.32ms 
iter 5361: loss 2.7523, time 5254.82ms 
iter 5362: loss 2.5015, time 5200.05ms 
iter 5363: loss 2.7847, time 5125.86ms 
iter 5364: loss 2.7213, time 5228.97ms 
iter 5365: loss 2.6753, time 5226.61ms 
iter 5366: loss 2.6025, time 5260.91ms 
iter 5367: loss 2.5540, time 5233.27ms 
iter 5368: loss 2.5397, time 5207.00ms 
iter 5369: loss 2.6382, time 5256.07ms 
iter 5370: loss 2.5483, time 5122.84ms 
iter 5371: loss 2.6364, time 5125.44ms 
iter 5372: loss 2.2554, time 5248.81ms 
iter 5373: loss 2.4956, time 5183.50ms 
iter 5374: loss 2.5316, time 5233.55ms 
iter 5375: loss 2.4645, time 5042.90ms 
iter 5376: loss 2.6879, time 5256.14ms 
iter 5377: loss 2.7167, time 5223.85ms 
iter 5378: loss 2.7097, time 5201.25ms 
iter 5379: loss 2.5800, time 5219.91ms 
iter 5380: loss 2.4168, time 5024.88ms 
iter 5381: loss 2.5101, time 5097.57ms 
iter 5382: loss 2.6732, time 5213.49ms 
iter 5383: loss 2.5596, time 5212.23ms 
iter 5384: loss 2.4530, time 5239.35ms 
iter 5385: loss 2.6638, time 5025.97ms 
iter 5386: loss 2.4269, time 5297.95ms 
iter 5387: loss 2.5798, time 5241.59ms 
iter 5388: loss 2.6356, time 5243.90ms 
iter 5389: loss 2.3116, time 5249.12ms 
iter 5390: loss 2.5322, time 5216.88ms 
iter 5391: loss 2.7463, time 5267.80ms 
iter 5392: loss 2.6655, time 5237.59ms 
iter 5393: loss 2.7756, time 5246.28ms 
iter 5394: loss 2.3454, time 5277.57ms 
iter 5395: loss 2.7892, time 5150.20ms 
iter 5396: loss 2.7459, time 5091.83ms 
iter 5397: loss 2.6312, time 5197.68ms 
iter 5398: loss 2.5933, time 5045.26ms 
iter 5399: loss 2.2901, time 5250.21ms 
step 5400: train loss 2.5872, val loss 2.8405
iter 5400: loss 2.6211, time 19839.31ms 
iter 5401: loss 2.6110, time 5135.36ms 
iter 5402: loss 2.6134, time 4971.77ms 
iter 5403: loss 2.4587, time 4998.43ms 
iter 5404: loss 2.6431, time 5077.50ms 
iter 5405: loss 2.5640, time 5056.42ms 
iter 5406: loss 2.6323, time 5121.11ms 
iter 5407: loss 2.5726, time 4992.56ms 
iter 5408: loss 2.6414, time 5022.43ms 
iter 5409: loss 2.6301, time 5106.82ms 
iter 5410: loss 2.7161, time 5256.77ms 
iter 5411: loss 2.5431, time 5190.43ms 
iter 5412: loss 2.6471, time 5258.38ms 
iter 5413: loss 2.6133, time 5123.02ms 
iter 5414: loss 2.4336, time 5137.07ms 
iter 5415: loss 2.3452, time 5097.64ms 
iter 5416: loss 2.5965, time 5119.56ms 
iter 5417: loss 2.5968, time 5260.96ms 
iter 5418: loss 2.3113, time 5129.27ms 
iter 5419: loss 2.5404, time 5113.37ms 
iter 5420: loss 2.6583, time 5297.24ms 
iter 5421: loss 2.4318, time 5175.40ms 
iter 5422: loss 2.7281, time 5270.28ms 
iter 5423: loss 2.6413, time 5185.72ms 
iter 5424: loss 2.7400, time 5166.40ms 
iter 5425: loss 2.5270, time 5288.17ms 
iter 5426: loss 2.5079, time 5216.37ms 
iter 5427: loss 2.4289, time 5151.43ms 
iter 5428: loss 2.7681, time 4998.45ms 
iter 5429: loss 2.5376, time 5049.97ms 
iter 5430: loss 2.8017, time 5067.47ms 
iter 5431: loss 2.7877, time 5162.43ms 
iter 5432: loss 2.5096, time 5063.29ms 
iter 5433: loss 2.5321, time 5041.07ms 
iter 5434: loss 2.5673, time 5072.55ms 
iter 5435: loss 2.6055, time 5073.20ms 
iter 5436: loss 2.4935, time 5089.70ms 
iter 5437: loss 2.5808, time 5038.46ms 
iter 5438: loss 2.5722, time 5094.85ms 
iter 5439: loss 2.5387, time 5168.80ms 
iter 5440: loss 2.5567, time 5269.34ms 
iter 5441: loss 2.5976, time 5250.55ms 
iter 5442: loss 2.5780, time 5176.09ms 
iter 5443: loss 2.3224, time 5227.27ms 
iter 5444: loss 2.6812, time 5192.34ms 
iter 5445: loss 2.6097, time 5282.77ms 
iter 5446: loss 2.5242, time 5219.61ms 
iter 5447: loss 2.4811, time 5134.70ms 
iter 5448: loss 2.6102, time 5063.43ms 
iter 5449: loss 2.5671, time 5064.28ms 
step 5450: train loss 2.5727, val loss 2.8237
iter 5450: loss 2.5695, time 19703.39ms 
iter 5451: loss 2.5708, time 5008.87ms 
iter 5452: loss 2.4690, time 5289.82ms 
iter 5453: loss 2.7382, time 5192.64ms 
iter 5454: loss 2.5272, time 5236.41ms 
iter 5455: loss 2.4300, time 5162.49ms 
iter 5456: loss 2.4644, time 5169.67ms 
iter 5457: loss 2.7151, time 5274.63ms 
iter 5458: loss 2.7656, time 5196.23ms 
iter 5459: loss 2.6188, time 5284.21ms 
iter 5460: loss 2.6238, time 5142.12ms 
iter 5461: loss 2.5067, time 5079.92ms 
iter 5462: loss 2.4778, time 5104.26ms 
iter 5463: loss 2.7212, time 5104.08ms 
iter 5464: loss 2.4165, time 5093.44ms 
iter 5465: loss 2.4617, time 5013.63ms 
iter 5466: loss 2.6984, time 5026.64ms 
iter 5467: loss 2.4711, time 5286.05ms 
iter 5468: loss 2.6548, time 5237.92ms 
iter 5469: loss 2.4934, time 5312.84ms 
iter 5470: loss 2.5170, time 5089.19ms 
iter 5471: loss 2.6094, time 5027.19ms 
iter 5472: loss 2.2404, time 5133.88ms 
iter 5473: loss 2.6668, time 5180.49ms 
iter 5474: loss 2.4340, time 5166.45ms 
iter 5475: loss 2.3494, time 5144.50ms 
iter 5476: loss 2.6120, time 5137.62ms 
iter 5477: loss 2.5992, time 5232.02ms 
iter 5478: loss 2.5924, time 5170.31ms 
iter 5479: loss 2.5776, time 5253.18ms 
iter 5480: loss 2.5396, time 5089.34ms 
iter 5481: loss 2.7176, time 5076.66ms 
iter 5482: loss 2.5390, time 5295.01ms 
iter 5483: loss 2.6212, time 5192.26ms 
iter 5484: loss 2.7739, time 5129.60ms 
iter 5485: loss 2.6319, time 5141.60ms 
iter 5486: loss 2.6068, time 5144.57ms 
iter 5487: loss 2.6152, time 5237.19ms 
iter 5488: loss 2.7632, time 5173.47ms 
iter 5489: loss 2.5637, time 5250.98ms 
iter 5490: loss 2.6936, time 5192.58ms 
iter 5491: loss 2.5300, time 5164.52ms 
iter 5492: loss 2.7517, time 5234.72ms 
iter 5493: loss 2.6244, time 5183.38ms 
iter 5494: loss 2.5170, time 5271.33ms 
iter 5495: loss 2.7285, time 5236.67ms 
iter 5496: loss 2.4069, time 5188.24ms 
iter 5497: loss 2.5158, time 5244.19ms 
iter 5498: loss 2.7141, time 5223.40ms 
iter 5499: loss 2.5474, time 5275.72ms 
step 5500: train loss 2.5715, val loss 2.8457
iter 5500: loss 2.6776, time 19951.18ms 
iter 5501: loss 2.5878, time 5265.86ms 
iter 5502: loss 2.5485, time 5233.68ms 
iter 5503: loss 2.5459, time 5185.98ms 
iter 5504: loss 2.6419, time 5289.00ms 
iter 5505: loss 2.5592, time 5203.98ms 
iter 5506: loss 2.3955, time 5267.31ms 
iter 5507: loss 2.4711, time 5123.14ms 
iter 5508: loss 2.4040, time 5197.42ms 
iter 5509: loss 2.7071, time 5198.64ms 
iter 5510: loss 2.4506, time 5215.05ms 
iter 5511: loss 2.5945, time 5120.15ms 
iter 5512: loss 2.5394, time 5238.17ms 
iter 5513: loss 2.5561, time 5226.72ms 
iter 5514: loss 2.5842, time 5230.06ms 
iter 5515: loss 2.6985, time 5204.63ms 
iter 5516: loss 2.5628, time 5181.62ms 
iter 5517: loss 2.8501, time 5099.35ms 
iter 5518: loss 2.6786, time 5220.74ms 
iter 5519: loss 2.4199, time 5249.33ms 
iter 5520: loss 2.3139, time 5223.48ms 
iter 5521: loss 2.2885, time 5090.36ms 
iter 5522: loss 2.4998, time 5251.53ms 
iter 5523: loss 2.5945, time 5274.54ms 
iter 5524: loss 2.5234, time 5143.70ms 
iter 5525: loss 2.7275, time 5259.68ms 
iter 5526: loss 2.5520, time 5055.02ms 
iter 5527: loss 2.5127, time 5078.42ms 
iter 5528: loss 2.7897, time 5069.88ms 
iter 5529: loss 2.5929, time 5183.49ms 
iter 5530: loss 2.5251, time 5222.16ms 
iter 5531: loss 2.6189, time 4980.32ms 
iter 5532: loss 2.5915, time 5002.49ms 
iter 5533: loss 2.5828, time 5055.82ms 
iter 5534: loss 2.5535, time 5028.27ms 
iter 5535: loss 2.5598, time 4943.64ms 
iter 5536: loss 2.6208, time 5039.17ms 
iter 5537: loss 2.5659, time 5079.67ms 
iter 5538: loss 2.4529, time 4942.25ms 
iter 5539: loss 2.7768, time 4945.34ms 
iter 5540: loss 2.7881, time 4952.14ms 
iter 5541: loss 2.3824, time 4943.64ms 
iter 5542: loss 2.5542, time 4941.44ms 
iter 5543: loss 2.5389, time 4940.91ms 
iter 5544: loss 2.1562, time 4942.97ms 
iter 5545: loss 2.5043, time 4941.86ms 
iter 5546: loss 2.4729, time 4945.77ms 
iter 5547: loss 2.5239, time 4957.09ms 
iter 5548: loss 2.8422, time 5041.66ms 
iter 5549: loss 2.5574, time 5047.98ms 
step 5550: train loss 2.5798, val loss 2.8424
iter 5550: loss 2.6498, time 19895.22ms 
iter 5551: loss 2.6995, time 5053.81ms 
iter 5552: loss 2.4367, time 5076.85ms 
iter 5553: loss 2.4499, time 5127.41ms 
iter 5554: loss 2.6789, time 5213.58ms 
iter 5555: loss 2.6249, time 5088.64ms 
iter 5556: loss 2.8123, time 5054.88ms 
iter 5557: loss 2.6872, time 5045.60ms 
iter 5558: loss 2.6187, time 5099.32ms 
iter 5559: loss 2.4677, time 5210.23ms 
iter 5560: loss 2.6810, time 5131.73ms 
iter 5561: loss 2.6724, time 5077.00ms 
iter 5562: loss 2.5177, time 5055.25ms 
iter 5563: loss 2.2930, time 5161.64ms 
iter 5564: loss 2.7299, time 5164.38ms 
iter 5565: loss 2.5662, time 5258.81ms 
iter 5566: loss 2.7927, time 5118.42ms 
iter 5567: loss 2.6487, time 5243.50ms 
iter 5568: loss 2.5189, time 5128.39ms 
iter 5569: loss 2.5314, time 5011.48ms 
iter 5570: loss 2.3381, time 5114.26ms 
iter 5571: loss 2.5735, time 5113.03ms 
iter 5572: loss 2.6423, time 5226.64ms 
iter 5573: loss 2.4467, time 5078.05ms 
iter 5574: loss 2.5940, time 5033.25ms 
iter 5575: loss 2.3918, time 5162.58ms 
iter 5576: loss 2.7535, time 5222.16ms 
iter 5577: loss 2.9123, time 5019.73ms 
iter 5578: loss 2.5737, time 5037.35ms 
iter 5579: loss 2.5065, time 5032.94ms 
iter 5580: loss 2.5082, time 5020.93ms 
iter 5581: loss 2.5252, time 5038.59ms 
iter 5582: loss 2.5925, time 5101.59ms 
iter 5583: loss 2.8101, time 5170.73ms 
iter 5584: loss 2.4337, time 4988.46ms 
iter 5585: loss 2.5087, time 4986.70ms 
iter 5586: loss 2.3875, time 5095.63ms 
iter 5587: loss 2.6290, time 5044.88ms 
iter 5588: loss 2.5078, time 5027.78ms 
iter 5589: loss 2.4412, time 4993.84ms 
iter 5590: loss 2.5860, time 4972.82ms 
iter 5591: loss 2.4869, time 4983.54ms 
iter 5592: loss 2.5757, time 5165.05ms 
iter 5593: loss 2.6232, time 5089.72ms 
iter 5594: loss 2.4017, time 5170.34ms 
iter 5595: loss 2.5210, time 5055.02ms 
iter 5596: loss 2.6270, time 5226.72ms 
iter 5597: loss 2.5649, time 5259.07ms 
iter 5598: loss 2.4248, time 5217.08ms 
iter 5599: loss 2.4621, time 5063.98ms 
step 5600: train loss 2.5619, val loss 2.8415
iter 5600: loss 2.4408, time 19894.06ms 
iter 5601: loss 2.5750, time 5152.22ms 
iter 5602: loss 2.5515, time 5276.20ms 
iter 5603: loss 2.4601, time 5159.01ms 
iter 5604: loss 2.4542, time 5080.72ms 
iter 5605: loss 2.5001, time 5252.00ms 
iter 5606: loss 2.5323, time 5165.80ms 
iter 5607: loss 2.5144, time 5164.98ms 
iter 5608: loss 2.7788, time 5074.50ms 
iter 5609: loss 2.5346, time 4990.77ms 
iter 5610: loss 2.4807, time 5001.27ms 
iter 5611: loss 2.5379, time 4991.97ms 
iter 5612: loss 2.6812, time 5207.93ms 
iter 5613: loss 2.5891, time 5134.34ms 
iter 5614: loss 2.5600, time 5251.27ms 
iter 5615: loss 2.6296, time 5198.45ms 
iter 5616: loss 2.4575, time 5274.43ms 
iter 5617: loss 2.5965, time 5083.86ms 
iter 5618: loss 2.6007, time 5052.98ms 
iter 5619: loss 2.6343, time 5046.14ms 
iter 5620: loss 2.7989, time 5003.25ms 
iter 5621: loss 2.6814, time 5033.55ms 
iter 5622: loss 2.5453, time 5073.56ms 
iter 5623: loss 2.4752, time 5260.02ms 
iter 5624: loss 2.7061, time 5089.31ms 
iter 5625: loss 2.5836, time 5055.99ms 
iter 5626: loss 2.7076, time 5174.61ms 
iter 5627: loss 2.4502, time 5300.64ms 
iter 5628: loss 2.5407, time 5237.99ms 
iter 5629: loss 2.6124, time 5296.18ms 
iter 5630: loss 2.5240, time 5239.23ms 
iter 5631: loss 2.7306, time 5213.05ms 
iter 5632: loss 2.5232, time 5103.56ms 
iter 5633: loss 2.8476, time 5057.26ms 
iter 5634: loss 2.5609, time 5190.28ms 
iter 5635: loss 2.6345, time 5090.81ms 
iter 5636: loss 2.3644, time 5062.36ms 
iter 5637: loss 2.4492, time 5249.74ms 
iter 5638: loss 2.5290, time 5190.55ms 
iter 5639: loss 2.9029, time 5208.96ms 
iter 5640: loss 2.6762, time 5018.80ms 
iter 5641: loss 2.8238, time 5101.13ms 
iter 5642: loss 2.5260, time 5145.31ms 
iter 5643: loss 2.7359, time 5300.22ms 
iter 5644: loss 2.5715, time 5163.24ms 
iter 5645: loss 2.6248, time 5092.33ms 
iter 5646: loss 2.4298, time 5069.26ms 
iter 5647: loss 2.5048, time 5058.54ms 
iter 5648: loss 2.5638, time 5097.90ms 
iter 5649: loss 2.5180, time 5019.25ms 
step 5650: train loss 2.5574, val loss 2.8494
iter 5650: loss 2.5365, time 19872.87ms 
iter 5651: loss 2.4957, time 5271.99ms 
iter 5652: loss 2.5778, time 5076.81ms 
iter 5653: loss 2.4637, time 5091.32ms 
iter 5654: loss 2.7027, time 5081.72ms 
iter 5655: loss 2.6301, time 5104.70ms 
iter 5656: loss 2.4656, time 5164.36ms 
iter 5657: loss 2.7198, time 5100.75ms 
iter 5658: loss 2.6031, time 5064.49ms 
iter 5659: loss 2.7522, time 5065.75ms 
iter 5660: loss 2.5197, time 5053.11ms 
iter 5661: loss 2.6804, time 5135.21ms 
iter 5662: loss 2.6609, time 5186.01ms 
iter 5663: loss 2.7755, time 5216.17ms 
iter 5664: loss 2.6500, time 5162.13ms 
iter 5665: loss 2.3528, time 5213.16ms 
iter 5666: loss 2.6318, time 5096.67ms 
iter 5667: loss 2.6423, time 4968.06ms 
iter 5668: loss 2.4739, time 5276.28ms 
iter 5669: loss 2.6511, time 5141.11ms 
iter 5670: loss 2.6245, time 5256.45ms 
iter 5671: loss 2.5761, time 5121.07ms 
iter 5672: loss 2.6279, time 5237.05ms 
iter 5673: loss 2.6306, time 5176.97ms 
iter 5674: loss 2.7011, time 5223.04ms 
iter 5675: loss 2.7413, time 5173.37ms 
iter 5676: loss 2.5070, time 5260.00ms 
iter 5677: loss 2.6797, time 5253.17ms 
iter 5678: loss 2.6370, time 5265.29ms 
iter 5679: loss 2.5384, time 5268.18ms 
iter 5680: loss 2.7146, time 5260.99ms 
iter 5681: loss 2.6882, time 5263.01ms 
iter 5682: loss 2.6226, time 5258.93ms 
iter 5683: loss 2.6187, time 5268.00ms 
iter 5684: loss 2.6689, time 5279.99ms 
iter 5685: loss 2.5652, time 5269.29ms 
iter 5686: loss 2.3458, time 5207.42ms 
iter 5687: loss 2.5504, time 5212.35ms 
iter 5688: loss 2.5634, time 5237.43ms 
iter 5689: loss 2.3617, time 5263.06ms 
iter 5690: loss 2.5450, time 5277.42ms 
iter 5691: loss 2.5758, time 5275.89ms 
iter 5692: loss 2.4091, time 5271.42ms 
iter 5693: loss 2.7559, time 5270.35ms 
iter 5694: loss 2.6385, time 5268.19ms 
iter 5695: loss 2.5455, time 5235.24ms 
iter 5696: loss 2.6006, time 5246.93ms 
iter 5697: loss 2.5265, time 5262.05ms 
iter 5698: loss 2.6467, time 5267.18ms 
iter 5699: loss 2.6376, time 5196.84ms 
step 5700: train loss 2.5636, val loss 2.8337
iter 5700: loss 2.5660, time 20090.33ms 
iter 5701: loss 2.5391, time 5248.37ms 
iter 5702: loss 2.5583, time 5248.70ms 
iter 5703: loss 2.5975, time 5262.30ms 
iter 5704: loss 2.5554, time 5259.70ms 
iter 5705: loss 2.4624, time 5256.51ms 
iter 5706: loss 2.4334, time 5257.06ms 
iter 5707: loss 2.4752, time 5274.69ms 
iter 5708: loss 2.3461, time 5280.39ms 
iter 5709: loss 2.5720, time 5272.25ms 
iter 5710: loss 2.4832, time 5252.15ms 
iter 5711: loss 2.5815, time 5254.81ms 
iter 5712: loss 2.3764, time 5256.72ms 
iter 5713: loss 2.8091, time 5271.24ms 
iter 5714: loss 2.3250, time 5262.75ms 
iter 5715: loss 2.6863, time 5246.79ms 
iter 5716: loss 2.6469, time 5260.57ms 
iter 5717: loss 2.4204, time 5263.39ms 
iter 5718: loss 2.4805, time 5258.36ms 
iter 5719: loss 2.7005, time 5255.90ms 
iter 5720: loss 2.5329, time 5249.60ms 
iter 5721: loss 2.6245, time 5245.79ms 
iter 5722: loss 2.5908, time 5087.78ms 
iter 5723: loss 2.5128, time 5049.09ms 
iter 5724: loss 2.5544, time 5045.46ms 
iter 5725: loss 2.5031, time 5178.53ms 
iter 5726: loss 2.4952, time 5233.14ms 
iter 5727: loss 2.6973, time 5036.05ms 
iter 5728: loss 2.5994, time 5027.47ms 
iter 5729: loss 2.3255, time 5024.78ms 
iter 5730: loss 2.3301, time 5033.09ms 
iter 5731: loss 2.6733, time 5037.55ms 
iter 5732: loss 2.7233, time 5116.47ms 
iter 5733: loss 2.6233, time 5264.93ms 
iter 5734: loss 2.5965, time 5259.67ms 
iter 5735: loss 2.4000, time 5254.46ms 
iter 5736: loss 2.2448, time 5261.99ms 
iter 5737: loss 2.6190, time 5275.36ms 
iter 5738: loss 2.5941, time 5237.11ms 
iter 5739: loss 2.5476, time 5001.46ms 
iter 5740: loss 2.6997, time 5208.87ms 
iter 5741: loss 2.6912, time 5275.06ms 
iter 5742: loss 2.2986, time 5244.88ms 
iter 5743: loss 2.7315, time 5044.04ms 
iter 5744: loss 2.4770, time 5217.35ms 
iter 5745: loss 2.6012, time 5275.38ms 
iter 5746: loss 2.2322, time 5264.23ms 
iter 5747: loss 2.6263, time 5190.50ms 
iter 5748: loss 2.7154, time 5169.54ms 
iter 5749: loss 2.5657, time 5245.56ms 
step 5750: train loss 2.5681, val loss 2.8343
iter 5750: loss 2.3982, time 19856.76ms 
iter 5751: loss 2.5486, time 5176.71ms 
iter 5752: loss 2.6929, time 5287.08ms 
iter 5753: loss 2.5269, time 5154.42ms 
iter 5754: loss 2.4525, time 5130.05ms 
iter 5755: loss 2.7298, time 5193.49ms 
iter 5756: loss 2.6324, time 5289.10ms 
iter 5757: loss 2.4732, time 5211.72ms 
iter 5758: loss 2.4833, time 5278.65ms 
iter 5759: loss 2.4791, time 5133.09ms 
iter 5760: loss 2.7230, time 5094.34ms 
iter 5761: loss 2.5977, time 5202.89ms 
iter 5762: loss 2.5772, time 5270.93ms 
iter 5763: loss 2.6390, time 5236.34ms 
iter 5764: loss 2.5195, time 5282.12ms 
iter 5765: loss 2.7208, time 5103.13ms 
iter 5766: loss 2.2982, time 5126.37ms 
iter 5767: loss 2.4652, time 5191.08ms 
iter 5768: loss 2.5419, time 5269.78ms 
iter 5769: loss 2.6956, time 5222.59ms 
iter 5770: loss 2.5917, time 5141.13ms 
iter 5771: loss 2.3933, time 5028.09ms 
iter 5772: loss 2.4672, time 5190.19ms 
iter 5773: loss 2.6189, time 5194.10ms 
iter 5774: loss 2.5263, time 5281.22ms 
iter 5775: loss 2.6099, time 5196.23ms 
iter 5776: loss 2.4522, time 5183.34ms 
iter 5777: loss 2.5049, time 4988.55ms 
iter 5778: loss 2.6675, time 5020.68ms 
iter 5779: loss 2.4346, time 5048.88ms 
iter 5780: loss 2.6570, time 5236.47ms 
iter 5781: loss 2.6848, time 5153.80ms 
iter 5782: loss 2.5595, time 5268.61ms 
iter 5783: loss 2.5536, time 5131.55ms 
iter 5784: loss 2.5628, time 5057.54ms 
iter 5785: loss 2.5176, time 5129.01ms 
iter 5786: loss 2.6578, time 5275.93ms 
iter 5787: loss 2.7517, time 5121.83ms 
iter 5788: loss 2.4782, time 5248.39ms 
iter 5789: loss 2.5911, time 5111.61ms 
iter 5790: loss 2.3344, time 5064.31ms 
iter 5791: loss 2.4773, time 5125.84ms 
iter 5792: loss 2.6613, time 5237.48ms 
iter 5793: loss 2.6623, time 5215.00ms 
iter 5794: loss 2.4946, time 5242.92ms 
iter 5795: loss 2.3991, time 5113.25ms 
iter 5796: loss 2.5915, time 5154.75ms 
iter 5797: loss 2.4392, time 5080.27ms 
iter 5798: loss 2.4128, time 5274.19ms 
iter 5799: loss 2.3337, time 5145.71ms 
step 5800: train loss 2.5565, val loss 2.8475
iter 5800: loss 2.6129, time 19766.74ms 
iter 5801: loss 2.6265, time 5215.45ms 
iter 5802: loss 2.7275, time 5126.68ms 
iter 5803: loss 2.7004, time 5249.16ms 
iter 5804: loss 2.3981, time 5263.15ms 
iter 5805: loss 2.5593, time 5179.05ms 
iter 5806: loss 2.5695, time 5112.60ms 
iter 5807: loss 2.3209, time 5224.17ms 
iter 5808: loss 2.6139, time 5052.66ms 
iter 5809: loss 2.5930, time 5207.57ms 
iter 5810: loss 2.3976, time 5263.58ms 
iter 5811: loss 2.6616, time 5104.58ms 
iter 5812: loss 2.4558, time 4994.64ms 
iter 5813: loss 2.4952, time 5176.07ms 
iter 5814: loss 2.5730, time 5283.77ms 
iter 5815: loss 2.4326, time 5196.86ms 
iter 5816: loss 2.3759, time 5280.39ms 
iter 5817: loss 2.4180, time 5159.29ms 
iter 5818: loss 2.7639, time 5134.49ms 
iter 5819: loss 2.4940, time 5132.01ms 
iter 5820: loss 2.7388, time 5177.45ms 
iter 5821: loss 2.4401, time 5209.83ms 
iter 5822: loss 2.3624, time 5258.76ms 
iter 5823: loss 2.5627, time 5132.52ms 
iter 5824: loss 2.5941, time 5076.41ms 
iter 5825: loss 2.5373, time 5171.02ms 
iter 5826: loss 2.6596, time 5073.43ms 
iter 5827: loss 2.5869, time 5200.63ms 
iter 5828: loss 2.5121, time 5242.90ms 
iter 5829: loss 2.5117, time 5080.14ms 
iter 5830: loss 2.7405, time 5116.60ms 
iter 5831: loss 2.5555, time 5119.57ms 
iter 5832: loss 2.4096, time 5104.03ms 
iter 5833: loss 2.4564, time 5117.67ms 
iter 5834: loss 2.5242, time 5161.62ms 
iter 5835: loss 2.5870, time 5023.33ms 
iter 5836: loss 2.5014, time 5197.83ms 
iter 5837: loss 2.7148, time 5195.01ms 
iter 5838: loss 2.6173, time 5275.79ms 
iter 5839: loss 2.4895, time 5227.72ms 
iter 5840: loss 2.2594, time 5222.08ms 
iter 5841: loss 2.5055, time 5078.18ms 
iter 5842: loss 2.5922, time 5172.86ms 
iter 5843: loss 2.4009, time 5171.24ms 
iter 5844: loss 2.4968, time 5211.16ms 
iter 5845: loss 2.7346, time 5148.83ms 
iter 5846: loss 2.4672, time 5253.64ms 
iter 5847: loss 2.2718, time 5057.51ms 
iter 5848: loss 2.6591, time 5085.51ms 
iter 5849: loss 2.6735, time 5123.03ms 
step 5850: train loss 2.5442, val loss 2.8360
iter 5850: loss 2.4741, time 19806.55ms 
iter 5851: loss 2.6912, time 5146.81ms 
iter 5852: loss 2.5523, time 5259.66ms 
iter 5853: loss 2.4907, time 5268.96ms 
iter 5854: loss 2.5723, time 5246.94ms 
iter 5855: loss 2.5169, time 5269.35ms 
iter 5856: loss 2.6189, time 5104.04ms 
iter 5857: loss 2.5847, time 5159.37ms 
iter 5858: loss 2.4168, time 5289.61ms 
iter 5859: loss 2.7277, time 5268.78ms 
iter 5860: loss 2.7146, time 5211.32ms 
iter 5861: loss 2.6057, time 5163.74ms 
iter 5862: loss 2.5778, time 5269.62ms 
iter 5863: loss 2.5741, time 5259.92ms 
iter 5864: loss 2.3951, time 5258.26ms 
iter 5865: loss 2.6060, time 5130.50ms 
iter 5866: loss 2.6121, time 5239.63ms 
iter 5867: loss 2.5376, time 5128.69ms 
iter 5868: loss 2.4537, time 5076.36ms 
iter 5869: loss 2.4544, time 5038.44ms 
iter 5870: loss 2.4600, time 5248.59ms 
iter 5871: loss 2.4832, time 5139.72ms 
iter 5872: loss 2.6362, time 5169.71ms 
iter 5873: loss 2.6816, time 5073.81ms 
iter 5874: loss 2.7305, time 5050.83ms 
iter 5875: loss 2.5914, time 5085.55ms 
iter 5876: loss 2.4020, time 5156.77ms 
iter 5877: loss 2.4739, time 5112.22ms 
iter 5878: loss 2.5737, time 5170.91ms 
iter 5879: loss 2.5090, time 5126.53ms 
iter 5880: loss 2.7179, time 5065.19ms 
iter 5881: loss 2.4804, time 5035.81ms 
iter 5882: loss 2.4035, time 5068.41ms 
iter 5883: loss 2.5848, time 5218.13ms 
iter 5884: loss 2.6677, time 5215.37ms 
iter 5885: loss 2.5929, time 5240.28ms 
iter 5886: loss 2.4697, time 5078.91ms 
iter 5887: loss 2.6540, time 5225.39ms 
iter 5888: loss 2.5944, time 5184.09ms 
iter 5889: loss 2.7737, time 5274.39ms 
iter 5890: loss 2.5221, time 5159.34ms 
iter 5891: loss 2.5182, time 5270.97ms 
iter 5892: loss 2.3852, time 5189.50ms 
iter 5893: loss 2.4910, time 5090.24ms 
iter 5894: loss 2.5350, time 5164.62ms 
iter 5895: loss 2.6087, time 5277.39ms 
iter 5896: loss 2.6106, time 5145.25ms 
iter 5897: loss 2.6019, time 5247.98ms 
iter 5898: loss 2.4271, time 5023.15ms 
iter 5899: loss 2.7013, time 5174.95ms 
step 5900: train loss 2.5681, val loss 2.8464
iter 5900: loss 2.6213, time 20051.25ms 
iter 5901: loss 2.6142, time 5250.81ms 
iter 5902: loss 2.4437, time 5219.93ms 
iter 5903: loss 2.5254, time 5117.09ms 
iter 5904: loss 2.5196, time 5207.22ms 
iter 5905: loss 2.6473, time 5196.49ms 
iter 5906: loss 2.4731, time 5210.60ms 
iter 5907: loss 2.6490, time 5259.93ms 
iter 5908: loss 2.4747, time 5131.13ms 
iter 5909: loss 2.5724, time 5128.27ms 
iter 5910: loss 2.4922, time 5165.47ms 
iter 5911: loss 2.5617, time 5290.63ms 
iter 5912: loss 2.4518, time 5161.07ms 
iter 5913: loss 2.4008, time 5255.50ms 
iter 5914: loss 2.7216, time 5116.82ms 
iter 5915: loss 2.6649, time 5058.25ms 
iter 5916: loss 2.6945, time 5136.94ms 
iter 5917: loss 2.6085, time 5292.11ms 
iter 5918: loss 2.5500, time 5191.83ms 
iter 5919: loss 2.6071, time 5277.10ms 
iter 5920: loss 2.6179, time 5127.83ms 
iter 5921: loss 2.6382, time 5159.27ms 
iter 5922: loss 2.5499, time 5165.94ms 
iter 5923: loss 2.8187, time 5275.33ms 
iter 5924: loss 2.4013, time 5146.22ms 
iter 5925: loss 2.4212, time 5279.00ms 
iter 5926: loss 2.7799, time 5173.80ms 
iter 5927: loss 2.7423, time 5230.68ms 
iter 5928: loss 2.5805, time 5120.14ms 
iter 5929: loss 2.3334, time 5275.78ms 
iter 5930: loss 2.6621, time 5085.15ms 
iter 5931: loss 2.7837, time 5246.76ms 
iter 5932: loss 2.3930, time 5183.06ms 
iter 5933: loss 2.6636, time 5171.68ms 
iter 5934: loss 2.4302, time 5128.81ms 
iter 5935: loss 2.5609, time 5251.21ms 
iter 5936: loss 2.5820, time 5233.87ms 
iter 5937: loss 2.6036, time 5183.39ms 
iter 5938: loss 2.5232, time 5265.21ms 
iter 5939: loss 2.5990, time 5108.16ms 
iter 5940: loss 2.7153, time 5213.99ms 
iter 5941: loss 2.7550, time 5217.48ms 
iter 5942: loss 2.5707, time 5277.84ms 
iter 5943: loss 2.5402, time 5192.12ms 
iter 5944: loss 2.5850, time 5282.11ms 
iter 5945: loss 2.4982, time 5130.67ms 
iter 5946: loss 2.5728, time 5263.97ms 
iter 5947: loss 2.6875, time 5166.20ms 
iter 5948: loss 2.4819, time 5279.03ms 
iter 5949: loss 2.5309, time 5132.05ms 
step 5950: train loss 2.5561, val loss 2.8410
iter 5950: loss 2.5794, time 19816.14ms 
iter 5951: loss 2.7472, time 5176.66ms 
iter 5952: loss 2.4176, time 5219.88ms 
iter 5953: loss 2.5683, time 5241.00ms 
iter 5954: loss 2.4663, time 5256.79ms 
iter 5955: loss 2.6653, time 5226.27ms 
iter 5956: loss 2.4456, time 5118.98ms 
iter 5957: loss 2.4784, time 5183.28ms 
iter 5958: loss 2.4509, time 5189.56ms 
iter 5959: loss 2.5535, time 5190.99ms 
iter 5960: loss 2.4952, time 5261.66ms 
iter 5961: loss 2.6894, time 5251.40ms 
iter 5962: loss 2.6658, time 5180.86ms 
iter 5963: loss 2.4432, time 5142.02ms 
iter 5964: loss 2.6565, time 5274.64ms 
iter 5965: loss 2.6184, time 5192.67ms 
iter 5966: loss 2.4801, time 5260.02ms 
iter 5967: loss 2.7790, time 5128.75ms 
iter 5968: loss 2.4074, time 4997.30ms 
iter 5969: loss 2.5717, time 5019.55ms 
iter 5970: loss 2.4646, time 5269.77ms 
iter 5971: loss 2.7404, time 5134.01ms 
iter 5972: loss 2.6288, time 5236.63ms 
iter 5973: loss 2.5049, time 5125.25ms 
iter 5974: loss 2.6128, time 5043.70ms 
iter 5975: loss 2.4089, time 5048.68ms 
iter 5976: loss 2.7107, time 5270.57ms 
iter 5977: loss 2.4193, time 5120.18ms 
iter 5978: loss 2.6358, time 5236.39ms 
iter 5979: loss 2.3912, time 5257.23ms 
iter 5980: loss 2.5856, time 5174.30ms 
iter 5981: loss 2.3457, time 5077.85ms 
iter 5982: loss 2.4655, time 5229.36ms 
iter 5983: loss 2.5852, time 5123.46ms 
iter 5984: loss 2.7568, time 5217.89ms 
iter 5985: loss 2.4218, time 5211.64ms 
iter 5986: loss 2.6279, time 5135.71ms 
iter 5987: loss 2.8179, time 5120.01ms 
iter 5988: loss 2.4547, time 5238.27ms 
iter 5989: loss 2.5067, time 5131.43ms 
iter 5990: loss 2.5483, time 5193.77ms 
iter 5991: loss 2.6146, time 5120.48ms 
iter 5992: loss 2.3300, time 5053.44ms 
iter 5993: loss 2.3264, time 5023.39ms 
iter 5994: loss 2.5975, time 5166.11ms 
iter 5995: loss 2.6651, time 5273.52ms 
iter 5996: loss 2.3974, time 5182.10ms 
iter 5997: loss 2.5446, time 5177.22ms 
iter 5998: loss 2.3540, time 5078.90ms 
iter 5999: loss 2.5700, time 5051.88ms 
step 6000: train loss 2.5458, val loss 2.8421
iter 6000: loss 2.5961, time 19763.43ms 
iter 6001: loss 2.3618, time 5142.15ms 
iter 6002: loss 2.4679, time 5263.62ms 
iter 6003: loss 2.4421, time 5260.00ms 
iter 6004: loss 2.6170, time 5257.52ms 
iter 6005: loss 2.3837, time 5255.92ms 
iter 6006: loss 2.6826, time 5264.85ms 
iter 6007: loss 2.7646, time 5277.43ms 
iter 6008: loss 2.5930, time 5270.90ms 
iter 6009: loss 2.7427, time 5098.16ms 
iter 6010: loss 2.3751, time 5207.69ms 
iter 6011: loss 2.6353, time 5274.58ms 
iter 6012: loss 2.5508, time 5018.48ms 
iter 6013: loss 2.8708, time 5118.61ms 
iter 6014: loss 2.3511, time 5172.78ms 
iter 6015: loss 2.5184, time 5266.88ms 
iter 6016: loss 2.4277, time 5200.50ms 
iter 6017: loss 2.4276, time 5238.69ms 
iter 6018: loss 2.4840, time 5267.15ms 
iter 6019: loss 2.6982, time 5064.19ms 
iter 6020: loss 2.7508, time 5212.10ms 
iter 6021: loss 2.3598, time 5202.14ms 
iter 6022: loss 2.7951, time 5183.55ms 
iter 6023: loss 2.6768, time 5198.87ms 
iter 6024: loss 2.5624, time 5256.83ms 
iter 6025: loss 2.4470, time 5072.89ms 
iter 6026: loss 2.6149, time 5156.63ms 
iter 6027: loss 2.5192, time 5089.55ms 
iter 6028: loss 2.6751, time 5235.40ms 
iter 6029: loss 2.5605, time 5154.18ms 
iter 6030: loss 2.5670, time 5269.62ms 
iter 6031: loss 2.4657, time 5270.63ms 
iter 6032: loss 2.5398, time 5121.64ms 
iter 6033: loss 2.7103, time 5121.71ms 
iter 6034: loss 2.4622, time 5194.53ms 
iter 6035: loss 2.3410, time 5215.76ms 
iter 6036: loss 2.4264, time 5153.07ms 
iter 6037: loss 2.5702, time 5223.06ms 
iter 6038: loss 2.6765, time 5029.61ms 
iter 6039: loss 2.6898, time 5242.75ms 
iter 6040: loss 2.5783, time 5217.57ms 
iter 6041: loss 2.5769, time 5159.24ms 
iter 6042: loss 2.5610, time 5111.36ms 
iter 6043: loss 2.6466, time 5268.76ms 
iter 6044: loss 2.5039, time 5073.33ms 
iter 6045: loss 2.5857, time 5058.92ms 
iter 6046: loss 2.4554, time 5133.79ms 
iter 6047: loss 2.4558, time 5149.70ms 
iter 6048: loss 2.5088, time 5175.34ms 
iter 6049: loss 2.3534, time 5084.97ms 
step 6050: train loss 2.5564, val loss 2.8451
iter 6050: loss 2.7025, time 20115.37ms 
iter 6051: loss 2.6150, time 5114.36ms 
iter 6052: loss 2.8407, time 5092.97ms 
iter 6053: loss 2.6891, time 5103.84ms 
iter 6054: loss 2.3885, time 5151.92ms 
iter 6055: loss 2.5980, time 5200.53ms 
iter 6056: loss 2.4657, time 5134.21ms 
iter 6057: loss 2.7339, time 5305.81ms 
iter 6058: loss 2.5682, time 5091.88ms 
iter 6059: loss 2.3597, time 5165.15ms 
iter 6060: loss 2.4580, time 5120.25ms 
iter 6061: loss 2.4765, time 5149.62ms 
iter 6062: loss 2.4408, time 5138.86ms 
iter 6063: loss 2.3294, time 5198.29ms 
iter 6064: loss 2.6679, time 5118.24ms 
iter 6065: loss 2.4955, time 5194.38ms 
iter 6066: loss 2.7188, time 5157.84ms 
iter 6067: loss 2.5284, time 5136.69ms 
iter 6068: loss 2.4155, time 5227.98ms 
iter 6069: loss 2.4897, time 5140.94ms 
iter 6070: loss 2.5159, time 5233.34ms 
iter 6071: loss 2.2168, time 5154.56ms 
iter 6072: loss 2.3720, time 5234.94ms 
iter 6073: loss 2.6162, time 5149.62ms 
iter 6074: loss 2.5089, time 5195.76ms 
iter 6075: loss 2.4790, time 5129.30ms 
iter 6076: loss 2.5880, time 5076.47ms 
iter 6077: loss 2.4810, time 5030.86ms 
iter 6078: loss 2.5364, time 5259.27ms 
iter 6079: loss 2.2538, time 5074.94ms 
iter 6080: loss 2.6945, time 5202.27ms 
iter 6081: loss 2.5188, time 5163.43ms 
iter 6082: loss 2.5621, time 5082.34ms 
iter 6083: loss 2.4448, time 5073.40ms 
iter 6084: loss 2.5709, time 5194.92ms 
iter 6085: loss 2.5227, time 5236.42ms 
iter 6086: loss 2.6677, time 5172.12ms 
iter 6087: loss 2.6149, time 5162.08ms 
iter 6088: loss 2.5296, time 5218.91ms 
iter 6089: loss 2.4818, time 5164.38ms 
iter 6090: loss 2.6192, time 5117.77ms 
iter 6091: loss 2.6217, time 5203.29ms 
iter 6092: loss 2.6678, time 5184.54ms 
iter 6093: loss 2.5036, time 5213.58ms 
iter 6094: loss 2.4929, time 5263.75ms 
iter 6095: loss 2.6280, time 5221.68ms 
iter 6096: loss 2.4814, time 5167.38ms 
iter 6097: loss 2.7074, time 5114.69ms 
iter 6098: loss 2.5071, time 5079.75ms 
iter 6099: loss 2.5826, time 5074.87ms 
step 6100: train loss 2.5479, val loss 2.8459
iter 6100: loss 2.7348, time 20091.74ms 
iter 6101: loss 2.5444, time 5129.49ms 
iter 6102: loss 2.4202, time 5133.53ms 
iter 6103: loss 2.6469, time 5109.93ms 
iter 6104: loss 2.6897, time 5248.36ms 
iter 6105: loss 2.4850, time 5304.39ms 
iter 6106: loss 2.7444, time 5256.62ms 
iter 6107: loss 2.6694, time 5308.74ms 
iter 6108: loss 2.5538, time 5154.05ms 
iter 6109: loss 2.6514, time 5276.82ms 
iter 6110: loss 2.6716, time 5185.37ms 
iter 6111: loss 2.5134, time 5303.28ms 
iter 6112: loss 2.5308, time 5164.00ms 
iter 6113: loss 2.5178, time 5192.06ms 
iter 6114: loss 2.3213, time 5281.17ms 
iter 6115: loss 2.4530, time 5193.68ms 
iter 6116: loss 2.5656, time 5074.16ms 
iter 6117: loss 2.4212, time 5134.63ms 
iter 6118: loss 2.4635, time 5155.90ms 
iter 6119: loss 2.3286, time 5108.46ms 
iter 6120: loss 2.4843, time 5075.07ms 
iter 6121: loss 2.7148, time 5022.24ms 
iter 6122: loss 2.4409, time 5026.75ms 
iter 6123: loss 2.5631, time 5022.54ms 
iter 6124: loss 2.4959, time 5110.38ms 
iter 6125: loss 2.5569, time 5214.01ms 
iter 6126: loss 2.7319, time 5307.32ms 
iter 6127: loss 2.7466, time 5155.65ms 
iter 6128: loss 2.7461, time 5116.47ms 
iter 6129: loss 2.6657, time 5074.97ms 
iter 6130: loss 2.6075, time 5130.73ms 
iter 6131: loss 2.5040, time 5202.73ms 
iter 6132: loss 2.4558, time 5294.33ms 
iter 6133: loss 2.3938, time 5090.65ms 
iter 6134: loss 2.7107, time 5067.51ms 
iter 6135: loss 2.5604, time 5087.84ms 
iter 6136: loss 2.4428, time 5205.20ms 
iter 6137: loss 2.4337, time 5168.18ms 
iter 6138: loss 2.4019, time 5204.28ms 
iter 6139: loss 2.4756, time 5295.19ms 
iter 6140: loss 2.5711, time 5164.31ms 
iter 6141: loss 2.5177, time 5192.36ms 
iter 6142: loss 2.5344, time 5167.52ms 
iter 6143: loss 2.3975, time 5237.70ms 
iter 6144: loss 2.5674, time 5147.91ms 
iter 6145: loss 2.4625, time 5253.98ms 
iter 6146: loss 2.4583, time 5298.18ms 
iter 6147: loss 2.4409, time 5211.87ms 
iter 6148: loss 2.6355, time 5152.90ms 
iter 6149: loss 2.3934, time 5099.08ms 
step 6150: train loss 2.5350, val loss 2.8374
iter 6150: loss 2.6335, time 20196.52ms 
iter 6151: loss 2.8278, time 5162.95ms 
iter 6152: loss 2.5939, time 5032.01ms 
iter 6153: loss 2.5201, time 5026.50ms 
iter 6154: loss 2.6117, time 5204.91ms 
iter 6155: loss 2.6567, time 5115.18ms 
iter 6156: loss 2.6020, time 5244.16ms 
iter 6157: loss 2.6692, time 5301.87ms 
iter 6158: loss 2.7786, time 5174.95ms 
iter 6159: loss 2.4984, time 5118.81ms 
iter 6160: loss 2.4685, time 5083.24ms 
iter 6161: loss 2.5919, time 5239.27ms 
iter 6162: loss 2.5324, time 5113.56ms 
iter 6163: loss 2.5009, time 5296.85ms 
iter 6164: loss 2.3155, time 5178.05ms 
iter 6165: loss 2.4872, time 5128.96ms 
iter 6166: loss 2.6252, time 5028.74ms 
iter 6167: loss 2.7460, time 4959.21ms 
iter 6168: loss 2.4303, time 5080.55ms 
iter 6169: loss 2.6631, time 5054.27ms 
iter 6170: loss 2.6208, time 5206.13ms 
iter 6171: loss 2.5774, time 5114.94ms 
iter 6172: loss 2.4735, time 5082.20ms 
iter 6173: loss 2.5974, time 5159.08ms 
iter 6174: loss 2.3381, time 5109.53ms 
iter 6175: loss 2.4977, time 5087.54ms 
iter 6176: loss 2.4648, time 5081.96ms 
iter 6177: loss 2.5602, time 5110.00ms 
iter 6178: loss 2.6221, time 5146.84ms 
iter 6179: loss 2.6702, time 5083.30ms 
iter 6180: loss 2.7009, time 5271.66ms 
iter 6181: loss 2.4973, time 5109.08ms 
iter 6182: loss 2.5337, time 5242.30ms 
iter 6183: loss 2.5582, time 5163.67ms 
iter 6184: loss 2.6838, time 5185.00ms 
iter 6185: loss 2.7525, time 5190.43ms 
iter 6186: loss 2.5655, time 5181.49ms 
iter 6187: loss 2.3477, time 5153.78ms 
iter 6188: loss 2.6146, time 5269.26ms 
iter 6189: loss 2.4866, time 5119.15ms 
iter 6190: loss 2.2153, time 5092.87ms 
iter 6191: loss 2.5333, time 5225.94ms 
iter 6192: loss 2.6099, time 5119.33ms 
iter 6193: loss 2.6936, time 5094.86ms 
iter 6194: loss 2.3118, time 5070.54ms 
iter 6195: loss 2.4323, time 5289.51ms 
iter 6196: loss 2.4510, time 5166.10ms 
iter 6197: loss 2.4261, time 5180.05ms 
iter 6198: loss 2.6100, time 5062.12ms 
iter 6199: loss 2.7641, time 5133.09ms 
step 6200: train loss 2.5358, val loss 2.8423
iter 6200: loss 2.6825, time 19926.63ms 
iter 6201: loss 2.7120, time 5199.81ms 
iter 6202: loss 2.4522, time 5132.27ms 
iter 6203: loss 2.5333, time 5134.44ms 
iter 6204: loss 2.5608, time 5040.79ms 
iter 6205: loss 2.4417, time 5087.41ms 
iter 6206: loss 2.7438, time 5037.45ms 
iter 6207: loss 2.5902, time 5036.79ms 
iter 6208: loss 2.6371, time 5034.54ms 
iter 6209: loss 2.6186, time 4994.27ms 
iter 6210: loss 2.5828, time 4991.29ms 
iter 6211: loss 2.6363, time 4992.19ms 
iter 6212: loss 2.5645, time 5130.81ms 
iter 6213: loss 2.6254, time 5164.15ms 
iter 6214: loss 2.1532, time 5269.24ms 
iter 6215: loss 2.7374, time 5119.14ms 
iter 6216: loss 2.6021, time 5109.92ms 
iter 6217: loss 2.6560, time 5285.58ms 
iter 6218: loss 2.4767, time 5168.14ms 
iter 6219: loss 2.5862, time 5249.20ms 
iter 6220: loss 2.6719, time 5125.07ms 
iter 6221: loss 2.4022, time 5236.27ms 
iter 6222: loss 2.5412, time 5294.49ms 
iter 6223: loss 2.4451, time 5165.08ms 
iter 6224: loss 2.4818, time 5079.31ms 
iter 6225: loss 2.4048, time 5083.35ms 
iter 6226: loss 2.5556, time 5242.71ms 
iter 6227: loss 2.2895, time 5228.50ms 
iter 6228: loss 2.6470, time 5146.83ms 
iter 6229: loss 2.5732, time 5309.85ms 
iter 6230: loss 2.2961, time 5229.86ms 
iter 6231: loss 2.6002, time 5218.04ms 
iter 6232: loss 2.4555, time 5091.06ms 
iter 6233: loss 2.4379, time 5187.46ms 
iter 6234: loss 2.4888, time 5008.10ms 
iter 6235: loss 2.5900, time 5305.72ms 
iter 6236: loss 2.5958, time 5259.94ms 
iter 6237: loss 2.5394, time 5152.40ms 
iter 6238: loss 2.5341, time 5059.63ms 
iter 6239: loss 2.5865, time 5032.46ms 
iter 6240: loss 2.6604, time 5193.45ms 
iter 6241: loss 2.5901, time 5076.13ms 
iter 6242: loss 2.4533, time 5331.48ms 
iter 6243: loss 2.5546, time 5336.20ms 
iter 6244: loss 2.5057, time 5183.08ms 
iter 6245: loss 2.6536, time 5219.44ms 
iter 6246: loss 2.6060, time 5163.75ms 
iter 6247: loss 2.5944, time 5242.58ms 
iter 6248: loss 2.5309, time 5129.32ms 
iter 6249: loss 2.3595, time 5268.27ms 
step 6250: train loss 2.5182, val loss 2.8421
iter 6250: loss 2.4046, time 19977.45ms 
iter 6251: loss 2.4241, time 5135.21ms 
iter 6252: loss 2.6367, time 5148.96ms 
iter 6253: loss 2.6572, time 5193.61ms 
iter 6254: loss 2.3147, time 5301.75ms 
iter 6255: loss 2.6135, time 5115.23ms 
iter 6256: loss 2.4173, time 5196.46ms 
iter 6257: loss 2.4309, time 5162.08ms 
iter 6258: loss 2.7113, time 5279.87ms 
iter 6259: loss 2.5970, time 5153.20ms 
iter 6260: loss 2.4561, time 5289.44ms 
iter 6261: loss 2.4920, time 5298.43ms 
iter 6262: loss 2.5470, time 5129.65ms 
iter 6263: loss 2.4630, time 5099.72ms 
iter 6264: loss 2.5334, time 5132.63ms 
iter 6265: loss 2.5449, time 5072.69ms 
iter 6266: loss 2.7173, time 5105.84ms 
iter 6267: loss 2.5294, time 5141.72ms 
iter 6268: loss 2.6737, time 5245.71ms 
iter 6269: loss 2.5156, time 5095.45ms 
iter 6270: loss 2.4977, time 5241.41ms 
iter 6271: loss 2.6778, time 5217.65ms 
iter 6272: loss 2.3513, time 5301.70ms 
iter 6273: loss 2.5837, time 5178.37ms 
iter 6274: loss 2.7601, time 5152.68ms 
iter 6275: loss 2.2956, time 5283.58ms 
iter 6276: loss 2.5725, time 5130.00ms 
iter 6277: loss 2.5685, time 5221.52ms 
iter 6278: loss 2.7393, time 5136.86ms 
iter 6279: loss 2.3913, time 5117.43ms 
iter 6280: loss 2.6422, time 5219.82ms 
iter 6281: loss 2.6703, time 5297.77ms 
iter 6282: loss 2.6075, time 5149.98ms 
iter 6283: loss 2.6852, time 5213.21ms 
iter 6284: loss 2.4625, time 5219.16ms 
iter 6285: loss 2.5748, time 5202.12ms 
iter 6286: loss 2.5905, time 5102.38ms 
iter 6287: loss 2.5110, time 5151.87ms 
iter 6288: loss 2.4918, time 5282.87ms 
iter 6289: loss 2.4079, time 5170.77ms 
iter 6290: loss 2.5760, time 5160.43ms 
iter 6291: loss 2.6039, time 5131.05ms 
iter 6292: loss 2.8076, time 5300.77ms 
iter 6293: loss 2.4080, time 5164.42ms 
iter 6294: loss 2.7125, time 5307.36ms 
iter 6295: loss 2.4944, time 5306.79ms 
iter 6296: loss 2.5290, time 5118.99ms 
iter 6297: loss 2.4826, time 4998.11ms 
iter 6298: loss 2.4092, time 4997.63ms 
iter 6299: loss 2.5121, time 4973.21ms 
step 6300: train loss 2.5394, val loss 2.8292
iter 6300: loss 2.4413, time 19698.34ms 
iter 6301: loss 2.5761, time 5156.55ms 
iter 6302: loss 2.5674, time 5129.46ms 
iter 6303: loss 2.4666, time 5073.76ms 
iter 6304: loss 2.5089, time 5044.80ms 
iter 6305: loss 2.6368, time 5042.23ms 
iter 6306: loss 2.3836, time 5149.86ms 
iter 6307: loss 2.5714, time 5097.22ms 
iter 6308: loss 2.4552, time 5173.27ms 
iter 6309: loss 2.6955, time 5076.48ms 
iter 6310: loss 2.5966, time 5044.02ms 
iter 6311: loss 2.5916, time 5030.70ms 
iter 6312: loss 2.5502, time 5228.34ms 
iter 6313: loss 2.5938, time 5215.78ms 
iter 6314: loss 2.5075, time 5081.38ms 
iter 6315: loss 2.5816, time 5266.87ms 
iter 6316: loss 2.4436, time 5191.96ms 
iter 6317: loss 2.4074, time 5139.61ms 
iter 6318: loss 2.4755, time 5180.85ms 
iter 6319: loss 2.4453, time 5169.75ms 
iter 6320: loss 2.5427, time 5268.74ms 
iter 6321: loss 2.5263, time 5131.29ms 
iter 6322: loss 2.4423, time 5088.98ms 
iter 6323: loss 2.5141, time 5154.46ms 
iter 6324: loss 2.6081, time 5242.03ms 
iter 6325: loss 2.5491, time 5144.06ms 
iter 6326: loss 2.3635, time 5285.59ms 
iter 6327: loss 2.4519, time 5186.30ms 
iter 6328: loss 2.3278, time 5053.01ms 
iter 6329: loss 2.3992, time 5022.69ms 
iter 6330: loss 2.5325, time 4982.35ms 
iter 6331: loss 2.5372, time 5044.45ms 
iter 6332: loss 2.6558, time 4995.93ms 
iter 6333: loss 2.5807, time 4994.64ms 
iter 6334: loss 2.7798, time 5136.62ms 
iter 6335: loss 2.6222, time 5083.39ms 
iter 6336: loss 2.5633, time 5044.96ms 
iter 6337: loss 2.5796, time 5058.51ms 
iter 6338: loss 2.3307, time 5230.00ms 
iter 6339: loss 2.4689, time 5012.87ms 
iter 6340: loss 2.4821, time 5024.94ms 
iter 6341: loss 2.6272, time 5229.21ms 
iter 6342: loss 2.3323, time 5028.73ms 
iter 6343: loss 2.6101, time 5159.80ms 
iter 6344: loss 2.4767, time 4980.26ms 
iter 6345: loss 2.8376, time 5015.50ms 
iter 6346: loss 2.4226, time 5077.12ms 
iter 6347: loss 2.5979, time 5238.44ms 
iter 6348: loss 2.6306, time 5148.46ms 
iter 6349: loss 2.5263, time 5056.36ms 
step 6350: train loss 2.5289, val loss 2.8407
iter 6350: loss 2.5296, time 19705.24ms 
iter 6351: loss 2.4709, time 4975.82ms 
iter 6352: loss 2.7956, time 5258.76ms 
iter 6353: loss 2.6859, time 5279.20ms 
iter 6354: loss 2.6052, time 5139.92ms 
iter 6355: loss 2.6765, time 5005.64ms 
iter 6356: loss 2.5329, time 5080.00ms 
iter 6357: loss 2.6422, time 5138.73ms 
iter 6358: loss 2.6388, time 5120.46ms 
iter 6359: loss 2.4417, time 5232.04ms 
iter 6360: loss 2.4727, time 5017.45ms 
iter 6361: loss 2.4863, time 4979.77ms 
iter 6362: loss 2.4963, time 5170.71ms 
iter 6363: loss 2.5430, time 5085.29ms 
iter 6364: loss 2.5364, time 5249.45ms 
iter 6365: loss 2.5553, time 5086.35ms 
iter 6366: loss 2.6085, time 5233.31ms 
iter 6367: loss 2.5250, time 5088.83ms 
iter 6368: loss 2.6304, time 5171.00ms 
iter 6369: loss 2.4936, time 5184.19ms 
iter 6370: loss 2.8264, time 5179.18ms 
iter 6371: loss 2.7315, time 5160.57ms 
iter 6372: loss 2.5977, time 5096.71ms 
iter 6373: loss 2.3326, time 5252.67ms 
iter 6374: loss 2.4725, time 5144.11ms 
iter 6375: loss 2.4557, time 5096.02ms 
iter 6376: loss 2.4210, time 5210.82ms 
iter 6377: loss 2.3125, time 5198.61ms 
iter 6378: loss 2.3835, time 5118.95ms 
iter 6379: loss 2.5800, time 5175.65ms 
iter 6380: loss 2.4560, time 5174.95ms 
iter 6381: loss 2.3674, time 5277.26ms 
iter 6382: loss 2.6426, time 5222.99ms 
iter 6383: loss 2.6712, time 5126.57ms 
iter 6384: loss 2.5398, time 5198.28ms 
iter 6385: loss 2.4958, time 5271.51ms 
iter 6386: loss 2.6207, time 5204.22ms 
iter 6387: loss 2.5164, time 5256.69ms 
iter 6388: loss 2.6852, time 5122.90ms 
iter 6389: loss 2.6924, time 5062.01ms 
iter 6390: loss 2.5665, time 5012.39ms 
iter 6391: loss 2.4326, time 4985.47ms 
iter 6392: loss 2.6124, time 5002.93ms 
iter 6393: loss 2.3328, time 4991.86ms 
iter 6394: loss 2.4521, time 4991.23ms 
iter 6395: loss 2.3359, time 5179.02ms 
iter 6396: loss 2.4797, time 5157.96ms 
iter 6397: loss 2.5512, time 5302.55ms 
iter 6398: loss 2.5309, time 5156.85ms 
iter 6399: loss 2.3271, time 5293.29ms 
step 6400: train loss 2.5545, val loss 2.8314
iter 6400: loss 2.7354, time 19849.65ms 
iter 6401: loss 2.6286, time 4993.31ms 
iter 6402: loss 2.5258, time 5050.26ms 
iter 6403: loss 2.5050, time 5056.33ms 
iter 6404: loss 2.6380, time 5051.17ms 
iter 6405: loss 2.4709, time 5258.69ms 
iter 6406: loss 2.6463, time 5272.62ms 
iter 6407: loss 2.5921, time 5123.43ms 
iter 6408: loss 2.4265, time 5232.71ms 
iter 6409: loss 2.1476, time 5071.69ms 
iter 6410: loss 2.6145, time 5053.30ms 
iter 6411: loss 2.6022, time 5046.94ms 
iter 6412: loss 2.6877, time 5092.97ms 
iter 6413: loss 2.5335, time 5277.89ms 
iter 6414: loss 2.7206, time 5091.81ms 
iter 6415: loss 2.3067, time 5084.93ms 
iter 6416: loss 2.6358, time 5051.79ms 
iter 6417: loss 2.5145, time 5262.46ms 
iter 6418: loss 2.7544, time 5172.10ms 
iter 6419: loss 2.7871, time 5264.04ms 
iter 6420: loss 2.5384, time 5123.11ms 
iter 6421: loss 2.5884, time 4996.77ms 
iter 6422: loss 2.6151, time 5186.55ms 
iter 6423: loss 2.3851, time 5129.51ms 
iter 6424: loss 2.3996, time 5237.75ms 
iter 6425: loss 2.6338, time 5130.27ms 
iter 6426: loss 2.5919, time 5178.50ms 
iter 6427: loss 2.4905, time 5132.76ms 
iter 6428: loss 2.5629, time 5074.67ms 
iter 6429: loss 2.3384, time 5049.83ms 
iter 6430: loss 2.4648, time 5162.60ms 
iter 6431: loss 2.7232, time 5041.86ms 
iter 6432: loss 2.7517, time 5030.19ms 
iter 6433: loss 2.6142, time 5209.21ms 
iter 6434: loss 2.5960, time 5280.34ms 
iter 6435: loss 2.6537, time 5110.65ms 
iter 6436: loss 2.6363, time 5276.67ms 
iter 6437: loss 2.3304, time 5201.52ms 
iter 6438: loss 2.4191, time 5057.22ms 
iter 6439: loss 2.6328, time 5156.10ms 
iter 6440: loss 2.5903, time 5187.85ms 
iter 6441: loss 2.6233, time 5258.80ms 
iter 6442: loss 2.5816, time 5135.33ms 
iter 6443: loss 2.6414, time 5093.28ms 
iter 6444: loss 2.5696, time 5190.37ms 
iter 6445: loss 2.4385, time 5197.70ms 
iter 6446: loss 2.4468, time 5108.56ms 
iter 6447: loss 2.3849, time 5277.49ms 
iter 6448: loss 2.7096, time 5271.48ms 
iter 6449: loss 2.5778, time 5048.03ms 
step 6450: train loss 2.5187, val loss 2.8301
iter 6450: loss 2.7104, time 19821.80ms 
iter 6451: loss 2.4492, time 5210.27ms 
iter 6452: loss 2.4873, time 5286.60ms 
iter 6453: loss 2.6868, time 5134.56ms 
iter 6454: loss 2.5090, time 5254.59ms 
iter 6455: loss 2.4526, time 5091.52ms 
iter 6456: loss 2.4518, time 5150.56ms 
iter 6457: loss 2.6338, time 5086.59ms 
iter 6458: loss 2.6583, time 5252.33ms 
iter 6459: loss 2.6093, time 5256.96ms 
iter 6460: loss 2.6140, time 5135.44ms 
iter 6461: loss 2.7128, time 5275.29ms 
iter 6462: loss 2.4002, time 5170.36ms 
iter 6463: loss 2.4110, time 5166.35ms 
iter 6464: loss 2.4813, time 5051.98ms 
iter 6465: loss 2.4917, time 5210.97ms 
iter 6466: loss 2.3253, time 5236.36ms 
iter 6467: loss 2.5038, time 5117.17ms 
iter 6468: loss 2.4344, time 5173.01ms 
iter 6469: loss 2.6240, time 5129.34ms 
iter 6470: loss 2.1500, time 5267.89ms 
iter 6471: loss 2.4442, time 5140.70ms 
iter 6472: loss 2.4292, time 5241.56ms 
iter 6473: loss 2.5437, time 5274.82ms 
iter 6474: loss 2.7471, time 5123.52ms 
iter 6475: loss 2.5023, time 5181.51ms 
iter 6476: loss 2.4916, time 5143.22ms 
iter 6477: loss 2.5837, time 5278.22ms 
iter 6478: loss 2.5558, time 5146.01ms 
iter 6479: loss 2.4917, time 5220.63ms 
iter 6480: loss 2.5040, time 5281.45ms 
iter 6481: loss 2.6058, time 5126.08ms 
iter 6482: loss 2.7346, time 5175.08ms 
iter 6483: loss 2.4738, time 5091.41ms 
iter 6484: loss 2.5026, time 5258.20ms 
iter 6485: loss 2.5275, time 5114.60ms 
iter 6486: loss 2.5374, time 5243.58ms 
iter 6487: loss 2.5708, time 5275.45ms 
iter 6488: loss 2.6039, time 5076.59ms 
iter 6489: loss 2.7623, time 5044.86ms 
iter 6490: loss 2.5064, time 5055.85ms 
iter 6491: loss 2.5495, time 5150.87ms 
iter 6492: loss 2.4872, time 5133.59ms 
iter 6493: loss 2.6219, time 5207.79ms 
iter 6494: loss 2.7282, time 5280.96ms 
iter 6495: loss 2.5904, time 5258.44ms 
iter 6496: loss 2.5626, time 5138.29ms 
iter 6497: loss 2.5021, time 5133.95ms 
iter 6498: loss 2.5570, time 5146.57ms 
iter 6499: loss 2.5166, time 5203.41ms 
step 6500: train loss 2.5370, val loss 2.8372
iter 6500: loss 2.5180, time 19843.03ms 
iter 6501: loss 2.5641, time 4944.38ms 
iter 6502: loss 2.4377, time 4967.53ms 
iter 6503: loss 2.3789, time 5234.07ms 
iter 6504: loss 2.4931, time 5126.11ms 
iter 6505: loss 2.5490, time 5273.57ms 
iter 6506: loss 2.7324, time 5271.94ms 
iter 6507: loss 2.4657, time 5078.58ms 
iter 6508: loss 2.5102, time 5049.13ms 
iter 6509: loss 2.3451, time 5111.23ms 
iter 6510: loss 2.5244, time 5207.42ms 
iter 6511: loss 2.6327, time 5180.74ms 
iter 6512: loss 2.8665, time 5278.89ms 
iter 6513: loss 2.8114, time 5281.94ms 
iter 6514: loss 2.5040, time 5113.44ms 
iter 6515: loss 2.4319, time 5145.79ms 
iter 6516: loss 2.4931, time 5094.22ms 
iter 6517: loss 2.1370, time 5264.79ms 
iter 6518: loss 2.4339, time 5160.75ms 
iter 6519: loss 2.4669, time 5277.21ms 
iter 6520: loss 2.4966, time 5273.54ms 
iter 6521: loss 2.5388, time 5122.83ms 
iter 6522: loss 2.5623, time 5077.90ms 
iter 6523: loss 2.3232, time 5075.17ms 
iter 6524: loss 2.5340, time 5258.04ms 
iter 6525: loss 2.5198, time 5137.05ms 
iter 6526: loss 2.6494, time 5262.21ms 
iter 6527: loss 2.6802, time 5269.44ms 
iter 6528: loss 2.5593, time 5140.57ms 
iter 6529: loss 2.3109, time 5083.31ms 
iter 6530: loss 2.6052, time 5048.74ms 
iter 6531: loss 2.6791, time 4970.88ms 
iter 6532: loss 2.4173, time 5017.02ms 
iter 6533: loss 2.4217, time 5208.79ms 
iter 6534: loss 2.4992, time 5129.02ms 
iter 6535: loss 2.5047, time 5070.82ms 
iter 6536: loss 2.6088, time 5117.13ms 
iter 6537: loss 2.6224, time 5079.28ms 
iter 6538: loss 2.4255, time 5047.83ms 
iter 6539: loss 2.7869, time 5053.24ms 
iter 6540: loss 2.3644, time 5238.82ms 
iter 6541: loss 2.4939, time 5132.88ms 
iter 6542: loss 2.4597, time 5174.60ms 
iter 6543: loss 2.5610, time 5108.23ms 
iter 6544: loss 2.7010, time 5091.65ms 
iter 6545: loss 2.5394, time 5051.60ms 
iter 6546: loss 2.6628, time 5045.50ms 
iter 6547: loss 2.7087, time 5056.26ms 
iter 6548: loss 2.6615, time 5202.55ms 
iter 6549: loss 2.6575, time 5060.42ms 
step 6550: train loss 2.5280, val loss 2.8552
iter 6550: loss 2.5564, time 19829.07ms 
iter 6551: loss 2.3886, time 4995.29ms 
iter 6552: loss 2.7263, time 5194.59ms 
iter 6553: loss 2.5558, time 5074.58ms 
iter 6554: loss 2.2641, time 5173.65ms 
iter 6555: loss 2.5642, time 5087.38ms 
iter 6556: loss 2.5246, time 5210.89ms 
iter 6557: loss 2.5974, time 5133.37ms 
iter 6558: loss 2.4692, time 5127.14ms 
iter 6559: loss 2.5603, time 5182.25ms 
iter 6560: loss 2.5909, time 5094.43ms 
iter 6561: loss 2.5939, time 5208.82ms 
iter 6562: loss 2.5042, time 5226.72ms 
iter 6563: loss 2.5330, time 5203.34ms 
iter 6564: loss 2.3991, time 5057.25ms 
iter 6565: loss 2.5184, time 5215.17ms 
iter 6566: loss 2.5427, time 5092.63ms 
iter 6567: loss 2.5164, time 5090.13ms 
iter 6568: loss 2.3900, time 5090.20ms 
iter 6569: loss 2.5895, time 5166.03ms 
iter 6570: loss 2.2926, time 5196.57ms 
iter 6571: loss 2.3328, time 5187.49ms 
iter 6572: loss 2.7332, time 5283.26ms 
iter 6573: loss 2.4143, time 5185.83ms 
iter 6574: loss 2.3436, time 5149.88ms 
iter 6575: loss 2.4910, time 5269.40ms 
iter 6576: loss 2.5552, time 5228.47ms 
iter 6577: loss 2.5071, time 5229.60ms 
iter 6578: loss 2.6440, time 5095.31ms 
iter 6579: loss 2.6517, time 5267.25ms 
iter 6580: loss 2.4873, time 5279.34ms 
iter 6581: loss 2.4672, time 5097.12ms 
iter 6582: loss 2.6813, time 5140.26ms 
iter 6583: loss 2.5656, time 5033.89ms 
iter 6584: loss 2.4799, time 5148.49ms 
iter 6585: loss 2.4002, time 5148.15ms 
iter 6586: loss 2.5050, time 5111.76ms 
iter 6587: loss 2.5153, time 5265.18ms 
iter 6588: loss 2.6497, time 5264.45ms 
iter 6589: loss 2.3619, time 5137.43ms 
iter 6590: loss 2.7226, time 5260.64ms 
iter 6591: loss 2.5438, time 5134.35ms 
iter 6592: loss 2.4590, time 5184.04ms 
iter 6593: loss 2.6131, time 5132.00ms 
iter 6594: loss 2.8012, time 5238.52ms 
iter 6595: loss 2.8295, time 5264.90ms 
iter 6596: loss 2.2732, time 5129.02ms 
iter 6597: loss 2.6806, time 5092.67ms 
iter 6598: loss 2.4232, time 5083.54ms 
iter 6599: loss 2.6084, time 5247.67ms 
step 6600: train loss 2.5218, val loss 2.8195
iter 6600: loss 2.6084, time 19852.26ms 
iter 6601: loss 2.4041, time 5116.83ms 
iter 6602: loss 2.6145, time 5150.36ms 
iter 6603: loss 2.6837, time 5105.73ms 
iter 6604: loss 2.3273, time 5063.89ms 
iter 6605: loss 2.6393, time 5242.00ms 
iter 6606: loss 2.5227, time 5057.88ms 
iter 6607: loss 2.4598, time 5156.30ms 
iter 6608: loss 2.4023, time 5016.55ms 
iter 6609: loss 2.6242, time 5192.82ms 
iter 6610: loss 2.4461, time 5220.11ms 
iter 6611: loss 2.7003, time 5197.30ms 
iter 6612: loss 2.6157, time 5256.92ms 
iter 6613: loss 2.4373, time 5145.94ms 
iter 6614: loss 2.5416, time 5165.61ms 
iter 6615: loss 2.6804, time 5158.22ms 
iter 6616: loss 2.5870, time 5176.15ms 
iter 6617: loss 2.6086, time 5157.53ms 
iter 6618: loss 2.3487, time 5276.05ms 
iter 6619: loss 2.4456, time 5144.23ms 
iter 6620: loss 2.6921, time 5248.28ms 
iter 6621: loss 2.2308, time 5238.37ms 
iter 6622: loss 2.5553, time 5278.88ms 
iter 6623: loss 2.5153, time 5133.56ms 
iter 6624: loss 2.3569, time 5262.52ms 
iter 6625: loss 2.4205, time 5176.02ms 
iter 6626: loss 2.3316, time 5027.34ms 
iter 6627: loss 2.6841, time 5264.64ms 
iter 6628: loss 2.4958, time 5280.40ms 
iter 6629: loss 2.3714, time 5268.92ms 
iter 6630: loss 2.3723, time 5275.28ms 
iter 6631: loss 2.5038, time 5277.07ms 
iter 6632: loss 2.7441, time 5239.79ms 
iter 6633: loss 2.5545, time 5258.78ms 
iter 6634: loss 2.6324, time 5269.62ms 
iter 6635: loss 2.3746, time 5284.58ms 
iter 6636: loss 2.5498, time 5276.86ms 
iter 6637: loss 2.4869, time 5263.91ms 
iter 6638: loss 2.3218, time 5268.74ms 
iter 6639: loss 2.5349, time 5263.47ms 
iter 6640: loss 2.5151, time 5265.52ms 
iter 6641: loss 2.5859, time 5275.75ms 
iter 6642: loss 2.3952, time 5268.81ms 
iter 6643: loss 2.4780, time 5276.31ms 
iter 6644: loss 2.3831, time 5263.91ms 
iter 6645: loss 2.5661, time 5190.55ms 
iter 6646: loss 2.7136, time 5154.50ms 
iter 6647: loss 2.4362, time 5289.14ms 
iter 6648: loss 2.5065, time 5282.31ms 
iter 6649: loss 2.6124, time 5297.80ms 
step 6650: train loss 2.5094, val loss 2.8352
iter 6650: loss 2.4932, time 20117.69ms 
iter 6651: loss 2.6463, time 5093.59ms 
iter 6652: loss 2.4965, time 5109.76ms 
iter 6653: loss 2.5877, time 5254.78ms 
iter 6654: loss 2.6827, time 5132.07ms 
iter 6655: loss 2.5507, time 5193.60ms 
iter 6656: loss 2.1844, time 5208.04ms 
iter 6657: loss 2.4645, time 5112.69ms 
iter 6658: loss 2.5383, time 5246.31ms 
iter 6659: loss 2.6256, time 5041.32ms 
iter 6660: loss 2.5235, time 5017.54ms 
iter 6661: loss 2.4660, time 5203.42ms 
iter 6662: loss 2.5177, time 5082.62ms 
iter 6663: loss 2.5221, time 5065.01ms 
iter 6664: loss 2.4565, time 5049.89ms 
iter 6665: loss 2.7245, time 5197.20ms 
iter 6666: loss 2.5411, time 5257.56ms 
iter 6667: loss 2.5813, time 5152.85ms 
iter 6668: loss 2.7202, time 5125.28ms 
iter 6669: loss 2.3302, time 5273.75ms 
iter 6670: loss 2.4683, time 5106.16ms 
iter 6671: loss 2.3231, time 5114.93ms 
iter 6672: loss 2.6038, time 5080.60ms 
iter 6673: loss 2.5160, time 5238.72ms 
iter 6674: loss 2.4474, time 5168.58ms 
iter 6675: loss 2.4380, time 5135.26ms 
iter 6676: loss 2.6156, time 5230.41ms 
iter 6677: loss 2.5354, time 5283.95ms 
iter 6678: loss 2.6012, time 5124.00ms 
iter 6679: loss 2.4832, time 5122.16ms 
iter 6680: loss 2.4812, time 5132.22ms 
iter 6681: loss 2.6612, time 5186.68ms 
iter 6682: loss 2.6464, time 5088.28ms 
iter 6683: loss 2.5498, time 5238.27ms 
iter 6684: loss 2.4148, time 5221.35ms 
iter 6685: loss 2.2378, time 5137.92ms 
iter 6686: loss 2.4757, time 5181.52ms 
iter 6687: loss 2.8029, time 5127.84ms 
iter 6688: loss 2.5584, time 5209.74ms 
iter 6689: loss 2.5435, time 5269.13ms 
iter 6690: loss 2.2208, time 5139.02ms 
iter 6691: loss 2.4364, time 5249.10ms 
iter 6692: loss 2.6344, time 5137.77ms 
iter 6693: loss 2.5319, time 5084.86ms 
iter 6694: loss 2.7711, time 5228.28ms 
iter 6695: loss 2.4115, time 5117.84ms 
iter 6696: loss 2.5430, time 5108.41ms 
iter 6697: loss 2.5564, time 5124.67ms 
iter 6698: loss 2.7571, time 5198.52ms 
iter 6699: loss 2.2476, time 5086.22ms 
step 6700: train loss 2.5246, val loss 2.8475
iter 6700: loss 2.4100, time 19896.33ms 
iter 6701: loss 2.6394, time 5238.70ms 
iter 6702: loss 2.6028, time 5147.40ms 
iter 6703: loss 2.6630, time 5275.63ms 
iter 6704: loss 2.4075, time 5257.25ms 
iter 6705: loss 2.5797, time 5083.02ms 
iter 6706: loss 2.3811, time 5013.79ms 
iter 6707: loss 2.4155, time 4989.75ms 
iter 6708: loss 2.3824, time 5043.90ms 
iter 6709: loss 2.7746, time 5266.36ms 
iter 6710: loss 2.4043, time 5131.61ms 
iter 6711: loss 2.6559, time 5145.20ms 
iter 6712: loss 2.3407, time 5260.81ms 
iter 6713: loss 2.3713, time 5155.67ms 
iter 6714: loss 2.6498, time 5131.80ms 
iter 6715: loss 2.6672, time 5077.69ms 
iter 6716: loss 2.6374, time 5247.30ms 
iter 6717: loss 2.4925, time 5098.29ms 
iter 6718: loss 2.5887, time 5058.28ms 
iter 6719: loss 2.4212, time 5185.22ms 
iter 6720: loss 2.4182, time 5043.12ms 
iter 6721: loss 2.4725, time 5010.69ms 
iter 6722: loss 2.5865, time 5120.37ms 
iter 6723: loss 2.4484, time 5011.42ms 
iter 6724: loss 2.4675, time 5230.94ms 
iter 6725: loss 2.4784, time 5133.39ms 
iter 6726: loss 2.5044, time 5014.25ms 
iter 6727: loss 2.4060, time 5197.63ms 
iter 6728: loss 2.8681, time 5053.77ms 
iter 6729: loss 2.4289, time 4974.68ms 
iter 6730: loss 2.4231, time 4966.39ms 
iter 6731: loss 2.5689, time 4965.19ms 
iter 6732: loss 2.4002, time 5225.69ms 
iter 6733: loss 2.5432, time 5233.26ms 
iter 6734: loss 2.4124, time 5242.04ms 
iter 6735: loss 2.6786, time 5287.78ms 
iter 6736: loss 2.6602, time 5133.87ms 
iter 6737: loss 2.5010, time 5142.98ms 
iter 6738: loss 2.5437, time 5088.92ms 
iter 6739: loss 2.8508, time 5202.02ms 
iter 6740: loss 2.7781, time 5252.65ms 
iter 6741: loss 2.6057, time 5129.09ms 
iter 6742: loss 2.3452, time 5183.16ms 
iter 6743: loss 2.6337, time 5089.59ms 
iter 6744: loss 2.2946, time 5052.37ms 
iter 6745: loss 2.6028, time 5044.06ms 
iter 6746: loss 2.6272, time 5056.16ms 
iter 6747: loss 2.5848, time 5272.96ms 
iter 6748: loss 2.4376, time 5130.98ms 
iter 6749: loss 2.4059, time 5141.43ms 
step 6750: train loss 2.5204, val loss 2.8346
iter 6750: loss 2.5533, time 19943.82ms 
iter 6751: loss 2.6071, time 5179.28ms 
iter 6752: loss 2.3248, time 5130.55ms 
iter 6753: loss 2.7712, time 5115.15ms 
iter 6754: loss 2.7768, time 5160.96ms 
iter 6755: loss 2.6986, time 5253.68ms 
iter 6756: loss 2.5860, time 5149.33ms 
iter 6757: loss 2.3155, time 5050.97ms 
iter 6758: loss 2.3352, time 5095.48ms 
iter 6759: loss 2.5293, time 5270.75ms 
iter 6760: loss 2.2821, time 5207.97ms 
iter 6761: loss 2.7132, time 5270.49ms 
iter 6762: loss 2.6358, time 5272.21ms 
iter 6763: loss 2.3667, time 5121.68ms 
iter 6764: loss 2.2661, time 5146.81ms 
iter 6765: loss 2.3988, time 5230.04ms 
iter 6766: loss 2.6261, time 5129.44ms 
iter 6767: loss 2.4005, time 5225.67ms 
iter 6768: loss 2.3054, time 5121.33ms 
iter 6769: loss 2.3532, time 5116.13ms 
iter 6770: loss 2.3906, time 5125.93ms 
iter 6771: loss 2.6008, time 5144.19ms 
iter 6772: loss 2.4223, time 5110.19ms 
iter 6773: loss 2.3959, time 5121.78ms 
iter 6774: loss 2.6485, time 5122.50ms 
iter 6775: loss 2.5631, time 5085.51ms 
iter 6776: loss 2.4428, time 5061.29ms 
iter 6777: loss 2.8188, time 5248.05ms 
iter 6778: loss 2.5438, time 5139.06ms 
iter 6779: loss 2.3603, time 5147.74ms 
iter 6780: loss 2.5404, time 5265.32ms 
iter 6781: loss 2.5561, time 5078.41ms 
iter 6782: loss 2.3803, time 5012.16ms 
iter 6783: loss 2.3263, time 5012.66ms 
iter 6784: loss 2.4755, time 5107.40ms 
iter 6785: loss 2.4334, time 5258.92ms 
iter 6786: loss 2.4147, time 5202.10ms 
iter 6787: loss 2.7207, time 5078.69ms 
iter 6788: loss 2.4636, time 5260.03ms 
iter 6789: loss 2.5326, time 5027.20ms 
iter 6790: loss 2.6706, time 4975.06ms 
iter 6791: loss 2.2473, time 4967.21ms 
iter 6792: loss 2.5608, time 5109.97ms 
iter 6793: loss 2.4742, time 5123.13ms 
iter 6794: loss 2.5292, time 5079.41ms 
iter 6795: loss 2.6138, time 5178.77ms 
iter 6796: loss 2.4506, time 5264.76ms 
iter 6797: loss 2.4175, time 5075.35ms 
iter 6798: loss 2.5387, time 5191.48ms 
iter 6799: loss 2.5390, time 5124.69ms 
step 6800: train loss 2.5213, val loss 2.8578
iter 6800: loss 2.3228, time 20049.78ms 
iter 6801: loss 2.5533, time 5269.71ms 
iter 6802: loss 2.4008, time 5130.80ms 
iter 6803: loss 2.5337, time 5146.92ms 
iter 6804: loss 2.4071, time 5126.07ms 
iter 6805: loss 2.5503, time 5160.50ms 
iter 6806: loss 2.5345, time 5138.35ms 
iter 6807: loss 2.6291, time 5087.61ms 
iter 6808: loss 2.4530, time 5267.71ms 
iter 6809: loss 2.3897, time 5194.51ms 
iter 6810: loss 2.6423, time 5144.13ms 
iter 6811: loss 2.7780, time 5105.95ms 
iter 6812: loss 2.6395, time 5047.25ms 
iter 6813: loss 2.5676, time 5227.25ms 
iter 6814: loss 2.7539, time 5258.91ms 
iter 6815: loss 2.4738, time 5140.77ms 
iter 6816: loss 2.6834, time 5279.24ms 
iter 6817: loss 2.4923, time 5212.70ms 
iter 6818: loss 2.4996, time 5121.96ms 
iter 6819: loss 2.8161, time 5123.42ms 
iter 6820: loss 2.5920, time 5166.21ms 
iter 6821: loss 2.5030, time 5132.77ms 
iter 6822: loss 2.8145, time 5015.43ms 
iter 6823: loss 2.3469, time 5159.86ms 
iter 6824: loss 2.6667, time 5188.38ms 
iter 6825: loss 2.5259, time 5078.14ms 
iter 6826: loss 2.6022, time 5184.09ms 
iter 6827: loss 2.4463, time 5119.25ms 
iter 6828: loss 2.1659, time 5208.41ms 
iter 6829: loss 2.6105, time 5126.10ms 
iter 6830: loss 2.6504, time 5082.56ms 
iter 6831: loss 2.4267, time 5213.50ms 
iter 6832: loss 2.7112, time 5239.89ms 
iter 6833: loss 2.9018, time 5118.28ms 
iter 6834: loss 2.1917, time 5035.20ms 
iter 6835: loss 2.6477, time 5030.14ms 
iter 6836: loss 2.5644, time 5203.66ms 
iter 6837: loss 2.7282, time 5123.41ms 
iter 6838: loss 2.5986, time 5117.43ms 
iter 6839: loss 2.5180, time 5268.14ms 
iter 6840: loss 2.5757, time 5129.95ms 
iter 6841: loss 2.6827, time 5031.23ms 
iter 6842: loss 2.5033, time 5007.33ms 
iter 6843: loss 2.5319, time 4981.83ms 
iter 6844: loss 2.4846, time 5250.61ms 
iter 6845: loss 2.5276, time 5139.98ms 
iter 6846: loss 2.5809, time 5170.73ms 
iter 6847: loss 2.4320, time 5257.23ms 
iter 6848: loss 2.4674, time 5120.48ms 
iter 6849: loss 2.3233, time 5078.40ms 
step 6850: train loss 2.5155, val loss 2.8363
iter 6850: loss 2.6546, time 20107.70ms 
iter 6851: loss 2.6589, time 5196.31ms 
iter 6852: loss 2.6985, time 5257.71ms 
iter 6853: loss 2.5322, time 5228.66ms 
iter 6854: loss 2.4115, time 5272.30ms 
iter 6855: loss 2.4343, time 5143.81ms 
iter 6856: loss 2.5073, time 5266.30ms 
iter 6857: loss 2.4882, time 5221.17ms 
iter 6858: loss 2.3238, time 5215.08ms 
iter 6859: loss 2.5120, time 5218.27ms 
iter 6860: loss 2.5256, time 5269.98ms 
iter 6861: loss 2.5895, time 5153.50ms 
iter 6862: loss 2.3830, time 5291.70ms 
iter 6863: loss 2.5795, time 5216.60ms 
iter 6864: loss 2.3252, time 5121.59ms 
iter 6865: loss 2.4200, time 5257.12ms 
iter 6866: loss 2.6809, time 5271.72ms 
iter 6867: loss 2.4623, time 5204.29ms 
iter 6868: loss 2.5526, time 5222.66ms 
iter 6869: loss 2.2802, time 5131.34ms 
iter 6870: loss 2.6000, time 5125.38ms 
iter 6871: loss 2.7229, time 5186.46ms 
iter 6872: loss 2.3132, time 5273.98ms 
iter 6873: loss 2.5928, time 5258.64ms 
iter 6874: loss 2.8133, time 5130.61ms 
iter 6875: loss 2.4594, time 5092.42ms 
iter 6876: loss 2.4114, time 5254.36ms 
iter 6877: loss 2.4656, time 5175.70ms 
iter 6878: loss 2.4864, time 5298.42ms 
iter 6879: loss 2.2878, time 5143.75ms 
iter 6880: loss 2.4812, time 5084.33ms 
iter 6881: loss 2.4685, time 5251.32ms 
iter 6882: loss 2.6582, time 5139.69ms 
iter 6883: loss 2.6572, time 5092.49ms 
iter 6884: loss 2.7048, time 5063.02ms 
iter 6885: loss 2.4227, time 5164.48ms 
iter 6886: loss 2.4485, time 5305.99ms 
iter 6887: loss 2.3923, time 5144.55ms 
iter 6888: loss 2.4998, time 5008.94ms 
iter 6889: loss 2.4711, time 5269.30ms 
iter 6890: loss 2.6174, time 5158.11ms 
iter 6891: loss 2.1735, time 5230.98ms 
iter 6892: loss 2.5247, time 5135.60ms 
iter 6893: loss 2.5506, time 5031.11ms 
iter 6894: loss 2.4728, time 5126.28ms 
iter 6895: loss 2.4252, time 5115.11ms 
iter 6896: loss 2.6011, time 5096.23ms 
iter 6897: loss 2.4586, time 5054.25ms 
iter 6898: loss 2.5196, time 5105.06ms 
iter 6899: loss 2.3888, time 5121.06ms 
step 6900: train loss 2.5261, val loss 2.8472
iter 6900: loss 2.5313, time 19999.37ms 
iter 6901: loss 2.4091, time 5268.94ms 
iter 6902: loss 2.4237, time 5133.72ms 
iter 6903: loss 2.8734, time 5139.40ms 
iter 6904: loss 2.4994, time 5134.80ms 
iter 6905: loss 2.6298, time 5151.22ms 
iter 6906: loss 2.6897, time 5144.11ms 
iter 6907: loss 2.6844, time 5083.00ms 
iter 6908: loss 2.4345, time 5223.13ms 
iter 6909: loss 2.3580, time 5137.24ms 
iter 6910: loss 2.6062, time 5099.91ms 
iter 6911: loss 2.4121, time 5276.97ms 
iter 6912: loss 2.3961, time 5257.31ms 
iter 6913: loss 2.1659, time 5193.55ms 
iter 6914: loss 2.3885, time 5303.92ms 
iter 6915: loss 2.5847, time 5146.27ms 
iter 6916: loss 2.4596, time 5210.42ms 
iter 6917: loss 2.4867, time 5125.04ms 
iter 6918: loss 2.3871, time 5091.23ms 
iter 6919: loss 2.6313, time 5094.61ms 
iter 6920: loss 2.6050, time 5118.90ms 
iter 6921: loss 2.4098, time 5259.26ms 
iter 6922: loss 2.3217, time 5135.19ms 
iter 6923: loss 2.6497, time 5064.24ms 
iter 6924: loss 2.5986, time 5212.65ms 
iter 6925: loss 2.3878, time 5136.97ms 
iter 6926: loss 2.5974, time 5224.28ms 
iter 6927: loss 2.5039, time 5138.25ms 
iter 6928: loss 2.6063, time 5085.66ms 
iter 6929: loss 2.3751, time 5202.03ms 
iter 6930: loss 2.5589, time 5128.60ms 
iter 6931: loss 2.5063, time 5280.40ms 
iter 6932: loss 2.6187, time 5074.12ms 
iter 6933: loss 2.4536, time 5092.75ms 
iter 6934: loss 2.7701, time 5171.31ms 
iter 6935: loss 2.7643, time 5179.35ms 
iter 6936: loss 2.5247, time 5090.35ms 
iter 6937: loss 2.4442, time 4990.76ms 
iter 6938: loss 2.7197, time 5081.15ms 
iter 6939: loss 2.4113, time 5227.27ms 
iter 6940: loss 2.6872, time 5085.66ms 
iter 6941: loss 2.5602, time 5199.00ms 
iter 6942: loss 2.3714, time 5132.51ms 
iter 6943: loss 2.5356, time 5202.41ms 
iter 6944: loss 2.7486, time 5079.72ms 
iter 6945: loss 2.5881, time 5209.47ms 
iter 6946: loss 2.5913, time 5262.96ms 
iter 6947: loss 2.6013, time 5140.74ms 
iter 6948: loss 2.4623, time 5045.81ms 
iter 6949: loss 2.5211, time 5053.53ms 
step 6950: train loss 2.5179, val loss 2.8437
iter 6950: loss 2.5529, time 19905.91ms 
iter 6951: loss 2.3339, time 5062.53ms 
iter 6952: loss 2.6198, time 5073.43ms 
iter 6953: loss 2.4201, time 5126.83ms 
iter 6954: loss 2.5344, time 5128.03ms 
iter 6955: loss 2.1711, time 5268.26ms 
iter 6956: loss 2.5551, time 5135.47ms 
iter 6957: loss 2.4287, time 5129.93ms 
iter 6958: loss 2.5950, time 5227.04ms 
iter 6959: loss 2.6342, time 5133.20ms 
iter 6960: loss 2.2666, time 5071.09ms 
iter 6961: loss 2.5522, time 5015.20ms 
iter 6962: loss 2.6743, time 5168.71ms 
iter 6963: loss 2.6635, time 5132.24ms 
iter 6964: loss 2.4697, time 5083.26ms 
iter 6965: loss 2.3192, time 5064.91ms 
iter 6966: loss 2.3237, time 5274.04ms 
iter 6967: loss 2.7439, time 5144.29ms 
iter 6968: loss 2.6294, time 5242.19ms 
iter 6969: loss 2.5082, time 5091.98ms 
iter 6970: loss 2.4884, time 5142.80ms 
iter 6971: loss 2.5330, time 5132.85ms 
iter 6972: loss 2.4705, time 5081.34ms 
iter 6973: loss 2.5532, time 5149.11ms 
iter 6974: loss 2.3594, time 5042.04ms 
iter 6975: loss 2.5403, time 5084.72ms 
iter 6976: loss 2.5982, time 5055.07ms 
iter 6977: loss 2.4486, time 5141.62ms 
iter 6978: loss 2.6211, time 5259.20ms 
iter 6979: loss 2.5574, time 5130.90ms 
iter 6980: loss 2.4204, time 5268.99ms 
iter 6981: loss 2.4756, time 5282.48ms 
iter 6982: loss 2.5669, time 5128.30ms 
iter 6983: loss 2.5179, time 5256.56ms 
iter 6984: loss 2.3209, time 5166.29ms 
iter 6985: loss 2.4755, time 5226.57ms 
iter 6986: loss 2.6623, time 5217.51ms 
iter 6987: loss 2.6754, time 5257.16ms 
iter 6988: loss 2.4013, time 5146.98ms 
iter 6989: loss 2.5898, time 5099.90ms 
iter 6990: loss 2.3444, time 5148.19ms 
iter 6991: loss 2.3845, time 5193.91ms 
iter 6992: loss 2.5506, time 5143.29ms 
iter 6993: loss 2.3650, time 5153.99ms 
iter 6994: loss 2.7945, time 5197.43ms 
iter 6995: loss 2.5877, time 5124.66ms 
iter 6996: loss 2.6035, time 5223.96ms 
iter 6997: loss 2.7546, time 5177.69ms 
iter 6998: loss 2.5507, time 5093.46ms 
iter 6999: loss 2.4344, time 5160.48ms 
step 7000: train loss 2.5193, val loss 2.8316
iter 7000: loss 2.5067, time 19982.13ms 
iter 7001: loss 2.5052, time 5055.58ms 
iter 7002: loss 2.3246, time 5050.97ms 
iter 7003: loss 2.5677, time 5103.10ms 
iter 7004: loss 2.0021, time 5201.06ms 
iter 7005: loss 2.4409, time 5145.91ms 
iter 7006: loss 2.5840, time 5087.47ms 
iter 7007: loss 2.6278, time 5130.06ms 
iter 7008: loss 2.6155, time 5145.36ms 
iter 7009: loss 2.4935, time 5148.19ms 
iter 7010: loss 2.6964, time 5287.14ms 
iter 7011: loss 2.4174, time 5120.93ms 
iter 7012: loss 2.4826, time 5209.14ms 
iter 7013: loss 2.6509, time 5165.61ms 
iter 7014: loss 2.5507, time 5091.57ms 
iter 7015: loss 2.1912, time 5056.64ms 
iter 7016: loss 2.4024, time 5054.36ms 
iter 7017: loss 2.6618, time 5177.05ms 
iter 7018: loss 2.3147, time 5127.49ms 
iter 7019: loss 2.7672, time 5087.35ms 
iter 7020: loss 2.5216, time 5218.59ms 
iter 7021: loss 2.5024, time 5208.99ms 
iter 7022: loss 2.5055, time 5023.28ms 
iter 7023: loss 2.4596, time 5184.87ms 
iter 7024: loss 2.6446, time 5083.51ms 
iter 7025: loss 2.4856, time 5210.51ms 
iter 7026: loss 2.3532, time 5202.20ms 
iter 7027: loss 2.3768, time 5212.61ms 
iter 7028: loss 2.5640, time 5206.62ms 
iter 7029: loss 2.7548, time 5138.82ms 
iter 7030: loss 2.5077, time 5090.04ms 
iter 7031: loss 2.5888, time 5075.04ms 
iter 7032: loss 2.5947, time 5050.29ms 
iter 7033: loss 2.5931, time 5265.14ms 
iter 7034: loss 2.5274, time 5104.98ms 
iter 7035: loss 2.5585, time 5281.87ms 
iter 7036: loss 2.6851, time 5271.58ms 
iter 7037: loss 2.7011, time 5162.03ms 
iter 7038: loss 2.5982, time 5105.32ms 
iter 7039: loss 2.2943, time 5065.44ms 
iter 7040: loss 2.4557, time 5063.94ms 
iter 7041: loss 2.3607, time 5094.88ms 
iter 7042: loss 2.6979, time 5187.90ms 
iter 7043: loss 2.5713, time 5074.81ms 
iter 7044: loss 2.3200, time 5078.65ms 
iter 7045: loss 2.5989, time 5082.02ms 
iter 7046: loss 2.4789, time 5237.76ms 
iter 7047: loss 2.2156, time 5079.30ms 
iter 7048: loss 2.3575, time 5242.89ms 
iter 7049: loss 2.5732, time 5261.24ms 
step 7050: train loss 2.5127, val loss 2.8474
iter 7050: loss 2.4005, time 19867.09ms 
iter 7051: loss 2.5519, time 4950.11ms 
iter 7052: loss 2.6289, time 5014.91ms 
iter 7053: loss 2.3822, time 5094.78ms 
iter 7054: loss 2.6408, time 5181.55ms 
iter 7055: loss 2.5052, time 5172.54ms 
iter 7056: loss 2.5847, time 5274.53ms 
iter 7057: loss 2.3070, time 5136.10ms 
iter 7058: loss 2.5853, time 5197.10ms 
iter 7059: loss 2.5585, time 5154.90ms 
iter 7060: loss 2.5154, time 5203.14ms 
iter 7061: loss 2.4563, time 5145.26ms 
iter 7062: loss 2.4691, time 5085.49ms 
iter 7063: loss 2.4904, time 5154.64ms 
iter 7064: loss 2.4052, time 5107.80ms 
iter 7065: loss 2.5536, time 5045.42ms 
iter 7066: loss 2.4848, time 5051.76ms 
iter 7067: loss 2.2999, time 5060.26ms 
iter 7068: loss 2.5946, time 5214.53ms 
iter 7069: loss 2.6803, time 5135.80ms 
iter 7070: loss 2.5441, time 5142.60ms 
iter 7071: loss 2.6049, time 5280.80ms 
iter 7072: loss 2.4997, time 5278.94ms 
iter 7073: loss 2.6935, time 5132.03ms 
iter 7074: loss 2.4577, time 5080.68ms 
iter 7075: loss 2.7435, time 5065.17ms 
iter 7076: loss 2.6111, time 5071.60ms 
iter 7077: loss 2.6841, time 5259.19ms 
iter 7078: loss 2.3852, time 5078.83ms 
iter 7079: loss 2.3184, time 5094.48ms 
iter 7080: loss 2.4023, time 5033.29ms 
iter 7081: loss 2.4940, time 5011.25ms 
iter 7082: loss 2.3404, time 5042.25ms 
iter 7083: loss 2.7097, time 5164.80ms 
iter 7084: loss 2.6114, time 5248.93ms 
iter 7085: loss 2.5089, time 5079.81ms 
iter 7086: loss 2.4379, time 5203.73ms 
iter 7087: loss 2.5329, time 5170.86ms 
iter 7088: loss 2.3187, time 5099.98ms 
iter 7089: loss 2.3432, time 5080.64ms 
iter 7090: loss 2.6080, time 5051.06ms 
iter 7091: loss 2.5158, time 5276.11ms 
iter 7092: loss 2.5802, time 5164.01ms 
iter 7093: loss 2.6856, time 5001.34ms 
iter 7094: loss 2.4760, time 5205.28ms 
iter 7095: loss 2.5270, time 5130.26ms 
iter 7096: loss 2.4706, time 5179.28ms 
iter 7097: loss 2.5546, time 5147.18ms 
iter 7098: loss 2.4717, time 5079.64ms 
iter 7099: loss 2.4465, time 5237.97ms 
step 7100: train loss 2.5096, val loss 2.8419
iter 7100: loss 2.5914, time 19966.82ms 
iter 7101: loss 2.6011, time 5250.16ms 
iter 7102: loss 2.5748, time 5203.97ms 
iter 7103: loss 2.4518, time 5131.77ms 
iter 7104: loss 2.6965, time 5231.78ms 
iter 7105: loss 2.5462, time 5061.10ms 
iter 7106: loss 2.6333, time 5227.91ms 
iter 7107: loss 2.4662, time 5279.07ms 
iter 7108: loss 2.3832, time 5143.11ms 
iter 7109: loss 2.7268, time 5230.75ms 
iter 7110: loss 2.6052, time 5149.67ms 
iter 7111: loss 2.5181, time 5268.13ms 
iter 7112: loss 2.7240, time 5219.98ms 
iter 7113: loss 2.4251, time 5286.04ms 
iter 7114: loss 2.5927, time 5196.36ms 
iter 7115: loss 2.4590, time 5225.37ms 
iter 7116: loss 2.6818, time 5161.39ms 
iter 7117: loss 2.5094, time 5229.57ms 
iter 7118: loss 2.2114, time 5159.94ms 
iter 7119: loss 2.4703, time 5242.88ms 
iter 7120: loss 2.4016, time 5266.25ms 
iter 7121: loss 2.4037, time 5210.91ms 
iter 7122: loss 2.5977, time 5218.99ms 
iter 7123: loss 2.4181, time 5271.01ms 
iter 7124: loss 2.5308, time 5129.11ms 
iter 7125: loss 2.4772, time 5391.28ms 
iter 7126: loss 2.5639, time 5270.09ms 
iter 7127: loss 2.4469, time 5156.54ms 
iter 7128: loss 2.6516, time 5126.83ms 
iter 7129: loss 2.4325, time 5242.38ms 
iter 7130: loss 2.3993, time 5168.64ms 
iter 7131: loss 2.5326, time 5261.46ms 
iter 7132: loss 2.2095, time 5277.15ms 
iter 7133: loss 2.3856, time 5147.27ms 
iter 7134: loss 2.5116, time 5077.94ms 
iter 7135: loss 2.4190, time 5135.49ms 
iter 7136: loss 2.4830, time 5200.20ms 
iter 7137: loss 2.4973, time 5116.56ms 
iter 7138: loss 2.4709, time 5135.13ms 
iter 7139: loss 2.5002, time 5265.13ms 
iter 7140: loss 2.5141, time 5124.97ms 
iter 7141: loss 2.8199, time 5265.53ms 
iter 7142: loss 2.3946, time 5234.75ms 
iter 7143: loss 2.4083, time 5221.59ms 
iter 7144: loss 2.6248, time 5273.96ms 
iter 7145: loss 2.5428, time 5277.98ms 
iter 7146: loss 2.5860, time 5135.00ms 
iter 7147: loss 2.1641, time 5260.27ms 
iter 7148: loss 2.4245, time 5195.82ms 
iter 7149: loss 2.2966, time 5255.88ms 
step 7150: train loss 2.5083, val loss 2.8335
iter 7150: loss 2.3448, time 19974.83ms 
iter 7151: loss 2.5745, time 5078.00ms 
iter 7152: loss 2.5647, time 5239.89ms 
iter 7153: loss 2.7388, time 5139.32ms 
iter 7154: loss 2.6643, time 5240.37ms 
iter 7155: loss 2.6267, time 5135.95ms 
iter 7156: loss 2.5795, time 4978.87ms 
iter 7157: loss 2.7003, time 5005.90ms 
iter 7158: loss 2.5602, time 5109.67ms 
iter 7159: loss 2.6422, time 4989.93ms 
iter 7160: loss 2.4672, time 5145.84ms 
iter 7161: loss 2.6265, time 5210.45ms 
iter 7162: loss 2.4990, time 4995.29ms 
iter 7163: loss 2.4343, time 5032.39ms 
iter 7164: loss 2.4893, time 4990.35ms 
iter 7165: loss 2.4454, time 5025.56ms 
iter 7166: loss 2.6604, time 5091.49ms 
iter 7167: loss 2.3828, time 5201.11ms 
iter 7168: loss 2.2633, time 5273.81ms 
iter 7169: loss 2.5572, time 5125.48ms 
iter 7170: loss 2.5740, time 5243.30ms 
iter 7171: loss 2.0281, time 5172.68ms 
iter 7172: loss 2.5827, time 5210.35ms 
iter 7173: loss 2.7009, time 5212.40ms 
iter 7174: loss 2.3495, time 5265.10ms 
iter 7175: loss 2.5531, time 5267.66ms 
iter 7176: loss 2.6237, time 5219.74ms 
iter 7177: loss 2.6337, time 5125.67ms 
iter 7178: loss 2.1811, time 5260.57ms 
iter 7179: loss 2.4595, time 5145.57ms 
iter 7180: loss 2.8127, time 5268.90ms 
iter 7181: loss 2.3049, time 5268.94ms 
iter 7182: loss 2.6146, time 5142.59ms 
iter 7183: loss 2.3887, time 5129.05ms 
iter 7184: loss 2.5297, time 5204.70ms 
iter 7185: loss 2.4557, time 5154.16ms 
iter 7186: loss 2.2112, time 5261.19ms 
iter 7187: loss 2.5780, time 5278.48ms 
iter 7188: loss 2.3701, time 5133.78ms 
iter 7189: loss 2.5029, time 5250.50ms 
iter 7190: loss 2.6123, time 5193.93ms 
iter 7191: loss 2.5168, time 5274.42ms 
iter 7192: loss 2.5540, time 5217.13ms 
iter 7193: loss 2.5399, time 5261.34ms 
iter 7194: loss 2.4934, time 5199.21ms 
iter 7195: loss 2.5308, time 5201.38ms 
iter 7196: loss 2.5433, time 5129.61ms 
iter 7197: loss 2.4802, time 5267.76ms 
iter 7198: loss 2.4455, time 5205.58ms 
iter 7199: loss 2.6657, time 5190.33ms 
step 7200: train loss 2.5047, val loss 2.8427
iter 7200: loss 2.5076, time 19812.26ms 
iter 7201: loss 2.5575, time 5228.34ms 
iter 7202: loss 2.5948, time 5149.54ms 
iter 7203: loss 2.4568, time 5285.62ms 
iter 7204: loss 2.5089, time 5286.15ms 
iter 7205: loss 2.4201, time 5181.90ms 
iter 7206: loss 2.6092, time 5131.95ms 
iter 7207: loss 2.6368, time 5236.93ms 
iter 7208: loss 2.4254, time 5149.06ms 
iter 7209: loss 2.7724, time 5247.41ms 
iter 7210: loss 2.6059, time 5249.68ms 
iter 7211: loss 2.4649, time 5146.93ms 
iter 7212: loss 2.3855, time 5101.70ms 
iter 7213: loss 2.6055, time 5098.01ms 
iter 7214: loss 2.3570, time 5196.57ms 
iter 7215: loss 2.6207, time 5129.37ms 
iter 7216: loss 2.4356, time 5279.64ms 
iter 7217: loss 2.6294, time 5145.18ms 
iter 7218: loss 2.5977, time 5116.30ms 
iter 7219: loss 2.3825, time 5269.75ms 
iter 7220: loss 2.5915, time 5246.18ms 
iter 7221: loss 2.4791, time 5225.45ms 
iter 7222: loss 2.3468, time 5082.64ms 
iter 7223: loss 2.2326, time 5205.32ms 
iter 7224: loss 2.4351, time 5180.14ms 
iter 7225: loss 2.3785, time 5062.40ms 
iter 7226: loss 2.6172, time 5032.43ms 
iter 7227: loss 2.6055, time 4976.81ms 
iter 7228: loss 2.5590, time 5020.48ms 
iter 7229: loss 2.6404, time 5207.89ms 
iter 7230: loss 2.6314, time 5128.31ms 
iter 7231: loss 2.5314, time 5128.30ms 
iter 7232: loss 2.4448, time 5198.80ms 
iter 7233: loss 2.5529, time 5117.64ms 
iter 7234: loss 2.6828, time 5046.42ms 
iter 7235: loss 2.6975, time 5035.94ms 
iter 7236: loss 2.7211, time 5022.74ms 
iter 7237: loss 2.4349, time 5096.88ms 
iter 7238: loss 2.5721, time 4975.96ms 
iter 7239: loss 2.6785, time 4977.86ms 
iter 7240: loss 2.6466, time 5090.41ms 
iter 7241: loss 2.6518, time 5204.09ms 
iter 7242: loss 2.6530, time 5114.26ms 
iter 7243: loss 2.4403, time 5224.46ms 
iter 7244: loss 2.4519, time 5140.19ms 
iter 7245: loss 2.7032, time 5082.21ms 
iter 7246: loss 2.2065, time 5050.14ms 
iter 7247: loss 2.5008, time 5051.65ms 
iter 7248: loss 2.3058, time 5057.05ms 
iter 7249: loss 2.5554, time 5388.83ms 
step 7250: train loss 2.5187, val loss 2.8428
iter 7250: loss 2.6039, time 19848.11ms 
iter 7251: loss 2.6267, time 4983.62ms 
iter 7252: loss 2.5218, time 5085.84ms 
iter 7253: loss 2.5798, time 5049.34ms 
iter 7254: loss 2.6239, time 5187.39ms 
iter 7255: loss 2.6737, time 5270.40ms 
iter 7256: loss 2.5799, time 5169.18ms 
iter 7257: loss 2.6135, time 5142.31ms 
iter 7258: loss 2.2738, time 5182.20ms 
iter 7259: loss 2.3564, time 5116.87ms 
iter 7260: loss 2.4017, time 5046.22ms 
iter 7261: loss 2.4041, time 5048.77ms 
iter 7262: loss 2.3926, time 5029.79ms 
iter 7263: loss 2.7411, time 5115.18ms 
iter 7264: loss 2.3021, time 5132.67ms 
iter 7265: loss 2.6000, time 5060.12ms 
iter 7266: loss 2.4280, time 5010.00ms 
iter 7267: loss 2.4664, time 5000.12ms 
iter 7268: loss 2.7071, time 5220.98ms 
iter 7269: loss 2.4758, time 5076.82ms 
iter 7270: loss 2.4535, time 5125.74ms 
iter 7271: loss 2.5032, time 5196.85ms 
iter 7272: loss 2.6692, time 5262.13ms 
iter 7273: loss 2.6271, time 5117.99ms 
iter 7274: loss 2.3412, time 5184.59ms 
iter 7275: loss 2.6190, time 5085.18ms 
iter 7276: loss 2.5038, time 5206.04ms 
iter 7277: loss 2.5508, time 5188.42ms 
iter 7278: loss 2.6437, time 5122.88ms 
iter 7279: loss 2.6640, time 5062.88ms 
iter 7280: loss 2.5289, time 5206.54ms 
iter 7281: loss 2.4329, time 5084.52ms 
iter 7282: loss 2.5145, time 5176.71ms 
iter 7283: loss 2.4674, time 5145.88ms 
iter 7284: loss 2.5716, time 5231.77ms 
iter 7285: loss 2.5355, time 5225.33ms 
iter 7286: loss 2.3319, time 5088.17ms 
iter 7287: loss 2.3023, time 5045.73ms 
iter 7288: loss 2.4468, time 5153.51ms 
iter 7289: loss 2.4085, time 5088.50ms 
iter 7290: loss 2.5787, time 5145.80ms 
iter 7291: loss 2.6553, time 5043.05ms 
iter 7292: loss 2.5959, time 5211.76ms 
iter 7293: loss 2.5400, time 5187.05ms 
iter 7294: loss 2.6641, time 5078.62ms 
iter 7295: loss 2.5704, time 4996.76ms 
iter 7296: loss 2.4216, time 5092.57ms 
iter 7297: loss 2.4010, time 5084.89ms 
iter 7298: loss 2.5708, time 5181.47ms 
iter 7299: loss 2.5120, time 5124.46ms 
step 7300: train loss 2.4996, val loss 2.8608
iter 7300: loss 2.4535, time 19837.82ms 
iter 7301: loss 2.6378, time 4952.59ms 
iter 7302: loss 2.4695, time 5085.30ms 
iter 7303: loss 2.4102, time 5088.30ms 
iter 7304: loss 2.3806, time 5116.49ms 
iter 7305: loss 2.6609, time 5234.44ms 
iter 7306: loss 2.5199, time 5270.92ms 
iter 7307: loss 2.5689, time 5086.49ms 
iter 7308: loss 2.5115, time 5082.56ms 
iter 7309: loss 2.1949, time 5263.49ms 
iter 7310: loss 2.3657, time 5140.48ms 
iter 7311: loss 2.7651, time 5084.00ms 
iter 7312: loss 2.5057, time 5111.80ms 
iter 7313: loss 2.7240, time 5060.19ms 
iter 7314: loss 2.4398, time 5085.79ms 
iter 7315: loss 2.5640, time 5272.13ms 
iter 7316: loss 2.4788, time 5148.67ms 
iter 7317: loss 2.4852, time 5058.92ms 
iter 7318: loss 2.4297, time 5121.35ms 
iter 7319: loss 2.4105, time 5027.90ms 
iter 7320: loss 2.4576, time 5051.09ms 
iter 7321: loss 2.4865, time 5057.23ms 
iter 7322: loss 2.4492, time 5029.91ms 
iter 7323: loss 2.3651, time 5134.80ms 
iter 7324: loss 2.6045, time 5122.36ms 
iter 7325: loss 2.4977, time 5214.14ms 
iter 7326: loss 2.4222, time 5133.62ms 
iter 7327: loss 2.3990, time 5126.30ms 
iter 7328: loss 2.4632, time 5056.38ms 
iter 7329: loss 2.7624, time 4973.85ms 
iter 7330: loss 2.5248, time 4975.78ms 
iter 7331: loss 2.6019, time 5121.05ms 
iter 7332: loss 2.7476, time 5124.15ms 
iter 7333: loss 2.4140, time 5175.99ms 
iter 7334: loss 2.5527, time 5275.42ms 
iter 7335: loss 2.6227, time 5109.05ms 
iter 7336: loss 2.5529, time 5060.90ms 
iter 7337: loss 2.4787, time 5079.51ms 
iter 7338: loss 2.4041, time 5046.15ms 
iter 7339: loss 2.2522, time 5036.39ms 
iter 7340: loss 2.3937, time 5113.80ms 
iter 7341: loss 2.7836, time 5046.73ms 
iter 7342: loss 2.4832, time 5078.92ms 
iter 7343: loss 2.5045, time 5082.18ms 
iter 7344: loss 2.5560, time 5035.57ms 
iter 7345: loss 2.6215, time 5053.82ms 
iter 7346: loss 2.6889, time 5092.20ms 
iter 7347: loss 2.2729, time 5055.61ms 
iter 7348: loss 2.4541, time 5053.31ms 
iter 7349: loss 2.2764, time 5089.27ms 
step 7350: train loss 2.4972, val loss 2.8458
iter 7350: loss 2.4242, time 19784.47ms 
iter 7351: loss 2.6021, time 4943.56ms 
iter 7352: loss 2.4970, time 4951.40ms 
iter 7353: loss 2.3487, time 4960.03ms 
iter 7354: loss 2.5221, time 5076.82ms 
iter 7355: loss 2.3943, time 5088.62ms 
iter 7356: loss 2.5903, time 5259.55ms 
iter 7357: loss 2.5652, time 5190.25ms 
iter 7358: loss 2.5673, time 5089.33ms 
iter 7359: loss 2.5825, time 5177.62ms 
iter 7360: loss 2.4637, time 5087.77ms 
iter 7361: loss 2.5815, time 5043.20ms 
iter 7362: loss 2.6848, time 5108.95ms 
iter 7363: loss 2.2543, time 5067.29ms 
iter 7364: loss 2.4452, time 5060.18ms 
iter 7365: loss 2.7307, time 5166.04ms 
iter 7366: loss 2.5368, time 5183.46ms 
iter 7367: loss 2.5301, time 5252.19ms 
iter 7368: loss 2.6720, time 5103.18ms 
iter 7369: loss 2.5857, time 5281.31ms 
iter 7370: loss 2.4659, time 5081.84ms 
iter 7371: loss 2.6105, time 5229.70ms 
iter 7372: loss 2.4936, time 5099.91ms 
iter 7373: loss 2.3435, time 5263.53ms 
iter 7374: loss 2.4377, time 5117.87ms 
iter 7375: loss 2.4945, time 5258.56ms 
iter 7376: loss 2.5048, time 5196.27ms 
iter 7377: loss 2.4523, time 5197.28ms 
iter 7378: loss 2.6400, time 5131.50ms 
iter 7379: loss 2.5534, time 5262.65ms 
iter 7380: loss 2.3226, time 5086.45ms 
iter 7381: loss 2.4749, time 5240.54ms 
iter 7382: loss 2.6503, time 5127.04ms 
iter 7383: loss 2.5750, time 5065.80ms 
iter 7384: loss 2.2699, time 4973.68ms 
iter 7385: loss 2.5192, time 5144.70ms 
iter 7386: loss 2.4441, time 5159.17ms 
iter 7387: loss 2.3512, time 5136.38ms 
iter 7388: loss 2.5245, time 5280.83ms 
iter 7389: loss 2.3667, time 5088.11ms 
iter 7390: loss 2.7651, time 5069.46ms 
iter 7391: loss 2.3956, time 5086.57ms 
iter 7392: loss 2.5413, time 5151.12ms 
iter 7393: loss 2.2547, time 5227.05ms 
iter 7394: loss 2.4918, time 5220.28ms 
iter 7395: loss 2.3948, time 5132.90ms 
iter 7396: loss 2.6222, time 5169.12ms 
iter 7397: loss 2.4904, time 5235.44ms 
iter 7398: loss 2.7176, time 5082.20ms 
iter 7399: loss 2.5653, time 5018.79ms 
step 7400: train loss 2.4757, val loss 2.8291
iter 7400: loss 2.4991, time 19831.03ms 
iter 7401: loss 2.5075, time 4963.27ms 
iter 7402: loss 2.6131, time 5259.83ms 
iter 7403: loss 2.4735, time 5135.67ms 
iter 7404: loss 2.4424, time 5054.16ms 
iter 7405: loss 2.5098, time 5109.52ms 
iter 7406: loss 2.5296, time 5089.42ms 
iter 7407: loss 2.3036, time 5116.86ms 
iter 7408: loss 2.5627, time 5017.50ms 
iter 7409: loss 2.6637, time 5131.23ms 
iter 7410: loss 2.7415, time 5232.46ms 
iter 7411: loss 2.3502, time 5160.75ms 
iter 7412: loss 2.5889, time 5070.84ms 
iter 7413: loss 2.4935, time 5107.71ms 
iter 7414: loss 2.3531, time 5089.79ms 
iter 7415: loss 2.5564, time 5041.58ms 
iter 7416: loss 2.5031, time 5045.39ms 
iter 7417: loss 2.4642, time 5039.30ms 
iter 7418: loss 2.6641, time 5051.85ms 
iter 7419: loss 2.3786, time 5038.90ms 
iter 7420: loss 2.4697, time 5030.36ms 
iter 7421: loss 2.4482, time 5036.27ms 
iter 7422: loss 2.5883, time 5040.84ms 
iter 7423: loss 2.3524, time 5039.71ms 
iter 7424: loss 2.4455, time 5047.81ms 
iter 7425: loss 2.3568, time 5020.76ms 
iter 7426: loss 2.4148, time 5020.86ms 
iter 7427: loss 2.4375, time 5176.38ms 
iter 7428: loss 2.4651, time 5027.94ms 
iter 7429: loss 2.3658, time 5065.86ms 
iter 7430: loss 2.3914, time 5099.00ms 
iter 7431: loss 2.6176, time 5067.28ms 
iter 7432: loss 2.2463, time 5179.97ms 
iter 7433: loss 2.3733, time 5099.63ms 
iter 7434: loss 2.3175, time 5061.55ms 
iter 7435: loss 2.7191, time 5159.76ms 
iter 7436: loss 2.5376, time 5099.72ms 
iter 7437: loss 2.6355, time 5059.98ms 
iter 7438: loss 2.4946, time 5151.35ms 
iter 7439: loss 2.5928, time 5062.84ms 
iter 7440: loss 2.4244, time 5165.95ms 
iter 7441: loss 2.4869, time 5226.48ms 
iter 7442: loss 2.5208, time 5108.44ms 
iter 7443: loss 2.7094, time 5206.79ms 
iter 7444: loss 2.5583, time 5099.14ms 
iter 7445: loss 2.3725, time 5058.04ms 
iter 7446: loss 2.4526, time 5040.53ms 
iter 7447: loss 2.3153, time 5037.12ms 
iter 7448: loss 2.5071, time 5160.69ms 
iter 7449: loss 2.3610, time 5067.76ms 
step 7450: train loss 2.4976, val loss 2.8385
iter 7450: loss 2.5023, time 19664.16ms 
iter 7451: loss 2.4301, time 4945.19ms 
iter 7452: loss 2.6675, time 4943.77ms 
iter 7453: loss 2.3354, time 4944.11ms 
iter 7454: loss 2.5197, time 4949.57ms 
iter 7455: loss 2.5649, time 4962.40ms 
iter 7456: loss 2.6160, time 5045.37ms 
iter 7457: loss 2.4627, time 5066.75ms 
iter 7458: loss 2.6126, time 5080.09ms 
iter 7459: loss 2.3624, time 5046.43ms 
iter 7460: loss 2.4765, time 5144.13ms 
iter 7461: loss 2.3722, time 5084.84ms 
iter 7462: loss 2.2205, time 5146.31ms 
iter 7463: loss 2.4712, time 5090.78ms 
iter 7464: loss 2.4495, time 5071.97ms 
iter 7465: loss 2.4759, time 5202.32ms 
iter 7466: loss 2.3229, time 5160.55ms 
iter 7467: loss 2.4022, time 5077.21ms 
iter 7468: loss 2.4782, time 5064.73ms 
iter 7469: loss 2.5488, time 5081.77ms 
iter 7470: loss 2.4187, time 5129.68ms 
iter 7471: loss 2.4312, time 5064.14ms 
iter 7472: loss 2.4640, time 5158.55ms 
iter 7473: loss 2.6291, time 5264.03ms 
iter 7474: loss 2.6840, time 5096.82ms 
iter 7475: loss 2.5294, time 5148.13ms 
iter 7476: loss 2.5025, time 5221.79ms 
iter 7477: loss 2.3840, time 5097.11ms 
iter 7478: loss 2.3140, time 5132.38ms 
iter 7479: loss 2.5523, time 5096.22ms 
iter 7480: loss 2.5159, time 5082.52ms 
iter 7481: loss 2.5821, time 5268.49ms 
iter 7482: loss 2.4885, time 5086.43ms 
iter 7483: loss 2.4622, time 5174.01ms 
iter 7484: loss 2.4372, time 5051.48ms 
iter 7485: loss 2.7340, time 5064.71ms 
iter 7486: loss 2.6322, time 5224.26ms 
iter 7487: loss 2.3039, time 5094.15ms 
iter 7488: loss 2.6772, time 5028.89ms 
iter 7489: loss 2.5684, time 5179.08ms 
iter 7490: loss 2.5239, time 5025.71ms 
iter 7491: loss 2.8212, time 5164.99ms 
iter 7492: loss 2.3264, time 5239.07ms 
iter 7493: loss 2.4965, time 5099.02ms 
iter 7494: loss 2.4258, time 5125.90ms 
iter 7495: loss 2.4318, time 4992.55ms 
iter 7496: loss 2.7267, time 5003.28ms 
iter 7497: loss 2.5637, time 5106.33ms 
iter 7498: loss 2.6429, time 5149.59ms 
iter 7499: loss 2.3093, time 5166.90ms 
step 7500: train loss 2.4935, val loss 2.8460
iter 7500: loss 2.4477, time 19918.96ms 
iter 7501: loss 2.5381, time 4949.10ms 
iter 7502: loss 2.6191, time 4993.26ms 
iter 7503: loss 2.5834, time 4947.37ms 
iter 7504: loss 2.4879, time 4946.07ms 
iter 7505: loss 2.6297, time 4990.33ms 
iter 7506: loss 2.4540, time 5034.49ms 
iter 7507: loss 2.1407, time 5040.58ms 
iter 7508: loss 2.5678, time 5042.31ms 
iter 7509: loss 2.4506, time 5079.32ms 
iter 7510: loss 2.5243, time 5047.37ms 
iter 7511: loss 2.4334, time 5043.99ms 
iter 7512: loss 2.6160, time 5035.47ms 
iter 7513: loss 2.3834, time 5025.07ms 
iter 7514: loss 2.4279, time 4987.88ms 
iter 7515: loss 2.4237, time 5150.36ms 
iter 7516: loss 2.2775, time 5030.19ms 
iter 7517: loss 2.4816, time 5041.25ms 
iter 7518: loss 2.5477, time 5054.23ms 
iter 7519: loss 2.4736, time 5115.83ms 
iter 7520: loss 2.2794, time 5245.57ms 
iter 7521: loss 2.4217, time 5153.74ms 
iter 7522: loss 2.5382, time 5093.17ms 
iter 7523: loss 2.4666, time 5090.63ms 
iter 7524: loss 2.5736, time 5043.16ms 
iter 7525: loss 2.5421, time 5037.50ms 
iter 7526: loss 2.4757, time 5005.77ms 
iter 7527: loss 2.4020, time 4993.97ms 
iter 7528: loss 2.4941, time 5055.67ms 
iter 7529: loss 2.6287, time 5070.75ms 
iter 7530: loss 2.1899, time 5049.57ms 
iter 7531: loss 2.4288, time 5077.68ms 
iter 7532: loss 2.4557, time 5046.46ms 
iter 7533: loss 2.5484, time 5033.43ms 
iter 7534: loss 2.5378, time 5045.85ms 
iter 7535: loss 2.4927, time 5169.73ms 
iter 7536: loss 2.5365, time 5045.57ms 
iter 7537: loss 2.4185, time 5029.68ms 
iter 7538: loss 2.5714, time 5051.06ms 
iter 7539: loss 2.4922, time 5074.57ms 
iter 7540: loss 2.5725, time 5081.75ms 
iter 7541: loss 2.5686, time 5142.64ms 
iter 7542: loss 2.4397, time 5027.36ms 
iter 7543: loss 2.3897, time 5244.24ms 
iter 7544: loss 2.3092, time 5123.96ms 
iter 7545: loss 2.3734, time 5029.93ms 
iter 7546: loss 2.4542, time 5015.74ms 
iter 7547: loss 2.5840, time 4997.52ms 
iter 7548: loss 2.4345, time 5277.98ms 
iter 7549: loss 2.5134, time 5134.21ms 
step 7550: train loss 2.5020, val loss 2.8602
iter 7550: loss 2.5662, time 19851.20ms 
iter 7551: loss 2.5602, time 5200.23ms 
iter 7552: loss 2.3656, time 5165.50ms 
iter 7553: loss 2.5338, time 5253.82ms 
iter 7554: loss 2.5298, time 5131.29ms 
iter 7555: loss 2.5618, time 5177.71ms 
iter 7556: loss 2.5500, time 5217.61ms 
iter 7557: loss 2.5796, time 5109.75ms 
iter 7558: loss 2.3787, time 5087.00ms 
iter 7559: loss 2.6262, time 5027.79ms 
iter 7560: loss 2.4999, time 5021.19ms 
iter 7561: loss 2.5569, time 5180.25ms 
iter 7562: loss 2.6582, time 5065.84ms 
iter 7563: loss 2.5376, time 5051.28ms 
iter 7564: loss 2.4826, time 5206.55ms 
iter 7565: loss 2.4681, time 5091.05ms 
iter 7566: loss 2.3469, time 5170.19ms 
iter 7567: loss 2.5580, time 4994.98ms 
iter 7568: loss 2.5347, time 4971.96ms 
iter 7569: loss 2.5347, time 4948.73ms 
iter 7570: loss 2.3831, time 5021.59ms 
iter 7571: loss 2.4070, time 5069.89ms 
iter 7572: loss 2.4775, time 5076.29ms 
iter 7573: loss 2.4519, time 5047.71ms 
iter 7574: loss 2.3490, time 5053.32ms 
iter 7575: loss 2.5300, time 5055.12ms 
iter 7576: loss 2.3826, time 5067.30ms 
iter 7577: loss 2.6789, time 4991.27ms 
iter 7578: loss 2.5192, time 4980.90ms 
iter 7579: loss 2.5617, time 5022.74ms 
iter 7580: loss 2.5422, time 5104.75ms 
iter 7581: loss 2.5079, time 5089.20ms 
iter 7582: loss 2.3393, time 5040.78ms 
iter 7583: loss 2.2653, time 5045.14ms 
iter 7584: loss 2.5370, time 4985.41ms 
iter 7585: loss 2.5659, time 4993.27ms 
iter 7586: loss 2.2295, time 4979.68ms 
iter 7587: loss 2.3724, time 4988.95ms 
iter 7588: loss 2.4642, time 4983.72ms 
iter 7589: loss 2.5332, time 4980.74ms 
iter 7590: loss 2.4486, time 4981.25ms 
iter 7591: loss 2.5508, time 4988.10ms 
iter 7592: loss 2.4946, time 4986.22ms 
iter 7593: loss 2.3787, time 4984.46ms 
iter 7594: loss 2.6723, time 5017.48ms 
iter 7595: loss 2.5545, time 5118.71ms 
iter 7596: loss 2.2907, time 5073.24ms 
iter 7597: loss 2.5215, time 5108.11ms 
iter 7598: loss 2.4190, time 5101.87ms 
iter 7599: loss 2.3324, time 5081.13ms 
step 7600: train loss 2.5085, val loss 2.8398
iter 7600: loss 2.5186, time 19839.11ms 
iter 7601: loss 2.4007, time 4953.51ms 
iter 7602: loss 2.5483, time 4949.47ms 
iter 7603: loss 2.4963, time 4994.45ms 
iter 7604: loss 2.7301, time 5089.67ms 
iter 7605: loss 2.6010, time 5062.82ms 
iter 7606: loss 2.6391, time 5000.71ms 
iter 7607: loss 2.4133, time 4992.40ms 
iter 7608: loss 2.7530, time 4987.19ms 
iter 7609: loss 2.6570, time 4997.25ms 
iter 7610: loss 2.3175, time 4991.16ms 
iter 7611: loss 2.5560, time 4986.95ms 
iter 7612: loss 2.3996, time 4989.46ms 
iter 7613: loss 2.2816, time 5001.81ms 
iter 7614: loss 2.3944, time 4994.49ms 
iter 7615: loss 2.4057, time 4987.11ms 
iter 7616: loss 2.8093, time 5001.43ms 
iter 7617: loss 2.1864, time 5056.11ms 
iter 7618: loss 2.3512, time 4997.93ms 
iter 7619: loss 2.4576, time 5042.36ms 
iter 7620: loss 2.4532, time 5052.86ms 
iter 7621: loss 2.4868, time 5066.90ms 
iter 7622: loss 2.5291, time 5153.39ms 
iter 7623: loss 2.4268, time 5251.28ms 
iter 7624: loss 2.3751, time 5089.01ms 
iter 7625: loss 2.5893, time 5069.09ms 
iter 7626: loss 2.3652, time 5031.13ms 
iter 7627: loss 2.8568, time 5183.22ms 
iter 7628: loss 2.4397, time 5217.01ms 
iter 7629: loss 2.6171, time 5099.77ms 
iter 7630: loss 2.5315, time 5289.67ms 
iter 7631: loss 2.6283, time 5222.06ms 
iter 7632: loss 2.4013, time 5092.29ms 
iter 7633: loss 2.5614, time 5116.61ms 
iter 7634: loss 2.8048, time 4976.54ms 
iter 7635: loss 2.4316, time 5023.11ms 
iter 7636: loss 2.4839, time 4983.41ms 
iter 7637: loss 2.6215, time 4984.26ms 
iter 7638: loss 2.5481, time 4999.66ms 
iter 7639: loss 2.2396, time 4981.22ms 
iter 7640: loss 2.3452, time 4951.60ms 
iter 7641: loss 2.4479, time 4951.11ms 
iter 7642: loss 2.4837, time 4952.59ms 
iter 7643: loss 2.3461, time 5048.24ms 
iter 7644: loss 2.6969, time 5086.42ms 
iter 7645: loss 2.4745, time 5042.43ms 
iter 7646: loss 2.5327, time 5034.52ms 
iter 7647: loss 2.4694, time 5030.79ms 
iter 7648: loss 2.2679, time 5255.97ms 
iter 7649: loss 2.6291, time 5076.10ms 
step 7650: train loss 2.4926, val loss 2.8499
iter 7650: loss 2.3675, time 19900.15ms 
iter 7651: loss 2.5350, time 5012.33ms 
iter 7652: loss 2.5271, time 5066.52ms 
iter 7653: loss 2.6119, time 4998.15ms 
iter 7654: loss 2.6664, time 5015.68ms 
iter 7655: loss 2.4315, time 5058.32ms 
iter 7656: loss 2.6296, time 5079.21ms 
iter 7657: loss 2.5563, time 5256.76ms 
iter 7658: loss 2.5321, time 5271.37ms 
iter 7659: loss 2.2520, time 5174.10ms 
iter 7660: loss 2.5424, time 5088.19ms 
iter 7661: loss 2.3229, time 4999.57ms 
iter 7662: loss 2.4298, time 4991.44ms 
iter 7663: loss 2.6092, time 5006.31ms 
iter 7664: loss 2.4701, time 5054.05ms 
iter 7665: loss 2.4720, time 5061.23ms 
iter 7666: loss 2.6325, time 5263.90ms 
iter 7667: loss 2.5688, time 5082.99ms 
iter 7668: loss 2.3287, time 5032.10ms 
iter 7669: loss 2.5255, time 5257.28ms 
iter 7670: loss 2.4555, time 5150.19ms 
iter 7671: loss 2.4436, time 5025.16ms 
iter 7672: loss 2.5087, time 5006.19ms 
iter 7673: loss 2.2910, time 5022.91ms 
iter 7674: loss 2.4076, time 5073.01ms 
iter 7675: loss 2.6632, time 5175.14ms 
iter 7676: loss 2.5821, time 5084.92ms 
iter 7677: loss 2.5669, time 5067.84ms 
iter 7678: loss 2.3023, time 5190.42ms 
iter 7679: loss 2.4029, time 5173.67ms 
iter 7680: loss 2.4359, time 4982.81ms 
iter 7681: loss 2.4468, time 4977.94ms 
iter 7682: loss 2.3428, time 5209.17ms 
iter 7683: loss 2.3112, time 4975.97ms 
iter 7684: loss 2.4146, time 5025.50ms 
iter 7685: loss 2.8007, time 5014.14ms 
iter 7686: loss 2.6902, time 5059.84ms 
iter 7687: loss 2.7305, time 5272.25ms 
iter 7688: loss 2.4951, time 5130.43ms 
iter 7689: loss 2.4281, time 5104.57ms 
iter 7690: loss 2.4247, time 5144.82ms 
iter 7691: loss 2.4449, time 5055.00ms 
iter 7692: loss 2.5087, time 5012.44ms 
iter 7693: loss 2.5288, time 5005.08ms 
iter 7694: loss 2.6319, time 4984.42ms 
iter 7695: loss 2.4864, time 4966.10ms 
iter 7696: loss 2.3950, time 4947.25ms 
iter 7697: loss 2.5585, time 5045.45ms 
iter 7698: loss 2.7761, time 5242.00ms 
iter 7699: loss 2.3381, time 5097.08ms 
step 7700: train loss 2.4889, val loss 2.8432
iter 7700: loss 2.6004, time 20001.51ms 
iter 7701: loss 2.4432, time 5097.99ms 
iter 7702: loss 2.5050, time 5125.31ms 
iter 7703: loss 2.3142, time 5089.33ms 
iter 7704: loss 2.3768, time 5093.87ms 
iter 7705: loss 2.5086, time 5133.79ms 
iter 7706: loss 2.5541, time 5249.40ms 
iter 7707: loss 2.4961, time 5141.17ms 
iter 7708: loss 2.8096, time 5086.25ms 
iter 7709: loss 2.4259, time 5240.98ms 
iter 7710: loss 2.4590, time 5113.85ms 
iter 7711: loss 2.4118, time 5033.25ms 
iter 7712: loss 2.4437, time 5016.46ms 
iter 7713: loss 2.4877, time 5027.55ms 
iter 7714: loss 2.4044, time 5082.85ms 
iter 7715: loss 2.3616, time 5081.61ms 
iter 7716: loss 2.4895, time 5089.51ms 
iter 7717: loss 2.3153, time 5172.83ms 
iter 7718: loss 2.4764, time 5290.24ms 
iter 7719: loss 2.4920, time 5089.90ms 
iter 7720: loss 2.0628, time 5205.68ms 
iter 7721: loss 2.6547, time 5091.54ms 
iter 7722: loss 2.5478, time 5173.84ms 
iter 7723: loss 2.6756, time 5084.42ms 
iter 7724: loss 2.2612, time 5068.41ms 
iter 7725: loss 2.4743, time 5175.39ms 
iter 7726: loss 2.5826, time 5086.02ms 
iter 7727: loss 2.5540, time 5008.84ms 
iter 7728: loss 2.3337, time 5062.92ms 
iter 7729: loss 2.5129, time 5026.08ms 
iter 7730: loss 2.6571, time 5041.28ms 
iter 7731: loss 2.5234, time 5122.45ms 
iter 7732: loss 2.5117, time 4982.44ms 
iter 7733: loss 2.5293, time 5052.23ms 
iter 7734: loss 2.4081, time 5272.15ms 
iter 7735: loss 2.7534, time 5085.78ms 
iter 7736: loss 2.4410, time 5008.09ms 
iter 7737: loss 2.4256, time 4976.27ms 
iter 7738: loss 2.6661, time 5058.45ms 
iter 7739: loss 2.5524, time 5033.55ms 
iter 7740: loss 2.4788, time 4975.19ms 
iter 7741: loss 2.6966, time 5108.52ms 
iter 7742: loss 2.4401, time 4988.62ms 
iter 7743: loss 2.6606, time 4981.63ms 
iter 7744: loss 2.5387, time 4947.45ms 
iter 7745: loss 2.2821, time 5012.81ms 
iter 7746: loss 2.6063, time 4975.68ms 
iter 7747: loss 2.1933, time 4956.22ms 
iter 7748: loss 2.4378, time 4951.72ms 
iter 7749: loss 2.5965, time 5098.45ms 
step 7750: train loss 2.4944, val loss 2.8468
iter 7750: loss 2.4047, time 19735.88ms 
iter 7751: loss 2.5382, time 4952.46ms 
iter 7752: loss 2.5268, time 5030.98ms 
iter 7753: loss 2.4916, time 4991.67ms 
iter 7754: loss 2.1816, time 5036.04ms 
iter 7755: loss 2.5573, time 5190.32ms 
iter 7756: loss 2.4879, time 5082.31ms 
iter 7757: loss 2.4872, time 4978.67ms 
iter 7758: loss 2.5096, time 4979.70ms 
iter 7759: loss 2.5750, time 4996.36ms 
iter 7760: loss 2.4443, time 4975.63ms 
iter 7761: loss 2.4529, time 4951.85ms 
iter 7762: loss 2.4358, time 4963.82ms 
iter 7763: loss 2.5354, time 5077.76ms 
iter 7764: loss 2.4813, time 5115.85ms 
iter 7765: loss 2.3742, time 5143.51ms 
iter 7766: loss 2.4848, time 5135.19ms 
iter 7767: loss 2.5881, time 5137.52ms 
iter 7768: loss 2.6054, time 5150.17ms 
iter 7769: loss 2.3890, time 5100.50ms 
iter 7770: loss 2.5231, time 5070.02ms 
iter 7771: loss 2.8218, time 5061.49ms 
iter 7772: loss 2.4705, time 5100.41ms 
iter 7773: loss 2.4841, time 5267.93ms 
iter 7774: loss 2.3161, time 5092.30ms 
iter 7775: loss 2.3174, time 5200.83ms 
iter 7776: loss 2.5251, time 5081.36ms 
iter 7777: loss 2.4741, time 5242.99ms 
iter 7778: loss 2.5604, time 5288.79ms 
iter 7779: loss 2.5049, time 5143.21ms 
iter 7780: loss 2.6977, time 5084.86ms 
iter 7781: loss 2.4194, time 5096.28ms 
iter 7782: loss 2.5988, time 5094.29ms 
iter 7783: loss 2.3680, time 5161.99ms 
iter 7784: loss 2.4046, time 5122.36ms 
iter 7785: loss 2.3676, time 5075.42ms 
iter 7786: loss 2.5141, time 5065.55ms 
iter 7787: loss 2.5852, time 5039.53ms 
iter 7788: loss 2.4250, time 5166.80ms 
iter 7789: loss 2.3642, time 5259.31ms 
iter 7790: loss 2.2690, time 5138.80ms 
iter 7791: loss 2.4033, time 5219.27ms 
iter 7792: loss 2.7177, time 5092.75ms 
iter 7793: loss 2.6756, time 5076.34ms 
iter 7794: loss 2.3945, time 5176.28ms 
iter 7795: loss 2.5222, time 5137.01ms 
iter 7796: loss 2.3945, time 5087.41ms 
iter 7797: loss 2.7083, time 5236.76ms 
iter 7798: loss 2.5155, time 5184.72ms 
iter 7799: loss 2.6501, time 5253.89ms 
step 7800: train loss 2.4875, val loss 2.8374
iter 7800: loss 2.6278, time 19842.62ms 
iter 7801: loss 2.7141, time 4987.27ms 
iter 7802: loss 2.5622, time 5263.80ms 
iter 7803: loss 2.5123, time 5270.19ms 
iter 7804: loss 2.4870, time 5250.82ms 
iter 7805: loss 2.3789, time 5138.92ms 
iter 7806: loss 2.5469, time 5086.92ms 
iter 7807: loss 2.4235, time 5186.75ms 
iter 7808: loss 2.6235, time 5259.08ms 
iter 7809: loss 2.3028, time 5218.27ms 
iter 7810: loss 2.2841, time 5083.39ms 
iter 7811: loss 2.3777, time 5217.44ms 
iter 7812: loss 2.5573, time 5123.71ms 
iter 7813: loss 2.5726, time 5145.63ms 
iter 7814: loss 2.4746, time 5188.55ms 
iter 7815: loss 2.4305, time 5122.64ms 
iter 7816: loss 2.7218, time 5283.19ms 
iter 7817: loss 2.4789, time 5168.27ms 
iter 7818: loss 2.4056, time 5153.97ms 
iter 7819: loss 2.6087, time 5087.99ms 
iter 7820: loss 2.4790, time 5047.20ms 
iter 7821: loss 2.4849, time 5062.17ms 
iter 7822: loss 2.4895, time 5234.52ms 
iter 7823: loss 2.5954, time 5098.97ms 
iter 7824: loss 2.5853, time 5001.20ms 
iter 7825: loss 2.2197, time 5036.97ms 
iter 7826: loss 2.5123, time 5126.49ms 
iter 7827: loss 2.5230, time 5298.55ms 
iter 7828: loss 2.5939, time 5064.09ms 
iter 7829: loss 2.3267, time 5005.12ms 
iter 7830: loss 2.4075, time 5193.70ms 
iter 7831: loss 2.1330, time 5082.13ms 
iter 7832: loss 2.6630, time 5040.10ms 
iter 7833: loss 2.4347, time 5005.64ms 
iter 7834: loss 2.6056, time 5052.95ms 
iter 7835: loss 2.4856, time 5118.44ms 
iter 7836: loss 2.4423, time 5263.28ms 
iter 7837: loss 2.2613, time 5191.07ms 
iter 7838: loss 2.5622, time 5085.30ms 
iter 7839: loss 2.6022, time 5006.59ms 
iter 7840: loss 2.4024, time 4984.93ms 
iter 7841: loss 2.4230, time 5201.52ms 
iter 7842: loss 2.6192, time 5180.61ms 
iter 7843: loss 2.5758, time 5094.80ms 
iter 7844: loss 2.5488, time 5265.06ms 
iter 7845: loss 2.5453, time 5125.65ms 
iter 7846: loss 2.2623, time 5084.18ms 
iter 7847: loss 2.4193, time 5178.77ms 
iter 7848: loss 2.5127, time 5082.08ms 
iter 7849: loss 2.5662, time 5134.01ms 
step 7850: train loss 2.4686, val loss 2.8434
iter 7850: loss 2.6605, time 19934.73ms 
iter 7851: loss 2.4731, time 5092.07ms 
iter 7852: loss 2.5943, time 5048.03ms 
iter 7853: loss 2.5810, time 5052.42ms 
iter 7854: loss 2.3357, time 5100.72ms 
iter 7855: loss 2.5056, time 5284.42ms 
iter 7856: loss 2.3703, time 5129.67ms 
iter 7857: loss 2.5864, time 5062.54ms 
iter 7858: loss 2.3279, time 5072.04ms 
iter 7859: loss 2.7420, time 5005.82ms 
iter 7860: loss 2.6489, time 5003.91ms 
iter 7861: loss 2.2850, time 5226.09ms 
iter 7862: loss 2.2993, time 5192.03ms 
iter 7863: loss 2.4500, time 5096.83ms 
iter 7864: loss 2.6646, time 5133.16ms 
iter 7865: loss 2.4494, time 5113.57ms 
iter 7866: loss 2.4874, time 5266.37ms 
iter 7867: loss 2.4295, time 5081.71ms 
iter 7868: loss 2.3849, time 5054.59ms 
iter 7869: loss 2.5388, time 5223.29ms 
iter 7870: loss 2.6136, time 5079.14ms 
iter 7871: loss 2.5955, time 5143.60ms 
iter 7872: loss 2.4784, time 5122.53ms 
iter 7873: loss 2.7787, time 5052.92ms 
iter 7874: loss 2.4876, time 5118.49ms 
iter 7875: loss 2.6312, time 5290.18ms 
iter 7876: loss 2.5032, time 5121.31ms 
iter 7877: loss 2.4574, time 5105.90ms 
iter 7878: loss 2.4063, time 5151.30ms 
iter 7879: loss 2.6639, time 5151.24ms 
iter 7880: loss 2.3173, time 5184.13ms 
iter 7881: loss 2.6256, time 5125.41ms 
iter 7882: loss 2.6508, time 5100.72ms 
iter 7883: loss 2.3306, time 5239.56ms 
iter 7884: loss 2.6559, time 5132.06ms 
iter 7885: loss 2.4869, time 5093.98ms 
iter 7886: loss 2.5834, time 5080.25ms 
iter 7887: loss 2.4417, time 5061.79ms 
iter 7888: loss 2.5551, time 5231.58ms 
iter 7889: loss 2.4244, time 5128.95ms 
iter 7890: loss 2.6273, time 5077.97ms 
iter 7891: loss 2.6662, time 5193.91ms 
iter 7892: loss 2.5329, time 5082.01ms 
iter 7893: loss 2.7579, time 5053.08ms 
iter 7894: loss 2.6258, time 5057.26ms 
iter 7895: loss 2.4434, time 5038.62ms 
iter 7896: loss 2.7139, time 5193.49ms 
iter 7897: loss 2.4022, time 5123.96ms 
iter 7898: loss 2.6815, time 5090.39ms 
iter 7899: loss 2.4121, time 5060.74ms 
step 7900: train loss 2.4764, val loss 2.8407
iter 7900: loss 2.6153, time 19827.55ms 
iter 7901: loss 2.4046, time 4951.13ms 
iter 7902: loss 2.3999, time 5124.77ms 
iter 7903: loss 2.3744, time 4975.03ms 
iter 7904: loss 2.5396, time 5063.06ms 
iter 7905: loss 2.5078, time 5096.08ms 
iter 7906: loss 2.2626, time 5053.20ms 
iter 7907: loss 2.3808, time 5106.30ms 
iter 7908: loss 2.6014, time 5188.28ms 
iter 7909: loss 2.2691, time 5080.35ms 
iter 7910: loss 2.5970, time 5060.23ms 
iter 7911: loss 2.4011, time 5058.05ms 
iter 7912: loss 2.3822, time 5094.77ms 
iter 7913: loss 2.4418, time 5209.34ms 
iter 7914: loss 2.4927, time 5170.14ms 
iter 7915: loss 2.4649, time 5236.11ms 
iter 7916: loss 2.5444, time 5085.81ms 
iter 7917: loss 2.6940, time 5055.86ms 
iter 7918: loss 2.4658, time 5202.29ms 
iter 7919: loss 2.5450, time 5143.98ms 
iter 7920: loss 2.3953, time 5101.30ms 
iter 7921: loss 2.4835, time 5260.39ms 
iter 7922: loss 2.4298, time 5271.07ms 
iter 7923: loss 2.6624, time 5140.43ms 
iter 7924: loss 2.4201, time 5086.58ms 
iter 7925: loss 2.2515, time 5306.36ms 
iter 7926: loss 2.2681, time 5098.97ms 
iter 7927: loss 2.3323, time 5062.93ms 
iter 7928: loss 2.3690, time 5043.22ms 
iter 7929: loss 2.2397, time 5046.23ms 
iter 7930: loss 2.3898, time 5055.42ms 
iter 7931: loss 2.4183, time 5065.50ms 
iter 7932: loss 2.4725, time 5110.56ms 
iter 7933: loss 2.3781, time 5128.70ms 
iter 7934: loss 2.5921, time 5092.58ms 
iter 7935: loss 2.4867, time 5285.85ms 
iter 7936: loss 2.5619, time 5129.80ms 
iter 7937: loss 2.5938, time 5092.87ms 
iter 7938: loss 2.3508, time 5225.30ms 
iter 7939: loss 2.5201, time 5080.24ms 
iter 7940: loss 2.6346, time 5038.86ms 
iter 7941: loss 2.4825, time 5103.86ms 
iter 7942: loss 2.4899, time 5095.00ms 
iter 7943: loss 2.4293, time 5224.13ms 
iter 7944: loss 2.6010, time 5139.49ms 
iter 7945: loss 2.3323, time 5092.68ms 
iter 7946: loss 2.5005, time 5125.80ms 
iter 7947: loss 2.2420, time 5083.17ms 
iter 7948: loss 2.3445, time 5145.16ms 
iter 7949: loss 2.5687, time 5093.04ms 
step 7950: train loss 2.4888, val loss 2.8529
iter 7950: loss 2.6364, time 19880.85ms 
iter 7951: loss 2.4249, time 5125.45ms 
iter 7952: loss 2.2800, time 5130.38ms 
iter 7953: loss 2.3076, time 5082.53ms 
iter 7954: loss 2.3599, time 5239.02ms 
iter 7955: loss 2.4350, time 5219.35ms 
iter 7956: loss 2.5316, time 5141.83ms 
iter 7957: loss 2.4111, time 5246.86ms 
iter 7958: loss 2.2137, time 5092.43ms 
iter 7959: loss 2.5383, time 5236.76ms 
iter 7960: loss 2.5227, time 5090.31ms 
iter 7961: loss 2.5815, time 5075.77ms 
iter 7962: loss 2.4835, time 5105.33ms 
iter 7963: loss 2.4060, time 5165.21ms 
iter 7964: loss 2.2892, time 5090.20ms 
iter 7965: loss 2.4961, time 5055.76ms 
iter 7966: loss 2.5065, time 5055.73ms 
iter 7967: loss 2.4960, time 5058.47ms 
iter 7968: loss 2.4767, time 5248.97ms 
iter 7969: loss 2.3387, time 5133.48ms 
iter 7970: loss 2.5306, time 5101.22ms 
iter 7971: loss 2.6264, time 5139.82ms 
iter 7972: loss 2.5158, time 5081.68ms 
iter 7973: loss 2.5979, time 5260.21ms 
iter 7974: loss 2.2808, time 5216.30ms 
iter 7975: loss 2.4682, time 5201.65ms 
iter 7976: loss 2.5433, time 5230.34ms 
iter 7977: loss 2.4766, time 5276.36ms 
iter 7978: loss 2.2513, time 5085.49ms 
iter 7979: loss 2.3296, time 5144.59ms 
iter 7980: loss 2.6729, time 5144.57ms 
iter 7981: loss 2.3279, time 5113.56ms 
iter 7982: loss 2.3636, time 5058.36ms 
iter 7983: loss 2.4560, time 5009.10ms 
iter 7984: loss 2.6204, time 5209.87ms 
iter 7985: loss 2.2953, time 5090.82ms 
iter 7986: loss 2.6987, time 5076.48ms 
iter 7987: loss 2.3286, time 5057.54ms 
iter 7988: loss 2.4501, time 5054.90ms 
iter 7989: loss 2.4909, time 5122.97ms 
iter 7990: loss 2.4716, time 5266.18ms 
iter 7991: loss 2.5173, time 5136.51ms 
iter 7992: loss 2.4891, time 5082.85ms 
iter 7993: loss 2.5402, time 5214.50ms 
iter 7994: loss 2.4926, time 5083.52ms 
iter 7995: loss 2.4996, time 5166.39ms 
iter 7996: loss 2.5306, time 5085.61ms 
iter 7997: loss 2.5581, time 5136.24ms 
iter 7998: loss 2.5955, time 5193.75ms 
iter 7999: loss 2.4578, time 5090.21ms 
step 8000: train loss 2.4770, val loss 2.8473
iter 8000: loss 2.5857, time 19905.83ms 
iter 8001: loss 2.6537, time 4995.89ms 
iter 8002: loss 2.5277, time 5014.28ms 
iter 8003: loss 2.5158, time 5260.55ms 
iter 8004: loss 2.7799, time 5139.36ms 
iter 8005: loss 2.2746, time 5139.29ms 
iter 8006: loss 2.5035, time 5248.09ms 
iter 8007: loss 2.5200, time 5122.93ms 
iter 8008: loss 2.3894, time 5153.82ms 
iter 8009: loss 2.3313, time 5051.48ms 
iter 8010: loss 2.6592, time 5014.65ms 
iter 8011: loss 2.2656, time 5071.27ms 
iter 8012: loss 2.5055, time 5120.78ms 
iter 8013: loss 2.2387, time 5048.48ms 
iter 8014: loss 2.4974, time 5273.87ms 
iter 8015: loss 2.5129, time 5132.91ms 
iter 8016: loss 2.3638, time 5099.21ms 
iter 8017: loss 2.4132, time 5267.98ms 
iter 8018: loss 2.5726, time 5127.93ms 
iter 8019: loss 2.4671, time 5078.65ms 
iter 8020: loss 2.5017, time 5060.75ms 
iter 8021: loss 2.4536, time 5017.43ms 
iter 8022: loss 2.2905, time 5075.31ms 
iter 8023: loss 2.6021, time 5263.50ms 
iter 8024: loss 2.6219, time 5194.47ms 
iter 8025: loss 2.4581, time 5088.41ms 
iter 8026: loss 2.5117, time 5049.18ms 
iter 8027: loss 2.5572, time 5035.89ms 
iter 8028: loss 2.6683, time 5189.46ms 
iter 8029: loss 2.4846, time 5082.91ms 
iter 8030: loss 2.4894, time 5070.40ms 
iter 8031: loss 2.4149, time 5272.97ms 
iter 8032: loss 2.5206, time 5286.51ms 
iter 8033: loss 2.4903, time 5103.19ms 
iter 8034: loss 2.3761, time 5127.62ms 
iter 8035: loss 2.4797, time 5103.13ms 
iter 8036: loss 2.4440, time 5087.41ms 
iter 8037: loss 2.4641, time 5175.92ms 
iter 8038: loss 2.4160, time 5101.88ms 
iter 8039: loss 2.7629, time 5049.44ms 
iter 8040: loss 2.6317, time 5155.02ms 
iter 8041: loss 2.6227, time 5115.03ms 
iter 8042: loss 2.5316, time 5090.56ms 
iter 8043: loss 2.1808, time 5193.98ms 
iter 8044: loss 2.3729, time 5031.67ms 
iter 8045: loss 2.3570, time 5242.08ms 
iter 8046: loss 2.4113, time 5134.19ms 
iter 8047: loss 2.7542, time 5119.61ms 
iter 8048: loss 2.6478, time 5236.99ms 
iter 8049: loss 2.4387, time 5084.89ms 
step 8050: train loss 2.4782, val loss 2.8536
iter 8050: loss 2.3993, time 20001.78ms 
iter 8051: loss 2.4681, time 5076.55ms 
iter 8052: loss 2.3804, time 5050.33ms 
iter 8053: loss 2.5212, time 5088.10ms 
iter 8054: loss 2.6072, time 5284.38ms 
iter 8055: loss 2.5446, time 5246.76ms 
iter 8056: loss 2.4917, time 5232.35ms 
iter 8057: loss 2.5666, time 5130.04ms 
iter 8058: loss 2.4766, time 5058.42ms 
iter 8059: loss 2.3861, time 5106.32ms 
iter 8060: loss 2.4787, time 4996.61ms 
iter 8061: loss 2.4255, time 5016.02ms 
iter 8062: loss 2.5882, time 5036.90ms 
iter 8063: loss 2.5264, time 5053.98ms 
iter 8064: loss 2.6096, time 5039.72ms 
iter 8065: loss 2.7906, time 5185.86ms 
iter 8066: loss 2.3623, time 5002.26ms 
iter 8067: loss 2.4038, time 5007.08ms 
iter 8068: loss 2.3968, time 5083.42ms 
iter 8069: loss 2.5310, time 5086.66ms 
iter 8070: loss 2.4682, time 5163.16ms 
iter 8071: loss 2.4981, time 5262.85ms 
iter 8072: loss 2.4019, time 5134.73ms 
iter 8073: loss 2.3492, time 5114.36ms 
iter 8074: loss 2.3628, time 5139.37ms 
iter 8075: loss 2.5420, time 5124.40ms 
iter 8076: loss 2.4028, time 5270.87ms 
iter 8077: loss 2.4227, time 5078.89ms 
iter 8078: loss 2.4118, time 5143.67ms 
iter 8079: loss 2.6215, time 5289.00ms 
iter 8080: loss 2.5798, time 5141.25ms 
iter 8081: loss 2.6590, time 5187.03ms 
iter 8082: loss 2.5127, time 5079.92ms 
iter 8083: loss 2.5843, time 5109.70ms 
iter 8084: loss 2.5847, time 5113.74ms 
iter 8085: loss 2.5157, time 5251.30ms 
iter 8086: loss 2.5713, time 5259.01ms 
iter 8087: loss 2.5061, time 5122.07ms 
iter 8088: loss 2.5283, time 5083.94ms 
iter 8089: loss 2.5233, time 5166.49ms 
iter 8090: loss 2.3624, time 5285.47ms 
iter 8091: loss 2.5180, time 5200.27ms 
iter 8092: loss 2.2753, time 5154.48ms 
iter 8093: loss 2.5362, time 5147.71ms 
iter 8094: loss 2.4254, time 5070.39ms 
iter 8095: loss 2.3572, time 5110.19ms 
iter 8096: loss 2.5852, time 5177.80ms 
iter 8097: loss 2.5389, time 5136.61ms 
iter 8098: loss 2.6661, time 5095.00ms 
iter 8099: loss 2.4989, time 5275.41ms 
step 8100: train loss 2.4956, val loss 2.8363
iter 8100: loss 2.5940, time 19838.78ms 
iter 8101: loss 2.5753, time 4947.33ms 
iter 8102: loss 2.4840, time 5021.47ms 
iter 8103: loss 2.3183, time 5036.12ms 
iter 8104: loss 2.3922, time 5126.57ms 
iter 8105: loss 2.4934, time 5188.99ms 
iter 8106: loss 2.5399, time 5084.91ms 
iter 8107: loss 2.3894, time 5104.76ms 
iter 8108: loss 2.1240, time 5269.67ms 
iter 8109: loss 2.4473, time 5130.09ms 
iter 8110: loss 2.3793, time 5104.74ms 
iter 8111: loss 2.5565, time 5178.54ms 
iter 8112: loss 2.5045, time 5058.58ms 
iter 8113: loss 2.3956, time 5079.52ms 
iter 8114: loss 2.4928, time 5123.02ms 
iter 8115: loss 2.4679, time 5090.12ms 
iter 8116: loss 2.7231, time 5354.62ms 
iter 8117: loss 2.3270, time 5246.41ms 
iter 8118: loss 2.6855, time 5058.57ms 
iter 8119: loss 2.3553, time 5160.52ms 
iter 8120: loss 2.1448, time 5231.01ms 
iter 8121: loss 2.6921, time 5157.81ms 
iter 8122: loss 2.3708, time 5178.06ms 
iter 8123: loss 2.2819, time 5046.49ms 
iter 8124: loss 2.6014, time 5225.41ms 
iter 8125: loss 2.5364, time 5276.52ms 
iter 8126: loss 2.5107, time 5189.05ms 
iter 8127: loss 2.6102, time 5088.48ms 
iter 8128: loss 2.3511, time 5091.99ms 
iter 8129: loss 2.5652, time 5046.98ms 
iter 8130: loss 2.6749, time 5031.62ms 
iter 8131: loss 2.4221, time 4994.81ms 
iter 8132: loss 2.2931, time 4952.00ms 
iter 8133: loss 2.5240, time 4997.97ms 
iter 8134: loss 2.4629, time 4973.71ms 
iter 8135: loss 2.2787, time 4947.42ms 
iter 8136: loss 2.3640, time 5065.96ms 
iter 8137: loss 2.4960, time 4980.02ms 
iter 8138: loss 2.6243, time 5003.48ms 
iter 8139: loss 2.2860, time 5198.44ms 
iter 8140: loss 2.5784, time 5081.45ms 
iter 8141: loss 2.6755, time 5085.77ms 
iter 8142: loss 2.5844, time 5215.38ms 
iter 8143: loss 2.4944, time 5067.77ms 
iter 8144: loss 2.5860, time 5060.01ms 
iter 8145: loss 2.4564, time 5048.32ms 
iter 8146: loss 2.5205, time 5156.70ms 
iter 8147: loss 2.5112, time 5066.76ms 
iter 8148: loss 2.5337, time 5047.06ms 
iter 8149: loss 2.5467, time 5091.22ms 
step 8150: train loss 2.4934, val loss 2.8493
iter 8150: loss 2.6165, time 19990.32ms 
iter 8151: loss 2.1712, time 5137.99ms 
iter 8152: loss 2.5695, time 5080.77ms 
iter 8153: loss 2.4705, time 5265.52ms 
iter 8154: loss 2.2892, time 5101.18ms 
iter 8155: loss 2.4038, time 5054.27ms 
iter 8156: loss 2.4774, time 5080.70ms 
iter 8157: loss 2.5247, time 5080.06ms 
iter 8158: loss 2.3479, time 5051.17ms 
iter 8159: loss 2.4004, time 5159.26ms 
iter 8160: loss 2.3391, time 5082.00ms 
iter 8161: loss 2.3289, time 5064.00ms 
iter 8162: loss 2.6524, time 5136.03ms 
iter 8163: loss 2.5265, time 5084.54ms 
iter 8164: loss 2.5179, time 5253.09ms 
iter 8165: loss 2.3956, time 5299.09ms 
iter 8166: loss 2.3062, time 5141.12ms 
iter 8167: loss 2.6816, time 5074.69ms 
iter 8168: loss 2.1436, time 5111.70ms 
iter 8169: loss 2.5875, time 5102.03ms 
iter 8170: loss 2.7081, time 5044.30ms 
iter 8171: loss 2.1766, time 5016.35ms 
iter 8172: loss 2.3597, time 5086.09ms 
iter 8173: loss 2.4527, time 5162.41ms 
iter 8174: loss 2.5198, time 5057.77ms 
iter 8175: loss 2.5767, time 5055.90ms 
iter 8176: loss 2.4200, time 5146.74ms 
iter 8177: loss 2.6280, time 5096.08ms 
iter 8178: loss 2.3751, time 5074.45ms 
iter 8179: loss 2.4906, time 5164.07ms 
iter 8180: loss 2.4147, time 5162.62ms 
iter 8181: loss 2.3677, time 5028.57ms 
iter 8182: loss 2.5256, time 5017.66ms 
iter 8183: loss 2.5651, time 5014.65ms 
iter 8184: loss 2.3377, time 5021.06ms 
iter 8185: loss 2.4684, time 5035.55ms 
iter 8186: loss 2.6303, time 5004.05ms 
iter 8187: loss 2.4406, time 5215.38ms 
iter 8188: loss 2.4665, time 5126.07ms 
iter 8189: loss 2.4949, time 5168.28ms 
iter 8190: loss 2.5585, time 5089.56ms 
iter 8191: loss 2.1420, time 4983.82ms 
iter 8192: loss 2.2878, time 4977.16ms 
iter 8193: loss 2.5022, time 5069.96ms 
iter 8194: loss 2.3448, time 5055.53ms 
iter 8195: loss 2.4971, time 5123.87ms 
iter 8196: loss 2.4417, time 5044.94ms 
iter 8197: loss 2.4215, time 5018.34ms 
iter 8198: loss 2.5354, time 5007.85ms 
iter 8199: loss 2.5107, time 5026.26ms 
step 8200: train loss 2.4772, val loss 2.8582
iter 8200: loss 2.4395, time 20095.35ms 
iter 8201: loss 2.5538, time 5204.43ms 
iter 8202: loss 2.4760, time 5149.92ms 
iter 8203: loss 2.6724, time 5183.04ms 
iter 8204: loss 2.6196, time 5234.74ms 
iter 8205: loss 2.5345, time 5074.14ms 
iter 8206: loss 2.4104, time 5134.40ms 
iter 8207: loss 2.4627, time 5091.01ms 
iter 8208: loss 2.3238, time 5207.52ms 
iter 8209: loss 2.6022, time 5245.82ms 
iter 8210: loss 2.4775, time 5097.71ms 
iter 8211: loss 2.6331, time 5187.69ms 
iter 8212: loss 2.2254, time 5154.34ms 
iter 8213: loss 2.4514, time 5155.83ms 
iter 8214: loss 2.5323, time 4990.24ms 
iter 8215: loss 2.5602, time 4988.38ms 
iter 8216: loss 2.4015, time 5109.69ms 
iter 8217: loss 2.5003, time 5265.21ms 
iter 8218: loss 2.4304, time 5126.59ms 
iter 8219: loss 2.5545, time 5036.48ms 
iter 8220: loss 2.5141, time 4988.92ms 
iter 8221: loss 2.3713, time 5020.33ms 
iter 8222: loss 2.6079, time 5037.92ms 
iter 8223: loss 2.6004, time 4952.69ms 
iter 8224: loss 2.4020, time 4970.34ms 
iter 8225: loss 2.5280, time 5043.93ms 
iter 8226: loss 2.6713, time 4974.38ms 
iter 8227: loss 2.5465, time 5000.14ms 
iter 8228: loss 2.3547, time 4950.49ms 
iter 8229: loss 2.3595, time 4960.55ms 
iter 8230: loss 2.4902, time 4946.33ms 
iter 8231: loss 2.4255, time 4968.81ms 
iter 8232: loss 2.4547, time 4950.25ms 
iter 8233: loss 2.6301, time 4952.90ms 
iter 8234: loss 2.2605, time 4953.93ms 
iter 8235: loss 2.5209, time 4957.42ms 
iter 8236: loss 2.3892, time 4949.50ms 
iter 8237: loss 2.6247, time 5076.70ms 
iter 8238: loss 2.3911, time 4980.27ms 
iter 8239: loss 2.5385, time 5025.24ms 
iter 8240: loss 2.5680, time 4995.09ms 
iter 8241: loss 2.5022, time 4979.84ms 
iter 8242: loss 2.5075, time 4946.92ms 
iter 8243: loss 2.6079, time 4961.16ms 
iter 8244: loss 2.4821, time 5000.69ms 
iter 8245: loss 2.3571, time 5199.23ms 
iter 8246: loss 2.3607, time 5227.64ms 
iter 8247: loss 2.5788, time 5133.83ms 
iter 8248: loss 2.5556, time 5095.56ms 
iter 8249: loss 2.5059, time 5093.77ms 
step 8250: train loss 2.4876, val loss 2.8403
iter 8250: loss 2.2439, time 19883.78ms 
iter 8251: loss 2.3676, time 5010.22ms 
iter 8252: loss 2.4355, time 5295.68ms 
iter 8253: loss 2.3823, time 5199.45ms 
iter 8254: loss 2.6623, time 5080.68ms 
iter 8255: loss 2.5241, time 5066.13ms 
iter 8256: loss 2.5041, time 5089.03ms 
iter 8257: loss 2.4083, time 5045.92ms 
iter 8258: loss 2.3706, time 5065.71ms 
iter 8259: loss 2.5037, time 5031.98ms 
iter 8260: loss 2.5619, time 5032.33ms 
iter 8261: loss 2.3146, time 5024.43ms 
iter 8262: loss 2.5762, time 5215.48ms 
iter 8263: loss 2.4781, time 5091.25ms 
iter 8264: loss 2.3980, time 5034.41ms 
iter 8265: loss 2.4320, time 5009.13ms 
iter 8266: loss 2.3175, time 5013.76ms 
iter 8267: loss 2.5487, time 5025.59ms 
iter 8268: loss 2.3501, time 5051.85ms 
iter 8269: loss 2.4893, time 5022.60ms 
iter 8270: loss 2.3151, time 5048.41ms 
iter 8271: loss 2.5269, time 5100.29ms 
iter 8272: loss 2.6675, time 5059.35ms 
iter 8273: loss 2.3390, time 5253.96ms 
iter 8274: loss 2.3783, time 5136.26ms 
iter 8275: loss 2.6633, time 5088.72ms 
iter 8276: loss 2.3913, time 5239.44ms 
iter 8277: loss 2.4263, time 5005.25ms 
iter 8278: loss 2.4680, time 4976.85ms 
iter 8279: loss 2.4653, time 5094.46ms 
iter 8280: loss 2.3220, time 5048.48ms 
iter 8281: loss 2.1359, time 5055.08ms 
iter 8282: loss 2.2992, time 5254.12ms 
iter 8283: loss 2.5420, time 5143.81ms 
iter 8284: loss 2.5119, time 5100.54ms 
iter 8285: loss 2.4227, time 5274.56ms 
iter 8286: loss 2.4344, time 5155.39ms 
iter 8287: loss 2.3923, time 5066.74ms 
iter 8288: loss 2.3821, time 5212.11ms 
iter 8289: loss 2.4449, time 5008.22ms 
iter 8290: loss 2.5997, time 5005.13ms 
iter 8291: loss 2.7003, time 5244.34ms 
iter 8292: loss 2.5578, time 5085.14ms 
iter 8293: loss 2.4880, time 5132.15ms 
iter 8294: loss 2.4290, time 5282.11ms 
iter 8295: loss 2.2479, time 5126.77ms 
iter 8296: loss 2.5444, time 5160.48ms 
iter 8297: loss 2.4942, time 5210.51ms 
iter 8298: loss 2.3558, time 5055.52ms 
iter 8299: loss 2.4295, time 5053.69ms 
step 8300: train loss 2.4703, val loss 2.8472
iter 8300: loss 2.5822, time 19890.46ms 
iter 8301: loss 2.3415, time 5045.52ms 
iter 8302: loss 2.4983, time 5142.67ms 
iter 8303: loss 2.6043, time 5247.55ms 
iter 8304: loss 2.1698, time 5189.72ms 
iter 8305: loss 2.6415, time 5078.17ms 
iter 8306: loss 2.5156, time 5023.15ms 
iter 8307: loss 2.5260, time 5052.36ms 
iter 8308: loss 2.3403, time 5228.09ms 
iter 8309: loss 2.5711, time 5099.37ms 
iter 8310: loss 2.5072, time 5044.95ms 
iter 8311: loss 2.4933, time 5208.11ms 
iter 8312: loss 2.5946, time 5092.84ms 
iter 8313: loss 2.6006, time 5138.43ms 
iter 8314: loss 2.3086, time 5231.89ms 
iter 8315: loss 2.7060, time 5059.49ms 
iter 8316: loss 2.7374, time 5139.82ms 
iter 8317: loss 2.5723, time 5202.30ms 
iter 8318: loss 2.4250, time 5079.27ms 
iter 8319: loss 2.5159, time 5061.24ms 
iter 8320: loss 2.4752, time 5043.93ms 
iter 8321: loss 2.3674, time 5023.01ms 
iter 8322: loss 2.6418, time 5007.00ms 
iter 8323: loss 2.6657, time 5070.39ms 
iter 8324: loss 2.4554, time 5099.94ms 
iter 8325: loss 2.5200, time 5113.38ms 
iter 8326: loss 2.4348, time 5157.20ms 
iter 8327: loss 2.2962, time 5093.69ms 
iter 8328: loss 2.4104, time 5077.65ms 
iter 8329: loss 2.4814, time 5048.52ms 
iter 8330: loss 2.5816, time 5164.96ms 
iter 8331: loss 2.4524, time 5096.15ms 
iter 8332: loss 2.5557, time 5063.00ms 
iter 8333: loss 2.5288, time 5046.30ms 
iter 8334: loss 2.4081, time 5053.46ms 
iter 8335: loss 2.3985, time 5131.77ms 
iter 8336: loss 2.6220, time 5149.07ms 
iter 8337: loss 2.4233, time 5063.25ms 
iter 8338: loss 2.2283, time 5257.05ms 
iter 8339: loss 2.4301, time 5145.27ms 
iter 8340: loss 2.4099, time 5096.59ms 
iter 8341: loss 2.6930, time 5043.56ms 
iter 8342: loss 2.5053, time 4974.61ms 
iter 8343: loss 2.6168, time 5014.24ms 
iter 8344: loss 2.5608, time 5055.84ms 
iter 8345: loss 2.4077, time 5037.30ms 
iter 8346: loss 2.5328, time 5135.64ms 
iter 8347: loss 2.5949, time 5072.27ms 
iter 8348: loss 2.6548, time 4971.10ms 
iter 8349: loss 2.5442, time 4957.90ms 
step 8350: train loss 2.4751, val loss 2.8327
iter 8350: loss 2.5201, time 19838.46ms 
iter 8351: loss 2.5150, time 4945.45ms 
iter 8352: loss 2.4892, time 5076.00ms 
iter 8353: loss 2.2739, time 5042.58ms 
iter 8354: loss 2.4566, time 5057.82ms 
iter 8355: loss 2.4821, time 5117.49ms 
iter 8356: loss 2.4137, time 5285.12ms 
iter 8357: loss 2.3794, time 5146.32ms 
iter 8358: loss 2.4260, time 5107.80ms 
iter 8359: loss 2.4266, time 4990.09ms 
iter 8360: loss 2.4212, time 5027.47ms 
iter 8361: loss 2.4699, time 5166.34ms 
iter 8362: loss 2.4070, time 5080.12ms 
iter 8363: loss 2.5556, time 5039.88ms 
iter 8364: loss 2.3156, time 5147.76ms 
iter 8365: loss 2.2591, time 5012.78ms 
iter 8366: loss 2.6092, time 4976.16ms 
iter 8367: loss 2.2839, time 4973.31ms 
iter 8368: loss 2.3746, time 4985.69ms 
iter 8369: loss 2.3800, time 4978.87ms 
iter 8370: loss 2.4511, time 5033.38ms 
iter 8371: loss 2.2579, time 5058.50ms 
iter 8372: loss 2.5310, time 5042.18ms 
iter 8373: loss 2.5956, time 5184.25ms 
iter 8374: loss 2.6210, time 5044.42ms 
iter 8375: loss 2.2766, time 5022.99ms 
iter 8376: loss 2.5767, time 5016.28ms 
iter 8377: loss 2.3716, time 5028.02ms 
iter 8378: loss 2.4213, time 5037.31ms 
iter 8379: loss 2.5985, time 5034.10ms 
iter 8380: loss 2.5658, time 5081.60ms 
iter 8381: loss 2.4936, time 5266.71ms 
iter 8382: loss 2.3784, time 5134.04ms 
iter 8383: loss 2.6235, time 5043.59ms 
iter 8384: loss 2.4843, time 5172.26ms 
iter 8385: loss 2.5641, time 5084.44ms 
iter 8386: loss 2.5660, time 5078.14ms 
iter 8387: loss 2.4218, time 5259.63ms 
iter 8388: loss 2.3847, time 5100.76ms 
iter 8389: loss 2.1452, time 5099.15ms 
iter 8390: loss 2.3898, time 5286.58ms 
iter 8391: loss 2.4715, time 5058.00ms 
iter 8392: loss 2.7221, time 5082.87ms 
iter 8393: loss 2.5059, time 5189.84ms 
iter 8394: loss 2.4656, time 5133.94ms 
iter 8395: loss 2.4241, time 5098.35ms 
iter 8396: loss 2.3859, time 5049.19ms 
iter 8397: loss 2.4447, time 5048.57ms 
iter 8398: loss 2.3782, time 5079.10ms 
iter 8399: loss 2.5875, time 5060.24ms 
step 8400: train loss 2.4719, val loss 2.8510
iter 8400: loss 2.2948, time 19873.50ms 
iter 8401: loss 2.2889, time 5058.51ms 
iter 8402: loss 2.5114, time 5055.07ms 
iter 8403: loss 2.4320, time 5062.16ms 
iter 8404: loss 2.2099, time 5058.18ms 
iter 8405: loss 2.6464, time 5055.27ms 
iter 8406: loss 2.5245, time 5110.26ms 
iter 8407: loss 2.2941, time 5163.06ms 
iter 8408: loss 2.4350, time 5089.42ms 
iter 8409: loss 2.5110, time 5112.30ms 
iter 8410: loss 2.4116, time 5224.90ms 
iter 8411: loss 2.3771, time 5159.56ms 
iter 8412: loss 2.3555, time 5194.87ms 
iter 8413: loss 2.4836, time 5321.00ms 
iter 8414: loss 2.7928, time 5151.25ms 
iter 8415: loss 2.4769, time 5125.44ms 
iter 8416: loss 2.3087, time 5270.08ms 
iter 8417: loss 2.4320, time 5107.94ms 
iter 8418: loss 2.4128, time 5095.13ms 
iter 8419: loss 2.3514, time 5257.85ms 
iter 8420: loss 2.5001, time 5099.57ms 
iter 8421: loss 2.5744, time 5125.67ms 
iter 8422: loss 2.3980, time 5282.89ms 
iter 8423: loss 2.3313, time 5143.71ms 
iter 8424: loss 2.2082, time 5134.22ms 
iter 8425: loss 2.3554, time 5194.74ms 
iter 8426: loss 2.6584, time 5087.83ms 
iter 8427: loss 2.4309, time 5061.97ms 
iter 8428: loss 2.5031, time 5064.14ms 
iter 8429: loss 2.5350, time 5063.02ms 
iter 8430: loss 2.3496, time 5063.56ms 
iter 8431: loss 2.1930, time 5059.91ms 
iter 8432: loss 2.4822, time 5087.09ms 
iter 8433: loss 2.3436, time 5063.24ms 
iter 8434: loss 2.4289, time 5030.16ms 
iter 8435: loss 2.3858, time 5191.32ms 
iter 8436: loss 2.5785, time 5142.37ms 
iter 8437: loss 2.3643, time 5081.60ms 
iter 8438: loss 2.4813, time 5187.46ms 
iter 8439: loss 2.3860, time 5178.40ms 
iter 8440: loss 2.5287, time 5087.96ms 
iter 8441: loss 2.6962, time 5033.32ms 
iter 8442: loss 2.5487, time 5040.21ms 
iter 8443: loss 2.4814, time 5031.95ms 
iter 8444: loss 2.2616, time 5243.01ms 
iter 8445: loss 2.2979, time 5132.65ms 
iter 8446: loss 2.3394, time 5094.57ms 
iter 8447: loss 2.6080, time 5169.31ms 
iter 8448: loss 2.5559, time 5138.84ms 
iter 8449: loss 2.4812, time 5081.58ms 
step 8450: train loss 2.4708, val loss 2.8347
iter 8450: loss 2.5704, time 20076.25ms 
iter 8451: loss 2.3582, time 5138.03ms 
iter 8452: loss 2.5569, time 5082.21ms 
iter 8453: loss 2.3368, time 5074.21ms 
iter 8454: loss 2.4901, time 5092.71ms 
iter 8455: loss 2.3596, time 5047.71ms 
iter 8456: loss 2.4490, time 5028.12ms 
iter 8457: loss 2.3488, time 5034.57ms 
iter 8458: loss 2.4961, time 5021.36ms 
iter 8459: loss 2.5435, time 4972.77ms 
iter 8460: loss 2.5130, time 4972.88ms 
iter 8461: loss 2.4144, time 4972.69ms 
iter 8462: loss 2.4802, time 4972.08ms 
iter 8463: loss 2.4614, time 4972.38ms 
iter 8464: loss 2.6262, time 4980.02ms 
iter 8465: loss 2.5846, time 4978.08ms 
iter 8466: loss 2.3744, time 5000.96ms 
iter 8467: loss 2.3817, time 5048.35ms 
iter 8468: loss 2.5009, time 5099.86ms 
iter 8469: loss 2.5144, time 5167.54ms 
iter 8470: loss 2.5559, time 5240.70ms 
iter 8471: loss 2.3414, time 5078.90ms 
iter 8472: loss 2.6061, time 5053.90ms 
iter 8473: loss 2.4891, time 5146.46ms 
iter 8474: loss 2.6602, time 5083.37ms 
iter 8475: loss 2.3588, time 5196.71ms 
iter 8476: loss 2.3031, time 5161.22ms 
iter 8477: loss 2.4598, time 5109.53ms 
iter 8478: loss 2.2726, time 5232.69ms 
iter 8479: loss 2.3542, time 5278.54ms 
iter 8480: loss 2.6955, time 5103.70ms 
iter 8481: loss 2.4922, time 5050.22ms 
iter 8482: loss 2.5396, time 5239.65ms 
iter 8483: loss 2.5388, time 5090.08ms 
iter 8484: loss 2.1965, time 5127.28ms 
iter 8485: loss 2.5160, time 5264.81ms 
iter 8486: loss 2.3767, time 5129.24ms 
iter 8487: loss 2.5553, time 5170.48ms 
iter 8488: loss 2.5756, time 5269.17ms 
iter 8489: loss 2.4972, time 5196.79ms 
iter 8490: loss 2.3058, time 5228.96ms 
iter 8491: loss 2.5457, time 5151.90ms 
iter 8492: loss 2.5338, time 5078.24ms 
iter 8493: loss 2.5561, time 5031.96ms 
iter 8494: loss 2.4509, time 5243.63ms 
iter 8495: loss 2.4812, time 5089.13ms 
iter 8496: loss 2.3662, time 5085.31ms 
iter 8497: loss 2.6803, time 5270.94ms 
iter 8498: loss 2.3458, time 5142.29ms 
iter 8499: loss 2.2753, time 5121.22ms 
step 8500: train loss 2.4771, val loss 2.8458
iter 8500: loss 2.4136, time 19899.82ms 
iter 8501: loss 2.4253, time 5063.67ms 
iter 8502: loss 2.2686, time 5065.41ms 
iter 8503: loss 2.5256, time 5166.78ms 
iter 8504: loss 2.3256, time 5128.17ms 
iter 8505: loss 2.3325, time 5090.78ms 
iter 8506: loss 2.4711, time 5097.04ms 
iter 8507: loss 2.3209, time 5087.15ms 
iter 8508: loss 2.4774, time 5014.03ms 
iter 8509: loss 2.4432, time 4981.79ms 
iter 8510: loss 2.6000, time 4979.28ms 
iter 8511: loss 2.3056, time 5011.08ms 
iter 8512: loss 2.3100, time 5104.52ms 
iter 8513: loss 2.4448, time 5116.02ms 
iter 8514: loss 2.7497, time 5003.77ms 
iter 8515: loss 2.3421, time 5051.88ms 
iter 8516: loss 2.3728, time 5073.46ms 
iter 8517: loss 2.1589, time 5063.12ms 
iter 8518: loss 2.4042, time 5152.71ms 
iter 8519: loss 2.3069, time 5130.58ms 
iter 8520: loss 2.6385, time 5131.37ms 
iter 8521: loss 2.3900, time 5083.05ms 
iter 8522: loss 2.5202, time 5152.07ms 
iter 8523: loss 2.3033, time 5144.24ms 
iter 8524: loss 2.4465, time 5088.65ms 
iter 8525: loss 2.4791, time 5131.18ms 
iter 8526: loss 2.4904, time 5130.08ms 
iter 8527: loss 2.5346, time 5063.44ms 
iter 8528: loss 2.5114, time 5278.87ms 
iter 8529: loss 2.5002, time 5088.00ms 
iter 8530: loss 2.5490, time 5053.09ms 
iter 8531: loss 2.6705, time 5209.34ms 
iter 8532: loss 2.4856, time 5088.54ms 
iter 8533: loss 2.3543, time 5039.01ms 
iter 8534: loss 2.4963, time 4973.16ms 
iter 8535: loss 2.4189, time 4953.13ms 
iter 8536: loss 2.4560, time 4946.35ms 
iter 8537: loss 2.4463, time 5137.06ms 
iter 8538: loss 2.4322, time 5079.59ms 
iter 8539: loss 2.2595, time 5059.17ms 
iter 8540: loss 2.5510, time 5083.43ms 
iter 8541: loss 2.2918, time 5079.55ms 
iter 8542: loss 2.2055, time 5052.39ms 
iter 8543: loss 2.5560, time 5064.33ms 
iter 8544: loss 2.4810, time 5048.77ms 
iter 8545: loss 2.4568, time 5054.52ms 
iter 8546: loss 2.4313, time 5131.87ms 
iter 8547: loss 2.3616, time 5085.87ms 
iter 8548: loss 2.4550, time 5056.57ms 
iter 8549: loss 2.2744, time 5066.25ms 
step 8550: train loss 2.4683, val loss 2.8537
iter 8550: loss 2.9452, time 19788.55ms 
iter 8551: loss 2.5382, time 4951.98ms 
iter 8552: loss 2.3913, time 4946.37ms 
iter 8553: loss 2.6981, time 4945.75ms 
iter 8554: loss 2.4890, time 4950.56ms 
iter 8555: loss 2.4510, time 4945.23ms 
iter 8556: loss 2.5384, time 5068.36ms 
iter 8557: loss 2.5154, time 5261.12ms 
iter 8558: loss 2.3051, time 5127.26ms 
iter 8559: loss 2.5025, time 5135.48ms 
iter 8560: loss 2.4044, time 5117.13ms 
iter 8561: loss 2.4358, time 5072.14ms 
iter 8562: loss 2.1363, time 4983.04ms 
iter 8563: loss 2.6456, time 5127.55ms 
iter 8564: loss 2.4580, time 5074.87ms 
iter 8565: loss 2.6784, time 5232.32ms 
iter 8566: loss 2.5268, time 5276.21ms 
iter 8567: loss 2.4685, time 5135.79ms 
iter 8568: loss 2.6520, time 5076.44ms 
iter 8569: loss 2.5717, time 5043.81ms 
iter 8570: loss 2.4214, time 5170.58ms 
iter 8571: loss 2.4585, time 5280.40ms 
iter 8572: loss 2.3769, time 5196.00ms 
iter 8573: loss 2.5192, time 5115.51ms 
iter 8574: loss 2.6854, time 5196.43ms 
iter 8575: loss 2.5063, time 5311.43ms 
iter 8576: loss 2.4713, time 5162.81ms 
iter 8577: loss 1.9900, time 5094.22ms 
iter 8578: loss 2.5391, time 5276.04ms 
iter 8579: loss 2.3823, time 5178.43ms 
iter 8580: loss 2.3204, time 5150.93ms 
iter 8581: loss 2.3846, time 5245.22ms 
iter 8582: loss 2.5384, time 5094.79ms 
iter 8583: loss 2.2533, time 5108.00ms 
iter 8584: loss 2.6395, time 5287.42ms 
iter 8585: loss 2.4114, time 5185.99ms 
iter 8586: loss 2.5955, time 5131.27ms 
iter 8587: loss 2.4858, time 5120.30ms 
iter 8588: loss 2.3552, time 5048.02ms 
iter 8589: loss 2.5978, time 5050.04ms 
iter 8590: loss 2.5771, time 5060.28ms 
iter 8591: loss 2.3270, time 5065.59ms 
iter 8592: loss 2.5393, time 5033.07ms 
iter 8593: loss 2.5521, time 5025.24ms 
iter 8594: loss 2.4910, time 5153.48ms 
iter 8595: loss 2.5687, time 5090.46ms 
iter 8596: loss 2.4644, time 5188.54ms 
iter 8597: loss 2.4747, time 5151.60ms 
iter 8598: loss 2.4891, time 5083.52ms 
iter 8599: loss 2.2052, time 5240.28ms 
step 8600: train loss 2.4656, val loss 2.8606
iter 8600: loss 2.7674, time 19942.77ms 
iter 8601: loss 2.6165, time 4985.58ms 
iter 8602: loss 2.5314, time 5062.64ms 
iter 8603: loss 2.3103, time 5030.85ms 
iter 8604: loss 2.4540, time 5031.45ms 
iter 8605: loss 2.4681, time 5031.64ms 
iter 8606: loss 2.3977, time 5114.02ms 
iter 8607: loss 2.4773, time 5045.54ms 
iter 8608: loss 2.3279, time 5020.79ms 
iter 8609: loss 2.2185, time 5222.70ms 
iter 8610: loss 2.4592, time 5086.20ms 
iter 8611: loss 2.4339, time 5044.00ms 
iter 8612: loss 2.6953, time 5230.21ms 
iter 8613: loss 2.5844, time 5090.48ms 
iter 8614: loss 2.3807, time 5068.83ms 
iter 8615: loss 2.4957, time 5150.38ms 
iter 8616: loss 2.3336, time 5077.69ms 
iter 8617: loss 2.3978, time 5167.99ms 
iter 8618: loss 2.3724, time 5168.06ms 
iter 8619: loss 2.5827, time 5029.39ms 
iter 8620: loss 2.6511, time 4972.59ms 
iter 8621: loss 2.4094, time 4970.36ms 
iter 8622: loss 2.4838, time 4951.34ms 
iter 8623: loss 2.7410, time 4953.79ms 
iter 8624: loss 2.3058, time 4949.69ms 
iter 8625: loss 2.7813, time 4947.00ms 
iter 8626: loss 2.5027, time 5043.61ms 
iter 8627: loss 2.3637, time 5132.83ms 
iter 8628: loss 2.4716, time 5286.95ms 
iter 8629: loss 2.4718, time 5151.31ms 
iter 8630: loss 2.3599, time 5075.50ms 
iter 8631: loss 2.5541, time 5202.97ms 
iter 8632: loss 2.5752, time 5027.58ms 
iter 8633: loss 2.4772, time 5081.29ms 
iter 8634: loss 2.7848, time 5258.65ms 
iter 8635: loss 2.6291, time 5128.99ms 
iter 8636: loss 2.3294, time 5200.29ms 
iter 8637: loss 2.3078, time 5099.29ms 
iter 8638: loss 2.6800, time 5011.30ms 
iter 8639: loss 2.5523, time 5020.40ms 
iter 8640: loss 2.5984, time 5094.80ms 
iter 8641: loss 2.4121, time 5082.30ms 
iter 8642: loss 2.6391, time 5091.49ms 
iter 8643: loss 2.4993, time 5062.16ms 
iter 8644: loss 2.4360, time 5218.72ms 
iter 8645: loss 2.4460, time 5271.19ms 
iter 8646: loss 2.5684, time 5160.57ms 
iter 8647: loss 2.3160, time 5062.56ms 
iter 8648: loss 2.3433, time 5037.24ms 
iter 8649: loss 2.3460, time 5201.53ms 
step 8650: train loss 2.4648, val loss 2.8581
iter 8650: loss 2.4179, time 20082.30ms 
iter 8651: loss 2.4516, time 5088.57ms 
iter 8652: loss 2.5082, time 5051.93ms 
iter 8653: loss 2.1410, time 5055.79ms 
iter 8654: loss 2.4827, time 5111.70ms 
iter 8655: loss 2.4770, time 5062.24ms 
iter 8656: loss 2.5470, time 5011.22ms 
iter 8657: loss 2.5906, time 5061.11ms 
iter 8658: loss 2.4888, time 5261.78ms 
iter 8659: loss 2.5831, time 5147.02ms 
iter 8660: loss 2.5660, time 5129.84ms 
iter 8661: loss 2.4948, time 5070.97ms 
iter 8662: loss 2.6076, time 5086.81ms 
iter 8663: loss 2.5291, time 5148.70ms 
iter 8664: loss 2.5095, time 5246.14ms 
iter 8665: loss 2.5195, time 5141.05ms 
iter 8666: loss 2.6374, time 5239.61ms 
iter 8667: loss 2.3618, time 5081.66ms 
iter 8668: loss 2.5154, time 5174.23ms 
iter 8669: loss 2.3560, time 5266.83ms 
iter 8670: loss 2.3830, time 5216.11ms 
iter 8671: loss 2.4722, time 5089.97ms 
iter 8672: loss 2.4603, time 5053.64ms 
iter 8673: loss 2.4224, time 5033.48ms 
iter 8674: loss 2.3452, time 5053.92ms 
iter 8675: loss 2.4871, time 5107.35ms 
iter 8676: loss 2.5090, time 5226.14ms 
iter 8677: loss 2.5304, time 5080.17ms 
iter 8678: loss 2.4465, time 5218.67ms 
iter 8679: loss 2.0474, time 5126.77ms 
iter 8680: loss 2.5973, time 5092.23ms 
iter 8681: loss 2.3749, time 5266.10ms 
iter 8682: loss 2.4825, time 5132.24ms 
iter 8683: loss 2.4021, time 5081.52ms 
iter 8684: loss 2.3991, time 5182.97ms 
iter 8685: loss 2.6292, time 5015.97ms 
iter 8686: loss 2.6955, time 5117.77ms 
iter 8687: loss 2.6253, time 5136.93ms 
iter 8688: loss 2.6551, time 5139.35ms 
iter 8689: loss 2.3675, time 5270.57ms 
iter 8690: loss 2.3074, time 5138.69ms 
iter 8691: loss 2.4631, time 5132.13ms 
iter 8692: loss 2.3110, time 5140.10ms 
iter 8693: loss 2.4712, time 5080.69ms 
iter 8694: loss 2.4451, time 5056.40ms 
iter 8695: loss 2.5191, time 5064.43ms 
iter 8696: loss 2.4847, time 5051.18ms 
iter 8697: loss 2.2506, time 5219.99ms 
iter 8698: loss 2.2755, time 5141.53ms 
iter 8699: loss 2.3971, time 5142.86ms 
step 8700: train loss 2.4811, val loss 2.8540
iter 8700: loss 2.5209, time 19853.81ms 
iter 8701: loss 2.7996, time 4950.67ms 
iter 8702: loss 2.6774, time 5129.85ms 
iter 8703: loss 2.4897, time 5132.34ms 
iter 8704: loss 2.3694, time 5079.99ms 
iter 8705: loss 2.6483, time 5095.86ms 
iter 8706: loss 2.4559, time 5044.06ms 
iter 8707: loss 2.2778, time 5044.13ms 
iter 8708: loss 2.4900, time 5044.14ms 
iter 8709: loss 2.3477, time 5094.06ms 
iter 8710: loss 2.4276, time 5174.12ms 
iter 8711: loss 2.4146, time 5084.49ms 
iter 8712: loss 2.5613, time 5053.81ms 
iter 8713: loss 2.6027, time 5060.87ms 
iter 8714: loss 2.3981, time 5074.88ms 
iter 8715: loss 2.4636, time 5247.51ms 
iter 8716: loss 2.2935, time 5089.27ms 
iter 8717: loss 2.4401, time 5066.03ms 
iter 8718: loss 2.6144, time 5160.10ms 
iter 8719: loss 2.4874, time 5067.71ms 
iter 8720: loss 2.4960, time 5041.96ms 
iter 8721: loss 2.4913, time 5035.72ms 
iter 8722: loss 2.1683, time 5047.41ms 
iter 8723: loss 2.5333, time 5277.09ms 
iter 8724: loss 2.5124, time 5107.22ms 
iter 8725: loss 2.4843, time 5045.32ms 
iter 8726: loss 2.5151, time 5249.62ms 
iter 8727: loss 2.3396, time 5149.02ms 
iter 8728: loss 2.4396, time 5136.99ms 
iter 8729: loss 2.5580, time 5136.48ms 
iter 8730: loss 2.3250, time 5095.96ms 
iter 8731: loss 2.6557, time 5250.61ms 
iter 8732: loss 2.3217, time 5120.44ms 
iter 8733: loss 2.4641, time 5095.96ms 
iter 8734: loss 2.3078, time 5198.81ms 
iter 8735: loss 2.5159, time 5087.95ms 
iter 8736: loss 2.4966, time 5154.12ms 
iter 8737: loss 2.4464, time 5130.78ms 
iter 8738: loss 2.5183, time 5094.51ms 
iter 8739: loss 2.5212, time 5242.49ms 
iter 8740: loss 2.4006, time 5163.27ms 
iter 8741: loss 2.4803, time 5126.26ms 
iter 8742: loss 2.4382, time 5087.00ms 
iter 8743: loss 2.3930, time 5054.49ms 
iter 8744: loss 2.6720, time 5015.62ms 
iter 8745: loss 2.5607, time 5013.30ms 
iter 8746: loss 2.4281, time 5019.71ms 
iter 8747: loss 2.4843, time 5018.94ms 
iter 8748: loss 2.4359, time 5016.94ms 
iter 8749: loss 2.3927, time 5015.61ms 
step 8750: train loss 2.4613, val loss 2.8423
iter 8750: loss 2.3208, time 19841.46ms 
iter 8751: loss 2.4399, time 4945.21ms 
iter 8752: loss 2.5966, time 4943.79ms 
iter 8753: loss 2.6201, time 4947.16ms 
iter 8754: loss 2.4361, time 5164.95ms 
iter 8755: loss 2.5436, time 5176.90ms 
iter 8756: loss 2.2028, time 5102.05ms 
iter 8757: loss 2.3829, time 5012.70ms 
iter 8758: loss 2.1864, time 5016.74ms 
iter 8759: loss 2.5057, time 5148.53ms 
iter 8760: loss 2.6340, time 5134.26ms 
iter 8761: loss 2.3405, time 5090.31ms 
iter 8762: loss 2.5809, time 5234.40ms 
iter 8763: loss 2.5336, time 5278.05ms 
iter 8764: loss 2.3331, time 5084.84ms 
iter 8765: loss 2.4864, time 5127.06ms 
iter 8766: loss 2.4862, time 5085.45ms 
iter 8767: loss 2.4546, time 5130.81ms 
iter 8768: loss 2.2588, time 5166.72ms 
iter 8769: loss 2.3878, time 5159.51ms 
iter 8770: loss 2.4548, time 5178.15ms 
iter 8771: loss 2.1250, time 5274.60ms 
iter 8772: loss 2.6631, time 5126.54ms 
iter 8773: loss 2.4671, time 5116.10ms 
iter 8774: loss 2.5172, time 5058.15ms 
iter 8775: loss 2.5610, time 4983.64ms 
iter 8776: loss 2.5806, time 5159.56ms 
iter 8777: loss 2.3715, time 5059.51ms 
iter 8778: loss 2.3411, time 4975.59ms 
iter 8779: loss 2.4603, time 5230.14ms 
iter 8780: loss 2.2865, time 5276.12ms 
iter 8781: loss 2.4608, time 5118.21ms 
iter 8782: loss 2.3804, time 5071.95ms 
iter 8783: loss 2.5021, time 5035.91ms 
iter 8784: loss 2.4740, time 5023.43ms 
iter 8785: loss 2.4546, time 5191.54ms 
iter 8786: loss 2.3232, time 5123.39ms 
iter 8787: loss 2.5681, time 5143.40ms 
iter 8788: loss 2.3386, time 5304.33ms 
iter 8789: loss 2.5727, time 5135.76ms 
iter 8790: loss 2.6220, time 5079.98ms 
iter 8791: loss 2.6120, time 5127.69ms 
iter 8792: loss 2.4741, time 5090.71ms 
iter 8793: loss 2.5121, time 5160.35ms 
iter 8794: loss 2.4404, time 5122.34ms 
iter 8795: loss 2.4467, time 5092.53ms 
iter 8796: loss 2.4737, time 5256.86ms 
iter 8797: loss 2.4596, time 5224.66ms 
iter 8798: loss 2.3554, time 5138.99ms 
iter 8799: loss 2.4504, time 5085.71ms 
step 8800: train loss 2.4567, val loss 2.8511
iter 8800: loss 2.6253, time 19872.56ms 
iter 8801: loss 2.5581, time 5035.90ms 
iter 8802: loss 2.0930, time 5084.96ms 
iter 8803: loss 2.6331, time 5006.39ms 
iter 8804: loss 2.1825, time 4962.06ms 
iter 8805: loss 2.3728, time 5010.13ms 
iter 8806: loss 2.6121, time 5009.61ms 
iter 8807: loss 2.6123, time 5134.24ms 
iter 8808: loss 2.6794, time 5104.17ms 
iter 8809: loss 2.5311, time 5055.73ms 
iter 8810: loss 2.5024, time 5083.17ms 
iter 8811: loss 2.3827, time 5088.48ms 
iter 8812: loss 2.4511, time 5264.83ms 
iter 8813: loss 2.5270, time 5142.43ms 
iter 8814: loss 2.6862, time 5266.41ms 
iter 8815: loss 2.3120, time 5141.68ms 
iter 8816: loss 2.7225, time 5095.02ms 
iter 8817: loss 2.3951, time 5228.67ms 
iter 8818: loss 2.5378, time 5053.96ms 
iter 8819: loss 2.5277, time 5092.11ms 
iter 8820: loss 2.4252, time 5132.54ms 
iter 8821: loss 2.5539, time 5104.40ms 
iter 8822: loss 2.3569, time 5175.60ms 
iter 8823: loss 2.3886, time 5152.39ms 
iter 8824: loss 2.3054, time 5076.96ms 
iter 8825: loss 2.4028, time 5087.99ms 
iter 8826: loss 2.7518, time 5069.69ms 
iter 8827: loss 2.6017, time 5056.98ms 
iter 8828: loss 2.4914, time 5104.33ms 
iter 8829: loss 2.4564, time 5093.03ms 
iter 8830: loss 2.6154, time 5062.00ms 
iter 8831: loss 2.5876, time 5043.26ms 
iter 8832: loss 2.5884, time 5039.72ms 
iter 8833: loss 2.5716, time 5210.86ms 
iter 8834: loss 2.4004, time 5186.20ms 
iter 8835: loss 2.4882, time 5093.84ms 
iter 8836: loss 2.5703, time 5263.29ms 
iter 8837: loss 2.5188, time 5298.65ms 
iter 8838: loss 2.4681, time 5226.46ms 
iter 8839: loss 2.3525, time 5087.04ms 
iter 8840: loss 2.4354, time 5108.11ms 
iter 8841: loss 2.3114, time 5057.08ms 
iter 8842: loss 2.3367, time 5065.12ms 
iter 8843: loss 2.3657, time 5077.35ms 
iter 8844: loss 2.5135, time 5058.40ms 
iter 8845: loss 2.4008, time 5133.28ms 
iter 8846: loss 2.4181, time 5140.57ms 
iter 8847: loss 2.4306, time 5044.03ms 
iter 8848: loss 2.4889, time 5083.27ms 
iter 8849: loss 2.3759, time 5094.60ms 
step 8850: train loss 2.4739, val loss 2.8498
iter 8850: loss 2.4840, time 20013.56ms 
iter 8851: loss 2.5952, time 5276.38ms 
iter 8852: loss 2.4184, time 5137.43ms 
iter 8853: loss 2.4807, time 5096.95ms 
iter 8854: loss 2.5201, time 5253.38ms 
iter 8855: loss 2.4863, time 5103.30ms 
iter 8856: loss 2.4161, time 5098.04ms 
iter 8857: loss 2.4378, time 5044.25ms 
iter 8858: loss 2.7793, time 5067.07ms 
iter 8859: loss 2.5633, time 5048.10ms 
iter 8860: loss 2.3527, time 5114.53ms 
iter 8861: loss 2.5761, time 5086.40ms 
iter 8862: loss 2.3747, time 5057.27ms 
iter 8863: loss 2.3741, time 5035.30ms 
iter 8864: loss 2.7453, time 5043.87ms 
iter 8865: loss 2.4275, time 5262.67ms 
iter 8866: loss 2.6919, time 5123.70ms 
iter 8867: loss 2.5502, time 5091.48ms 
iter 8868: loss 2.4359, time 5205.24ms 
iter 8869: loss 2.5841, time 5095.90ms 
iter 8870: loss 2.6836, time 5194.51ms 
iter 8871: loss 2.6442, time 5250.84ms 
iter 8872: loss 2.6637, time 5153.83ms 
iter 8873: loss 2.5239, time 5052.14ms 
iter 8874: loss 2.7743, time 5063.15ms 
iter 8875: loss 2.4862, time 5101.71ms 
iter 8876: loss 2.6673, time 5065.34ms 
iter 8877: loss 2.6305, time 5062.94ms 
iter 8878: loss 2.6250, time 5061.59ms 
iter 8879: loss 2.4887, time 5057.83ms 
iter 8880: loss 2.4722, time 5183.44ms 
iter 8881: loss 2.7485, time 5124.37ms 
iter 8882: loss 2.5734, time 5239.89ms 
iter 8883: loss 2.7184, time 5131.28ms 
iter 8884: loss 2.3698, time 5069.52ms 
iter 8885: loss 2.5975, time 5187.08ms 
iter 8886: loss 2.2823, time 5204.80ms 
iter 8887: loss 2.3829, time 5046.53ms 
iter 8888: loss 2.4455, time 5113.75ms 
iter 8889: loss 2.4001, time 5034.59ms 
iter 8890: loss 2.2843, time 4994.05ms 
iter 8891: loss 2.6335, time 5054.05ms 
iter 8892: loss 2.5096, time 5058.12ms 
iter 8893: loss 2.4639, time 5110.01ms 
iter 8894: loss 2.1218, time 5147.11ms 
iter 8895: loss 2.6842, time 5093.20ms 
iter 8896: loss 2.3981, time 5046.17ms 
iter 8897: loss 2.5087, time 5032.95ms 
iter 8898: loss 2.2930, time 5032.75ms 
iter 8899: loss 2.2303, time 5113.77ms 
step 8900: train loss 2.4651, val loss 2.8368
iter 8900: loss 2.6377, time 20041.14ms 
iter 8901: loss 2.5598, time 5095.14ms 
iter 8902: loss 2.4254, time 5086.34ms 
iter 8903: loss 2.4585, time 5128.33ms 
iter 8904: loss 2.7312, time 5083.65ms 
iter 8905: loss 2.5268, time 5057.73ms 
iter 8906: loss 2.5462, time 5051.43ms 
iter 8907: loss 2.3753, time 5028.60ms 
iter 8908: loss 2.4866, time 5099.06ms 
iter 8909: loss 2.2186, time 5084.07ms 
iter 8910: loss 2.5141, time 5077.21ms 
iter 8911: loss 2.4101, time 5156.86ms 
iter 8912: loss 2.4878, time 5076.88ms 
iter 8913: loss 2.4491, time 5092.30ms 
iter 8914: loss 2.3727, time 5294.58ms 
iter 8915: loss 2.3989, time 5093.65ms 
iter 8916: loss 2.4240, time 5051.24ms 
iter 8917: loss 2.5116, time 5122.59ms 
iter 8918: loss 2.3815, time 5110.37ms 
iter 8919: loss 2.4573, time 5101.47ms 
iter 8920: loss 2.5176, time 5089.89ms 
iter 8921: loss 2.2118, time 5075.07ms 
iter 8922: loss 2.5117, time 5061.30ms 
iter 8923: loss 2.3503, time 5058.44ms 
iter 8924: loss 2.4326, time 5060.37ms 
iter 8925: loss 2.5152, time 5065.80ms 
iter 8926: loss 2.6560, time 5047.85ms 
iter 8927: loss 2.4013, time 5048.74ms 
iter 8928: loss 2.4510, time 5017.34ms 
iter 8929: loss 2.4863, time 5021.97ms 
iter 8930: loss 2.4680, time 5150.38ms 
iter 8931: loss 2.4151, time 5143.25ms 
iter 8932: loss 2.4631, time 5050.34ms 
iter 8933: loss 2.5314, time 5068.79ms 
iter 8934: loss 2.3435, time 5189.76ms 
iter 8935: loss 2.5678, time 5266.89ms 
iter 8936: loss 2.4813, time 5125.05ms 
iter 8937: loss 2.4232, time 5085.01ms 
iter 8938: loss 2.4789, time 5104.94ms 
iter 8939: loss 2.5677, time 5083.59ms 
iter 8940: loss 2.6768, time 5053.08ms 
iter 8941: loss 2.4541, time 5062.86ms 
iter 8942: loss 2.6287, time 5132.00ms 
iter 8943: loss 2.4024, time 5010.95ms 
iter 8944: loss 2.4595, time 4974.13ms 
iter 8945: loss 2.5903, time 4951.32ms 
iter 8946: loss 2.5417, time 4946.24ms 
iter 8947: loss 2.2617, time 4946.30ms 
iter 8948: loss 2.2582, time 4949.19ms 
iter 8949: loss 2.5637, time 5216.03ms 
step 8950: train loss 2.4662, val loss 2.8725
iter 8950: loss 2.5017, time 20188.99ms 
iter 8951: loss 2.3111, time 4997.55ms 
iter 8952: loss 2.1095, time 5135.36ms 
iter 8953: loss 2.4639, time 5070.12ms 
iter 8954: loss 2.3153, time 5100.27ms 
iter 8955: loss 2.5473, time 5056.10ms 
iter 8956: loss 2.5002, time 5013.82ms 
iter 8957: loss 2.5799, time 4977.33ms 
iter 8958: loss 2.4402, time 4953.31ms 
iter 8959: loss 2.5537, time 4987.53ms 
iter 8960: loss 2.5083, time 5024.87ms 
iter 8961: loss 2.3958, time 5058.95ms 
iter 8962: loss 2.3958, time 5048.59ms 
iter 8963: loss 2.5140, time 5062.59ms 
iter 8964: loss 2.4450, time 5207.85ms 
iter 8965: loss 2.4677, time 5166.64ms 
iter 8966: loss 2.2789, time 5112.19ms 
iter 8967: loss 2.4122, time 5193.80ms 
iter 8968: loss 2.4201, time 5155.53ms 
iter 8969: loss 2.5523, time 5070.93ms 
iter 8970: loss 2.5396, time 5293.04ms 
iter 8971: loss 2.0998, time 5136.11ms 
iter 8972: loss 2.3998, time 5086.70ms 
iter 8973: loss 2.5560, time 5076.60ms 
iter 8974: loss 2.5867, time 5056.11ms 
iter 8975: loss 2.4947, time 5068.42ms 
iter 8976: loss 2.5809, time 5085.04ms 
iter 8977: loss 2.6105, time 5085.03ms 
iter 8978: loss 2.4147, time 5059.69ms 
iter 8979: loss 2.5260, time 5065.64ms 
iter 8980: loss 2.6004, time 5018.20ms 
iter 8981: loss 2.4646, time 5173.88ms 
iter 8982: loss 2.4153, time 5127.90ms 
iter 8983: loss 2.5622, time 5131.19ms 
iter 8984: loss 2.5241, time 5275.94ms 
iter 8985: loss 2.5730, time 5074.30ms 
iter 8986: loss 2.5044, time 5032.68ms 
iter 8987: loss 2.5348, time 4975.24ms 
iter 8988: loss 2.4461, time 5050.99ms 
iter 8989: loss 2.4241, time 5034.89ms 
iter 8990: loss 2.8139, time 5105.88ms 
iter 8991: loss 2.1843, time 5081.20ms 
iter 8992: loss 2.5185, time 5017.81ms 
iter 8993: loss 2.3584, time 5114.86ms 
iter 8994: loss 2.6214, time 5057.53ms 
iter 8995: loss 2.5982, time 5019.65ms 
iter 8996: loss 2.5103, time 5011.36ms 
iter 8997: loss 2.4781, time 5026.17ms 
iter 8998: loss 2.4574, time 5026.95ms 
iter 8999: loss 2.5898, time 5019.36ms 
step 9000: train loss 2.4745, val loss 2.8458
iter 9000: loss 2.5115, time 19875.26ms 
iter 9001: loss 2.3759, time 4945.59ms 
iter 9002: loss 2.4385, time 5149.57ms 
iter 9003: loss 2.4173, time 5106.52ms 
iter 9004: loss 2.4493, time 5111.47ms 
iter 9005: loss 2.2738, time 5147.55ms 
iter 9006: loss 2.3178, time 5102.43ms 
iter 9007: loss 2.5684, time 5057.92ms 
iter 9008: loss 2.6389, time 5102.32ms 
iter 9009: loss 2.4142, time 5112.44ms 
iter 9010: loss 2.3292, time 5063.57ms 
iter 9011: loss 2.6734, time 5108.62ms 
iter 9012: loss 2.3380, time 5094.18ms 
iter 9013: loss 2.6154, time 5110.96ms 
iter 9014: loss 2.5418, time 5045.32ms 
iter 9015: loss 2.4946, time 5021.22ms 
iter 9016: loss 2.5173, time 5102.63ms 
iter 9017: loss 2.3025, time 5086.23ms 
iter 9018: loss 2.3826, time 5061.06ms 
iter 9019: loss 2.5623, time 5209.88ms 
iter 9020: loss 2.2219, time 5222.22ms 
iter 9021: loss 2.6495, time 5133.14ms 
iter 9022: loss 2.6258, time 5065.96ms 
iter 9023: loss 2.5214, time 5104.85ms 
iter 9024: loss 2.3620, time 5061.47ms 
iter 9025: loss 2.5431, time 5070.53ms 
iter 9026: loss 2.5652, time 5015.83ms 
iter 9027: loss 2.4576, time 5014.64ms 
iter 9028: loss 2.3794, time 5116.49ms 
iter 9029: loss 2.6478, time 5239.75ms 
iter 9030: loss 2.5340, time 5276.99ms 
iter 9031: loss 2.5401, time 5166.31ms 
iter 9032: loss 2.4676, time 5000.16ms 
iter 9033: loss 2.6303, time 5280.01ms 
iter 9034: loss 2.6544, time 5270.84ms 
iter 9035: loss 2.6005, time 5159.90ms 
iter 9036: loss 2.5203, time 5139.22ms 
iter 9037: loss 2.4880, time 5105.00ms 
iter 9038: loss 2.5177, time 5052.17ms 
iter 9039: loss 2.4543, time 5150.17ms 
iter 9040: loss 2.6432, time 5068.65ms 
iter 9041: loss 2.5589, time 5052.83ms 
iter 9042: loss 2.6404, time 5062.32ms 
iter 9043: loss 2.3986, time 5094.17ms 
iter 9044: loss 2.3691, time 5036.75ms 
iter 9045: loss 2.2552, time 5190.45ms 
iter 9046: loss 2.3958, time 5197.98ms 
iter 9047: loss 2.4868, time 5171.76ms 
iter 9048: loss 2.1641, time 5100.70ms 
iter 9049: loss 2.4347, time 5068.02ms 
step 9050: train loss 2.4600, val loss 2.8614
iter 9050: loss 2.4208, time 19990.03ms 
iter 9051: loss 2.4500, time 5168.47ms 
iter 9052: loss 2.3838, time 5118.37ms 
iter 9053: loss 2.5617, time 5052.07ms 
iter 9054: loss 2.6733, time 5006.82ms 
iter 9055: loss 2.6845, time 4975.05ms 
iter 9056: loss 2.5531, time 4979.07ms 
iter 9057: loss 2.3202, time 5170.51ms 
iter 9058: loss 2.5519, time 5304.98ms 
iter 9059: loss 2.3822, time 5133.83ms 
iter 9060: loss 2.2635, time 5155.48ms 
iter 9061: loss 2.6124, time 5024.49ms 
iter 9062: loss 2.3631, time 5028.52ms 
iter 9063: loss 2.6489, time 5116.69ms 
iter 9064: loss 2.2476, time 5088.31ms 
iter 9065: loss 2.4641, time 5135.77ms 
iter 9066: loss 2.4166, time 5136.88ms 
iter 9067: loss 2.3646, time 5064.76ms 
iter 9068: loss 2.4388, time 5038.78ms 
iter 9069: loss 2.7148, time 4972.17ms 
iter 9070: loss 2.4733, time 4970.24ms 
iter 9071: loss 2.4317, time 4991.80ms 
iter 9072: loss 2.5626, time 5253.04ms 
iter 9073: loss 2.3538, time 5059.77ms 
iter 9074: loss 2.6847, time 5053.67ms 
iter 9075: loss 2.5204, time 5116.93ms 
iter 9076: loss 2.4111, time 5124.23ms 
iter 9077: loss 2.5363, time 5089.66ms 
iter 9078: loss 2.3173, time 5171.49ms 
iter 9079: loss 2.6775, time 5137.43ms 
iter 9080: loss 2.4897, time 5047.67ms 
iter 9081: loss 2.4051, time 5060.45ms 
iter 9082: loss 2.4644, time 5010.53ms 
iter 9083: loss 2.3586, time 5022.03ms 
iter 9084: loss 2.4411, time 5046.31ms 
iter 9085: loss 2.3820, time 5085.94ms 
iter 9086: loss 2.4675, time 5092.15ms 
iter 9087: loss 2.4558, time 5031.48ms 
iter 9088: loss 2.4609, time 5015.69ms 
iter 9089: loss 2.3002, time 5172.87ms 
iter 9090: loss 2.3428, time 5089.71ms 
iter 9091: loss 2.3103, time 5051.55ms 
iter 9092: loss 2.2908, time 5202.41ms 
iter 9093: loss 2.2386, time 5250.53ms 
iter 9094: loss 2.2727, time 5093.83ms 
iter 9095: loss 2.4386, time 5133.70ms 
iter 9096: loss 2.5412, time 5070.91ms 
iter 9097: loss 2.4224, time 4995.50ms 
iter 9098: loss 2.4839, time 5079.48ms 
iter 9099: loss 2.4257, time 5286.45ms 
step 9100: train loss 2.4788, val loss 2.8455
iter 9100: loss 2.6463, time 19860.86ms 
iter 9101: loss 2.2700, time 5025.47ms 
iter 9102: loss 2.2526, time 5230.80ms 
iter 9103: loss 2.4352, time 5104.03ms 
iter 9104: loss 2.5270, time 5046.29ms 
iter 9105: loss 2.5461, time 5030.46ms 
iter 9106: loss 2.3273, time 5036.40ms 
iter 9107: loss 2.3330, time 5178.73ms 
iter 9108: loss 2.5087, time 5099.24ms 
iter 9109: loss 2.3636, time 5050.07ms 
iter 9110: loss 2.4161, time 5148.95ms 
iter 9111: loss 2.4190, time 5131.27ms 
iter 9112: loss 2.5723, time 5058.67ms 
iter 9113: loss 2.5746, time 5117.35ms 
iter 9114: loss 2.4097, time 5105.05ms 
iter 9115: loss 2.3155, time 5064.41ms 
iter 9116: loss 2.6831, time 5170.28ms 
iter 9117: loss 2.4174, time 5281.21ms 
iter 9118: loss 2.6211, time 5132.50ms 
iter 9119: loss 2.5751, time 5095.31ms 
iter 9120: loss 2.5131, time 5062.10ms 
iter 9121: loss 2.3413, time 5043.29ms 
iter 9122: loss 2.4183, time 4954.36ms 
iter 9123: loss 2.4920, time 4947.71ms 
iter 9124: loss 2.3049, time 4946.79ms 
iter 9125: loss 2.5345, time 4986.91ms 
iter 9126: loss 2.2679, time 5119.59ms 
iter 9127: loss 2.4776, time 5066.65ms 
iter 9128: loss 2.5402, time 5090.67ms 
iter 9129: loss 2.5394, time 5051.17ms 
iter 9130: loss 2.5608, time 5074.74ms 
iter 9131: loss 2.3631, time 5220.42ms 
iter 9132: loss 2.5460, time 5129.36ms 
iter 9133: loss 2.2994, time 5134.58ms 
iter 9134: loss 2.4594, time 5174.31ms 
iter 9135: loss 2.5794, time 5099.78ms 
iter 9136: loss 2.4796, time 5056.64ms 
iter 9137: loss 2.5462, time 5107.52ms 
iter 9138: loss 2.2136, time 5304.10ms 
iter 9139: loss 2.4956, time 5141.56ms 
iter 9140: loss 2.3463, time 5095.76ms 
iter 9141: loss 2.4399, time 5070.48ms 
iter 9142: loss 2.2947, time 5053.34ms 
iter 9143: loss 2.2919, time 5047.70ms 
iter 9144: loss 2.2854, time 5105.13ms 
iter 9145: loss 2.4944, time 5007.26ms 
iter 9146: loss 2.8848, time 5004.93ms 
iter 9147: loss 2.2722, time 5003.33ms 
iter 9148: loss 2.5371, time 5038.07ms 
iter 9149: loss 2.3095, time 5020.42ms 
step 9150: train loss 2.4513, val loss 2.8414
iter 9150: loss 2.4397, time 19927.62ms 
iter 9151: loss 2.5971, time 5038.59ms 
iter 9152: loss 2.2567, time 4979.23ms 
iter 9153: loss 2.5602, time 4966.96ms 
iter 9154: loss 2.3634, time 4946.20ms 
iter 9155: loss 2.5065, time 5026.72ms 
iter 9156: loss 2.3670, time 5269.83ms 
iter 9157: loss 2.5334, time 5047.79ms 
iter 9158: loss 2.6562, time 4973.66ms 
iter 9159: loss 2.3242, time 5161.41ms 
iter 9160: loss 2.6080, time 5177.15ms 
iter 9161: loss 2.5880, time 5027.11ms 
iter 9162: loss 2.6780, time 5031.14ms 
iter 9163: loss 2.6873, time 5012.48ms 
iter 9164: loss 2.5034, time 4997.43ms 
iter 9165: loss 2.5482, time 5178.11ms 
iter 9166: loss 2.3072, time 5198.11ms 
iter 9167: loss 2.4144, time 5129.66ms 
iter 9168: loss 2.1433, time 5049.87ms 
iter 9169: loss 2.3482, time 5061.12ms 
iter 9170: loss 2.6246, time 5237.12ms 
iter 9171: loss 2.3736, time 5191.66ms 
iter 9172: loss 2.3709, time 5073.85ms 
iter 9173: loss 2.3098, time 5065.90ms 
iter 9174: loss 2.4908, time 5209.96ms 
iter 9175: loss 2.4693, time 5054.28ms 
iter 9176: loss 2.2833, time 5058.44ms 
iter 9177: loss 2.4050, time 5279.54ms 
iter 9178: loss 2.4476, time 5295.57ms 
iter 9179: loss 2.4879, time 5131.74ms 
iter 9180: loss 2.4519, time 5091.23ms 
iter 9181: loss 2.1404, time 5019.13ms 
iter 9182: loss 2.7367, time 5024.71ms 
iter 9183: loss 2.5382, time 5267.63ms 
iter 9184: loss 2.4527, time 5042.37ms 
iter 9185: loss 2.6772, time 4987.94ms 
iter 9186: loss 2.5077, time 4980.99ms 
iter 9187: loss 2.4863, time 5035.59ms 
iter 9188: loss 2.5297, time 5059.13ms 
iter 9189: loss 2.4232, time 5074.85ms 
iter 9190: loss 2.4022, time 5078.50ms 
iter 9191: loss 2.4542, time 5099.57ms 
iter 9192: loss 2.4712, time 5118.77ms 
iter 9193: loss 2.5020, time 5053.36ms 
iter 9194: loss 2.3548, time 5095.18ms 
iter 9195: loss 2.3216, time 5155.93ms 
iter 9196: loss 2.0984, time 5169.03ms 
iter 9197: loss 2.5786, time 5147.95ms 
iter 9198: loss 2.5659, time 5087.66ms 
iter 9199: loss 2.4295, time 5053.95ms 
step 9200: train loss 2.4796, val loss 2.8370
iter 9200: loss 2.2540, time 19842.36ms 
iter 9201: loss 2.4669, time 5024.41ms 
iter 9202: loss 2.3702, time 5098.08ms 
iter 9203: loss 2.3193, time 5147.28ms 
iter 9204: loss 2.4057, time 5088.38ms 
iter 9205: loss 2.5637, time 5006.24ms 
iter 9206: loss 2.6164, time 5003.88ms 
iter 9207: loss 2.2858, time 5000.23ms 
iter 9208: loss 2.5920, time 5005.53ms 
iter 9209: loss 2.4753, time 5021.47ms 
iter 9210: loss 2.5184, time 5008.01ms 
iter 9211: loss 2.4734, time 5025.76ms 
iter 9212: loss 2.5536, time 5097.49ms 
iter 9213: loss 2.3653, time 5052.49ms 
iter 9214: loss 2.2020, time 5065.29ms 
iter 9215: loss 2.3597, time 5066.43ms 
iter 9216: loss 2.3741, time 5147.35ms 
iter 9217: loss 2.4379, time 5049.35ms 
iter 9218: loss 2.2990, time 5015.73ms 
iter 9219: loss 2.6832, time 4984.36ms 
iter 9220: loss 2.5171, time 5001.95ms 
iter 9221: loss 2.3695, time 5054.59ms 
iter 9222: loss 2.3311, time 5052.10ms 
iter 9223: loss 2.5062, time 5127.42ms 
iter 9224: loss 2.4198, time 5173.74ms 
iter 9225: loss 2.3560, time 5030.93ms 
iter 9226: loss 2.4336, time 4991.94ms 
iter 9227: loss 2.1637, time 5034.49ms 
iter 9228: loss 2.5990, time 5040.39ms 
iter 9229: loss 2.5728, time 5057.62ms 
iter 9230: loss 2.4488, time 5109.33ms 
iter 9231: loss 2.3716, time 5135.63ms 
iter 9232: loss 2.4243, time 5151.74ms 
iter 9233: loss 2.3256, time 5057.53ms 
iter 9234: loss 2.2511, time 5047.89ms 
iter 9235: loss 2.4134, time 5016.38ms 
iter 9236: loss 2.3136, time 5001.83ms 
iter 9237: loss 2.4609, time 4999.90ms 
iter 9238: loss 2.5820, time 5000.76ms 
iter 9239: loss 2.4820, time 5042.03ms 
iter 9240: loss 2.2573, time 5052.49ms 
iter 9241: loss 2.4951, time 5038.91ms 
iter 9242: loss 2.3415, time 4990.37ms 
iter 9243: loss 2.4098, time 4980.48ms 
iter 9244: loss 2.4667, time 5003.28ms 
iter 9245: loss 2.3849, time 5135.37ms 
iter 9246: loss 2.3101, time 5100.75ms 
iter 9247: loss 2.6467, time 5064.45ms 
iter 9248: loss 2.4588, time 5070.85ms 
iter 9249: loss 2.4378, time 5077.83ms 
step 9250: train loss 2.4380, val loss 2.8545
iter 9250: loss 2.4147, time 19905.62ms 
iter 9251: loss 2.5477, time 5015.56ms 
iter 9252: loss 2.2049, time 5056.82ms 
iter 9253: loss 2.2961, time 5064.77ms 
iter 9254: loss 2.4608, time 5067.43ms 
iter 9255: loss 2.6208, time 5054.54ms 
iter 9256: loss 2.5622, time 5048.90ms 
iter 9257: loss 2.5994, time 5134.38ms 
iter 9258: loss 2.6321, time 5096.38ms 
iter 9259: loss 2.4638, time 5066.48ms 
iter 9260: loss 2.4572, time 5199.54ms 
iter 9261: loss 2.3595, time 5172.73ms 
iter 9262: loss 2.2954, time 5090.30ms 
iter 9263: loss 2.5374, time 5063.29ms 
iter 9264: loss 2.3276, time 5147.67ms 
iter 9265: loss 2.6011, time 5085.49ms 
iter 9266: loss 2.4496, time 5199.51ms 
iter 9267: loss 2.2465, time 5173.17ms 
iter 9268: loss 2.5742, time 5064.10ms 
iter 9269: loss 2.4883, time 5243.16ms 
iter 9270: loss 2.6071, time 5129.47ms 
iter 9271: loss 2.5973, time 5097.82ms 
iter 9272: loss 2.5199, time 5096.85ms 
iter 9273: loss 2.5766, time 5134.35ms 
iter 9274: loss 2.3384, time 5078.58ms 
iter 9275: loss 2.5562, time 5202.37ms 
iter 9276: loss 2.5616, time 5087.75ms 
iter 9277: loss 2.1879, time 5054.19ms 
iter 9278: loss 2.3531, time 5212.79ms 
iter 9279: loss 2.5772, time 5136.22ms 
iter 9280: loss 2.6050, time 5062.30ms 
iter 9281: loss 2.5043, time 5077.27ms 
iter 9282: loss 2.2265, time 5094.51ms 
iter 9283: loss 2.5464, time 5074.82ms 
iter 9284: loss 2.5211, time 5204.24ms 
iter 9285: loss 2.4111, time 5031.11ms 
iter 9286: loss 2.3506, time 5001.42ms 
iter 9287: loss 2.4350, time 5254.86ms 
iter 9288: loss 2.2431, time 5103.34ms 
iter 9289: loss 2.3016, time 5026.22ms 
iter 9290: loss 2.5789, time 5041.02ms 
iter 9291: loss 2.4698, time 5090.12ms 
iter 9292: loss 2.4502, time 5061.82ms 
iter 9293: loss 2.3414, time 5232.71ms 
iter 9294: loss 2.4087, time 5088.19ms 
iter 9295: loss 2.5308, time 5058.52ms 
iter 9296: loss 2.4561, time 5277.28ms 
iter 9297: loss 2.5722, time 5148.22ms 
iter 9298: loss 2.6782, time 5089.72ms 
iter 9299: loss 2.5160, time 5117.55ms 
step 9300: train loss 2.4542, val loss 2.8378
iter 9300: loss 2.4532, time 19843.97ms 
iter 9301: loss 2.3308, time 4949.65ms 
iter 9302: loss 2.2865, time 5143.20ms 
iter 9303: loss 2.4918, time 5133.62ms 
iter 9304: loss 2.4394, time 5092.29ms 
iter 9305: loss 2.4902, time 5070.86ms 
iter 9306: loss 2.3541, time 5265.54ms 
iter 9307: loss 2.3762, time 5134.18ms 
iter 9308: loss 2.0819, time 5166.23ms 
iter 9309: loss 2.6290, time 5080.09ms 
iter 9310: loss 2.3619, time 5046.26ms 
iter 9311: loss 2.1922, time 5267.64ms 
iter 9312: loss 2.5591, time 5143.66ms 
iter 9313: loss 2.4983, time 5096.11ms 
iter 9314: loss 2.3883, time 5092.08ms 
iter 9315: loss 2.4534, time 5244.27ms 
iter 9316: loss 2.5238, time 5090.27ms 
iter 9317: loss 2.6702, time 5140.42ms 
iter 9318: loss 2.5344, time 5144.80ms 
iter 9319: loss 2.4428, time 5086.45ms 
iter 9320: loss 2.1894, time 5217.58ms 
iter 9321: loss 2.4127, time 5265.76ms 
iter 9322: loss 2.3267, time 5190.22ms 
iter 9323: loss 2.2857, time 5069.84ms 
iter 9324: loss 2.8260, time 5433.53ms 
iter 9325: loss 2.4197, time 5276.63ms 
iter 9326: loss 2.3337, time 5122.59ms 
iter 9327: loss 2.5487, time 5114.05ms 
iter 9328: loss 2.3911, time 5052.16ms 
iter 9329: loss 2.4461, time 5266.36ms 
iter 9330: loss 2.4742, time 5220.02ms 
iter 9331: loss 2.1733, time 5077.37ms 
iter 9332: loss 2.2248, time 5051.06ms 
iter 9333: loss 2.1920, time 5250.14ms 
iter 9334: loss 2.5561, time 5130.10ms 
iter 9335: loss 2.4331, time 5145.46ms 
iter 9336: loss 2.5484, time 5138.12ms 
iter 9337: loss 2.4801, time 5098.57ms 
iter 9338: loss 2.4651, time 5261.10ms 
iter 9339: loss 2.5168, time 5246.58ms 
iter 9340: loss 2.4660, time 5094.68ms 
iter 9341: loss 2.5946, time 5211.95ms 
iter 9342: loss 2.4363, time 5278.59ms 
iter 9343: loss 2.2849, time 5177.84ms 
iter 9344: loss 2.5958, time 5220.74ms 
iter 9345: loss 2.4310, time 5108.28ms 
iter 9346: loss 2.4453, time 5055.04ms 
iter 9347: loss 2.5006, time 5220.50ms 
iter 9348: loss 2.6818, time 5230.28ms 
iter 9349: loss 2.3896, time 5093.27ms 
step 9350: train loss 2.4622, val loss 2.8726
iter 9350: loss 2.3702, time 19856.87ms 
iter 9351: loss 2.3128, time 5152.00ms 
iter 9352: loss 2.5903, time 5089.00ms 
iter 9353: loss 2.8270, time 5179.12ms 
iter 9354: loss 2.5293, time 5271.63ms 
iter 9355: loss 2.2181, time 5093.68ms 
iter 9356: loss 2.1752, time 5064.80ms 
iter 9357: loss 2.6247, time 5272.80ms 
iter 9358: loss 2.2983, time 5198.16ms 
iter 9359: loss 2.5361, time 5232.68ms 
iter 9360: loss 2.4850, time 5066.09ms 
iter 9361: loss 2.3676, time 5003.58ms 
iter 9362: loss 2.4095, time 5159.00ms 
iter 9363: loss 2.3590, time 5292.90ms 
iter 9364: loss 2.3356, time 5146.97ms 
iter 9365: loss 2.4139, time 5154.11ms 
iter 9366: loss 2.3630, time 5279.22ms 
iter 9367: loss 2.5215, time 5075.63ms 
iter 9368: loss 2.4474, time 5125.07ms 
iter 9369: loss 2.4763, time 5048.05ms 
iter 9370: loss 2.2980, time 5119.58ms 
iter 9371: loss 2.4470, time 5150.53ms 
iter 9372: loss 2.5739, time 5272.85ms 
iter 9373: loss 2.3334, time 5101.85ms 
iter 9374: loss 2.5420, time 5108.41ms 
iter 9375: loss 2.3947, time 5232.51ms 
iter 9376: loss 2.5180, time 5172.60ms 
iter 9377: loss 2.4912, time 5090.49ms 
iter 9378: loss 2.3375, time 5001.36ms 
iter 9379: loss 2.5210, time 4991.55ms 
iter 9380: loss 2.4836, time 5229.10ms 
iter 9381: loss 2.5468, time 5303.32ms 
iter 9382: loss 2.4895, time 5152.12ms 
iter 9383: loss 2.4685, time 5209.05ms 
iter 9384: loss 2.7429, time 5206.32ms 
iter 9385: loss 2.3760, time 5092.51ms 
iter 9386: loss 2.6019, time 5057.31ms 
iter 9387: loss 2.4538, time 5193.12ms 
iter 9388: loss 2.5396, time 5192.47ms 
iter 9389: loss 2.4205, time 5253.13ms 
iter 9390: loss 2.2497, time 5281.98ms 
iter 9391: loss 2.2951, time 5144.86ms 
iter 9392: loss 2.4912, time 5079.97ms 
iter 9393: loss 2.4325, time 5017.41ms 
iter 9394: loss 2.6492, time 5034.46ms 
iter 9395: loss 2.4742, time 5223.08ms 
iter 9396: loss 2.3914, time 5122.48ms 
iter 9397: loss 2.4258, time 5058.92ms 
iter 9398: loss 2.4294, time 5188.78ms 
iter 9399: loss 2.3354, time 5113.60ms 
step 9400: train loss 2.4564, val loss 2.8689
iter 9400: loss 2.6151, time 20110.00ms 
iter 9401: loss 2.4650, time 5022.10ms 
iter 9402: loss 2.4824, time 5362.33ms 
iter 9403: loss 2.1998, time 5144.74ms 
iter 9404: loss 2.3278, time 5185.00ms 
iter 9405: loss 2.3970, time 5268.95ms 
iter 9406: loss 2.5150, time 5077.14ms 
iter 9407: loss 2.3947, time 5056.55ms 
iter 9408: loss 2.5791, time 5236.62ms 
iter 9409: loss 2.5932, time 5110.88ms 
iter 9410: loss 2.3599, time 5087.04ms 
iter 9411: loss 2.5491, time 5206.76ms 
iter 9412: loss 2.3911, time 5073.84ms 
iter 9413: loss 2.3979, time 5061.18ms 
iter 9414: loss 2.3077, time 5278.47ms 
iter 9415: loss 2.6154, time 5171.87ms 
iter 9416: loss 2.3817, time 5081.67ms 
iter 9417: loss 2.3878, time 5216.52ms 
iter 9418: loss 2.2057, time 5039.71ms 
iter 9419: loss 2.3995, time 5003.91ms 
iter 9420: loss 2.4195, time 4987.97ms 
iter 9421: loss 2.2985, time 5006.49ms 
iter 9422: loss 2.6165, time 4995.83ms 
iter 9423: loss 2.3257, time 5062.04ms 
iter 9424: loss 2.6201, time 5239.78ms 
iter 9425: loss 2.5928, time 5004.39ms 
iter 9426: loss 2.2841, time 4999.02ms 
iter 9427: loss 2.4273, time 5041.98ms 
iter 9428: loss 2.2334, time 5042.14ms 
iter 9429: loss 2.3788, time 5034.61ms 
iter 9430: loss 2.3434, time 5108.87ms 
iter 9431: loss 2.2891, time 5055.26ms 
iter 9432: loss 2.3621, time 5097.17ms 
iter 9433: loss 2.3161, time 5150.93ms 
iter 9434: loss 2.5362, time 5035.05ms 
iter 9435: loss 2.2869, time 5041.70ms 
iter 9436: loss 2.4398, time 5070.25ms 
iter 9437: loss 2.4919, time 5065.48ms 
iter 9438: loss 2.5116, time 5085.90ms 
iter 9439: loss 2.3138, time 5054.81ms 
iter 9440: loss 2.5629, time 5053.64ms 
iter 9441: loss 2.6055, time 5053.63ms 
iter 9442: loss 2.1636, time 5070.92ms 
iter 9443: loss 2.5117, time 5138.86ms 
iter 9444: loss 2.3432, time 5095.08ms 
iter 9445: loss 2.2968, time 5056.23ms 
iter 9446: loss 2.6265, time 5045.84ms 
iter 9447: loss 2.5494, time 5258.30ms 
iter 9448: loss 2.4801, time 5066.45ms 
iter 9449: loss 2.4959, time 5001.77ms 
step 9450: train loss 2.4552, val loss 2.8356
iter 9450: loss 2.5824, time 19918.57ms 
iter 9451: loss 2.6519, time 5118.99ms 
iter 9452: loss 2.5191, time 5078.67ms 
iter 9453: loss 2.3889, time 5068.94ms 
iter 9454: loss 2.2534, time 5062.87ms 
iter 9455: loss 2.5065, time 5010.03ms 
iter 9456: loss 2.5031, time 5123.68ms 
iter 9457: loss 2.5505, time 5070.94ms 
iter 9458: loss 2.3528, time 5049.44ms 
iter 9459: loss 2.4559, time 5158.46ms 
iter 9460: loss 2.4618, time 5021.60ms 
iter 9461: loss 2.4035, time 5004.34ms 
iter 9462: loss 2.3231, time 5000.60ms 
iter 9463: loss 2.4207, time 5213.60ms 
iter 9464: loss 2.5215, time 5191.07ms 
iter 9465: loss 2.3455, time 5086.14ms 
iter 9466: loss 2.5418, time 5193.43ms 
iter 9467: loss 2.5826, time 5105.69ms 
iter 9468: loss 2.6066, time 5175.86ms 
iter 9469: loss 2.3993, time 5286.97ms 
iter 9470: loss 2.3357, time 5121.72ms 
iter 9471: loss 2.4046, time 5095.78ms 
iter 9472: loss 2.4779, time 5024.21ms 
iter 9473: loss 2.2888, time 5028.29ms 
iter 9474: loss 2.5204, time 5022.77ms 
iter 9475: loss 2.5443, time 5020.02ms 
iter 9476: loss 2.4920, time 5027.90ms 
iter 9477: loss 2.3645, time 5029.07ms 
iter 9478: loss 2.4712, time 4997.89ms 
iter 9479: loss 2.3081, time 4975.44ms 
iter 9480: loss 2.3356, time 4954.87ms 
iter 9481: loss 2.3462, time 4949.01ms 
iter 9482: loss 2.5571, time 5229.67ms 
iter 9483: loss 2.5069, time 5083.55ms 
iter 9484: loss 2.6311, time 5045.75ms 
iter 9485: loss 2.4712, time 5028.89ms 
iter 9486: loss 2.5831, time 4993.80ms 
iter 9487: loss 2.3069, time 4991.00ms 
iter 9488: loss 2.3918, time 5196.53ms 
iter 9489: loss 2.3260, time 5193.50ms 
iter 9490: loss 2.5734, time 5007.70ms 
iter 9491: loss 2.3396, time 5049.28ms 
iter 9492: loss 2.4584, time 5272.70ms 
iter 9493: loss 2.5834, time 5327.51ms 
iter 9494: loss 2.4187, time 5080.17ms 
iter 9495: loss 2.3705, time 5084.27ms 
iter 9496: loss 2.3940, time 4982.19ms 
iter 9497: loss 2.4234, time 4953.32ms 
iter 9498: loss 2.3807, time 4957.30ms 
iter 9499: loss 2.7667, time 4955.19ms 
step 9500: train loss 2.4550, val loss 2.8527
iter 9500: loss 2.4862, time 19666.52ms 
iter 9501: loss 2.4595, time 4950.97ms 
iter 9502: loss 2.4652, time 4947.49ms 
iter 9503: loss 2.4799, time 4960.19ms 
iter 9504: loss 2.5541, time 4947.73ms 
iter 9505: loss 2.3180, time 4948.75ms 
iter 9506: loss 2.4421, time 4951.42ms 
iter 9507: loss 2.4249, time 4948.38ms 
iter 9508: loss 2.5690, time 5006.92ms 
iter 9509: loss 2.4993, time 4951.35ms 
iter 9510: loss 2.3597, time 4952.30ms 
iter 9511: loss 2.4755, time 4951.79ms 
iter 9512: loss 2.6120, time 4953.51ms 
iter 9513: loss 2.3788, time 4950.60ms 
iter 9514: loss 2.4161, time 4948.38ms 
iter 9515: loss 2.3657, time 5037.20ms 
iter 9516: loss 2.5312, time 5011.77ms 
iter 9517: loss 2.5511, time 5166.86ms 
iter 9518: loss 2.4092, time 5025.32ms 
iter 9519: loss 2.5031, time 5058.78ms 
iter 9520: loss 2.6377, time 4975.88ms 
iter 9521: loss 2.4620, time 4983.12ms 
iter 9522: loss 2.4658, time 5019.34ms 
iter 9523: loss 2.4753, time 4975.18ms 
iter 9524: loss 2.5939, time 5108.72ms 
iter 9525: loss 2.2939, time 5245.47ms 
iter 9526: loss 2.5400, time 5269.80ms 
iter 9527: loss 2.3479, time 5081.54ms 
iter 9528: loss 2.5581, time 5144.67ms 
iter 9529: loss 2.3087, time 5133.00ms 
iter 9530: loss 2.3942, time 5060.38ms 
iter 9531: loss 2.5357, time 4993.71ms 
iter 9532: loss 2.4999, time 4988.74ms 
iter 9533: loss 2.4458, time 4968.77ms 
iter 9534: loss 2.2670, time 4969.94ms 
iter 9535: loss 2.3521, time 5120.08ms 
iter 9536: loss 2.4764, time 5041.27ms 
iter 9537: loss 2.4907, time 5037.80ms 
iter 9538: loss 2.4760, time 5040.13ms 
iter 9539: loss 2.5337, time 5035.57ms 
iter 9540: loss 2.6258, time 4986.00ms 
iter 9541: loss 2.5872, time 5036.96ms 
iter 9542: loss 2.4125, time 5059.40ms 
iter 9543: loss 2.6546, time 5082.96ms 
iter 9544: loss 2.4333, time 5074.40ms 
iter 9545: loss 2.6062, time 5018.33ms 
iter 9546: loss 2.5275, time 5005.82ms 
iter 9547: loss 2.5264, time 4999.70ms 
iter 9548: loss 2.5538, time 5014.81ms 
iter 9549: loss 2.3824, time 5122.77ms 
step 9550: train loss 2.4423, val loss 2.8301
iter 9550: loss 2.4509, time 19862.52ms 
iter 9551: loss 2.5973, time 4980.31ms 
iter 9552: loss 2.4404, time 5149.75ms 
iter 9553: loss 2.1801, time 5091.75ms 
iter 9554: loss 2.4618, time 5090.02ms 
iter 9555: loss 2.5781, time 5262.33ms 
iter 9556: loss 2.0681, time 5083.73ms 
iter 9557: loss 2.7055, time 5063.09ms 
iter 9558: loss 2.5969, time 5071.63ms 
iter 9559: loss 2.2335, time 4990.30ms 
iter 9560: loss 2.3436, time 4995.58ms 
iter 9561: loss 2.3887, time 5243.47ms 
iter 9562: loss 2.6886, time 5145.84ms 
iter 9563: loss 2.4512, time 5089.13ms 
iter 9564: loss 2.4294, time 5177.16ms 
iter 9565: loss 2.3691, time 5096.65ms 
iter 9566: loss 2.5620, time 5050.18ms 
iter 9567: loss 2.4443, time 5031.05ms 
iter 9568: loss 2.5613, time 5243.23ms 
iter 9569: loss 2.4134, time 5135.33ms 
iter 9570: loss 2.4060, time 5086.96ms 
iter 9571: loss 2.4161, time 5093.01ms 
iter 9572: loss 2.3324, time 5063.93ms 
iter 9573: loss 2.2854, time 5051.76ms 
iter 9574: loss 2.4482, time 5057.94ms 
iter 9575: loss 2.4797, time 5056.26ms 
iter 9576: loss 2.3260, time 5038.52ms 
iter 9577: loss 2.4066, time 5033.88ms 
iter 9578: loss 2.4770, time 5036.95ms 
iter 9579: loss 2.5528, time 5038.64ms 
iter 9580: loss 2.4571, time 5121.34ms 
iter 9581: loss 2.4794, time 5116.19ms 
iter 9582: loss 2.4073, time 5232.05ms 
iter 9583: loss 2.4921, time 5223.24ms 
iter 9584: loss 2.3816, time 5060.98ms 
iter 9585: loss 2.4912, time 5056.50ms 
iter 9586: loss 2.5873, time 5014.32ms 
iter 9587: loss 2.4037, time 5141.65ms 
iter 9588: loss 2.4811, time 5076.99ms 
iter 9589: loss 2.5837, time 5055.12ms 
iter 9590: loss 2.4103, time 5010.24ms 
iter 9591: loss 2.4766, time 5013.63ms 
iter 9592: loss 2.6012, time 4995.84ms 
iter 9593: loss 2.4611, time 4993.07ms 
iter 9594: loss 2.5784, time 5165.47ms 
iter 9595: loss 2.3811, time 5143.66ms 
iter 9596: loss 2.3708, time 5084.23ms 
iter 9597: loss 2.4284, time 5053.87ms 
iter 9598: loss 2.4243, time 5028.05ms 
iter 9599: loss 2.1133, time 4990.51ms 
step 9600: train loss 2.4470, val loss 2.8637
iter 9600: loss 2.3612, time 19978.69ms 
iter 9601: loss 2.3280, time 5164.10ms 
iter 9602: loss 2.4753, time 4984.80ms 
iter 9603: loss 2.2834, time 4976.86ms 
iter 9604: loss 2.5248, time 4997.65ms 
iter 9605: loss 2.5873, time 4990.70ms 
iter 9606: loss 2.5140, time 5016.36ms 
iter 9607: loss 2.4030, time 5301.66ms 
iter 9608: loss 2.2527, time 5101.03ms 
iter 9609: loss 2.4160, time 4978.59ms 
iter 9610: loss 2.4800, time 5216.41ms 
iter 9611: loss 2.3038, time 5279.90ms 
iter 9612: loss 2.3566, time 5135.27ms 
iter 9613: loss 2.5185, time 5080.72ms 
