tokens per iteration will be: 491,520
Initializing a new model from scratch
config:OpenELMConfig {
  "_name_or_path": "./",
  "activation_fn_name": "swish",
  "architectures": [
    "OpenELMForCausalLM"
  ],
  "attn_pdrop": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_openelm.OpenELMConfig",
    "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "ffn_dim_divisor": 256,
  "ffn_multipliers": [
    0.5,
    0.75,
    1.0,
    1.25,
    1.5,
    1.75,
    2.0,
    2.25
  ],
  "ffn_with_glu": true,
  "head_dim": 64,
  "initializer_range": 0.02,
  "max_context_length": 1024,
  "model_dim": 1280,
  "model_type": "openelm",
  "normalization_layer_name": "rms_norm",
  "normalize_qk_projections": true,
  "num_gqa_groups": 2,
  "num_kv_heads": [
    3,
    3,
    4,
    4,
    4,
    4,
    5,
    5
  ],
  "num_query_heads": [
    10,
    12,
    12,
    14,
    16,
    18,
    18,
    20
  ],
  "num_transformer_layers": 8,
  "qkv_multipliers": [
    0.5,
    1.0
  ],
  "rope_freq_constant": 10000,
  "rope_max_length": 1024,
  "share_input_output_layers": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

config:OpenELMConfig {
  "_name_or_path": "./",
  "activation_fn_name": "swish",
  "architectures": [
    "OpenELMForCausalLM"
  ],
  "attn_pdrop": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_openelm.OpenELMConfig",
    "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "ffn_dim_divisor": 256,
  "ffn_multipliers": [
    0.5,
    0.75,
    1.0,
    1.25,
    1.5,
    1.5,
    1.75,
    2.0
  ],
  "ffn_with_glu": true,
  "head_dim": 64,
  "initializer_range": 0.02,
  "max_context_length": 1024,
  "model_dim": 954,
  "model_type": "openelm",
  "normalization_layer_name": "rms_norm",
  "normalize_qk_projections": true,
  "num_gqa_groups": 2,
  "num_kv_heads": [
    3,
    3,
    3,
    3,
    4,
    4,
    4,
    5
  ],
  "num_query_heads": [
    6,
    6,
    6,
    6,
    8,
    8,
    8,
    10
  ],
  "num_transformer_layers": 8,
  "qkv_multipliers": [
    0.5,
    1.0
  ],
  "rope_freq_constant": 10000,
  "rope_max_length": 1024,
  "share_input_output_layers": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

num decayed parameter tensors: 33, with 87,875,802 parameters
num non-decayed parameter tensors: 33, with 17,242 parameters
using fused AdamW: True
compiling the model... (takes a ~minute)
number of parameters: 87.89M
number of transformer parameters: 39.95M
step 0: train loss 11.2805, val loss 11.2734
iter 0: loss 11.2739, time 55851.87ms 
iter 1: loss 11.3555, time 4950.56ms 
iter 2: loss 11.1429, time 4908.12ms 
iter 3: loss 11.0328, time 4983.18ms 
iter 4: loss 10.8675, time 4994.46ms 
iter 5: loss 10.6005, time 4963.52ms 
iter 6: loss 10.2972, time 4982.50ms 
iter 7: loss 9.7395, time 5029.25ms 
iter 8: loss 9.4164, time 5028.44ms 
iter 9: loss 8.9855, time 5030.25ms 
iter 10: loss 8.6674, time 4977.00ms 
iter 11: loss 8.1846, time 5011.24ms 
iter 12: loss 8.0349, time 5026.05ms 
iter 13: loss 7.3283, time 5026.78ms 
iter 14: loss 7.2576, time 5020.02ms 
iter 15: loss 7.3757, time 5019.30ms 
iter 16: loss 7.2721, time 5021.57ms 
iter 17: loss 7.3341, time 5027.79ms 
iter 18: loss 6.8345, time 4975.40ms 
iter 19: loss 7.2202, time 5013.47ms 
iter 20: loss 6.8067, time 5023.23ms 
iter 21: loss 6.7134, time 5026.50ms 
iter 22: loss 6.9493, time 5026.98ms 
iter 23: loss 6.6870, time 5021.44ms 
iter 24: loss 6.6939, time 5024.95ms 
iter 25: loss 6.6815, time 5024.57ms 
iter 26: loss 6.4576, time 4979.30ms 
iter 27: loss 6.3417, time 5028.71ms 
iter 28: loss 6.3437, time 5035.29ms 
iter 29: loss 6.6005, time 5032.48ms 
iter 30: loss 6.2905, time 5025.67ms 
iter 31: loss 6.6676, time 5031.77ms 
iter 32: loss 6.6504, time 5029.42ms 
iter 33: loss 6.2395, time 5032.55ms 
iter 34: loss 6.0455, time 4980.48ms 
iter 35: loss 6.0282, time 5018.61ms 
iter 36: loss 6.1245, time 5037.62ms 
iter 37: loss 6.5704, time 5032.03ms 
iter 38: loss 6.1893, time 5030.32ms 
iter 39: loss 6.0792, time 5009.55ms 
iter 40: loss 5.8455, time 4995.62ms 
iter 41: loss 5.9076, time 5034.10ms 
iter 42: loss 6.0320, time 4984.88ms 
iter 43: loss 6.1835, time 5026.69ms 
iter 44: loss 6.0059, time 5028.60ms 
iter 45: loss 5.7381, time 5020.89ms 
iter 46: loss 5.9794, time 5017.25ms 
iter 47: loss 5.6060, time 5026.61ms 
iter 48: loss 6.0575, time 5026.66ms 
iter 49: loss 5.9313, time 5014.10ms 
step 50: train loss 5.8732, val loss 5.8235
iter 50: loss 6.0006, time 19692.23ms 
iter 51: loss 5.8812, time 5019.73ms 
iter 52: loss 6.1321, time 5019.15ms 
iter 53: loss 5.6756, time 5019.73ms 
iter 54: loss 5.7960, time 5016.92ms 
iter 55: loss 5.8511, time 4942.94ms 
iter 56: loss 5.9238, time 5015.11ms 
iter 57: loss 5.8660, time 5025.02ms 
iter 58: loss 5.7228, time 5026.20ms 
iter 59: loss 6.2671, time 5025.34ms 
iter 60: loss 5.5305, time 5016.79ms 
iter 61: loss 5.5561, time 5028.41ms 
iter 62: loss 5.7363, time 5054.99ms 
iter 63: loss 5.7678, time 4972.05ms 
iter 64: loss 5.7926, time 4943.89ms 
iter 65: loss 5.5129, time 5024.27ms 
iter 66: loss 5.5690, time 5023.83ms 
iter 67: loss 5.4182, time 5024.12ms 
iter 68: loss 5.4085, time 5011.42ms 
iter 69: loss 5.4926, time 5016.75ms 
iter 70: loss 5.5947, time 5016.13ms 
iter 71: loss 5.4368, time 5026.54ms 
iter 72: loss 5.8233, time 5017.78ms 
iter 73: loss 5.5242, time 5024.41ms 
iter 74: loss 5.9585, time 5024.73ms 
iter 75: loss 5.3209, time 5024.81ms 
iter 76: loss 5.6364, time 5026.03ms 
iter 77: loss 5.6042, time 5026.55ms 
iter 78: loss 5.7845, time 5027.71ms 
iter 79: loss 5.3308, time 5030.65ms 
iter 80: loss 5.4819, time 5023.91ms 
iter 81: loss 5.4546, time 5024.55ms 
iter 82: loss 5.3790, time 5025.05ms 
iter 83: loss 5.4136, time 5005.96ms 
iter 84: loss 5.5407, time 5003.38ms 
iter 85: loss 5.3550, time 5019.84ms 
iter 86: loss 5.4155, time 5031.32ms 
iter 87: loss 5.1109, time 5017.93ms 
iter 88: loss 5.2023, time 5023.62ms 
iter 89: loss 5.2002, time 5023.23ms 
iter 90: loss 5.3368, time 5024.60ms 
iter 91: loss 5.3087, time 5024.77ms 
iter 92: loss 5.5310, time 5018.56ms 
iter 93: loss 5.6082, time 5024.97ms 
iter 94: loss 5.0918, time 4990.04ms 
iter 95: loss 5.2496, time 4986.38ms 
iter 96: loss 5.1012, time 5027.29ms 
iter 97: loss 5.2786, time 5025.52ms 
iter 98: loss 4.9939, time 5023.38ms 
iter 99: loss 4.8960, time 5022.77ms 
step 100: train loss 5.1304, val loss 5.0623
iter 100: loss 5.3846, time 19633.96ms 
iter 101: loss 5.2220, time 5024.36ms 
iter 102: loss 4.8797, time 5022.65ms 
iter 103: loss 4.9402, time 5023.67ms 
iter 104: loss 5.0002, time 5003.43ms 
iter 105: loss 5.1197, time 5013.32ms 
iter 106: loss 5.1931, time 5021.93ms 
iter 107: loss 5.1550, time 5009.86ms 
iter 108: loss 4.9835, time 4954.02ms 
iter 109: loss 5.1124, time 5012.48ms 
iter 110: loss 4.8779, time 5023.23ms 
iter 111: loss 5.2215, time 5023.42ms 
iter 112: loss 4.8852, time 5023.19ms 
iter 113: loss 5.0048, time 5002.90ms 
iter 114: loss 4.9620, time 5010.92ms 
iter 115: loss 5.0174, time 5022.95ms 
iter 116: loss 4.8360, time 5022.74ms 
iter 117: loss 4.8088, time 4967.57ms 
iter 118: loss 4.9330, time 4987.99ms 
iter 119: loss 4.6938, time 5022.08ms 
iter 120: loss 4.6462, time 5004.94ms 
iter 121: loss 4.5842, time 5014.58ms 
iter 122: loss 4.7627, time 5026.05ms 
iter 123: loss 4.5791, time 5022.38ms 
iter 124: loss 4.9309, time 5027.11ms 
iter 125: loss 4.5514, time 4973.32ms 
iter 126: loss 4.6283, time 5010.69ms 
iter 127: loss 4.7776, time 5024.85ms 
iter 128: loss 4.7010, time 5024.02ms 
iter 129: loss 4.6515, time 5023.32ms 
iter 130: loss 4.5037, time 5015.80ms 
iter 131: loss 4.8322, time 5023.58ms 
iter 132: loss 4.6268, time 5023.84ms 
iter 133: loss 4.7479, time 4975.91ms 
iter 134: loss 4.5856, time 5023.52ms 
iter 135: loss 4.6309, time 5022.90ms 
iter 136: loss 4.4946, time 5022.48ms 
iter 137: loss 4.4324, time 5024.22ms 
iter 138: loss 4.5146, time 5024.56ms 
iter 139: loss 4.5339, time 5023.94ms 
iter 140: loss 4.4983, time 5025.07ms 
iter 141: loss 4.5841, time 5014.41ms 
iter 142: loss 4.4832, time 5023.12ms 
iter 143: loss 4.4666, time 5024.19ms 
iter 144: loss 4.4283, time 5002.51ms 
iter 145: loss 4.2218, time 5022.88ms 
iter 146: loss 4.3244, time 5022.73ms 
iter 147: loss 4.3598, time 5024.74ms 
iter 148: loss 4.2168, time 5027.11ms 
iter 149: loss 4.4505, time 4991.67ms 
step 150: train loss 4.3985, val loss 4.3359
iter 150: loss 4.2399, time 19671.02ms 
iter 151: loss 4.3256, time 5019.50ms 
iter 152: loss 4.3795, time 5024.05ms 
iter 153: loss 4.3732, time 5024.54ms 
iter 154: loss 4.3391, time 5007.80ms 
iter 155: loss 4.1229, time 5017.69ms 
iter 156: loss 4.3237, time 5019.27ms 
iter 157: loss 4.2098, time 5017.15ms 
iter 158: loss 4.4237, time 5016.26ms 
iter 159: loss 4.1503, time 5017.49ms 
iter 160: loss 4.4008, time 5015.00ms 
iter 161: loss 4.2523, time 4991.56ms 
iter 162: loss 4.1560, time 5016.85ms 
iter 163: loss 4.0681, time 5016.32ms 
iter 164: loss 4.4077, time 5015.81ms 
iter 165: loss 4.3711, time 5015.48ms 
iter 166: loss 4.3398, time 5018.60ms 
iter 167: loss 4.2788, time 5015.98ms 
iter 168: loss 4.1726, time 5024.72ms 
iter 169: loss 4.2224, time 5018.56ms 
iter 170: loss 4.1470, time 5017.39ms 
iter 171: loss 4.1530, time 5000.27ms 
iter 172: loss 4.0273, time 5024.33ms 
iter 173: loss 4.2258, time 5017.79ms 
iter 174: loss 4.1769, time 5016.01ms 
iter 175: loss 4.3412, time 5020.50ms 
iter 176: loss 3.9024, time 4982.86ms 
iter 177: loss 4.0613, time 5016.69ms 
iter 178: loss 4.1574, time 5017.78ms 
iter 179: loss 4.1053, time 5017.71ms 
iter 180: loss 4.2082, time 5017.50ms 
iter 181: loss 4.2384, time 5015.27ms 
iter 182: loss 4.1138, time 5016.29ms 
iter 183: loss 3.9111, time 4988.35ms 
iter 184: loss 4.0111, time 4923.52ms 
iter 185: loss 3.9183, time 4996.66ms 
iter 186: loss 4.2616, time 5018.38ms 
iter 187: loss 4.0296, time 4998.91ms 
iter 188: loss 3.9362, time 4952.30ms 
iter 189: loss 4.2421, time 4919.98ms 
iter 190: loss 4.3030, time 4918.59ms 
iter 191: loss 4.1464, time 4918.89ms 
iter 192: loss 4.2779, time 4920.94ms 
iter 193: loss 4.2223, time 4914.73ms 
iter 194: loss 4.1332, time 5012.73ms 
iter 195: loss 4.1242, time 5020.14ms 
iter 196: loss 4.1121, time 5021.88ms 
iter 197: loss 3.8952, time 5024.52ms 
iter 198: loss 4.2118, time 5019.61ms 
iter 199: loss 3.8929, time 5022.00ms 
step 200: train loss 4.0217, val loss 3.9881
iter 200: loss 3.9255, time 19684.17ms 
iter 201: loss 4.1158, time 5005.14ms 
iter 202: loss 3.8262, time 5016.60ms 
iter 203: loss 4.0043, time 5027.65ms 
iter 204: loss 4.0883, time 5027.23ms 
iter 205: loss 4.2614, time 4975.14ms 
iter 206: loss 3.9997, time 4943.10ms 
iter 207: loss 4.1266, time 5022.68ms 
iter 208: loss 3.9986, time 5025.71ms 
iter 209: loss 4.2806, time 5015.78ms 
iter 210: loss 3.8403, time 4995.80ms 
iter 211: loss 4.0749, time 5025.85ms 
iter 212: loss 4.1923, time 5026.99ms 
iter 213: loss 3.8981, time 5007.91ms 
iter 214: loss 4.0707, time 4967.74ms 
iter 215: loss 3.8219, time 4915.29ms 
iter 216: loss 3.8029, time 4975.05ms 
iter 217: loss 4.2061, time 4978.14ms 
iter 218: loss 3.7353, time 4981.07ms 
iter 219: loss 3.8396, time 5020.26ms 
iter 220: loss 3.9245, time 5008.00ms 
iter 221: loss 4.0442, time 5023.85ms 
iter 222: loss 3.9406, time 5014.85ms 
iter 223: loss 3.8006, time 4911.40ms 
iter 224: loss 3.9059, time 4910.92ms 
iter 225: loss 3.8125, time 4990.26ms 
iter 226: loss 3.8852, time 5018.50ms 
iter 227: loss 3.8263, time 5024.77ms 
iter 228: loss 3.7441, time 5025.64ms 
iter 229: loss 3.8229, time 5019.28ms 
iter 230: loss 3.9098, time 5024.90ms 
iter 231: loss 3.8710, time 5013.76ms 
iter 232: loss 3.6604, time 4940.27ms 
iter 233: loss 4.1222, time 5021.74ms 
iter 234: loss 4.0054, time 5025.18ms 
iter 235: loss 3.9598, time 5023.02ms 
iter 236: loss 4.2138, time 5022.21ms 
iter 237: loss 3.9598, time 5022.75ms 
iter 238: loss 4.1859, time 5020.30ms 
iter 239: loss 4.0794, time 5012.06ms 
iter 240: loss 3.9823, time 4912.72ms 
iter 241: loss 3.7551, time 4916.55ms 
iter 242: loss 4.0452, time 4990.76ms 
iter 243: loss 3.7956, time 5024.50ms 
iter 244: loss 3.8067, time 5025.58ms 
iter 245: loss 3.8019, time 5022.92ms 
iter 246: loss 3.7818, time 5015.38ms 
iter 247: loss 3.7180, time 5012.53ms 
iter 248: loss 3.7434, time 5000.08ms 
iter 249: loss 4.1207, time 4913.79ms 
step 250: train loss 3.8946, val loss 3.8258
iter 250: loss 3.8933, time 19734.21ms 
iter 251: loss 3.8581, time 5027.79ms 
iter 252: loss 3.8831, time 5022.20ms 
iter 253: loss 3.9433, time 5027.04ms 
iter 254: loss 3.9301, time 5022.42ms 
iter 255: loss 3.7727, time 4967.25ms 
iter 256: loss 3.8521, time 4912.48ms 
iter 257: loss 4.1710, time 4927.81ms 
iter 258: loss 3.8736, time 4924.42ms 
iter 259: loss 3.9770, time 4930.12ms 
iter 260: loss 3.9962, time 5019.23ms 
iter 261: loss 4.0617, time 5032.56ms 
iter 262: loss 3.8004, time 5032.85ms 
iter 263: loss 4.3804, time 5024.19ms 
iter 264: loss 3.7723, time 5019.55ms 
iter 265: loss 3.8153, time 4918.74ms 
iter 266: loss 3.7828, time 4915.25ms 
iter 267: loss 3.8073, time 5006.45ms 
iter 268: loss 3.8775, time 5034.42ms 
iter 269: loss 4.0199, time 5010.84ms 
iter 270: loss 4.1125, time 5016.51ms 
iter 271: loss 3.9354, time 5005.97ms 
iter 272: loss 3.8311, time 5015.05ms 
iter 273: loss 3.8574, time 4993.56ms 
iter 274: loss 3.7725, time 4911.83ms 
iter 275: loss 3.8094, time 4921.45ms 
iter 276: loss 3.7054, time 4927.16ms 
iter 277: loss 3.9812, time 4911.47ms 
iter 278: loss 3.9168, time 4920.62ms 
iter 279: loss 3.7516, time 5020.52ms 
iter 280: loss 3.7048, time 4994.32ms 
iter 281: loss 3.6782, time 5024.88ms 
iter 282: loss 4.1165, time 5023.47ms 
iter 283: loss 3.9471, time 5009.54ms 
iter 284: loss 4.1642, time 5027.75ms 
iter 285: loss 3.6751, time 5036.38ms 
iter 286: loss 3.7231, time 4983.26ms 
iter 287: loss 3.7637, time 4916.18ms 
iter 288: loss 3.7518, time 4956.34ms 
iter 289: loss 3.7676, time 5014.45ms 
iter 290: loss 3.8006, time 5021.40ms 
iter 291: loss 3.8327, time 5023.69ms 
iter 292: loss 3.9013, time 5014.39ms 
iter 293: loss 3.7403, time 5007.43ms 
iter 294: loss 3.6865, time 5017.49ms 
iter 295: loss 3.8826, time 5007.18ms 
iter 296: loss 3.8250, time 4977.98ms 
iter 297: loss 3.6988, time 4974.74ms 
iter 298: loss 3.6175, time 5021.08ms 
iter 299: loss 3.6881, time 5025.69ms 
step 300: train loss 3.7679, val loss 3.7535
iter 300: loss 3.7727, time 19686.07ms 
iter 301: loss 3.5385, time 4938.57ms 
iter 302: loss 3.6807, time 4914.05ms 
iter 303: loss 3.7593, time 4962.41ms 
iter 304: loss 3.7803, time 5023.84ms 
iter 305: loss 3.7453, time 5012.05ms 
iter 306: loss 3.7942, time 5021.51ms 
iter 307: loss 3.7484, time 5023.04ms 
iter 308: loss 3.8242, time 5013.00ms 
iter 309: loss 4.0144, time 5017.21ms 
iter 310: loss 3.8084, time 4968.66ms 
iter 311: loss 3.5726, time 4914.90ms 
iter 312: loss 3.6088, time 4965.65ms 
iter 313: loss 3.7639, time 5018.47ms 
iter 314: loss 3.8665, time 5020.47ms 
iter 315: loss 3.8740, time 5004.72ms 
iter 316: loss 3.9007, time 5023.30ms 
iter 317: loss 3.8643, time 5024.54ms 
iter 318: loss 3.7014, time 5021.76ms 
iter 319: loss 3.7504, time 5025.42ms 
iter 320: loss 3.6836, time 4979.24ms 
iter 321: loss 3.5258, time 5020.13ms 
iter 322: loss 3.6158, time 5020.41ms 
iter 323: loss 3.7311, time 5019.74ms 
iter 324: loss 3.7304, time 5028.26ms 
iter 325: loss 3.7300, time 5020.28ms 
iter 326: loss 3.7664, time 5028.44ms 
iter 327: loss 3.7119, time 5005.75ms 
iter 328: loss 3.7161, time 4939.70ms 
iter 329: loss 3.7792, time 4911.19ms 
iter 330: loss 3.7392, time 4987.91ms 
iter 331: loss 3.8173, time 5016.40ms 
iter 332: loss 3.8363, time 5017.15ms 
iter 333: loss 3.6099, time 5018.18ms 
iter 334: loss 3.6236, time 4994.32ms 
iter 335: loss 3.6735, time 5018.15ms 
iter 336: loss 3.6455, time 5014.33ms 
iter 337: loss 3.8642, time 4912.70ms 
iter 338: loss 3.8021, time 4912.02ms 
iter 339: loss 3.5811, time 4987.56ms 
iter 340: loss 3.7792, time 5020.13ms 
iter 341: loss 3.6856, time 5025.04ms 
iter 342: loss 3.7241, time 5008.42ms 
iter 343: loss 3.8005, time 5017.45ms 
iter 344: loss 3.7147, time 5001.06ms 
iter 345: loss 3.8025, time 5005.47ms 
iter 346: loss 3.8442, time 4929.67ms 
iter 347: loss 3.7365, time 4950.81ms 
iter 348: loss 3.6469, time 4991.09ms 
iter 349: loss 3.6682, time 5020.98ms 
step 350: train loss 3.6931, val loss 3.6778
iter 350: loss 3.7397, time 19741.27ms 
iter 351: loss 3.7886, time 4911.12ms 
iter 352: loss 3.7588, time 4993.53ms 
iter 353: loss 3.6328, time 5016.24ms 
iter 354: loss 3.7372, time 5027.54ms 
iter 355: loss 3.6510, time 5022.84ms 
iter 356: loss 3.5913, time 5004.31ms 
iter 357: loss 3.7019, time 5014.39ms 
iter 358: loss 3.6505, time 5022.74ms 
iter 359: loss 3.8379, time 4982.02ms 
iter 360: loss 3.6822, time 4964.16ms 
iter 361: loss 3.5510, time 5024.76ms 
iter 362: loss 3.7414, time 5015.57ms 
iter 363: loss 3.8021, time 5021.57ms 
iter 364: loss 3.5711, time 5024.45ms 
iter 365: loss 3.9833, time 5023.38ms 
iter 366: loss 3.6313, time 5021.44ms 
iter 367: loss 3.7779, time 4979.50ms 
iter 368: loss 3.5847, time 4917.04ms 
iter 369: loss 3.5140, time 4995.26ms 
iter 370: loss 3.7901, time 4991.77ms 
iter 371: loss 3.5967, time 5023.79ms 
iter 372: loss 3.6442, time 5023.02ms 
iter 373: loss 3.6429, time 5021.56ms 
iter 374: loss 3.6186, time 5023.79ms 
iter 375: loss 3.6776, time 5010.64ms 
iter 376: loss 3.7522, time 4913.15ms 
iter 377: loss 3.5107, time 4991.90ms 
iter 378: loss 3.6021, time 5022.77ms 
iter 379: loss 3.6263, time 5017.85ms 
iter 380: loss 3.5873, time 5022.76ms 
iter 381: loss 3.5935, time 5023.81ms 
iter 382: loss 3.5320, time 5019.55ms 
iter 383: loss 3.8216, time 5028.22ms 
iter 384: loss 3.5041, time 4975.36ms 
iter 385: loss 3.6079, time 5021.12ms 
iter 386: loss 3.7185, time 5034.46ms 
iter 387: loss 3.5910, time 4980.33ms 
iter 388: loss 3.6651, time 5028.99ms 
iter 389: loss 3.7259, time 5015.62ms 
iter 390: loss 3.7284, time 5028.58ms 
iter 391: loss 3.6278, time 5024.14ms 
iter 392: loss 3.5961, time 4958.31ms 
iter 393: loss 3.8652, time 5002.39ms 
iter 394: loss 3.7247, time 5017.39ms 
iter 395: loss 3.6574, time 5027.36ms 
iter 396: loss 3.6251, time 5029.11ms 
iter 397: loss 3.6743, time 5001.85ms 
iter 398: loss 3.6151, time 5024.03ms 
iter 399: loss 3.4106, time 5030.09ms 
step 400: train loss 3.6334, val loss 3.6091
iter 400: loss 3.6279, time 19689.94ms 
iter 401: loss 3.6349, time 4980.19ms 
iter 402: loss 3.6260, time 5012.18ms 
iter 403: loss 3.7490, time 5005.59ms 
iter 404: loss 3.4828, time 5025.24ms 
iter 405: loss 3.4196, time 5018.73ms 
iter 406: loss 3.5209, time 5002.27ms 
iter 407: loss 3.6217, time 5027.44ms 
iter 408: loss 3.6917, time 5029.30ms 
iter 409: loss 3.6909, time 5013.55ms 
iter 410: loss 3.6376, time 5024.15ms 
iter 411: loss 3.5456, time 5021.26ms 
iter 412: loss 3.7444, time 5003.51ms 
iter 413: loss 3.5984, time 5021.72ms 
iter 414: loss 3.5459, time 5023.20ms 
iter 415: loss 3.4704, time 5018.69ms 
iter 416: loss 3.6310, time 5020.82ms 
iter 417: loss 3.7551, time 5020.41ms 
iter 418: loss 3.8801, time 4986.09ms 
iter 419: loss 3.3983, time 5009.24ms 
iter 420: loss 3.6599, time 5018.82ms 
iter 421: loss 3.5019, time 5005.56ms 
iter 422: loss 3.5075, time 4992.55ms 
iter 423: loss 3.5470, time 5016.53ms 
iter 424: loss 3.5187, time 4983.77ms 
iter 425: loss 3.5438, time 4927.41ms 
iter 426: loss 3.5133, time 4931.54ms 
iter 427: loss 3.4741, time 4936.27ms 
iter 428: loss 3.6245, time 4943.14ms 
iter 429: loss 3.6150, time 4946.27ms 
iter 430: loss 3.5901, time 5001.71ms 
iter 431: loss 3.5278, time 5024.35ms 
iter 432: loss 3.6295, time 5031.76ms 
iter 433: loss 3.5703, time 5031.41ms 
iter 434: loss 3.7498, time 4990.77ms 
iter 435: loss 3.4956, time 5016.52ms 
iter 436: loss 3.7401, time 5014.70ms 
iter 437: loss 3.4980, time 5018.73ms 
iter 438: loss 3.5728, time 5020.91ms 
iter 439: loss 3.6792, time 5021.40ms 
iter 440: loss 3.6730, time 5023.63ms 
iter 441: loss 3.9481, time 4941.61ms 
iter 442: loss 3.4711, time 4935.28ms 
iter 443: loss 3.6125, time 5004.42ms 
iter 444: loss 3.7897, time 4997.44ms 
iter 445: loss 3.5623, time 5028.97ms 
iter 446: loss 3.6911, time 5018.62ms 
iter 447: loss 3.5557, time 4974.85ms 
iter 448: loss 3.4975, time 5003.67ms 
iter 449: loss 3.4848, time 5023.77ms 
step 450: train loss 3.5654, val loss 3.5717
iter 450: loss 3.7518, time 19782.42ms 
iter 451: loss 3.6056, time 5023.79ms 
iter 452: loss 3.5442, time 4977.98ms 
iter 453: loss 3.4151, time 5023.75ms 
iter 454: loss 3.4922, time 5021.19ms 
iter 455: loss 3.4792, time 5027.97ms 
iter 456: loss 3.4545, time 5026.12ms 
iter 457: loss 3.5196, time 4984.53ms 
iter 458: loss 3.4701, time 5028.11ms 
iter 459: loss 3.6523, time 5031.95ms 
iter 460: loss 3.7539, time 5014.08ms 
iter 461: loss 3.5896, time 5027.14ms 
iter 462: loss 3.6026, time 5031.30ms 
iter 463: loss 3.5583, time 5031.95ms 
iter 464: loss 3.7154, time 5031.43ms 
iter 465: loss 3.5274, time 5020.36ms 
iter 466: loss 3.5228, time 5011.97ms 
iter 467: loss 3.6208, time 4981.80ms 
iter 468: loss 3.4748, time 5008.43ms 
iter 469: loss 3.4459, time 5023.37ms 
iter 470: loss 3.5779, time 5030.48ms 
iter 471: loss 3.5291, time 5016.58ms 
iter 472: loss 3.5618, time 5017.26ms 
iter 473: loss 3.5514, time 5032.86ms 
iter 474: loss 3.5019, time 5027.83ms 
iter 475: loss 3.5150, time 4955.79ms 
iter 476: loss 3.4359, time 5017.07ms 
iter 477: loss 3.5524, time 5040.62ms 
iter 478: loss 3.3076, time 5043.57ms 
iter 479: loss 3.4794, time 5039.42ms 
iter 480: loss 3.4853, time 5032.90ms 
iter 481: loss 3.5523, time 5026.31ms 
iter 482: loss 3.5894, time 4999.06ms 
iter 483: loss 3.7204, time 5023.60ms 
iter 484: loss 3.4122, time 5013.66ms 
iter 485: loss 3.5357, time 5027.71ms 
iter 486: loss 3.4902, time 5022.78ms 
iter 487: loss 3.6473, time 5012.93ms 
iter 488: loss 3.4565, time 5010.08ms 
iter 489: loss 3.6562, time 4958.87ms 
iter 490: loss 3.5465, time 5017.29ms 
iter 491: loss 3.4129, time 4982.04ms 
iter 492: loss 3.5902, time 5022.51ms 
iter 493: loss 3.4190, time 5025.16ms 
iter 494: loss 3.4287, time 5014.74ms 
iter 495: loss 3.4604, time 5027.21ms 
iter 496: loss 3.5453, time 4929.84ms 
iter 497: loss 3.4064, time 4962.41ms 
iter 498: loss 3.5705, time 5014.87ms 
iter 499: loss 3.4710, time 5025.31ms 
step 500: train loss 3.5140, val loss 3.4842
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 500: loss 3.3601, time 20793.41ms 
iter 501: loss 3.6066, time 4986.50ms 
iter 502: loss 3.4870, time 4966.25ms 
iter 503: loss 3.3768, time 5008.31ms 
iter 504: loss 3.6215, time 4992.22ms 
iter 505: loss 3.7225, time 5015.73ms 
iter 506: loss 3.5732, time 5029.14ms 
iter 507: loss 3.3877, time 5027.75ms 
iter 508: loss 3.5746, time 5031.03ms 
iter 509: loss 3.6357, time 4980.54ms 
iter 510: loss 3.4537, time 4972.90ms 
iter 511: loss 3.6149, time 5031.47ms 
iter 512: loss 3.4526, time 4997.18ms 
iter 513: loss 3.7147, time 5003.82ms 
iter 514: loss 3.5076, time 5029.27ms 
iter 515: loss 3.6164, time 5025.53ms 
iter 516: loss 3.5300, time 5019.62ms 
iter 517: loss 3.5333, time 4916.13ms 
iter 518: loss 3.5513, time 5000.94ms 
iter 519: loss 3.3753, time 5022.97ms 
iter 520: loss 3.4645, time 5026.13ms 
iter 521: loss 3.4350, time 5021.70ms 
iter 522: loss 3.6350, time 5011.43ms 
iter 523: loss 3.3226, time 5025.05ms 
iter 524: loss 3.5822, time 5025.73ms 
iter 525: loss 3.4566, time 4999.59ms 
iter 526: loss 3.5165, time 5014.51ms 
iter 527: loss 3.3830, time 5021.43ms 
iter 528: loss 3.5195, time 5024.63ms 
iter 529: loss 3.2528, time 5023.40ms 
iter 530: loss 3.6051, time 5019.86ms 
iter 531: loss 3.2953, time 5020.81ms 
iter 532: loss 3.4100, time 5023.98ms 
iter 533: loss 3.4842, time 5007.91ms 
iter 534: loss 3.2909, time 5021.24ms 
iter 535: loss 3.5773, time 4988.45ms 
iter 536: loss 3.4849, time 4984.70ms 
iter 537: loss 3.4468, time 5019.75ms 
iter 538: loss 3.3597, time 5022.58ms 
iter 539: loss 3.5352, time 5020.73ms 
iter 540: loss 3.5108, time 4966.02ms 
iter 541: loss 3.4373, time 5001.65ms 
iter 542: loss 3.7657, time 5026.56ms 
iter 543: loss 3.3174, time 5000.58ms 
iter 544: loss 3.4230, time 5022.49ms 
iter 545: loss 3.5438, time 5022.83ms 
iter 546: loss 3.6259, time 5024.38ms 
iter 547: loss 3.5195, time 5023.95ms 
iter 548: loss 3.2102, time 4983.06ms 
iter 549: loss 3.4756, time 5026.72ms 
step 550: train loss 3.4745, val loss 3.4479
iter 550: loss 3.6566, time 19703.20ms 
iter 551: loss 3.5201, time 5028.97ms 
iter 552: loss 3.4095, time 5026.21ms 
iter 553: loss 3.3487, time 5024.65ms 
iter 554: loss 3.3292, time 5024.90ms 
iter 555: loss 3.3865, time 5026.18ms 
iter 556: loss 3.5255, time 5002.36ms 
iter 557: loss 3.4906, time 5026.95ms 
iter 558: loss 3.3873, time 5031.16ms 
iter 559: loss 3.5401, time 5023.64ms 
iter 560: loss 3.5266, time 5024.95ms 
iter 561: loss 3.3065, time 5027.28ms 
iter 562: loss 3.4843, time 5024.59ms 
iter 563: loss 3.3890, time 5029.24ms 
iter 564: loss 3.3922, time 5025.97ms 
iter 565: loss 3.3884, time 4972.91ms 
iter 566: loss 3.6227, time 4927.28ms 
iter 567: loss 3.4286, time 4928.67ms 
iter 568: loss 3.3570, time 4998.45ms 
iter 569: loss 3.4473, time 5026.63ms 
iter 570: loss 3.5661, time 5027.89ms 
iter 571: loss 3.6438, time 5025.66ms 
iter 572: loss 3.2257, time 5024.31ms 
iter 573: loss 3.4175, time 5010.06ms 
iter 574: loss 3.4583, time 4976.15ms 
iter 575: loss 3.5733, time 4968.15ms 
iter 576: loss 3.5801, time 5009.03ms 
iter 577: loss 3.3436, time 4983.49ms 
iter 578: loss 3.3011, time 5028.47ms 
iter 579: loss 3.7133, time 5033.72ms 
iter 580: loss 3.7654, time 5030.62ms 
iter 581: loss 3.4810, time 5032.94ms 
iter 582: loss 3.3405, time 5012.96ms 
iter 583: loss 3.6134, time 4971.31ms 
iter 584: loss 3.4102, time 5028.91ms 
iter 585: loss 3.4567, time 5031.96ms 
iter 586: loss 3.5349, time 5034.90ms 
iter 587: loss 3.2465, time 4997.09ms 
iter 588: loss 3.5716, time 5032.59ms 
iter 589: loss 3.4581, time 4994.55ms 
iter 590: loss 3.3761, time 4954.61ms 
iter 591: loss 3.4410, time 5027.40ms 
iter 592: loss 3.3875, time 5030.87ms 
iter 593: loss 3.5630, time 5030.37ms 
iter 594: loss 3.4485, time 5023.66ms 
iter 595: loss 3.5009, time 5017.94ms 
iter 596: loss 3.3314, time 5027.87ms 
iter 597: loss 3.5303, time 5031.20ms 
iter 598: loss 3.3412, time 5009.77ms 
iter 599: loss 3.4753, time 5024.83ms 
step 600: train loss 3.4186, val loss 3.4146
iter 600: loss 3.3561, time 19682.06ms 
iter 601: loss 3.3451, time 5019.58ms 
iter 602: loss 3.3894, time 5024.87ms 
iter 603: loss 3.4974, time 5004.68ms 
iter 604: loss 3.3304, time 5013.36ms 
iter 605: loss 3.2684, time 5014.48ms 
iter 606: loss 3.4038, time 5029.43ms 
iter 607: loss 3.4833, time 5019.97ms 
iter 608: loss 3.5003, time 5020.00ms 
iter 609: loss 3.5976, time 5023.99ms 
iter 610: loss 3.4987, time 4973.33ms 
iter 611: loss 3.3975, time 5019.20ms 
iter 612: loss 3.2670, time 5024.48ms 
iter 613: loss 3.1298, time 5024.55ms 
iter 614: loss 3.4539, time 5021.65ms 
iter 615: loss 3.4735, time 5021.08ms 
iter 616: loss 3.3454, time 5029.23ms 
iter 617: loss 3.5385, time 5025.47ms 
iter 618: loss 3.3298, time 4937.29ms 
iter 619: loss 3.5153, time 5022.68ms 
iter 620: loss 3.3807, time 5027.13ms 
iter 621: loss 3.2183, time 5032.40ms 
iter 622: loss 3.3377, time 5025.40ms 
iter 623: loss 3.4991, time 5018.24ms 
iter 624: loss 3.3011, time 5028.39ms 
iter 625: loss 3.3447, time 5001.62ms 
iter 626: loss 3.1473, time 4930.30ms 
iter 627: loss 3.4380, time 5027.31ms 
iter 628: loss 3.2765, time 5018.78ms 
iter 629: loss 3.4288, time 5032.01ms 
iter 630: loss 3.5060, time 5026.62ms 
iter 631: loss 3.3771, time 5033.41ms 
iter 632: loss 3.3325, time 5009.36ms 
iter 633: loss 3.3649, time 5033.96ms 
iter 634: loss 3.4047, time 5001.81ms 
iter 635: loss 3.2175, time 5028.94ms 
iter 636: loss 3.3894, time 5026.49ms 
iter 637: loss 3.4270, time 5028.90ms 
iter 638: loss 3.7616, time 4996.53ms 
iter 639: loss 3.3021, time 4996.13ms 
iter 640: loss 3.5393, time 4998.05ms 
iter 641: loss 3.6173, time 4939.98ms 
iter 642: loss 3.4548, time 4933.59ms 
iter 643: loss 3.3465, time 4967.73ms 
iter 644: loss 3.5654, time 5025.20ms 
iter 645: loss 3.2558, time 5013.12ms 
iter 646: loss 3.2764, time 4986.72ms 
iter 647: loss 3.3468, time 5016.12ms 
iter 648: loss 3.3528, time 5030.43ms 
iter 649: loss 3.5603, time 4983.96ms 
step 650: train loss 3.3832, val loss 3.3789
iter 650: loss 3.5392, time 19724.85ms 
iter 651: loss 3.4416, time 5026.65ms 
iter 652: loss 3.3676, time 5009.80ms 
iter 653: loss 3.2808, time 5020.09ms 
iter 654: loss 3.3282, time 4917.55ms 
iter 655: loss 3.3887, time 4931.29ms 
iter 656: loss 3.3469, time 5013.86ms 
iter 657: loss 3.5445, time 5008.66ms 
iter 658: loss 3.3898, time 5018.00ms 
iter 659: loss 3.2142, time 5031.20ms 
iter 660: loss 3.3310, time 5030.18ms 
iter 661: loss 3.2471, time 5038.24ms 
iter 662: loss 3.1327, time 4981.79ms 
iter 663: loss 3.2941, time 4981.81ms 
iter 664: loss 3.4442, time 4953.12ms 
iter 665: loss 3.3372, time 4942.56ms 
iter 666: loss 3.4612, time 4942.05ms 
iter 667: loss 3.3678, time 4993.11ms 
iter 668: loss 3.4234, time 4941.11ms 
iter 669: loss 3.4374, time 4943.71ms 
iter 670: loss 3.3786, time 4931.14ms 
iter 671: loss 3.5092, time 5018.47ms 
iter 672: loss 3.4942, time 5019.73ms 
iter 673: loss 3.4852, time 5019.66ms 
iter 674: loss 3.5941, time 5033.50ms 
iter 675: loss 3.3439, time 5033.71ms 
iter 676: loss 3.2456, time 5027.19ms 
iter 677: loss 3.3756, time 5033.97ms 
iter 678: loss 3.2717, time 4988.25ms 
iter 679: loss 3.2498, time 5044.51ms 
iter 680: loss 3.2245, time 5022.06ms 
iter 681: loss 3.3407, time 5031.09ms 
iter 682: loss 3.2766, time 5023.78ms 
iter 683: loss 3.5303, time 5031.13ms 
iter 684: loss 3.2761, time 5030.83ms 
iter 685: loss 3.4686, time 4980.80ms 
iter 686: loss 3.3917, time 4977.77ms 
iter 687: loss 3.3325, time 5011.59ms 
iter 688: loss 3.1945, time 4936.45ms 
iter 689: loss 3.2018, time 4933.95ms 
iter 690: loss 3.3068, time 4943.91ms 
iter 691: loss 3.3478, time 5024.86ms 
iter 692: loss 3.2908, time 5035.94ms 
iter 693: loss 3.3970, time 5031.39ms 
iter 694: loss 3.2168, time 5023.34ms 
iter 695: loss 3.4149, time 5017.45ms 
iter 696: loss 3.2445, time 4998.97ms 
iter 697: loss 3.3286, time 4963.55ms 
iter 698: loss 3.3356, time 5013.23ms 
iter 699: loss 3.2344, time 5005.83ms 
step 700: train loss 3.3480, val loss 3.3255
iter 700: loss 3.2894, time 19684.05ms 
iter 701: loss 3.3666, time 4984.40ms 
iter 702: loss 3.2894, time 4978.24ms 
iter 703: loss 3.0891, time 4941.32ms 
iter 704: loss 3.3320, time 4996.20ms 
iter 705: loss 3.2724, time 4952.15ms 
iter 706: loss 3.5480, time 4970.76ms 
iter 707: loss 3.2502, time 4930.51ms 
iter 708: loss 3.4362, time 4927.08ms 
iter 709: loss 3.2726, time 4986.93ms 
iter 710: loss 3.4721, time 4977.96ms 
iter 711: loss 3.3535, time 4920.37ms 
iter 712: loss 3.2710, time 4993.50ms 
iter 713: loss 3.3495, time 5031.50ms 
iter 714: loss 3.2494, time 5030.05ms 
iter 715: loss 3.4398, time 5026.37ms 
iter 716: loss 3.4607, time 5027.43ms 
iter 717: loss 3.4363, time 5029.24ms 
iter 718: loss 3.2510, time 5027.13ms 
iter 719: loss 3.3102, time 5001.05ms 
iter 720: loss 3.5558, time 4972.62ms 
iter 721: loss 3.3029, time 4964.14ms 
iter 722: loss 3.1512, time 5002.94ms 
iter 723: loss 3.2535, time 4999.87ms 
iter 724: loss 3.3787, time 5023.61ms 
iter 725: loss 3.3854, time 4918.53ms 
iter 726: loss 3.3265, time 4919.31ms 
iter 727: loss 3.2865, time 4972.28ms 
iter 728: loss 3.1882, time 5024.69ms 
iter 729: loss 3.4055, time 5024.31ms 
iter 730: loss 3.3463, time 5024.10ms 
iter 731: loss 3.1966, time 5024.39ms 
iter 732: loss 3.2905, time 5023.37ms 
iter 733: loss 3.3456, time 4973.05ms 
iter 734: loss 3.4483, time 4918.06ms 
iter 735: loss 3.3761, time 4916.60ms 
iter 736: loss 3.3102, time 4933.12ms 
iter 737: loss 3.2813, time 5006.15ms 
iter 738: loss 3.1921, time 5026.97ms 
iter 739: loss 3.4304, time 5011.39ms 
iter 740: loss 3.3652, time 4985.32ms 
iter 741: loss 3.3109, time 4957.90ms 
iter 742: loss 3.3181, time 5015.40ms 
iter 743: loss 3.2253, time 4983.86ms 
iter 744: loss 3.2853, time 4920.65ms 
iter 745: loss 3.4187, time 4916.34ms 
iter 746: loss 3.3220, time 4989.25ms 
iter 747: loss 3.1848, time 4982.88ms 
iter 748: loss 3.3309, time 4919.59ms 
iter 749: loss 3.3943, time 4964.91ms 
step 750: train loss 3.3026, val loss 3.3014
iter 750: loss 3.1101, time 19702.99ms 
iter 751: loss 3.2698, time 4947.70ms 
iter 752: loss 3.1526, time 5005.96ms 
iter 753: loss 3.2028, time 4921.25ms 
iter 754: loss 3.3969, time 4931.05ms 
iter 755: loss 3.3480, time 4981.09ms 
iter 756: loss 3.3286, time 5018.53ms 
iter 757: loss 3.2479, time 5027.82ms 
iter 758: loss 3.3159, time 5003.73ms 
iter 759: loss 3.3061, time 4948.93ms 
iter 760: loss 3.1495, time 4916.45ms 
iter 761: loss 3.3532, time 4919.90ms 
iter 762: loss 3.3403, time 4998.43ms 
iter 763: loss 3.1427, time 5021.60ms 
iter 764: loss 3.4528, time 5021.47ms 
iter 765: loss 3.1137, time 5011.57ms 
iter 766: loss 3.2066, time 5027.17ms 
iter 767: loss 3.2242, time 5020.82ms 
iter 768: loss 3.5526, time 5017.89ms 
iter 769: loss 3.1966, time 4973.53ms 
iter 770: loss 3.4039, time 4916.69ms 
iter 771: loss 3.3518, time 4994.79ms 
iter 772: loss 3.3255, time 5028.26ms 
iter 773: loss 3.2203, time 5027.57ms 
iter 774: loss 3.3632, time 5017.08ms 
iter 775: loss 3.3220, time 5029.57ms 
iter 776: loss 3.6257, time 5026.58ms 
iter 777: loss 3.2761, time 5032.15ms 
iter 778: loss 3.1116, time 4979.76ms 
iter 779: loss 3.2770, time 4943.91ms 
iter 780: loss 3.3508, time 5033.89ms 
iter 781: loss 3.1922, time 5032.52ms 
iter 782: loss 3.2746, time 5034.18ms 
iter 783: loss 3.1960, time 5036.09ms 
iter 784: loss 3.2530, time 5027.42ms 
iter 785: loss 3.3591, time 5034.67ms 
iter 786: loss 3.2070, time 5036.32ms 
iter 787: loss 3.2546, time 4959.47ms 
iter 788: loss 3.2666, time 4966.99ms 
iter 789: loss 3.3311, time 5034.08ms 
iter 790: loss 3.4867, time 5018.60ms 
iter 791: loss 3.3100, time 5030.25ms 
iter 792: loss 3.0879, time 5028.19ms 
iter 793: loss 3.3041, time 5026.60ms 
iter 794: loss 3.3040, time 5030.57ms 
iter 795: loss 3.1221, time 4925.91ms 
iter 796: loss 3.1932, time 4955.98ms 
iter 797: loss 3.2020, time 4938.62ms 
iter 798: loss 3.2620, time 5028.08ms 
iter 799: loss 3.2380, time 5027.18ms 
step 800: train loss 3.2652, val loss 3.2544
iter 800: loss 3.1375, time 19706.85ms 
iter 801: loss 3.2359, time 5030.53ms 
iter 802: loss 3.3617, time 4976.94ms 
iter 803: loss 3.1095, time 4920.02ms 
iter 804: loss 3.3757, time 5009.34ms 
iter 805: loss 3.4151, time 5025.99ms 
iter 806: loss 3.1430, time 5014.15ms 
iter 807: loss 3.2150, time 5018.45ms 
iter 808: loss 3.3958, time 5025.24ms 
iter 809: loss 3.2947, time 4997.83ms 
iter 810: loss 3.3740, time 5025.36ms 
iter 811: loss 3.2977, time 4970.58ms 
iter 812: loss 3.3124, time 4915.36ms 
iter 813: loss 3.2854, time 4986.94ms 
iter 814: loss 3.1895, time 5031.65ms 
iter 815: loss 3.2684, time 5024.54ms 
iter 816: loss 3.2330, time 5023.33ms 
iter 817: loss 3.1424, time 5025.28ms 
iter 818: loss 3.3385, time 5020.55ms 
iter 819: loss 3.1986, time 5022.14ms 
iter 820: loss 3.2538, time 4973.51ms 
iter 821: loss 3.3292, time 4923.45ms 
iter 822: loss 3.3394, time 4926.38ms 
iter 823: loss 3.3296, time 4971.90ms 
iter 824: loss 3.3348, time 5025.75ms 
iter 825: loss 3.2750, time 5018.12ms 
iter 826: loss 3.2816, time 5033.89ms 
iter 827: loss 3.2919, time 5012.31ms 
iter 828: loss 3.1426, time 4981.28ms 
iter 829: loss 3.3252, time 4973.70ms 
iter 830: loss 3.1796, time 4925.36ms 
iter 831: loss 3.2509, time 4971.32ms 
iter 832: loss 3.3880, time 5028.15ms 
iter 833: loss 3.2845, time 5052.36ms 
iter 834: loss 3.4131, time 5035.85ms 
iter 835: loss 3.2697, time 5004.35ms 
iter 836: loss 3.1784, time 5026.72ms 
iter 837: loss 3.2087, time 5024.14ms 
iter 838: loss 3.3227, time 4974.45ms 
iter 839: loss 3.1593, time 4956.52ms 
iter 840: loss 3.2969, time 5029.32ms 
iter 841: loss 3.0466, time 5019.76ms 
iter 842: loss 3.3700, time 5027.86ms 
iter 843: loss 3.1980, time 5027.96ms 
iter 844: loss 3.4739, time 5026.15ms 
iter 845: loss 3.2278, time 5028.15ms 
iter 846: loss 3.0516, time 5029.44ms 
iter 847: loss 3.2293, time 4997.66ms 
iter 848: loss 3.4687, time 4930.86ms 
iter 849: loss 3.2054, time 5029.45ms 
step 850: train loss 3.2355, val loss 3.2464
iter 850: loss 3.2394, time 19748.96ms 
iter 851: loss 3.2312, time 5027.10ms 
iter 852: loss 3.2814, time 5033.52ms 
iter 853: loss 3.2057, time 5034.76ms 
iter 854: loss 3.3376, time 5022.28ms 
iter 855: loss 3.2343, time 5049.57ms 
iter 856: loss 3.3781, time 5035.29ms 
iter 857: loss 3.4235, time 5034.07ms 
iter 858: loss 3.3144, time 5033.49ms 
iter 859: loss 3.1995, time 5048.85ms 
iter 860: loss 3.1765, time 5025.32ms 
iter 861: loss 3.2918, time 4935.83ms 
iter 862: loss 3.2092, time 4922.90ms 
iter 863: loss 3.1816, time 4978.54ms 
iter 864: loss 3.3365, time 5037.78ms 
iter 865: loss 3.0265, time 5029.75ms 
iter 866: loss 3.3085, time 5032.72ms 
iter 867: loss 3.3418, time 5032.98ms 
iter 868: loss 3.2980, time 5034.78ms 
iter 869: loss 3.2178, time 5020.43ms 
iter 870: loss 3.2872, time 4929.14ms 
iter 871: loss 3.1968, time 4918.35ms 
iter 872: loss 3.2683, time 5002.54ms 
iter 873: loss 3.1311, time 5030.83ms 
iter 874: loss 3.1593, time 5024.40ms 
iter 875: loss 3.2404, time 5028.53ms 
iter 876: loss 3.1338, time 5034.08ms 
iter 877: loss 3.0933, time 5037.46ms 
iter 878: loss 3.3999, time 5029.93ms 
iter 879: loss 3.2148, time 4960.98ms 
iter 880: loss 3.1247, time 4920.67ms 
iter 881: loss 3.4921, time 4974.49ms 
iter 882: loss 3.0821, time 5036.03ms 
iter 883: loss 3.3510, time 5029.53ms 
iter 884: loss 3.1167, time 5033.13ms 
iter 885: loss 3.2002, time 5034.85ms 
iter 886: loss 3.2461, time 5032.88ms 
iter 887: loss 3.3014, time 5038.67ms 
iter 888: loss 3.3061, time 5034.17ms 
iter 889: loss 3.0566, time 4985.51ms 
iter 890: loss 3.2459, time 4951.06ms 
iter 891: loss 3.0244, time 5022.29ms 
iter 892: loss 3.1482, time 5037.97ms 
iter 893: loss 3.1771, time 5042.32ms 
iter 894: loss 3.1447, time 5013.32ms 
iter 895: loss 3.1657, time 5024.34ms 
iter 896: loss 3.3292, time 5028.89ms 
iter 897: loss 3.1519, time 5028.46ms 
iter 898: loss 3.2701, time 4926.22ms 
iter 899: loss 3.2904, time 4918.53ms 
step 900: train loss 3.2089, val loss 3.1990
iter 900: loss 3.3295, time 19702.78ms 
iter 901: loss 3.3505, time 5019.29ms 
iter 902: loss 3.2091, time 5011.61ms 
iter 903: loss 3.2028, time 5022.72ms 
iter 904: loss 3.1661, time 4962.73ms 
iter 905: loss 3.1803, time 5004.13ms 
iter 906: loss 3.0573, time 5008.23ms 
iter 907: loss 3.0847, time 5024.77ms 
iter 908: loss 3.0335, time 5014.11ms 
iter 909: loss 3.0350, time 5018.38ms 
iter 910: loss 3.0110, time 5015.03ms 
iter 911: loss 3.3181, time 5014.07ms 
iter 912: loss 3.2612, time 5015.86ms 
iter 913: loss 3.1839, time 4929.66ms 
iter 914: loss 3.0138, time 4915.16ms 
iter 915: loss 3.0799, time 5003.22ms 
iter 916: loss 3.2087, time 4962.07ms 
iter 917: loss 3.0278, time 4950.18ms 
iter 918: loss 3.2313, time 5004.34ms 
iter 919: loss 3.1459, time 5016.65ms 
iter 920: loss 3.4039, time 5010.02ms 
iter 921: loss 3.2253, time 4988.29ms 
iter 922: loss 3.2735, time 4967.54ms 
iter 923: loss 3.1411, time 4924.31ms 
iter 924: loss 3.1260, time 5002.99ms 
iter 925: loss 3.5261, time 5030.21ms 
iter 926: loss 3.1792, time 5007.98ms 
iter 927: loss 3.3091, time 5026.05ms 
iter 928: loss 3.3941, time 5029.42ms 
iter 929: loss 3.0825, time 5025.53ms 
iter 930: loss 3.2399, time 5040.45ms 
iter 931: loss 3.2855, time 4950.81ms 
iter 932: loss 3.1687, time 4918.66ms 
iter 933: loss 3.0743, time 5005.71ms 
iter 934: loss 3.2706, time 5041.09ms 
iter 935: loss 3.4551, time 5015.45ms 
iter 936: loss 3.1704, time 5034.96ms 
iter 937: loss 3.1476, time 5037.18ms 
iter 938: loss 3.2511, time 5041.55ms 
iter 939: loss 3.3855, time 5045.42ms 
iter 940: loss 3.1979, time 4984.76ms 
iter 941: loss 3.1158, time 4938.38ms 
iter 942: loss 3.2979, time 5012.12ms 
iter 943: loss 3.2683, time 5031.81ms 
iter 944: loss 3.1512, time 5031.82ms 
iter 945: loss 3.1384, time 5022.79ms 
iter 946: loss 3.2353, time 5037.86ms 
iter 947: loss 3.1233, time 5030.66ms 
iter 948: loss 3.2963, time 5032.66ms 
iter 949: loss 3.3821, time 4976.15ms 
step 950: train loss 3.1683, val loss 3.1806
iter 950: loss 3.2123, time 19707.96ms 
iter 951: loss 3.2484, time 5009.61ms 
iter 952: loss 3.4527, time 5029.60ms 
iter 953: loss 3.3260, time 5015.13ms 
iter 954: loss 3.2143, time 4933.51ms 
iter 955: loss 3.3748, time 4932.40ms 
iter 956: loss 3.2530, time 4956.44ms 
iter 957: loss 3.1026, time 5030.69ms 
iter 958: loss 3.1427, time 5030.75ms 
iter 959: loss 3.1610, time 5028.73ms 
iter 960: loss 3.1311, time 5028.70ms 
iter 961: loss 3.2095, time 5024.75ms 
iter 962: loss 3.2897, time 5027.70ms 
iter 963: loss 3.1472, time 4996.25ms 
iter 964: loss 3.1525, time 4924.26ms 
iter 965: loss 3.1730, time 5017.91ms 
iter 966: loss 3.0809, time 4992.10ms 
iter 967: loss 3.2911, time 4931.42ms 
iter 968: loss 3.0741, time 4919.90ms 
iter 969: loss 3.1975, time 4999.60ms 
iter 970: loss 3.1773, time 5025.27ms 
iter 971: loss 3.2887, time 5028.50ms 
iter 972: loss 3.1666, time 4974.97ms 
iter 973: loss 3.3019, time 4987.92ms 
iter 974: loss 3.1390, time 5018.50ms 
iter 975: loss 3.1936, time 4979.91ms 
iter 976: loss 3.3280, time 5023.01ms 
iter 977: loss 2.9926, time 5022.63ms 
iter 978: loss 3.2949, time 4994.67ms 
iter 979: loss 3.2345, time 4971.17ms 
iter 980: loss 3.2695, time 5001.01ms 
iter 981: loss 3.1949, time 4978.78ms 
iter 982: loss 3.0882, time 4997.19ms 
iter 983: loss 3.1734, time 5019.74ms 
iter 984: loss 3.2709, time 5032.42ms 
iter 985: loss 3.1337, time 5033.79ms 
iter 986: loss 3.0731, time 5015.81ms 
iter 987: loss 3.0377, time 5003.86ms 
iter 988: loss 3.2523, time 4986.14ms 
iter 989: loss 3.3972, time 4918.76ms 
iter 990: loss 3.1018, time 4917.57ms 
iter 991: loss 3.1734, time 5007.07ms 
iter 992: loss 3.1116, time 5014.47ms 
iter 993: loss 3.1336, time 5039.64ms 
iter 994: loss 3.1412, time 5031.97ms 
iter 995: loss 3.1919, time 5033.64ms 
iter 996: loss 2.9800, time 5016.12ms 
iter 997: loss 3.1807, time 5032.07ms 
iter 998: loss 2.9727, time 4975.34ms 
iter 999: loss 3.2295, time 4995.39ms 
step 1000: train loss 3.1497, val loss 3.1525
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 1000: loss 3.2294, time 20736.25ms 
iter 1001: loss 3.1614, time 5027.55ms 
iter 1002: loss 3.0874, time 5023.73ms 
iter 1003: loss 3.0153, time 4972.21ms 
iter 1004: loss 3.2598, time 4931.52ms 
iter 1005: loss 3.2811, time 5002.34ms 
iter 1006: loss 3.0862, time 4962.59ms 
iter 1007: loss 3.4379, time 4988.84ms 
iter 1008: loss 3.2030, time 5023.08ms 
iter 1009: loss 3.3987, time 5031.14ms 
iter 1010: loss 3.1472, time 5024.46ms 
iter 1011: loss 3.1330, time 5032.03ms 
iter 1012: loss 3.0362, time 4934.24ms 
iter 1013: loss 3.0757, time 4987.43ms 
iter 1014: loss 3.3611, time 5012.05ms 
iter 1015: loss 3.2673, time 5019.74ms 
iter 1016: loss 3.3543, time 5032.90ms 
iter 1017: loss 3.2654, time 5027.41ms 
iter 1018: loss 3.1719, time 5028.95ms 
iter 1019: loss 3.1616, time 5023.15ms 
iter 1020: loss 2.9261, time 5034.61ms 
iter 1021: loss 3.1680, time 4985.83ms 
iter 1022: loss 3.1186, time 5023.52ms 
iter 1023: loss 3.0338, time 5021.96ms 
iter 1024: loss 3.0378, time 4932.33ms 
iter 1025: loss 3.0299, time 5028.77ms 
iter 1026: loss 3.1653, time 5033.76ms 
iter 1027: loss 3.3571, time 5034.89ms 
iter 1028: loss 3.0343, time 5031.08ms 
iter 1029: loss 2.9965, time 4979.35ms 
iter 1030: loss 3.0265, time 4959.18ms 
iter 1031: loss 3.0628, time 5025.45ms 
iter 1032: loss 3.0282, time 4981.50ms 
iter 1033: loss 3.2275, time 4979.11ms 
iter 1034: loss 3.3626, time 5015.05ms 
iter 1035: loss 3.3161, time 5008.11ms 
iter 1036: loss 3.2864, time 5029.11ms 
iter 1037: loss 3.1198, time 5031.12ms 
iter 1038: loss 3.0280, time 5014.99ms 
iter 1039: loss 3.0060, time 4968.05ms 
iter 1040: loss 2.9786, time 5028.62ms 
iter 1041: loss 3.0851, time 5031.14ms 
iter 1042: loss 3.0290, time 5030.26ms 
iter 1043: loss 2.9784, time 5055.74ms 
iter 1044: loss 3.0990, time 5008.31ms 
iter 1045: loss 3.2020, time 4986.83ms 
iter 1046: loss 3.1388, time 4967.29ms 
iter 1047: loss 3.0443, time 4917.84ms 
iter 1048: loss 2.9635, time 5015.49ms 
iter 1049: loss 3.1654, time 5029.39ms 
step 1050: train loss 3.1399, val loss 3.1541
iter 1050: loss 3.0581, time 19721.59ms 
iter 1051: loss 3.1122, time 5028.18ms 
iter 1052: loss 3.1736, time 4976.16ms 
iter 1053: loss 3.0849, time 4958.17ms 
iter 1054: loss 3.2763, time 5033.96ms 
iter 1055: loss 3.1808, time 5035.43ms 
iter 1056: loss 3.0290, time 5022.39ms 
iter 1057: loss 3.0175, time 5031.86ms 
iter 1058: loss 3.1266, time 5011.19ms 
iter 1059: loss 3.1032, time 5016.90ms 
iter 1060: loss 3.1194, time 5038.33ms 
iter 1061: loss 3.0342, time 4921.42ms 
iter 1062: loss 3.0659, time 5007.93ms 
iter 1063: loss 3.1325, time 5034.88ms 
iter 1064: loss 2.9851, time 5020.33ms 
iter 1065: loss 2.9856, time 5023.89ms 
iter 1066: loss 3.0917, time 5033.46ms 
iter 1067: loss 3.2550, time 4989.77ms 
iter 1068: loss 3.1635, time 4985.32ms 
iter 1069: loss 3.1857, time 4977.51ms 
iter 1070: loss 3.0361, time 4996.09ms 
iter 1071: loss 3.0617, time 5028.25ms 
iter 1072: loss 3.2438, time 5034.22ms 
iter 1073: loss 3.1466, time 5064.89ms 
iter 1074: loss 3.0891, time 5024.04ms 
iter 1075: loss 3.0682, time 5042.03ms 
iter 1076: loss 3.0564, time 5029.49ms 
iter 1077: loss 3.2942, time 5032.35ms 
iter 1078: loss 3.1419, time 4976.65ms 
iter 1079: loss 3.0799, time 4979.50ms 
iter 1080: loss 3.2618, time 5029.95ms 
iter 1081: loss 3.1293, time 5030.35ms 
iter 1082: loss 3.1872, time 5029.53ms 
iter 1083: loss 3.2062, time 5025.61ms 
iter 1084: loss 3.0469, time 4945.24ms 
iter 1085: loss 3.2444, time 4911.79ms 
iter 1086: loss 3.0193, time 4912.55ms 
iter 1087: loss 3.1060, time 4913.35ms 
iter 1088: loss 3.2007, time 4914.90ms 
iter 1089: loss 3.1750, time 4912.68ms 
iter 1090: loss 3.1940, time 4912.32ms 
iter 1091: loss 3.0987, time 4913.04ms 
iter 1092: loss 2.9270, time 4912.16ms 
iter 1093: loss 3.0222, time 4912.64ms 
iter 1094: loss 3.2097, time 4996.80ms 
iter 1095: loss 3.3583, time 5025.21ms 
iter 1096: loss 3.0161, time 4970.20ms 
iter 1097: loss 3.0995, time 4989.26ms 
iter 1098: loss 3.1149, time 5025.13ms 
iter 1099: loss 3.2075, time 5000.96ms 
step 1100: train loss 3.1174, val loss 3.1455
iter 1100: loss 3.2364, time 19689.10ms 
iter 1101: loss 3.1077, time 5013.38ms 
iter 1102: loss 3.0301, time 4969.74ms 
iter 1103: loss 3.0609, time 5004.42ms 
iter 1104: loss 3.0395, time 5021.87ms 
iter 1105: loss 3.0558, time 5021.78ms 
iter 1106: loss 3.0500, time 5019.89ms 
iter 1107: loss 3.1722, time 5024.16ms 
iter 1108: loss 3.1801, time 5020.99ms 
iter 1109: loss 3.2220, time 5023.75ms 
iter 1110: loss 2.9573, time 4973.37ms 
iter 1111: loss 3.1392, time 4947.27ms 
iter 1112: loss 3.0939, time 5025.38ms 
iter 1113: loss 3.0250, time 5028.38ms 
iter 1114: loss 3.1183, time 5027.15ms 
iter 1115: loss 3.0329, time 5026.29ms 
iter 1116: loss 3.1197, time 5027.55ms 
iter 1117: loss 3.0244, time 5026.92ms 
iter 1118: loss 2.9943, time 5000.20ms 
iter 1119: loss 3.1456, time 4970.03ms 
iter 1120: loss 3.2093, time 5030.38ms 
iter 1121: loss 3.1065, time 5024.70ms 
iter 1122: loss 3.1052, time 5026.38ms 
iter 1123: loss 3.1258, time 5019.02ms 
iter 1124: loss 3.2824, time 5026.01ms 
iter 1125: loss 3.0361, time 5025.53ms 
iter 1126: loss 3.2697, time 4971.96ms 
iter 1127: loss 3.1792, time 4913.91ms 
iter 1128: loss 3.0486, time 4988.71ms 
iter 1129: loss 2.9960, time 5017.50ms 
iter 1130: loss 3.2168, time 5021.13ms 
iter 1131: loss 3.0545, time 5026.28ms 
iter 1132: loss 2.9737, time 5016.89ms 
iter 1133: loss 2.9263, time 5026.43ms 
iter 1134: loss 3.0828, time 5033.40ms 
iter 1135: loss 3.3295, time 4985.75ms 
iter 1136: loss 3.0542, time 4978.49ms 
iter 1137: loss 3.1711, time 5025.34ms 
iter 1138: loss 3.3245, time 5026.25ms 
iter 1139: loss 3.0636, time 5032.78ms 
iter 1140: loss 2.9927, time 5026.90ms 
iter 1141: loss 3.0758, time 5024.37ms 
iter 1142: loss 3.0372, time 5027.59ms 
iter 1143: loss 3.3246, time 5018.87ms 
iter 1144: loss 3.0795, time 4913.83ms 
iter 1145: loss 3.0848, time 4920.29ms 
iter 1146: loss 3.1056, time 5022.11ms 
iter 1147: loss 2.9252, time 5029.73ms 
iter 1148: loss 3.0793, time 5030.12ms 
iter 1149: loss 3.2355, time 5028.00ms 
step 1150: train loss 3.0934, val loss 3.1234
iter 1150: loss 3.1442, time 19617.08ms 
iter 1151: loss 3.1396, time 4928.81ms 
iter 1152: loss 3.0823, time 5022.17ms 
iter 1153: loss 3.0830, time 4989.87ms 
iter 1154: loss 2.9936, time 5018.28ms 
iter 1155: loss 2.9351, time 5025.19ms 
iter 1156: loss 2.9777, time 5026.63ms 
iter 1157: loss 2.9384, time 5033.31ms 
iter 1158: loss 3.0666, time 5028.64ms 
iter 1159: loss 3.1852, time 4987.96ms 
iter 1160: loss 2.9173, time 4993.00ms 
iter 1161: loss 3.0389, time 5010.19ms 
iter 1162: loss 3.0822, time 5022.52ms 
iter 1163: loss 3.1763, time 5022.09ms 
iter 1164: loss 2.9831, time 5015.78ms 
iter 1165: loss 3.2061, time 5024.92ms 
iter 1166: loss 3.2537, time 5020.36ms 
iter 1167: loss 3.1062, time 4994.68ms 
iter 1168: loss 3.3081, time 4964.18ms 
iter 1169: loss 2.9415, time 5022.44ms 
iter 1170: loss 3.1552, time 5023.60ms 
iter 1171: loss 3.0765, time 5017.15ms 
iter 1172: loss 3.0144, time 5021.26ms 
iter 1173: loss 3.1746, time 5023.61ms 
iter 1174: loss 3.0016, time 5020.44ms 
iter 1175: loss 3.0969, time 5023.16ms 
iter 1176: loss 3.0391, time 4944.54ms 
iter 1177: loss 3.0817, time 4938.29ms 
iter 1178: loss 3.1982, time 4989.17ms 
iter 1179: loss 3.0876, time 5020.98ms 
iter 1180: loss 2.9468, time 5021.32ms 
iter 1181: loss 3.0382, time 5025.86ms 
iter 1182: loss 3.0588, time 5019.07ms 
iter 1183: loss 3.0251, time 4997.26ms 
iter 1184: loss 3.0225, time 5017.72ms 
iter 1185: loss 3.0752, time 4929.19ms 
iter 1186: loss 3.1533, time 4915.10ms 
iter 1187: loss 2.9072, time 4973.64ms 
iter 1188: loss 3.1669, time 5025.84ms 
iter 1189: loss 3.0351, time 5024.52ms 
iter 1190: loss 3.0129, time 5017.47ms 
iter 1191: loss 3.1442, time 5019.95ms 
iter 1192: loss 3.0314, time 5022.80ms 
iter 1193: loss 3.0093, time 5020.71ms 
iter 1194: loss 3.1005, time 4972.13ms 
iter 1195: loss 3.0607, time 4950.87ms 
iter 1196: loss 3.2075, time 5025.81ms 
iter 1197: loss 3.0954, time 5023.26ms 
iter 1198: loss 3.0172, time 5016.47ms 
iter 1199: loss 3.0833, time 5016.51ms 
step 1200: train loss 3.0796, val loss 3.1052
iter 1200: loss 3.0147, time 19611.74ms 
iter 1201: loss 2.9176, time 4987.21ms 
iter 1202: loss 2.8880, time 5028.25ms 
iter 1203: loss 3.0535, time 5026.56ms 
iter 1204: loss 2.9689, time 5027.76ms 
iter 1205: loss 2.8843, time 5032.43ms 
iter 1206: loss 2.9070, time 5037.34ms 
iter 1207: loss 3.0255, time 5015.82ms 
iter 1208: loss 2.9455, time 4983.16ms 
iter 1209: loss 3.1341, time 4927.81ms 
iter 1210: loss 3.0892, time 5037.85ms 
iter 1211: loss 2.9362, time 4995.80ms 
iter 1212: loss 3.2353, time 5027.34ms 
iter 1213: loss 3.1397, time 5023.00ms 
iter 1214: loss 2.9824, time 5021.48ms 
iter 1215: loss 3.0778, time 5046.41ms 
iter 1216: loss 2.9281, time 4990.56ms 
iter 1217: loss 3.0022, time 4918.86ms 
iter 1218: loss 2.9787, time 4986.99ms 
iter 1219: loss 3.1845, time 5028.13ms 
iter 1220: loss 2.9177, time 5026.13ms 
iter 1221: loss 3.1649, time 5032.41ms 
iter 1222: loss 3.0352, time 5029.31ms 
iter 1223: loss 3.1125, time 5008.62ms 
iter 1224: loss 3.0395, time 5005.14ms 
iter 1225: loss 3.0608, time 4925.50ms 
iter 1226: loss 3.1145, time 4929.46ms 
iter 1227: loss 3.0515, time 5033.05ms 
iter 1228: loss 3.0785, time 5022.51ms 
iter 1229: loss 3.3350, time 5010.99ms 
iter 1230: loss 3.1298, time 5030.46ms 
iter 1231: loss 2.9617, time 5031.09ms 
iter 1232: loss 2.8239, time 4999.66ms 
iter 1233: loss 3.0823, time 5014.67ms 
iter 1234: loss 3.0932, time 4919.75ms 
iter 1235: loss 2.9673, time 4979.23ms 
iter 1236: loss 2.9696, time 5036.50ms 
iter 1237: loss 2.9843, time 5038.24ms 
iter 1238: loss 2.9660, time 5028.65ms 
iter 1239: loss 3.0549, time 5022.96ms 
iter 1240: loss 3.1464, time 5025.80ms 
iter 1241: loss 3.1387, time 5023.87ms 
iter 1242: loss 3.0674, time 5001.52ms 
iter 1243: loss 3.1475, time 4921.94ms 
iter 1244: loss 3.0881, time 4993.20ms 
iter 1245: loss 3.0328, time 5026.81ms 
iter 1246: loss 3.0656, time 5022.29ms 
iter 1247: loss 3.1951, time 5029.90ms 
iter 1248: loss 3.2106, time 5029.86ms 
iter 1249: loss 3.1658, time 5012.79ms 
step 1250: train loss 3.0566, val loss 3.0822
iter 1250: loss 2.9268, time 19723.56ms 
iter 1251: loss 2.8770, time 5021.53ms 
iter 1252: loss 3.0885, time 5021.32ms 
iter 1253: loss 2.8829, time 5025.91ms 
iter 1254: loss 3.0386, time 5010.04ms 
iter 1255: loss 2.9376, time 5014.42ms 
iter 1256: loss 2.9491, time 5027.46ms 
iter 1257: loss 2.9234, time 4978.46ms 
iter 1258: loss 2.9817, time 5001.89ms 
iter 1259: loss 3.2395, time 5007.08ms 
iter 1260: loss 3.1060, time 5019.10ms 
iter 1261: loss 2.9777, time 5013.27ms 
iter 1262: loss 3.1784, time 5015.07ms 
iter 1263: loss 3.0946, time 5029.27ms 
iter 1264: loss 3.0172, time 5019.19ms 
iter 1265: loss 3.0846, time 5012.10ms 
iter 1266: loss 2.9528, time 4995.09ms 
iter 1267: loss 3.0505, time 5029.67ms 
iter 1268: loss 2.9510, time 5021.13ms 
iter 1269: loss 2.9316, time 5033.84ms 
iter 1270: loss 2.9698, time 5021.71ms 
iter 1271: loss 2.9818, time 5025.63ms 
iter 1272: loss 2.9316, time 5025.92ms 
iter 1273: loss 2.9254, time 4978.75ms 
iter 1274: loss 2.9714, time 4949.69ms 
iter 1275: loss 3.1134, time 4999.66ms 
iter 1276: loss 3.1323, time 5031.25ms 
iter 1277: loss 3.0398, time 4968.60ms 
iter 1278: loss 2.9800, time 4980.45ms 
iter 1279: loss 3.0264, time 5017.79ms 
iter 1280: loss 2.9166, time 4972.34ms 
iter 1281: loss 2.8848, time 4985.76ms 
iter 1282: loss 2.9178, time 4924.81ms 
iter 1283: loss 2.9745, time 4981.34ms 
iter 1284: loss 3.2156, time 4993.54ms 
iter 1285: loss 3.1070, time 5023.70ms 
iter 1286: loss 2.9935, time 5015.53ms 
iter 1287: loss 3.0353, time 5041.93ms 
iter 1288: loss 3.0101, time 5027.42ms 
iter 1289: loss 2.8935, time 4981.93ms 
iter 1290: loss 3.1792, time 4927.96ms 
iter 1291: loss 3.2439, time 4927.75ms 
iter 1292: loss 3.0472, time 4959.04ms 
iter 1293: loss 3.1605, time 5031.49ms 
iter 1294: loss 3.0757, time 5018.84ms 
iter 1295: loss 3.1165, time 4996.21ms 
iter 1296: loss 2.8612, time 5017.79ms 
iter 1297: loss 3.0127, time 5035.10ms 
iter 1298: loss 3.1016, time 4975.99ms 
iter 1299: loss 3.1604, time 5023.08ms 
step 1300: train loss 3.0275, val loss 3.0884
iter 1300: loss 3.1905, time 19732.70ms 
iter 1301: loss 3.4110, time 5018.69ms 
iter 1302: loss 2.8666, time 4999.54ms 
iter 1303: loss 3.0498, time 4919.44ms 
iter 1304: loss 3.0674, time 4984.48ms 
iter 1305: loss 3.1835, time 5026.94ms 
iter 1306: loss 3.0570, time 5047.16ms 
iter 1307: loss 2.9693, time 5024.14ms 
iter 1308: loss 3.1425, time 5018.20ms 
iter 1309: loss 3.1867, time 5025.01ms 
iter 1310: loss 2.8672, time 5021.75ms 
iter 1311: loss 3.1427, time 4973.04ms 
iter 1312: loss 2.9697, time 5006.69ms 
iter 1313: loss 3.0717, time 4952.11ms 
iter 1314: loss 2.8660, time 4945.86ms 
iter 1315: loss 3.0634, time 4919.44ms 
iter 1316: loss 2.8546, time 5023.74ms 
iter 1317: loss 2.8770, time 5021.53ms 
iter 1318: loss 3.0190, time 5001.64ms 
iter 1319: loss 2.8803, time 4951.19ms 
iter 1320: loss 2.9010, time 4924.61ms 
iter 1321: loss 3.1035, time 4934.55ms 
iter 1322: loss 3.1392, time 4941.44ms 
iter 1323: loss 2.9922, time 4984.47ms 
iter 1324: loss 3.2151, time 5046.13ms 
iter 1325: loss 2.9942, time 5031.92ms 
iter 1326: loss 2.9812, time 5016.85ms 
iter 1327: loss 3.0663, time 5065.69ms 
iter 1328: loss 3.1271, time 4990.01ms 
iter 1329: loss 2.9582, time 4929.32ms 
iter 1330: loss 3.0570, time 4995.30ms 
iter 1331: loss 3.0779, time 5033.95ms 
iter 1332: loss 2.9739, time 4978.09ms 
iter 1333: loss 3.1498, time 4986.95ms 
iter 1334: loss 3.1645, time 5032.27ms 
iter 1335: loss 3.0465, time 4988.33ms 
iter 1336: loss 3.0050, time 4957.57ms 
iter 1337: loss 2.9901, time 4993.07ms 
iter 1338: loss 3.1405, time 4978.21ms 
iter 1339: loss 2.9482, time 4960.18ms 
iter 1340: loss 3.1404, time 5027.62ms 
iter 1341: loss 2.9484, time 5024.53ms 
iter 1342: loss 3.1395, time 5029.63ms 
iter 1343: loss 3.0089, time 5032.13ms 
iter 1344: loss 3.0930, time 5011.07ms 
iter 1345: loss 3.0480, time 5020.78ms 
iter 1346: loss 3.0421, time 4984.57ms 
iter 1347: loss 2.8928, time 4957.08ms 
iter 1348: loss 3.1130, time 5060.34ms 
iter 1349: loss 2.9548, time 5044.84ms 
step 1350: train loss 3.0192, val loss 3.0603
iter 1350: loss 3.1428, time 19734.14ms 
iter 1351: loss 3.0292, time 4986.27ms 
iter 1352: loss 2.9750, time 4920.12ms 
iter 1353: loss 3.0845, time 4947.69ms 
iter 1354: loss 3.0636, time 4997.45ms 
iter 1355: loss 3.0995, time 4919.75ms 
iter 1356: loss 3.2118, time 4999.73ms 
iter 1357: loss 3.0550, time 5022.27ms 
iter 1358: loss 2.8004, time 5029.78ms 
iter 1359: loss 2.9922, time 5032.20ms 
iter 1360: loss 2.9700, time 5022.09ms 
iter 1361: loss 3.0599, time 5007.86ms 
iter 1362: loss 2.9400, time 5032.53ms 
iter 1363: loss 2.9746, time 4974.98ms 
iter 1364: loss 2.9238, time 4959.56ms 
iter 1365: loss 2.9671, time 5020.77ms 
iter 1366: loss 3.0818, time 5028.76ms 
iter 1367: loss 2.9623, time 5028.38ms 
iter 1368: loss 2.9525, time 5034.39ms 
iter 1369: loss 3.0422, time 5026.47ms 
iter 1370: loss 2.9729, time 5019.73ms 
iter 1371: loss 3.0181, time 5034.67ms 
iter 1372: loss 3.1984, time 4983.22ms 
iter 1373: loss 3.3086, time 4969.95ms 
iter 1374: loss 3.0373, time 5018.35ms 
iter 1375: loss 2.9167, time 5017.62ms 
iter 1376: loss 2.9785, time 5032.30ms 
iter 1377: loss 3.1314, time 5030.12ms 
iter 1378: loss 3.0347, time 5026.69ms 
iter 1379: loss 3.0288, time 5037.46ms 
iter 1380: loss 2.8942, time 4981.46ms 
iter 1381: loss 2.9122, time 4990.46ms 
iter 1382: loss 3.1167, time 5018.97ms 
iter 1383: loss 3.1572, time 4977.93ms 
iter 1384: loss 3.1375, time 4925.87ms 
iter 1385: loss 3.0480, time 5025.52ms 
iter 1386: loss 3.0974, time 5019.08ms 
iter 1387: loss 2.9278, time 4931.33ms 
iter 1388: loss 3.1760, time 4954.75ms 
iter 1389: loss 3.1157, time 4968.27ms 
iter 1390: loss 3.1184, time 4986.49ms 
iter 1391: loss 3.0062, time 4971.92ms 
iter 1392: loss 2.8503, time 4963.92ms 
iter 1393: loss 2.8620, time 4977.40ms 
iter 1394: loss 3.0204, time 4953.69ms 
iter 1395: loss 2.9432, time 4959.66ms 
iter 1396: loss 3.1770, time 4969.77ms 
iter 1397: loss 3.1617, time 4968.72ms 
iter 1398: loss 2.9122, time 4992.16ms 
iter 1399: loss 3.1422, time 5036.17ms 
step 1400: train loss 3.0098, val loss 3.0555
iter 1400: loss 2.8849, time 19869.38ms 
iter 1401: loss 3.1912, time 5041.09ms 
iter 1402: loss 3.0184, time 5016.75ms 
iter 1403: loss 3.1542, time 5027.29ms 
iter 1404: loss 3.2513, time 5030.24ms 
iter 1405: loss 2.8720, time 5033.16ms 
iter 1406: loss 3.0417, time 4974.35ms 
iter 1407: loss 2.9748, time 4981.52ms 
iter 1408: loss 3.2609, time 5013.13ms 
iter 1409: loss 3.1146, time 5020.28ms 
iter 1410: loss 2.8972, time 5031.53ms 
iter 1411: loss 3.0099, time 5025.25ms 
iter 1412: loss 2.8604, time 5017.32ms 
iter 1413: loss 3.0945, time 5025.67ms 
iter 1414: loss 2.9427, time 4961.41ms 
iter 1415: loss 3.0710, time 4944.08ms 
iter 1416: loss 3.1489, time 5030.41ms 
iter 1417: loss 3.0489, time 5027.91ms 
iter 1418: loss 2.9443, time 5021.24ms 
iter 1419: loss 2.8158, time 5022.68ms 
iter 1420: loss 2.8631, time 5026.93ms 
iter 1421: loss 3.0509, time 5025.68ms 
iter 1422: loss 3.1109, time 5027.02ms 
iter 1423: loss 3.2300, time 4976.02ms 
iter 1424: loss 2.8872, time 4994.61ms 
iter 1425: loss 3.0333, time 5026.89ms 
iter 1426: loss 2.9635, time 5012.39ms 
iter 1427: loss 3.0540, time 5025.20ms 
iter 1428: loss 2.9745, time 5024.01ms 
iter 1429: loss 2.8570, time 5027.01ms 
iter 1430: loss 2.9035, time 5024.83ms 
iter 1431: loss 2.9190, time 4974.17ms 
iter 1432: loss 2.8567, time 4942.45ms 
iter 1433: loss 3.1833, time 5014.98ms 
iter 1434: loss 2.9195, time 5026.83ms 
iter 1435: loss 3.0492, time 5031.48ms 
iter 1436: loss 2.9698, time 5029.10ms 
iter 1437: loss 3.1052, time 5026.89ms 
iter 1438: loss 3.1083, time 5009.27ms 
iter 1439: loss 2.9637, time 5027.33ms 
iter 1440: loss 3.0142, time 4977.17ms 
iter 1441: loss 2.9805, time 4983.71ms 
iter 1442: loss 2.9422, time 5026.63ms 
iter 1443: loss 3.1446, time 5023.66ms 
iter 1444: loss 3.1450, time 4993.79ms 
iter 1445: loss 3.0399, time 4915.34ms 
iter 1446: loss 2.9360, time 5015.57ms 
iter 1447: loss 2.9562, time 5014.18ms 
iter 1448: loss 2.9075, time 4966.69ms 
iter 1449: loss 2.8820, time 4914.90ms 
step 1450: train loss 2.9981, val loss 3.0458
iter 1450: loss 3.0538, time 19681.27ms 
iter 1451: loss 3.0275, time 5017.58ms 
iter 1452: loss 2.9804, time 5029.24ms 
iter 1453: loss 2.8616, time 5028.74ms 
iter 1454: loss 2.9169, time 4976.62ms 
iter 1455: loss 2.9252, time 4980.33ms 
iter 1456: loss 3.1043, time 5030.53ms 
iter 1457: loss 3.0684, time 5025.48ms 
iter 1458: loss 3.1267, time 5024.38ms 
iter 1459: loss 2.8736, time 5022.92ms 
iter 1460: loss 3.0124, time 5027.41ms 
iter 1461: loss 2.9343, time 5026.06ms 
iter 1462: loss 2.8847, time 4972.94ms 
iter 1463: loss 3.0368, time 4916.29ms 
iter 1464: loss 2.9713, time 4985.28ms 
iter 1465: loss 3.0340, time 5025.31ms 
iter 1466: loss 3.1802, time 5029.26ms 
iter 1467: loss 2.9263, time 5026.89ms 
iter 1468: loss 3.0674, time 5000.73ms 
iter 1469: loss 3.0744, time 4916.27ms 
iter 1470: loss 2.9783, time 4995.62ms 
iter 1471: loss 3.1116, time 5019.64ms 
iter 1472: loss 3.0722, time 5021.68ms 
iter 1473: loss 3.0937, time 5023.11ms 
iter 1474: loss 3.1652, time 5024.95ms 
iter 1475: loss 2.9349, time 5023.72ms 
iter 1476: loss 3.0384, time 5025.35ms 
iter 1477: loss 3.0255, time 4974.11ms 
iter 1478: loss 3.1011, time 4945.25ms 
iter 1479: loss 2.8862, time 5039.18ms 
iter 1480: loss 2.8862, time 5039.46ms 
iter 1481: loss 3.0365, time 5032.91ms 
iter 1482: loss 2.8320, time 5020.83ms 
iter 1483: loss 3.0877, time 5029.94ms 
iter 1484: loss 2.8900, time 5025.60ms 
iter 1485: loss 2.7589, time 4960.81ms 
iter 1486: loss 3.2587, time 4917.05ms 
iter 1487: loss 3.0467, time 4950.92ms 
iter 1488: loss 2.9893, time 5030.13ms 
iter 1489: loss 3.1670, time 5035.82ms 
iter 1490: loss 2.9153, time 5023.38ms 
iter 1491: loss 2.9927, time 5010.25ms 
iter 1492: loss 2.8466, time 5028.78ms 
iter 1493: loss 3.1015, time 5033.20ms 
iter 1494: loss 2.8868, time 4949.84ms 
iter 1495: loss 3.1165, time 4922.45ms 
iter 1496: loss 3.1636, time 4972.37ms 
iter 1497: loss 3.0149, time 4952.27ms 
iter 1498: loss 3.0227, time 5012.92ms 
iter 1499: loss 2.8265, time 4919.87ms 
step 1500: train loss 2.9775, val loss 3.0358
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 1500: loss 2.9321, time 20797.53ms 
iter 1501: loss 3.1389, time 4979.37ms 
iter 1502: loss 3.0706, time 4958.78ms 
iter 1503: loss 2.9535, time 5035.23ms 
iter 1504: loss 2.8864, time 5007.65ms 
iter 1505: loss 2.9923, time 5023.73ms 
iter 1506: loss 2.9716, time 5007.93ms 
iter 1507: loss 2.9653, time 5026.67ms 
iter 1508: loss 2.9248, time 5031.44ms 
iter 1509: loss 2.8233, time 5010.05ms 
iter 1510: loss 2.8693, time 4915.58ms 
iter 1511: loss 3.1999, time 4978.29ms 
iter 1512: loss 3.0334, time 5023.76ms 
iter 1513: loss 2.8526, time 5014.35ms 
iter 1514: loss 2.8943, time 5027.09ms 
iter 1515: loss 2.8754, time 5013.84ms 
iter 1516: loss 3.0086, time 5017.70ms 
iter 1517: loss 2.8998, time 5022.76ms 
iter 1518: loss 2.9589, time 4973.00ms 
iter 1519: loss 3.0326, time 4931.13ms 
iter 1520: loss 2.8934, time 4999.17ms 
iter 1521: loss 2.8947, time 5019.10ms 
iter 1522: loss 2.8789, time 5005.96ms 
iter 1523: loss 3.0236, time 4946.30ms 
iter 1524: loss 3.1420, time 4911.40ms 
iter 1525: loss 3.0701, time 4928.46ms 
iter 1526: loss 3.0833, time 4993.15ms 
iter 1527: loss 3.0672, time 4924.92ms 
iter 1528: loss 3.0771, time 4976.51ms 
iter 1529: loss 2.7696, time 5026.67ms 
iter 1530: loss 2.8430, time 5029.57ms 
iter 1531: loss 2.8497, time 5031.12ms 
iter 1532: loss 3.0243, time 5032.29ms 
iter 1533: loss 2.8267, time 5022.42ms 
iter 1534: loss 2.9172, time 5034.71ms 
iter 1535: loss 3.2835, time 4981.28ms 
iter 1536: loss 3.0564, time 4915.77ms 
iter 1537: loss 2.8666, time 4990.12ms 
iter 1538: loss 2.9573, time 5019.87ms 
iter 1539: loss 3.0935, time 5031.20ms 
iter 1540: loss 2.9617, time 5023.88ms 
iter 1541: loss 2.7436, time 5032.24ms 
iter 1542: loss 2.8486, time 5033.67ms 
iter 1543: loss 2.8702, time 4963.27ms 
iter 1544: loss 3.1443, time 4913.76ms 
iter 1545: loss 3.0629, time 4914.65ms 
iter 1546: loss 3.1103, time 4974.33ms 
iter 1547: loss 2.8263, time 5025.56ms 
iter 1548: loss 2.9001, time 5038.00ms 
iter 1549: loss 2.9420, time 5008.39ms 
step 1550: train loss 2.9870, val loss 3.0281
iter 1550: loss 2.9646, time 19637.54ms 
iter 1551: loss 2.9382, time 4922.04ms 
iter 1552: loss 2.8642, time 4971.67ms 
iter 1553: loss 3.1075, time 4954.27ms 
iter 1554: loss 2.8734, time 4973.68ms 
iter 1555: loss 3.0241, time 4935.94ms 
iter 1556: loss 2.9432, time 4951.44ms 
iter 1557: loss 2.8630, time 5030.55ms 
iter 1558: loss 2.8966, time 5036.34ms 
iter 1559: loss 2.8842, time 4969.61ms 
iter 1560: loss 2.7955, time 4917.11ms 
iter 1561: loss 2.9770, time 5006.06ms 
iter 1562: loss 3.0345, time 5033.48ms 
iter 1563: loss 2.7290, time 5032.51ms 
iter 1564: loss 3.0626, time 5027.45ms 
iter 1565: loss 2.9280, time 5028.33ms 
iter 1566: loss 3.0567, time 5027.61ms 
iter 1567: loss 3.0205, time 5028.09ms 
iter 1568: loss 3.0313, time 5016.86ms 
iter 1569: loss 2.9277, time 4980.94ms 
iter 1570: loss 3.0920, time 5006.19ms 
iter 1571: loss 2.8888, time 5035.88ms 
iter 1572: loss 3.1716, time 5036.58ms 
iter 1573: loss 2.9026, time 5000.76ms 
iter 1574: loss 3.0591, time 5006.86ms 
iter 1575: loss 3.1905, time 5019.54ms 
iter 1576: loss 3.0401, time 5022.90ms 
iter 1577: loss 3.0807, time 4982.46ms 
iter 1578: loss 2.8636, time 4953.48ms 
iter 1579: loss 2.8569, time 5004.22ms 
iter 1580: loss 2.8901, time 5022.15ms 
iter 1581: loss 2.9339, time 5032.65ms 
iter 1582: loss 2.9358, time 5029.89ms 
iter 1583: loss 2.7938, time 4918.56ms 
iter 1584: loss 3.0399, time 4930.17ms 
iter 1585: loss 2.9218, time 4964.25ms 
iter 1586: loss 2.6560, time 4942.29ms 
iter 1587: loss 3.0290, time 4944.49ms 
iter 1588: loss 2.9487, time 5007.54ms 
iter 1589: loss 2.9688, time 5032.76ms 
iter 1590: loss 3.0311, time 4969.78ms 
iter 1591: loss 3.0329, time 4975.02ms 
iter 1592: loss 2.9327, time 4976.18ms 
iter 1593: loss 3.0911, time 5008.81ms 
iter 1594: loss 2.9133, time 4967.26ms 
iter 1595: loss 2.8425, time 4919.40ms 
iter 1596: loss 2.8355, time 4969.44ms 
iter 1597: loss 2.9463, time 5033.51ms 
iter 1598: loss 3.0273, time 5033.21ms 
iter 1599: loss 2.9997, time 5030.48ms 
step 1600: train loss 2.9653, val loss 3.0235
iter 1600: loss 2.8551, time 19693.68ms 
iter 1601: loss 2.8847, time 4959.78ms 
iter 1602: loss 3.0077, time 4958.91ms 
iter 1603: loss 3.0073, time 5029.07ms 
iter 1604: loss 2.8927, time 5017.41ms 
iter 1605: loss 3.1381, time 5010.89ms 
iter 1606: loss 2.9050, time 5019.09ms 
iter 1607: loss 2.8012, time 5019.30ms 
iter 1608: loss 2.8078, time 5028.05ms 
iter 1609: loss 2.7486, time 5030.11ms 
iter 1610: loss 2.8954, time 4982.00ms 
iter 1611: loss 3.2683, time 5002.73ms 
iter 1612: loss 2.8907, time 5032.43ms 
iter 1613: loss 3.0822, time 4951.26ms 
iter 1614: loss 2.9942, time 4990.49ms 
iter 1615: loss 2.7988, time 5022.24ms 
iter 1616: loss 2.8085, time 5028.62ms 
iter 1617: loss 2.9630, time 4989.14ms 
iter 1618: loss 2.8882, time 5029.96ms 
iter 1619: loss 2.8623, time 5021.39ms 
iter 1620: loss 2.9614, time 5028.68ms 
iter 1621: loss 3.0064, time 4977.66ms 
iter 1622: loss 3.0358, time 4949.22ms 
iter 1623: loss 3.1321, time 4937.48ms 
iter 1624: loss 2.9039, time 5027.18ms 
iter 1625: loss 2.9369, time 5000.67ms 
iter 1626: loss 3.0566, time 5021.55ms 
iter 1627: loss 2.9085, time 5025.85ms 
iter 1628: loss 2.9137, time 5025.68ms 
iter 1629: loss 3.0566, time 5023.05ms 
iter 1630: loss 2.9000, time 4998.87ms 
iter 1631: loss 2.8892, time 4915.25ms 
iter 1632: loss 3.0909, time 4995.94ms 
iter 1633: loss 2.9099, time 4990.08ms 
iter 1634: loss 3.0176, time 4925.85ms 
iter 1635: loss 2.8941, time 4926.57ms 
iter 1636: loss 2.7719, time 4926.42ms 
iter 1637: loss 2.9752, time 4937.28ms 
iter 1638: loss 2.9556, time 5032.03ms 
iter 1639: loss 3.0034, time 4964.68ms 
iter 1640: loss 2.9839, time 4948.28ms 
iter 1641: loss 2.9303, time 4974.23ms 
iter 1642: loss 2.9190, time 4944.29ms 
iter 1643: loss 2.8531, time 4962.23ms 
iter 1644: loss 3.0691, time 5036.19ms 
iter 1645: loss 2.9025, time 5033.39ms 
iter 1646: loss 2.8314, time 5016.01ms 
iter 1647: loss 3.0042, time 5030.58ms 
iter 1648: loss 3.0384, time 5033.61ms 
iter 1649: loss 2.9637, time 4977.85ms 
step 1650: train loss 2.9422, val loss 3.0207
iter 1650: loss 3.0013, time 19693.40ms 
iter 1651: loss 3.0230, time 5000.34ms 
iter 1652: loss 2.8199, time 5013.45ms 
iter 1653: loss 2.9100, time 5029.49ms 
iter 1654: loss 3.1159, time 4979.02ms 
iter 1655: loss 2.7396, time 4917.16ms 
iter 1656: loss 2.9683, time 4991.70ms 
iter 1657: loss 2.9676, time 5022.24ms 
iter 1658: loss 2.8408, time 5023.88ms 
iter 1659: loss 3.0717, time 4982.84ms 
iter 1660: loss 2.8185, time 5004.65ms 
iter 1661: loss 2.9652, time 5001.46ms 
iter 1662: loss 2.7565, time 5032.36ms 
iter 1663: loss 2.7712, time 4966.13ms 
iter 1664: loss 2.9134, time 4922.86ms 
iter 1665: loss 2.8866, time 5028.13ms 
iter 1666: loss 2.9782, time 5029.13ms 
iter 1667: loss 2.8997, time 5021.24ms 
iter 1668: loss 2.9528, time 5034.55ms 
iter 1669: loss 2.8953, time 5032.61ms 
iter 1670: loss 2.9184, time 5024.48ms 
iter 1671: loss 2.8214, time 5028.96ms 
iter 1672: loss 3.1406, time 4978.53ms 
iter 1673: loss 2.9048, time 4988.23ms 
iter 1674: loss 2.8920, time 5028.87ms 
iter 1675: loss 2.8412, time 5032.70ms 
iter 1676: loss 2.8877, time 5024.18ms 
iter 1677: loss 2.9403, time 5009.76ms 
iter 1678: loss 3.0816, time 5013.49ms 
iter 1679: loss 3.0604, time 5024.08ms 
iter 1680: loss 3.0480, time 4978.51ms 
iter 1681: loss 2.8589, time 4917.42ms 
iter 1682: loss 3.0957, time 4913.55ms 
iter 1683: loss 3.0932, time 4913.68ms 
iter 1684: loss 3.2431, time 4914.33ms 
iter 1685: loss 3.0630, time 4914.22ms 
iter 1686: loss 3.1259, time 4920.27ms 
iter 1687: loss 2.9825, time 4923.68ms 
iter 1688: loss 3.2743, time 4914.33ms 
iter 1689: loss 2.9373, time 4914.35ms 
iter 1690: loss 2.8999, time 4914.55ms 
iter 1691: loss 2.9013, time 4928.87ms 
iter 1692: loss 2.9422, time 5023.46ms 
iter 1693: loss 3.1356, time 4940.31ms 
iter 1694: loss 2.9776, time 4913.97ms 
iter 1695: loss 2.7513, time 4914.40ms 
iter 1696: loss 2.9002, time 4914.88ms 
iter 1697: loss 2.9344, time 4914.82ms 
iter 1698: loss 2.5938, time 4914.60ms 
iter 1699: loss 3.1575, time 4915.08ms 
step 1700: train loss 2.9282, val loss 3.0097
iter 1700: loss 2.8010, time 19718.17ms 
iter 1701: loss 2.8063, time 5029.11ms 
iter 1702: loss 2.8592, time 5024.48ms 
iter 1703: loss 3.3740, time 5032.00ms 
iter 1704: loss 3.1167, time 4940.81ms 
iter 1705: loss 2.9114, time 4921.64ms 
iter 1706: loss 2.8544, time 4979.87ms 
iter 1707: loss 2.9879, time 4999.21ms 
iter 1708: loss 2.9232, time 5032.03ms 
iter 1709: loss 2.8599, time 4966.61ms 
iter 1710: loss 2.8046, time 5012.88ms 
iter 1711: loss 3.1556, time 5030.24ms 
iter 1712: loss 3.1462, time 5036.40ms 
iter 1713: loss 2.6900, time 4919.77ms 
iter 1714: loss 2.7407, time 4930.69ms 
iter 1715: loss 2.9709, time 5007.53ms 
iter 1716: loss 3.0047, time 4972.10ms 
iter 1717: loss 2.8866, time 4992.19ms 
iter 1718: loss 2.9686, time 4960.06ms 
iter 1719: loss 2.8591, time 5029.54ms 
iter 1720: loss 2.7837, time 5034.52ms 
iter 1721: loss 3.0600, time 5023.90ms 
iter 1722: loss 2.9228, time 4928.69ms 
iter 1723: loss 3.0960, time 4927.75ms 
iter 1724: loss 2.9521, time 5020.50ms 
iter 1725: loss 2.8127, time 5033.86ms 
iter 1726: loss 3.0382, time 5035.85ms 
iter 1727: loss 2.8947, time 5051.08ms 
iter 1728: loss 2.8049, time 5011.42ms 
iter 1729: loss 3.0049, time 5006.72ms 
iter 1730: loss 2.7512, time 5023.17ms 
iter 1731: loss 2.7819, time 4957.18ms 
iter 1732: loss 3.0306, time 4986.86ms 
iter 1733: loss 3.0181, time 5029.41ms 
iter 1734: loss 3.0187, time 5029.30ms 
iter 1735: loss 2.9237, time 5010.57ms 
iter 1736: loss 2.7558, time 5030.50ms 
iter 1737: loss 2.8462, time 5030.36ms 
iter 1738: loss 2.9909, time 5026.14ms 
iter 1739: loss 2.9259, time 5029.69ms 
iter 1740: loss 2.8974, time 4916.05ms 
iter 1741: loss 2.8767, time 4979.58ms 
iter 1742: loss 2.9510, time 5028.42ms 
iter 1743: loss 2.9773, time 5037.59ms 
iter 1744: loss 3.1014, time 5007.32ms 
iter 1745: loss 2.9422, time 4917.61ms 
iter 1746: loss 2.8166, time 4918.31ms 
iter 1747: loss 2.9553, time 5009.88ms 
iter 1748: loss 2.8188, time 5041.68ms 
iter 1749: loss 2.8432, time 5033.51ms 
step 1750: train loss 2.9348, val loss 3.0018
iter 1750: loss 3.0055, time 19716.24ms 
iter 1751: loss 2.9480, time 4973.32ms 
iter 1752: loss 2.8734, time 4955.48ms 
iter 1753: loss 2.9934, time 5021.80ms 
iter 1754: loss 3.0769, time 5032.94ms 
iter 1755: loss 2.8864, time 5024.03ms 
iter 1756: loss 2.7995, time 5024.73ms 
iter 1757: loss 3.1170, time 5028.06ms 
iter 1758: loss 2.8063, time 5029.29ms 
iter 1759: loss 3.0178, time 5030.04ms 
iter 1760: loss 2.9227, time 4980.39ms 
iter 1761: loss 3.0143, time 5007.16ms 
iter 1762: loss 2.8437, time 5042.13ms 
iter 1763: loss 2.9111, time 5032.30ms 
iter 1764: loss 2.8415, time 5015.60ms 
iter 1765: loss 2.8782, time 5029.65ms 
iter 1766: loss 3.1016, time 4976.81ms 
iter 1767: loss 2.8620, time 4962.02ms 
iter 1768: loss 3.0326, time 5030.05ms 
iter 1769: loss 3.1643, time 5029.33ms 
iter 1770: loss 2.8541, time 5028.35ms 
iter 1771: loss 3.1442, time 5019.69ms 
iter 1772: loss 3.1601, time 5033.33ms 
iter 1773: loss 2.9046, time 5029.77ms 
iter 1774: loss 3.0083, time 5031.57ms 
iter 1775: loss 2.9704, time 4977.03ms 
iter 1776: loss 2.9132, time 4966.27ms 
iter 1777: loss 3.0824, time 5031.86ms 
iter 1778: loss 2.9285, time 5035.45ms 
iter 1779: loss 2.8145, time 5031.94ms 
iter 1780: loss 2.9094, time 5022.51ms 
iter 1781: loss 2.9064, time 5031.46ms 
iter 1782: loss 3.0202, time 5019.08ms 
iter 1783: loss 2.8807, time 4963.21ms 
iter 1784: loss 2.8517, time 4918.75ms 
iter 1785: loss 2.9046, time 4983.14ms 
iter 1786: loss 2.9110, time 5025.37ms 
iter 1787: loss 2.8680, time 5028.53ms 
iter 1788: loss 2.9437, time 5023.20ms 
iter 1789: loss 2.9117, time 5023.09ms 
iter 1790: loss 2.7751, time 5014.67ms 
iter 1791: loss 2.9013, time 5031.98ms 
iter 1792: loss 2.7367, time 5043.03ms 
iter 1793: loss 2.9257, time 4993.29ms 
iter 1794: loss 2.9805, time 4972.98ms 
iter 1795: loss 2.8883, time 4989.45ms 
iter 1796: loss 2.9811, time 5004.30ms 
iter 1797: loss 3.0144, time 5025.33ms 
iter 1798: loss 2.9127, time 5030.11ms 
iter 1799: loss 2.6804, time 5024.18ms 
step 1800: train loss 2.9187, val loss 2.9954
iter 1800: loss 2.8460, time 19749.34ms 
iter 1801: loss 2.9136, time 5043.86ms 
iter 1802: loss 2.9350, time 5024.90ms 
iter 1803: loss 2.8866, time 5043.77ms 
iter 1804: loss 2.9969, time 5020.84ms 
iter 1805: loss 2.8681, time 5015.54ms 
iter 1806: loss 3.0489, time 4914.19ms 
iter 1807: loss 2.8500, time 4945.25ms 
iter 1808: loss 2.8673, time 5024.66ms 
iter 1809: loss 2.9214, time 5007.41ms 
iter 1810: loss 2.9603, time 5021.62ms 
iter 1811: loss 3.0374, time 5030.58ms 
iter 1812: loss 2.8852, time 5029.00ms 
iter 1813: loss 2.9296, time 5024.50ms 
iter 1814: loss 2.7955, time 5026.97ms 
iter 1815: loss 2.8423, time 4971.33ms 
iter 1816: loss 2.8756, time 4985.00ms 
iter 1817: loss 2.9348, time 5017.56ms 
iter 1818: loss 2.9160, time 5011.96ms 
iter 1819: loss 3.0815, time 4919.18ms 
iter 1820: loss 3.1713, time 4914.18ms 
iter 1821: loss 2.8376, time 4915.15ms 
iter 1822: loss 2.8441, time 4914.22ms 
iter 1823: loss 2.7457, time 4913.76ms 
iter 1824: loss 2.8479, time 4913.37ms 
iter 1825: loss 2.7377, time 4913.24ms 
iter 1826: loss 2.8392, time 4913.65ms 
iter 1827: loss 2.9013, time 4913.72ms 
iter 1828: loss 3.0926, time 4916.46ms 
iter 1829: loss 3.0900, time 4914.39ms 
iter 1830: loss 2.8129, time 4913.63ms 
iter 1831: loss 2.6844, time 4914.74ms 
iter 1832: loss 3.0353, time 4913.99ms 
iter 1833: loss 2.7433, time 4913.72ms 
iter 1834: loss 2.8579, time 4971.55ms 
iter 1835: loss 2.8624, time 5019.82ms 
iter 1836: loss 3.1419, time 4927.88ms 
iter 1837: loss 2.9997, time 4928.15ms 
iter 1838: loss 3.0124, time 4927.51ms 
iter 1839: loss 2.8367, time 4928.98ms 
iter 1840: loss 2.8905, time 4930.56ms 
iter 1841: loss 2.8466, time 4926.94ms 
iter 1842: loss 3.0538, time 4962.39ms 
iter 1843: loss 2.9343, time 5023.15ms 
iter 1844: loss 2.7658, time 5032.24ms 
iter 1845: loss 2.8204, time 5028.36ms 
iter 1846: loss 2.9726, time 5014.65ms 
iter 1847: loss 2.9469, time 5026.98ms 
iter 1848: loss 3.1020, time 4919.28ms 
iter 1849: loss 2.7822, time 4927.63ms 
step 1850: train loss 2.9132, val loss 2.9785
iter 1850: loss 2.8617, time 19668.13ms 
iter 1851: loss 2.9698, time 4957.77ms 
iter 1852: loss 3.1075, time 5024.14ms 
iter 1853: loss 2.6911, time 5031.53ms 
iter 1854: loss 3.1168, time 4980.38ms 
iter 1855: loss 2.8034, time 4968.61ms 
iter 1856: loss 2.8391, time 4914.67ms 
iter 1857: loss 2.8189, time 4913.65ms 
iter 1858: loss 2.9934, time 4928.77ms 
iter 1859: loss 2.9203, time 4971.12ms 
iter 1860: loss 2.9044, time 4998.48ms 
iter 1861: loss 2.9059, time 5036.36ms 
iter 1862: loss 3.1189, time 4982.44ms 
iter 1863: loss 2.9203, time 4922.56ms 
iter 1864: loss 2.7684, time 5000.49ms 
iter 1865: loss 3.1139, time 5035.59ms 
iter 1866: loss 2.9441, time 5038.45ms 
iter 1867: loss 2.9776, time 5038.20ms 
iter 1868: loss 3.0521, time 5030.26ms 
iter 1869: loss 2.8158, time 5031.02ms 
iter 1870: loss 3.2663, time 5018.29ms 
iter 1871: loss 3.1289, time 4916.59ms 
iter 1872: loss 2.7796, time 4965.46ms 
iter 1873: loss 2.8796, time 5023.70ms 
iter 1874: loss 2.9657, time 5027.73ms 
iter 1875: loss 2.9834, time 5047.51ms 
iter 1876: loss 2.9014, time 5041.91ms 
iter 1877: loss 3.0134, time 5033.27ms 
iter 1878: loss 2.8037, time 5034.98ms 
iter 1879: loss 2.6775, time 5011.61ms 
iter 1880: loss 2.9056, time 4995.69ms 
iter 1881: loss 3.0594, time 5038.34ms 
iter 1882: loss 2.8431, time 5032.63ms 
iter 1883: loss 2.9063, time 5019.12ms 
iter 1884: loss 2.8805, time 5018.66ms 
iter 1885: loss 2.8718, time 5025.19ms 
iter 1886: loss 2.9136, time 5031.59ms 
iter 1887: loss 2.8819, time 4980.11ms 
iter 1888: loss 2.9541, time 4959.09ms 
iter 1889: loss 2.7644, time 5022.98ms 
iter 1890: loss 2.8996, time 5022.25ms 
iter 1891: loss 2.9463, time 5035.24ms 
iter 1892: loss 2.9998, time 5027.97ms 
iter 1893: loss 3.0704, time 5027.69ms 
iter 1894: loss 2.7998, time 5023.67ms 
iter 1895: loss 2.8951, time 5030.66ms 
iter 1896: loss 3.0032, time 4939.84ms 
iter 1897: loss 2.9745, time 4998.66ms 
iter 1898: loss 2.8086, time 5025.13ms 
iter 1899: loss 2.8284, time 5015.37ms 
step 1900: train loss 2.8989, val loss 2.9774
iter 1900: loss 2.8395, time 19727.11ms 
iter 1901: loss 2.8019, time 4969.89ms 
iter 1902: loss 2.9174, time 4992.51ms 
iter 1903: loss 2.7647, time 4984.89ms 
iter 1904: loss 2.7435, time 4993.05ms 
iter 1905: loss 2.9383, time 5016.87ms 
iter 1906: loss 2.8066, time 5026.56ms 
iter 1907: loss 3.0794, time 5027.24ms 
iter 1908: loss 2.7949, time 5028.79ms 
iter 1909: loss 3.1396, time 4964.80ms 
iter 1910: loss 2.8871, time 4996.47ms 
iter 1911: loss 2.8938, time 5020.39ms 
iter 1912: loss 2.9503, time 5022.54ms 
iter 1913: loss 2.9270, time 5023.52ms 
iter 1914: loss 2.8602, time 5018.76ms 
iter 1915: loss 3.0485, time 5023.90ms 
iter 1916: loss 2.7859, time 4959.00ms 
iter 1917: loss 2.9561, time 4926.99ms 
iter 1918: loss 2.8828, time 5020.51ms 
iter 1919: loss 3.0523, time 5020.31ms 
iter 1920: loss 2.9289, time 5019.80ms 
iter 1921: loss 2.8734, time 5013.40ms 
iter 1922: loss 3.1353, time 5023.19ms 
iter 1923: loss 2.7219, time 5025.02ms 
iter 1924: loss 2.9447, time 5032.63ms 
iter 1925: loss 2.7624, time 4951.61ms 
iter 1926: loss 3.0962, time 5026.46ms 
iter 1927: loss 2.8031, time 5026.57ms 
iter 1928: loss 2.9329, time 5027.08ms 
iter 1929: loss 2.9223, time 5024.30ms 
iter 1930: loss 2.9803, time 5024.02ms 
iter 1931: loss 2.6659, time 5017.11ms 
iter 1932: loss 2.9418, time 5079.23ms 
iter 1933: loss 2.9932, time 5019.60ms 
iter 1934: loss 2.9790, time 5012.28ms 
iter 1935: loss 3.0630, time 5027.64ms 
iter 1936: loss 2.7611, time 5020.13ms 
iter 1937: loss 2.9725, time 5048.78ms 
iter 1938: loss 2.9003, time 5021.94ms 
iter 1939: loss 2.8657, time 5025.14ms 
iter 1940: loss 3.0601, time 4998.15ms 
iter 1941: loss 2.9566, time 4918.44ms 
iter 1942: loss 2.7308, time 4934.35ms 
iter 1943: loss 2.8287, time 5033.24ms 
iter 1944: loss 2.7833, time 5024.47ms 
iter 1945: loss 2.8566, time 5022.74ms 
iter 1946: loss 2.9048, time 5006.73ms 
iter 1947: loss 2.9900, time 5023.55ms 
iter 1948: loss 2.8992, time 5025.27ms 
iter 1949: loss 2.8256, time 5025.40ms 
step 1950: train loss 2.8911, val loss 2.9851
iter 1950: loss 2.9461, time 19681.54ms 
iter 1951: loss 2.9074, time 5013.31ms 
iter 1952: loss 2.6683, time 5020.11ms 
iter 1953: loss 2.8665, time 5022.07ms 
iter 1954: loss 2.8483, time 5021.87ms 
iter 1955: loss 3.0044, time 4971.78ms 
iter 1956: loss 2.9936, time 4937.10ms 
iter 1957: loss 3.0488, time 5023.68ms 
iter 1958: loss 3.0067, time 5022.35ms 
iter 1959: loss 2.9106, time 5019.86ms 
iter 1960: loss 2.7129, time 5022.50ms 
iter 1961: loss 3.0162, time 5025.51ms 
iter 1962: loss 2.8808, time 5027.16ms 
iter 1963: loss 2.7799, time 5023.61ms 
iter 1964: loss 3.0059, time 4972.06ms 
iter 1965: loss 2.8066, time 5006.80ms 
iter 1966: loss 2.9506, time 5018.54ms 
iter 1967: loss 3.0576, time 5014.67ms 
iter 1968: loss 2.9236, time 5014.19ms 
iter 1969: loss 2.9363, time 5023.64ms 
iter 1970: loss 2.6950, time 5028.67ms 
iter 1971: loss 2.9484, time 5025.13ms 
iter 1972: loss 2.8564, time 4953.99ms 
iter 1973: loss 2.8534, time 4944.82ms 
iter 1974: loss 2.7544, time 5022.84ms 
iter 1975: loss 2.8882, time 5023.48ms 
iter 1976: loss 2.8798, time 5023.51ms 
iter 1977: loss 2.8190, time 5012.70ms 
iter 1978: loss 2.9208, time 5026.29ms 
iter 1979: loss 2.8973, time 5022.96ms 
iter 1980: loss 2.9272, time 4970.82ms 
iter 1981: loss 2.9782, time 4934.00ms 
iter 1982: loss 2.8224, time 5024.50ms 
iter 1983: loss 3.0051, time 5024.80ms 
iter 1984: loss 2.8710, time 5014.16ms 
iter 1985: loss 2.7088, time 5021.55ms 
iter 1986: loss 2.8063, time 5023.84ms 
iter 1987: loss 2.7297, time 5025.87ms 
iter 1988: loss 3.0668, time 5028.46ms 
iter 1989: loss 2.9330, time 4916.32ms 
iter 1990: loss 2.8249, time 4985.61ms 
iter 1991: loss 2.9726, time 5024.63ms 
iter 1992: loss 2.7065, time 5024.08ms 
iter 1993: loss 2.9462, time 5027.11ms 
iter 1994: loss 2.7924, time 5024.75ms 
iter 1995: loss 2.8645, time 5036.43ms 
iter 1996: loss 2.7235, time 5026.80ms 
iter 1997: loss 2.9489, time 4977.90ms 
iter 1998: loss 2.8970, time 4970.14ms 
iter 1999: loss 3.0122, time 5022.35ms 
step 2000: train loss 2.8840, val loss 2.9564
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 2000: loss 2.8709, time 20821.18ms 
iter 2001: loss 2.9248, time 5028.20ms 
iter 2002: loss 2.9155, time 4973.66ms 
iter 2003: loss 2.8816, time 4955.16ms 
iter 2004: loss 2.9621, time 5024.49ms 
iter 2005: loss 2.9527, time 5028.52ms 
iter 2006: loss 3.0154, time 5019.29ms 
iter 2007: loss 2.8330, time 5031.44ms 
iter 2008: loss 2.8725, time 5018.33ms 
iter 2009: loss 2.8294, time 5025.92ms 
iter 2010: loss 2.9087, time 4977.18ms 
iter 2011: loss 2.8659, time 4952.59ms 
iter 2012: loss 2.8295, time 5031.45ms 
iter 2013: loss 2.7372, time 5034.82ms 
iter 2014: loss 2.8796, time 5027.45ms 
iter 2015: loss 2.7854, time 5030.63ms 
iter 2016: loss 2.8392, time 5026.08ms 
iter 2017: loss 2.7900, time 5034.40ms 
iter 2018: loss 2.9374, time 4984.53ms 
iter 2019: loss 3.0107, time 4922.18ms 
iter 2020: loss 2.8481, time 5036.73ms 
iter 2021: loss 2.9554, time 5035.22ms 
iter 2022: loss 2.8884, time 4995.47ms 
iter 2023: loss 2.9343, time 4981.32ms 
iter 2024: loss 2.9391, time 4950.00ms 
iter 2025: loss 2.8940, time 5022.77ms 
iter 2026: loss 2.8030, time 5031.94ms 
iter 2027: loss 2.7968, time 4977.69ms 
iter 2028: loss 2.6496, time 4990.63ms 
iter 2029: loss 3.1335, time 5030.75ms 
iter 2030: loss 2.7809, time 5029.01ms 
iter 2031: loss 2.9184, time 5026.39ms 
iter 2032: loss 2.8541, time 5041.60ms 
iter 2033: loss 2.8894, time 5043.66ms 
iter 2034: loss 2.9823, time 5038.15ms 
iter 2035: loss 2.9410, time 4959.77ms 
iter 2036: loss 2.9611, time 4990.27ms 
iter 2037: loss 2.6585, time 4998.69ms 
iter 2038: loss 2.9066, time 4987.47ms 
iter 2039: loss 2.7725, time 5006.07ms 
iter 2040: loss 3.0534, time 5031.80ms 
iter 2041: loss 3.0238, time 5038.85ms 
iter 2042: loss 2.9157, time 5034.00ms 
iter 2043: loss 2.9414, time 4977.16ms 
iter 2044: loss 2.8818, time 4930.30ms 
iter 2045: loss 3.0981, time 5022.83ms 
iter 2046: loss 2.8584, time 4997.69ms 
iter 2047: loss 2.8774, time 5024.97ms 
iter 2048: loss 2.8296, time 5027.95ms 
iter 2049: loss 2.8901, time 5022.47ms 
step 2050: train loss 2.8772, val loss 2.9518
iter 2050: loss 2.7248, time 19677.29ms 
iter 2051: loss 2.8054, time 5026.27ms 
iter 2052: loss 2.7542, time 5023.07ms 
iter 2053: loss 2.8661, time 5028.59ms 
iter 2054: loss 2.7815, time 5017.34ms 
iter 2055: loss 2.9027, time 5019.02ms 
iter 2056: loss 2.9897, time 5027.42ms 
iter 2057: loss 2.7979, time 5028.81ms 
iter 2058: loss 3.0282, time 4956.04ms 
iter 2059: loss 2.8439, time 5037.21ms 
iter 2060: loss 2.9184, time 5036.14ms 
iter 2061: loss 2.8222, time 5038.60ms 
iter 2062: loss 2.7805, time 5034.94ms 
iter 2063: loss 2.9011, time 5032.88ms 
iter 2064: loss 2.8928, time 4980.98ms 
iter 2065: loss 2.9426, time 4991.91ms 
iter 2066: loss 2.8376, time 4950.92ms 
iter 2067: loss 2.7197, time 5030.37ms 
iter 2068: loss 2.9882, time 5033.67ms 
iter 2069: loss 3.0090, time 5044.79ms 
iter 2070: loss 3.0542, time 5035.16ms 
iter 2071: loss 2.9565, time 5031.06ms 
iter 2072: loss 2.8228, time 5049.39ms 
iter 2073: loss 2.9951, time 5036.42ms 
iter 2074: loss 2.8838, time 4982.16ms 
iter 2075: loss 2.7529, time 5033.55ms 
iter 2076: loss 2.9617, time 5035.62ms 
iter 2077: loss 2.9193, time 5014.63ms 
iter 2078: loss 2.7699, time 5002.65ms 
iter 2079: loss 2.9483, time 5032.76ms 
iter 2080: loss 2.8251, time 5028.72ms 
iter 2081: loss 3.0860, time 5036.81ms 
iter 2082: loss 2.8063, time 4976.96ms 
iter 2083: loss 2.8701, time 4993.26ms 
iter 2084: loss 2.9032, time 4940.96ms 
iter 2085: loss 2.8799, time 5015.20ms 
iter 2086: loss 2.8025, time 5030.22ms 
iter 2087: loss 2.6664, time 5036.39ms 
iter 2088: loss 3.0106, time 5032.44ms 
iter 2089: loss 2.7906, time 5039.55ms 
iter 2090: loss 2.9354, time 4980.73ms 
iter 2091: loss 2.8388, time 4969.59ms 
iter 2092: loss 2.8534, time 5008.65ms 
iter 2093: loss 2.8342, time 5022.28ms 
iter 2094: loss 2.8024, time 5029.29ms 
iter 2095: loss 2.9796, time 5024.06ms 
iter 2096: loss 2.7782, time 5023.92ms 
iter 2097: loss 2.6801, time 5028.50ms 
iter 2098: loss 2.9643, time 4970.23ms 
iter 2099: loss 2.8387, time 4953.71ms 
step 2100: train loss 2.8568, val loss 2.9515
iter 2100: loss 2.8592, time 19693.61ms 
iter 2101: loss 2.7518, time 5026.69ms 
iter 2102: loss 2.7559, time 5027.73ms 
iter 2103: loss 2.7438, time 4917.44ms 
iter 2104: loss 2.6318, time 4935.97ms 
iter 2105: loss 2.7111, time 5029.16ms 
iter 2106: loss 2.9107, time 5036.43ms 
iter 2107: loss 2.7851, time 5045.36ms 
iter 2108: loss 2.6908, time 5019.71ms 
iter 2109: loss 2.7984, time 5007.12ms 
iter 2110: loss 2.8831, time 5004.22ms 
iter 2111: loss 2.8386, time 5061.52ms 
iter 2112: loss 2.8566, time 4917.41ms 
iter 2113: loss 2.7665, time 4926.22ms 
iter 2114: loss 2.7452, time 5030.77ms 
iter 2115: loss 3.0085, time 4941.30ms 
iter 2116: loss 3.1260, time 4915.96ms 
iter 2117: loss 2.7225, time 4915.87ms 
iter 2118: loss 2.7556, time 4916.34ms 
iter 2119: loss 2.8838, time 4916.12ms 
iter 2120: loss 2.8948, time 4916.74ms 
iter 2121: loss 2.9419, time 4917.08ms 
iter 2122: loss 2.8437, time 4915.55ms 
iter 2123: loss 2.9438, time 4914.10ms 
iter 2124: loss 2.9588, time 4913.34ms 
iter 2125: loss 3.0720, time 4948.91ms 
iter 2126: loss 2.8389, time 5026.38ms 
iter 2127: loss 2.6937, time 5026.47ms 
iter 2128: loss 2.8158, time 5025.93ms 
iter 2129: loss 2.9373, time 4993.54ms 
iter 2130: loss 2.8539, time 5025.69ms 
iter 2131: loss 2.9384, time 5027.25ms 
iter 2132: loss 2.7939, time 5019.67ms 
iter 2133: loss 2.9289, time 5018.97ms 
iter 2134: loss 2.8080, time 5024.24ms 
iter 2135: loss 2.7126, time 5022.54ms 
iter 2136: loss 2.7464, time 4971.84ms 
iter 2137: loss 2.8482, time 4918.28ms 
iter 2138: loss 2.9429, time 5018.55ms 
iter 2139: loss 2.8733, time 5024.29ms 
iter 2140: loss 2.9438, time 5022.17ms 
iter 2141: loss 2.9344, time 5015.71ms 
iter 2142: loss 2.7065, time 5034.71ms 
iter 2143: loss 2.9259, time 5015.62ms 
iter 2144: loss 2.7052, time 5014.72ms 
iter 2145: loss 2.8719, time 4992.36ms 
iter 2146: loss 2.9265, time 4997.21ms 
iter 2147: loss 2.6516, time 5032.49ms 
iter 2148: loss 2.7368, time 5025.84ms 
iter 2149: loss 2.8770, time 5030.94ms 
step 2150: train loss 2.8498, val loss 2.9452
iter 2150: loss 2.7998, time 19600.54ms 
iter 2151: loss 2.7585, time 4984.27ms 
iter 2152: loss 2.7726, time 5020.94ms 
iter 2153: loss 3.0572, time 5021.44ms 
iter 2154: loss 2.7492, time 5019.51ms 
iter 2155: loss 2.8258, time 5017.96ms 
iter 2156: loss 2.7196, time 5018.59ms 
iter 2157: loss 2.7145, time 5020.67ms 
iter 2158: loss 2.7736, time 4965.33ms 
iter 2159: loss 2.9604, time 4913.44ms 
iter 2160: loss 3.1243, time 4998.59ms 
iter 2161: loss 2.9205, time 5022.60ms 
iter 2162: loss 2.6944, time 5023.15ms 
iter 2163: loss 2.9834, time 5022.13ms 
iter 2164: loss 2.9400, time 5023.88ms 
iter 2165: loss 2.7121, time 5016.60ms 
iter 2166: loss 2.8220, time 5024.03ms 
iter 2167: loss 2.7779, time 4972.31ms 
iter 2168: loss 2.7582, time 4985.31ms 
iter 2169: loss 2.8651, time 5021.62ms 
iter 2170: loss 2.9841, time 5021.10ms 
iter 2171: loss 2.8492, time 5022.13ms 
iter 2172: loss 2.7646, time 5023.48ms 
iter 2173: loss 2.8776, time 5025.21ms 
iter 2174: loss 2.7912, time 5022.86ms 
iter 2175: loss 2.8197, time 4971.06ms 
iter 2176: loss 2.7727, time 4974.92ms 
iter 2177: loss 2.6743, time 5012.43ms 
iter 2178: loss 2.9227, time 5004.75ms 
iter 2179: loss 2.9170, time 5022.77ms 
iter 2180: loss 2.9439, time 5023.22ms 
iter 2181: loss 2.7408, time 5022.04ms 
iter 2182: loss 2.9633, time 5019.05ms 
iter 2183: loss 2.9575, time 4971.47ms 
iter 2184: loss 2.6098, time 4956.20ms 
iter 2185: loss 3.0172, time 5006.29ms 
iter 2186: loss 2.9519, time 5022.99ms 
iter 2187: loss 2.9084, time 5030.38ms 
iter 2188: loss 2.9037, time 5013.34ms 
iter 2189: loss 2.9195, time 5030.14ms 
iter 2190: loss 2.8217, time 5006.73ms 
iter 2191: loss 2.7310, time 5026.62ms 
iter 2192: loss 2.9028, time 5000.33ms 
iter 2193: loss 2.7874, time 5022.02ms 
iter 2194: loss 2.8170, time 5026.01ms 
iter 2195: loss 2.8923, time 4995.75ms 
iter 2196: loss 2.8688, time 5023.37ms 
iter 2197: loss 2.6916, time 5023.18ms 
iter 2198: loss 2.8414, time 5028.97ms 
iter 2199: loss 2.8674, time 4972.41ms 
step 2200: train loss 2.8405, val loss 2.9239
iter 2200: loss 2.7024, time 19672.19ms 
iter 2201: loss 2.6686, time 5007.95ms 
iter 2202: loss 2.8791, time 5026.26ms 
iter 2203: loss 2.8936, time 5014.27ms 
iter 2204: loss 3.0139, time 4976.90ms 
iter 2205: loss 2.8645, time 4922.02ms 
iter 2206: loss 2.8280, time 4970.18ms 
iter 2207: loss 2.8601, time 5032.63ms 
iter 2208: loss 2.7835, time 5003.25ms 
iter 2209: loss 2.9188, time 5022.08ms 
iter 2210: loss 2.9684, time 5019.27ms 
iter 2211: loss 2.6345, time 5022.83ms 
iter 2212: loss 2.8556, time 5015.79ms 
iter 2213: loss 2.7736, time 4925.79ms 
iter 2214: loss 3.0020, time 4916.03ms 
iter 2215: loss 2.9680, time 4999.47ms 
iter 2216: loss 2.6491, time 5023.74ms 
iter 2217: loss 2.8717, time 5019.76ms 
iter 2218: loss 2.4685, time 5000.28ms 
iter 2219: loss 2.7731, time 5021.92ms 
iter 2220: loss 2.8699, time 5022.32ms 
iter 2221: loss 2.7172, time 5026.75ms 
iter 2222: loss 2.9837, time 4974.47ms 
iter 2223: loss 3.0793, time 4961.51ms 
iter 2224: loss 3.0832, time 5008.44ms 
iter 2225: loss 2.8325, time 5024.80ms 
iter 2226: loss 2.9301, time 5023.77ms 
iter 2227: loss 2.8881, time 5025.87ms 
iter 2228: loss 3.1042, time 5025.56ms 
iter 2229: loss 3.0207, time 4980.06ms 
iter 2230: loss 3.0830, time 4958.32ms 
iter 2231: loss 2.7065, time 4916.57ms 
iter 2232: loss 2.6154, time 5003.36ms 
iter 2233: loss 2.7973, time 5029.91ms 
iter 2234: loss 2.6481, time 5022.22ms 
iter 2235: loss 2.8635, time 5017.31ms 
iter 2236: loss 2.7240, time 5023.00ms 
iter 2237: loss 2.8477, time 5019.81ms 
iter 2238: loss 2.8146, time 5035.35ms 
iter 2239: loss 2.9944, time 4958.89ms 
iter 2240: loss 2.8994, time 4988.04ms 
iter 2241: loss 2.8836, time 5037.21ms 
iter 2242: loss 2.9068, time 5029.55ms 
iter 2243: loss 2.7419, time 5026.78ms 
iter 2244: loss 2.8842, time 5024.87ms 
iter 2245: loss 2.7486, time 5022.23ms 
iter 2246: loss 2.7255, time 5001.61ms 
iter 2247: loss 2.8350, time 4975.12ms 
iter 2248: loss 2.8440, time 4998.23ms 
iter 2249: loss 2.8146, time 5028.54ms 
step 2250: train loss 2.8200, val loss 2.9337
iter 2250: loss 2.8055, time 19730.98ms 
iter 2251: loss 2.6472, time 5035.56ms 
iter 2252: loss 2.8110, time 4983.39ms 
iter 2253: loss 2.9402, time 4937.98ms 
iter 2254: loss 2.9092, time 5012.14ms 
iter 2255: loss 2.8160, time 5032.58ms 
iter 2256: loss 3.0348, time 5030.38ms 
iter 2257: loss 2.7608, time 5040.27ms 
iter 2258: loss 2.7172, time 5030.00ms 
iter 2259: loss 2.7824, time 5018.05ms 
iter 2260: loss 2.8010, time 4973.74ms 
iter 2261: loss 2.7820, time 4979.19ms 
iter 2262: loss 2.9092, time 4994.82ms 
iter 2263: loss 2.9595, time 5027.25ms 
iter 2264: loss 2.9518, time 5023.25ms 
iter 2265: loss 2.7379, time 5011.91ms 
iter 2266: loss 2.8584, time 5022.60ms 
iter 2267: loss 2.6573, time 5022.29ms 
iter 2268: loss 2.8239, time 4970.27ms 
iter 2269: loss 2.8976, time 4947.25ms 
iter 2270: loss 2.8529, time 5021.59ms 
iter 2271: loss 2.8090, time 5025.95ms 
iter 2272: loss 2.7579, time 4978.61ms 
iter 2273: loss 2.9455, time 4913.96ms 
iter 2274: loss 2.9655, time 4914.77ms 
iter 2275: loss 2.6254, time 4913.76ms 
iter 2276: loss 2.7940, time 4914.57ms 
iter 2277: loss 2.7156, time 4973.75ms 
iter 2278: loss 2.8320, time 5021.74ms 
iter 2279: loss 2.8087, time 5016.85ms 
iter 2280: loss 2.6881, time 4954.49ms 
iter 2281: loss 2.7514, time 4980.67ms 
iter 2282: loss 2.9463, time 4979.04ms 
iter 2283: loss 2.8221, time 4973.13ms 
iter 2284: loss 2.7757, time 4950.60ms 
iter 2285: loss 2.8316, time 4924.78ms 
iter 2286: loss 2.8346, time 4931.52ms 
iter 2287: loss 3.0848, time 5015.32ms 
iter 2288: loss 3.0302, time 5029.34ms 
iter 2289: loss 2.6598, time 5018.35ms 
iter 2290: loss 2.7953, time 4977.61ms 
iter 2291: loss 2.7325, time 5028.09ms 
iter 2292: loss 2.7282, time 5028.79ms 
iter 2293: loss 2.9105, time 4970.25ms 
iter 2294: loss 2.7254, time 4928.95ms 
iter 2295: loss 2.9108, time 5032.22ms 
iter 2296: loss 2.8928, time 5024.60ms 
iter 2297: loss 2.7827, time 5025.89ms 
iter 2298: loss 3.1083, time 5033.11ms 
iter 2299: loss 2.7229, time 5037.39ms 
step 2300: train loss 2.8296, val loss 2.9303
iter 2300: loss 2.9827, time 19740.21ms 
iter 2301: loss 2.5794, time 5024.49ms 
iter 2302: loss 2.9190, time 5030.72ms 
iter 2303: loss 2.8801, time 5013.23ms 
iter 2304: loss 2.7941, time 5029.52ms 
iter 2305: loss 2.6709, time 4999.42ms 
iter 2306: loss 2.7438, time 4979.84ms 
iter 2307: loss 2.8131, time 4925.11ms 
iter 2308: loss 2.9192, time 5029.66ms 
iter 2309: loss 2.7568, time 5026.66ms 
iter 2310: loss 2.6897, time 5014.69ms 
iter 2311: loss 2.6890, time 4989.56ms 
iter 2312: loss 2.8668, time 5020.14ms 
iter 2313: loss 2.8020, time 5027.94ms 
iter 2314: loss 2.7169, time 5019.35ms 
iter 2315: loss 2.8050, time 4934.25ms 
iter 2316: loss 2.9976, time 4993.71ms 
iter 2317: loss 2.9398, time 5030.23ms 
iter 2318: loss 2.7800, time 5030.43ms 
iter 2319: loss 3.0245, time 5009.43ms 
iter 2320: loss 2.7913, time 5021.99ms 
iter 2321: loss 2.6608, time 5023.57ms 
iter 2322: loss 3.1833, time 5033.83ms 
iter 2323: loss 2.8868, time 4972.84ms 
iter 2324: loss 2.9056, time 4982.84ms 
iter 2325: loss 2.9037, time 5000.15ms 
iter 2326: loss 2.8225, time 5007.35ms 
iter 2327: loss 2.7594, time 5008.86ms 
iter 2328: loss 2.7768, time 5012.10ms 
iter 2329: loss 2.7752, time 5019.99ms 
iter 2330: loss 2.9038, time 5004.04ms 
iter 2331: loss 2.9719, time 4980.55ms 
iter 2332: loss 2.7603, time 4983.96ms 
iter 2333: loss 2.8998, time 4999.01ms 
iter 2334: loss 2.7416, time 4976.37ms 
iter 2335: loss 2.8049, time 5034.05ms 
iter 2336: loss 2.6438, time 5035.07ms 
iter 2337: loss 2.7359, time 5007.89ms 
iter 2338: loss 2.7525, time 5040.67ms 
iter 2339: loss 2.8639, time 4981.87ms 
iter 2340: loss 2.7578, time 4917.21ms 
iter 2341: loss 2.7727, time 4999.62ms 
iter 2342: loss 2.7697, time 5018.32ms 
iter 2343: loss 2.7821, time 4995.06ms 
iter 2344: loss 2.8134, time 5001.59ms 
iter 2345: loss 2.8314, time 5021.85ms 
iter 2346: loss 2.7191, time 5027.90ms 
iter 2347: loss 2.8218, time 5032.38ms 
iter 2348: loss 2.8218, time 4951.24ms 
iter 2349: loss 2.7298, time 4915.76ms 
step 2350: train loss 2.8104, val loss 2.9370
iter 2350: loss 2.7125, time 19682.75ms 
iter 2351: loss 3.0874, time 5029.29ms 
iter 2352: loss 2.9628, time 5034.62ms 
iter 2353: loss 2.7509, time 5027.93ms 
iter 2354: loss 2.8729, time 5029.58ms 
iter 2355: loss 3.0279, time 4978.34ms 
iter 2356: loss 2.7929, time 4974.62ms 
iter 2357: loss 2.5146, time 5026.98ms 
iter 2358: loss 2.6216, time 4989.19ms 
iter 2359: loss 2.7904, time 5026.57ms 
iter 2360: loss 2.7668, time 5034.29ms 
iter 2361: loss 2.7712, time 5020.41ms 
iter 2362: loss 2.8865, time 5036.67ms 
iter 2363: loss 2.9801, time 4979.34ms 
iter 2364: loss 2.8007, time 4925.05ms 
iter 2365: loss 3.0819, time 5032.93ms 
iter 2366: loss 2.8202, time 5032.92ms 
iter 2367: loss 2.8091, time 5028.95ms 
iter 2368: loss 2.7686, time 5027.99ms 
iter 2369: loss 2.7984, time 5031.09ms 
iter 2370: loss 2.5186, time 5025.10ms 
iter 2371: loss 2.7527, time 5029.81ms 
iter 2372: loss 2.7534, time 4981.74ms 
iter 2373: loss 2.6709, time 5028.81ms 
iter 2374: loss 2.8938, time 5028.14ms 
iter 2375: loss 2.6528, time 5027.49ms 
iter 2376: loss 2.9231, time 5037.73ms 
iter 2377: loss 2.7627, time 5028.73ms 
iter 2378: loss 2.8950, time 5027.00ms 
iter 2379: loss 2.6481, time 5031.56ms 
iter 2380: loss 2.6472, time 4935.35ms 
iter 2381: loss 2.8622, time 5021.21ms 
iter 2382: loss 2.6974, time 5031.85ms 
iter 2383: loss 2.8182, time 5031.79ms 
iter 2384: loss 2.8295, time 5027.31ms 
iter 2385: loss 2.7426, time 5028.33ms 
iter 2386: loss 2.7873, time 5028.98ms 
iter 2387: loss 2.8102, time 5030.04ms 
iter 2388: loss 2.8247, time 4993.61ms 
iter 2389: loss 2.7744, time 5026.95ms 
iter 2390: loss 2.7041, time 5025.99ms 
iter 2391: loss 2.8299, time 5029.56ms 
iter 2392: loss 2.9042, time 5026.28ms 
iter 2393: loss 2.8160, time 5021.02ms 
iter 2394: loss 2.8154, time 5024.72ms 
iter 2395: loss 2.6334, time 5029.81ms 
iter 2396: loss 2.6468, time 4976.88ms 
iter 2397: loss 3.0042, time 4987.44ms 
iter 2398: loss 2.7367, time 4995.87ms 
iter 2399: loss 2.8623, time 5029.82ms 
step 2400: train loss 2.8096, val loss 2.9259
iter 2400: loss 2.9648, time 19668.98ms 
iter 2401: loss 2.8608, time 4977.31ms 
iter 2402: loss 2.9653, time 5009.36ms 
iter 2403: loss 2.7957, time 5028.50ms 
iter 2404: loss 2.7483, time 5027.24ms 
iter 2405: loss 2.7128, time 5017.81ms 
iter 2406: loss 2.7921, time 5029.16ms 
iter 2407: loss 2.7207, time 5024.20ms 
iter 2408: loss 2.7106, time 5027.77ms 
iter 2409: loss 2.7343, time 4979.13ms 
iter 2410: loss 2.7789, time 4988.81ms 
iter 2411: loss 2.7149, time 5017.26ms 
iter 2412: loss 2.9058, time 4974.44ms 
iter 2413: loss 2.8009, time 5016.34ms 
iter 2414: loss 2.7840, time 5023.92ms 
iter 2415: loss 2.8705, time 5015.28ms 
iter 2416: loss 2.7793, time 5027.94ms 
iter 2417: loss 2.7352, time 4913.56ms 
iter 2418: loss 2.7720, time 4914.17ms 
iter 2419: loss 2.6965, time 4968.63ms 
iter 2420: loss 2.8603, time 5026.41ms 
iter 2421: loss 2.7693, time 5021.16ms 
iter 2422: loss 2.6924, time 5001.06ms 
iter 2423: loss 2.6738, time 5021.70ms 
iter 2424: loss 2.9239, time 5019.00ms 
iter 2425: loss 2.9912, time 5019.26ms 
iter 2426: loss 2.8281, time 4971.55ms 
iter 2427: loss 2.7838, time 4915.49ms 
iter 2428: loss 2.8978, time 5009.91ms 
iter 2429: loss 2.7746, time 5021.17ms 
iter 2430: loss 2.8228, time 5022.20ms 
iter 2431: loss 2.7855, time 5021.98ms 
iter 2432: loss 3.0195, time 5022.28ms 
iter 2433: loss 2.7494, time 5022.60ms 
iter 2434: loss 2.7751, time 5021.24ms 
iter 2435: loss 2.5353, time 4971.95ms 
iter 2436: loss 2.8599, time 4971.66ms 
iter 2437: loss 2.6310, time 5021.23ms 
iter 2438: loss 2.7441, time 5021.37ms 
iter 2439: loss 2.8109, time 5022.75ms 
iter 2440: loss 2.8282, time 5020.81ms 
iter 2441: loss 2.7961, time 5025.56ms 
iter 2442: loss 2.6782, time 5024.94ms 
iter 2443: loss 2.8260, time 4984.70ms 
iter 2444: loss 2.9031, time 5018.77ms 
iter 2445: loss 2.8247, time 5027.98ms 
iter 2446: loss 3.1665, time 5017.45ms 
iter 2447: loss 3.1131, time 5029.49ms 
iter 2448: loss 2.9407, time 5033.06ms 
iter 2449: loss 2.7290, time 5030.77ms 
step 2450: train loss 2.8005, val loss 2.9018
iter 2450: loss 2.7895, time 19674.85ms 
iter 2451: loss 2.9532, time 5031.08ms 
iter 2452: loss 2.7664, time 5014.17ms 
iter 2453: loss 2.7020, time 5027.87ms 
iter 2454: loss 2.6208, time 5029.33ms 
iter 2455: loss 2.8814, time 5030.36ms 
iter 2456: loss 2.8474, time 5030.51ms 
iter 2457: loss 2.6860, time 4956.61ms 
iter 2458: loss 2.9097, time 4984.59ms 
iter 2459: loss 2.8120, time 5030.35ms 
iter 2460: loss 2.7771, time 5027.92ms 
iter 2461: loss 2.9094, time 5026.44ms 
iter 2462: loss 3.0679, time 5028.00ms 
iter 2463: loss 2.8160, time 5011.49ms 
iter 2464: loss 2.7226, time 5031.46ms 
iter 2465: loss 2.8682, time 4977.22ms 
iter 2466: loss 2.8833, time 4944.32ms 
iter 2467: loss 3.0056, time 4932.74ms 
iter 2468: loss 2.9515, time 5027.35ms 
iter 2469: loss 2.7625, time 5027.45ms 
iter 2470: loss 2.8800, time 5026.67ms 
iter 2471: loss 2.7715, time 5028.56ms 
iter 2472: loss 2.8578, time 5023.17ms 
iter 2473: loss 2.9604, time 5032.11ms 
iter 2474: loss 2.7508, time 4983.50ms 
iter 2475: loss 2.8661, time 4942.89ms 
iter 2476: loss 2.7795, time 5008.48ms 
iter 2477: loss 2.6581, time 5009.87ms 
iter 2478: loss 2.8682, time 5029.69ms 
iter 2479: loss 2.7978, time 5028.19ms 
iter 2480: loss 2.8299, time 5026.14ms 
iter 2481: loss 2.8839, time 5020.87ms 
iter 2482: loss 2.8928, time 5011.64ms 
iter 2483: loss 2.7346, time 4969.33ms 
iter 2484: loss 2.7517, time 4917.33ms 
iter 2485: loss 2.8096, time 4968.99ms 
iter 2486: loss 2.8546, time 5021.18ms 
iter 2487: loss 2.8571, time 5031.29ms 
iter 2488: loss 2.7553, time 5020.16ms 
iter 2489: loss 2.9190, time 5024.83ms 
iter 2490: loss 2.8584, time 5027.87ms 
iter 2491: loss 2.6608, time 4990.41ms 
iter 2492: loss 2.7888, time 4962.04ms 
iter 2493: loss 2.8755, time 5028.19ms 
iter 2494: loss 3.1566, time 5029.76ms 
iter 2495: loss 2.8663, time 5026.49ms 
iter 2496: loss 2.8231, time 5015.77ms 
iter 2497: loss 2.7364, time 5029.16ms 
iter 2498: loss 2.6372, time 5035.96ms 
iter 2499: loss 2.8369, time 4977.86ms 
step 2500: train loss 2.7867, val loss 2.9112
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 2500: loss 2.6833, time 20762.43ms 
iter 2501: loss 2.7535, time 4990.88ms 
iter 2502: loss 2.9621, time 4952.39ms 
iter 2503: loss 2.9338, time 4966.13ms 
iter 2504: loss 2.7746, time 4924.07ms 
iter 2505: loss 2.8572, time 5010.93ms 
iter 2506: loss 2.8127, time 5026.10ms 
iter 2507: loss 2.7997, time 5027.58ms 
iter 2508: loss 2.9784, time 5024.86ms 
iter 2509: loss 2.6818, time 5008.95ms 
iter 2510: loss 3.0015, time 5022.91ms 
iter 2511: loss 2.6784, time 4957.09ms 
iter 2512: loss 2.7047, time 4913.72ms 
iter 2513: loss 2.8493, time 4945.02ms 
iter 2514: loss 2.7923, time 5021.62ms 
iter 2515: loss 2.7331, time 5014.78ms 
iter 2516: loss 2.6784, time 4996.64ms 
iter 2517: loss 2.8077, time 4995.17ms 
iter 2518: loss 2.7532, time 5011.30ms 
iter 2519: loss 2.6628, time 5029.09ms 
iter 2520: loss 2.8003, time 4964.57ms 
iter 2521: loss 2.6744, time 4943.59ms 
iter 2522: loss 2.5890, time 5013.53ms 
iter 2523: loss 2.6630, time 5005.02ms 
iter 2524: loss 2.7684, time 5025.65ms 
iter 2525: loss 2.7057, time 5007.91ms 
iter 2526: loss 2.8782, time 5024.09ms 
iter 2527: loss 2.6544, time 5003.85ms 
iter 2528: loss 2.7319, time 4982.53ms 
iter 2529: loss 2.7981, time 4992.10ms 
iter 2530: loss 2.7612, time 5045.44ms 
iter 2531: loss 2.6491, time 5052.39ms 
iter 2532: loss 2.7832, time 5043.77ms 
iter 2533: loss 2.7107, time 5039.10ms 
iter 2534: loss 2.6925, time 5023.29ms 
iter 2535: loss 2.8070, time 5022.00ms 
iter 2536: loss 2.9314, time 4980.09ms 
iter 2537: loss 2.8717, time 4976.06ms 
iter 2538: loss 2.8730, time 5023.47ms 
iter 2539: loss 2.7999, time 5025.41ms 
iter 2540: loss 2.8844, time 5028.22ms 
iter 2541: loss 2.7391, time 5026.46ms 
iter 2542: loss 2.6379, time 5028.30ms 
iter 2543: loss 2.7675, time 5027.99ms 
iter 2544: loss 2.6495, time 5015.80ms 
iter 2545: loss 2.6926, time 5004.25ms 
iter 2546: loss 2.8391, time 5027.61ms 
iter 2547: loss 2.8642, time 5026.88ms 
iter 2548: loss 2.9815, time 5025.94ms 
iter 2549: loss 2.9783, time 5027.99ms 
step 2550: train loss 2.7759, val loss 2.9143
iter 2550: loss 2.6591, time 19679.66ms 
iter 2551: loss 2.7958, time 4993.77ms 
iter 2552: loss 2.8312, time 4999.27ms 
iter 2553: loss 2.7117, time 4934.82ms 
iter 2554: loss 2.9912, time 4927.50ms 
iter 2555: loss 2.7032, time 5004.67ms 
iter 2556: loss 2.7771, time 5033.17ms 
iter 2557: loss 2.9396, time 5004.35ms 
iter 2558: loss 2.9038, time 5028.12ms 
iter 2559: loss 2.8697, time 5023.68ms 
iter 2560: loss 2.9839, time 5029.36ms 
iter 2561: loss 2.8379, time 5013.75ms 
iter 2562: loss 2.8955, time 5007.83ms 
iter 2563: loss 2.4969, time 5032.74ms 
iter 2564: loss 2.5411, time 4962.87ms 
iter 2565: loss 2.8039, time 4977.29ms 
iter 2566: loss 2.8191, time 5029.53ms 
iter 2567: loss 2.6090, time 5026.81ms 
iter 2568: loss 2.6394, time 5020.95ms 
iter 2569: loss 2.6613, time 5027.31ms 
iter 2570: loss 2.7000, time 5029.28ms 
iter 2571: loss 2.5265, time 5028.84ms 
iter 2572: loss 3.0700, time 5034.99ms 
iter 2573: loss 2.7609, time 5022.35ms 
iter 2574: loss 2.6506, time 4997.02ms 
iter 2575: loss 2.7584, time 5029.74ms 
iter 2576: loss 2.7998, time 5007.63ms 
iter 2577: loss 2.8409, time 5030.09ms 
iter 2578: loss 2.6684, time 5031.94ms 
iter 2579: loss 2.7295, time 5030.74ms 
iter 2580: loss 2.6652, time 4979.59ms 
iter 2581: loss 2.7887, time 4984.82ms 
iter 2582: loss 2.5714, time 5024.56ms 
iter 2583: loss 2.7129, time 5000.20ms 
iter 2584: loss 3.0792, time 5028.46ms 
iter 2585: loss 2.7360, time 5029.66ms 
iter 2586: loss 2.8688, time 5028.95ms 
iter 2587: loss 2.6137, time 5027.15ms 
iter 2588: loss 3.0008, time 4973.16ms 
iter 2589: loss 2.7304, time 4967.74ms 
iter 2590: loss 2.9178, time 4980.03ms 
iter 2591: loss 2.6521, time 5004.90ms 
iter 2592: loss 2.8761, time 5005.84ms 
iter 2593: loss 2.8059, time 5024.69ms 
iter 2594: loss 2.7126, time 5013.62ms 
iter 2595: loss 2.8632, time 5027.36ms 
iter 2596: loss 2.6635, time 5031.64ms 
iter 2597: loss 2.7387, time 4977.92ms 
iter 2598: loss 2.8222, time 5022.27ms 
iter 2599: loss 2.9029, time 5029.85ms 
step 2600: train loss 2.7875, val loss 2.9085
iter 2600: loss 2.8835, time 19681.31ms 
iter 2601: loss 2.8801, time 5026.42ms 
iter 2602: loss 2.6678, time 4973.92ms 
iter 2603: loss 2.8436, time 4998.13ms 
iter 2604: loss 2.9417, time 5021.46ms 
iter 2605: loss 2.5595, time 5027.13ms 
iter 2606: loss 2.5442, time 5028.16ms 
iter 2607: loss 2.6419, time 5022.60ms 
iter 2608: loss 2.8371, time 4992.81ms 
iter 2609: loss 2.6629, time 5043.98ms 
iter 2610: loss 2.6344, time 5026.40ms 
iter 2611: loss 2.7964, time 4935.36ms 
iter 2612: loss 2.8858, time 4924.02ms 
iter 2613: loss 2.5635, time 4927.38ms 
iter 2614: loss 2.6630, time 4927.51ms 
iter 2615: loss 2.5116, time 4928.02ms 
iter 2616: loss 2.7536, time 4977.15ms 
iter 2617: loss 2.9514, time 5020.73ms 
iter 2618: loss 2.7113, time 4958.57ms 
iter 2619: loss 2.9626, time 5007.72ms 
iter 2620: loss 2.4871, time 5026.16ms 
iter 2621: loss 2.7041, time 5041.55ms 
iter 2622: loss 2.9297, time 5031.12ms 
iter 2623: loss 2.8737, time 5027.52ms 
iter 2624: loss 2.6476, time 5031.90ms 
iter 2625: loss 2.8471, time 5034.47ms 
iter 2626: loss 2.7282, time 4988.10ms 
iter 2627: loss 2.9114, time 5031.14ms 
iter 2628: loss 2.6076, time 5034.16ms 
iter 2629: loss 2.7420, time 5022.78ms 
iter 2630: loss 2.6535, time 5013.28ms 
iter 2631: loss 2.9180, time 5006.50ms 
iter 2632: loss 2.6500, time 5028.44ms 
iter 2633: loss 2.6177, time 5032.66ms 
iter 2634: loss 2.7995, time 5014.19ms 
iter 2635: loss 2.8209, time 5015.40ms 
iter 2636: loss 2.7968, time 5007.47ms 
iter 2637: loss 2.7373, time 5030.50ms 
iter 2638: loss 2.7575, time 5029.58ms 
iter 2639: loss 2.8942, time 5026.70ms 
iter 2640: loss 2.8738, time 5027.28ms 
iter 2641: loss 2.7543, time 5034.72ms 
iter 2642: loss 2.6713, time 5023.91ms 
iter 2643: loss 2.8814, time 5019.48ms 
iter 2644: loss 2.6244, time 5024.28ms 
iter 2645: loss 2.5844, time 5021.66ms 
iter 2646: loss 2.7769, time 5017.30ms 
iter 2647: loss 2.7516, time 4993.03ms 
iter 2648: loss 2.7325, time 5015.69ms 
iter 2649: loss 2.7007, time 4982.89ms 
step 2650: train loss 2.7601, val loss 2.8970
iter 2650: loss 2.7407, time 19675.12ms 
iter 2651: loss 2.8519, time 5027.08ms 
iter 2652: loss 2.8490, time 5027.55ms 
iter 2653: loss 2.9547, time 5029.91ms 
iter 2654: loss 2.7855, time 4934.13ms 
iter 2655: loss 2.8265, time 5004.00ms 
iter 2656: loss 2.8044, time 5027.73ms 
iter 2657: loss 2.6899, time 5007.17ms 
iter 2658: loss 2.7396, time 5032.98ms 
iter 2659: loss 2.8720, time 5033.18ms 
iter 2660: loss 2.7796, time 5028.16ms 
iter 2661: loss 2.7886, time 4936.06ms 
iter 2662: loss 2.6596, time 4921.78ms 
iter 2663: loss 2.6188, time 4920.73ms 
iter 2664: loss 2.6758, time 4984.60ms 
iter 2665: loss 2.8002, time 5025.14ms 
iter 2666: loss 2.7441, time 5025.98ms 
iter 2667: loss 2.7913, time 5028.43ms 
iter 2668: loss 2.9035, time 5027.54ms 
iter 2669: loss 2.5713, time 5026.37ms 
iter 2670: loss 2.7045, time 5033.98ms 
iter 2671: loss 2.9816, time 4975.56ms 
iter 2672: loss 2.7685, time 5008.31ms 
iter 2673: loss 2.7583, time 5026.44ms 
iter 2674: loss 2.7497, time 5029.67ms 
iter 2675: loss 2.9083, time 5028.72ms 
iter 2676: loss 2.8250, time 5025.11ms 
iter 2677: loss 2.6957, time 5025.40ms 
iter 2678: loss 2.7564, time 5029.11ms 
iter 2679: loss 2.6762, time 4981.71ms 
iter 2680: loss 2.8926, time 4951.72ms 
iter 2681: loss 2.5710, time 5028.74ms 
iter 2682: loss 2.6276, time 5038.82ms 
iter 2683: loss 2.5521, time 5032.23ms 
iter 2684: loss 2.7520, time 5030.10ms 
iter 2685: loss 2.8886, time 5029.18ms 
iter 2686: loss 2.7247, time 5023.98ms 
iter 2687: loss 2.8708, time 5024.92ms 
iter 2688: loss 2.7096, time 4970.82ms 
iter 2689: loss 2.8440, time 4931.05ms 
iter 2690: loss 2.9000, time 5024.03ms 
iter 2691: loss 2.8197, time 5022.03ms 
iter 2692: loss 2.8378, time 5027.16ms 
iter 2693: loss 2.7459, time 5026.04ms 
iter 2694: loss 2.7439, time 5021.70ms 
iter 2695: loss 2.7444, time 5025.48ms 
iter 2696: loss 2.4829, time 4996.61ms 
iter 2697: loss 2.9110, time 5004.24ms 
iter 2698: loss 2.9350, time 5021.60ms 
iter 2699: loss 2.7175, time 5024.19ms 
step 2700: train loss 2.7540, val loss 2.9137
iter 2700: loss 2.7843, time 19680.54ms 
iter 2701: loss 2.7742, time 5025.21ms 
iter 2702: loss 2.4904, time 4976.26ms 
iter 2703: loss 2.6576, time 5010.28ms 
iter 2704: loss 2.7447, time 5020.59ms 
iter 2705: loss 2.6917, time 5018.37ms 
iter 2706: loss 2.7314, time 5010.16ms 
iter 2707: loss 2.7468, time 5013.70ms 
iter 2708: loss 2.7474, time 4997.92ms 
iter 2709: loss 2.7599, time 5017.99ms 
iter 2710: loss 2.7641, time 4935.78ms 
iter 2711: loss 2.8512, time 4954.60ms 
iter 2712: loss 2.8466, time 4997.07ms 
iter 2713: loss 2.6845, time 5024.12ms 
iter 2714: loss 2.8556, time 5023.24ms 
iter 2715: loss 2.6539, time 5021.01ms 
iter 2716: loss 2.7997, time 5022.29ms 
iter 2717: loss 2.6945, time 5022.85ms 
iter 2718: loss 2.6834, time 4969.86ms 
iter 2719: loss 2.9390, time 4925.69ms 
iter 2720: loss 2.7521, time 5017.30ms 
iter 2721: loss 2.7457, time 5024.89ms 
iter 2722: loss 2.7738, time 5022.32ms 
iter 2723: loss 2.7922, time 5013.77ms 
iter 2724: loss 2.8908, time 5022.44ms 
iter 2725: loss 2.6975, time 5011.08ms 
iter 2726: loss 2.7186, time 4996.68ms 
iter 2727: loss 2.6137, time 4944.56ms 
iter 2728: loss 2.7443, time 5020.46ms 
iter 2729: loss 2.6018, time 5009.49ms 
iter 2730: loss 2.7715, time 5021.38ms 
iter 2731: loss 2.5649, time 5021.01ms 
iter 2732: loss 2.7495, time 5008.57ms 
iter 2733: loss 2.7530, time 5021.78ms 
iter 2734: loss 2.8068, time 5025.77ms 
iter 2735: loss 3.0524, time 5018.02ms 
iter 2736: loss 2.6212, time 5015.34ms 
iter 2737: loss 2.9283, time 5020.36ms 
iter 2738: loss 2.9425, time 5020.05ms 
iter 2739: loss 2.6604, time 5026.77ms 
iter 2740: loss 2.8817, time 5027.24ms 
iter 2741: loss 2.7296, time 5029.23ms 
iter 2742: loss 2.9643, time 5032.01ms 
iter 2743: loss 2.6605, time 4979.11ms 
iter 2744: loss 2.9196, time 5015.49ms 
iter 2745: loss 2.7745, time 5025.62ms 
iter 2746: loss 2.7540, time 5023.65ms 
iter 2747: loss 2.7018, time 5026.14ms 
iter 2748: loss 2.9083, time 5015.96ms 
iter 2749: loss 2.6961, time 5027.17ms 
step 2750: train loss 2.7763, val loss 2.8984
iter 2750: loss 2.8521, time 19691.78ms 
iter 2751: loss 2.7937, time 5022.58ms 
iter 2752: loss 2.6930, time 5026.30ms 
iter 2753: loss 2.9465, time 5030.96ms 
iter 2754: loss 2.7470, time 5029.12ms 
iter 2755: loss 2.7012, time 5033.02ms 
iter 2756: loss 2.7373, time 4978.45ms 
iter 2757: loss 2.8504, time 5016.11ms 
iter 2758: loss 2.7702, time 4988.28ms 
iter 2759: loss 2.6457, time 5026.13ms 
iter 2760: loss 2.9450, time 5029.71ms 
iter 2761: loss 2.7143, time 5031.98ms 
iter 2762: loss 2.7671, time 5033.34ms 
iter 2763: loss 2.6391, time 5036.57ms 
iter 2764: loss 2.7016, time 4982.18ms 
iter 2765: loss 2.5887, time 4977.13ms 
iter 2766: loss 2.7407, time 5026.83ms 
iter 2767: loss 2.7957, time 5033.94ms 
iter 2768: loss 2.8174, time 5037.58ms 
iter 2769: loss 2.7186, time 5047.87ms 
iter 2770: loss 2.7354, time 5035.30ms 
iter 2771: loss 2.8160, time 5035.11ms 
iter 2772: loss 2.5882, time 5042.83ms 
iter 2773: loss 2.5838, time 5005.84ms 
iter 2774: loss 2.7991, time 5037.67ms 
iter 2775: loss 2.7708, time 5034.26ms 
iter 2776: loss 2.6801, time 5034.59ms 
iter 2777: loss 2.7901, time 5036.23ms 
iter 2778: loss 2.7077, time 4969.66ms 
iter 2779: loss 2.8545, time 4999.34ms 
iter 2780: loss 2.8244, time 4965.29ms 
iter 2781: loss 2.9679, time 4989.72ms 
iter 2782: loss 2.6580, time 5023.81ms 
iter 2783: loss 2.8548, time 5025.07ms 
iter 2784: loss 2.6024, time 5031.78ms 
iter 2785: loss 2.7344, time 5032.09ms 
iter 2786: loss 2.8099, time 5030.51ms 
iter 2787: loss 2.6883, time 5005.68ms 
iter 2788: loss 2.5256, time 4919.03ms 
iter 2789: loss 2.8358, time 5008.48ms 
iter 2790: loss 2.7415, time 5034.33ms 
iter 2791: loss 2.7404, time 5034.55ms 
iter 2792: loss 2.6395, time 5031.54ms 
iter 2793: loss 2.4735, time 5031.31ms 
iter 2794: loss 2.7274, time 5030.57ms 
iter 2795: loss 2.7301, time 5031.88ms 
iter 2796: loss 2.6633, time 4979.90ms 
iter 2797: loss 2.5515, time 4938.13ms 
iter 2798: loss 2.6904, time 5004.38ms 
iter 2799: loss 2.7070, time 5014.09ms 
step 2800: train loss 2.7530, val loss 2.8900
iter 2800: loss 2.6847, time 19730.22ms 
iter 2801: loss 2.8677, time 5038.00ms 
iter 2802: loss 2.6986, time 4976.60ms 
iter 2803: loss 2.8881, time 4931.57ms 
iter 2804: loss 2.9288, time 4929.63ms 
iter 2805: loss 2.8985, time 4928.58ms 
iter 2806: loss 2.8230, time 4929.86ms 
iter 2807: loss 2.8027, time 4929.90ms 
iter 2808: loss 2.7155, time 4951.60ms 
iter 2809: loss 2.8134, time 5033.27ms 
iter 2810: loss 2.6338, time 4988.38ms 
iter 2811: loss 2.7937, time 5027.57ms 
iter 2812: loss 2.7759, time 5022.81ms 
iter 2813: loss 2.6495, time 5028.80ms 
iter 2814: loss 2.8279, time 5034.90ms 
iter 2815: loss 2.9746, time 5035.51ms 
iter 2816: loss 2.6456, time 5030.86ms 
iter 2817: loss 2.8186, time 5030.93ms 
iter 2818: loss 2.6789, time 4935.58ms 
iter 2819: loss 2.7227, time 4974.82ms 
iter 2820: loss 3.0098, time 5043.81ms 
iter 2821: loss 2.7987, time 5028.70ms 
iter 2822: loss 2.8271, time 5027.65ms 
iter 2823: loss 2.7870, time 5013.10ms 
iter 2824: loss 2.8395, time 5014.38ms 
iter 2825: loss 2.8581, time 5028.15ms 
iter 2826: loss 2.8487, time 5031.82ms 
iter 2827: loss 2.7707, time 5016.20ms 
iter 2828: loss 2.7958, time 5027.99ms 
iter 2829: loss 2.8246, time 5029.53ms 
iter 2830: loss 2.8586, time 5026.05ms 
iter 2831: loss 2.7164, time 5030.82ms 
iter 2832: loss 2.7631, time 5009.40ms 
iter 2833: loss 2.7884, time 5025.56ms 
iter 2834: loss 2.6914, time 4997.68ms 
iter 2835: loss 2.8349, time 5029.08ms 
iter 2836: loss 2.6616, time 5026.71ms 
iter 2837: loss 2.5862, time 5028.36ms 
iter 2838: loss 2.7598, time 5030.33ms 
iter 2839: loss 2.7638, time 4998.04ms 
iter 2840: loss 2.7021, time 5018.89ms 
iter 2841: loss 2.9538, time 4972.11ms 
iter 2842: loss 2.7692, time 4928.53ms 
iter 2843: loss 2.7748, time 5025.71ms 
iter 2844: loss 2.7644, time 5018.31ms 
iter 2845: loss 2.7947, time 5000.57ms 
iter 2846: loss 2.9092, time 5027.34ms 
iter 2847: loss 2.7079, time 5027.74ms 
iter 2848: loss 2.5747, time 5026.80ms 
iter 2849: loss 2.9293, time 5028.72ms 
step 2850: train loss 2.7558, val loss 2.8712
iter 2850: loss 2.7406, time 19695.89ms 
iter 2851: loss 3.0330, time 5025.13ms 
iter 2852: loss 2.8563, time 5025.79ms 
iter 2853: loss 2.6914, time 5025.71ms 
iter 2854: loss 2.6192, time 5025.48ms 
iter 2855: loss 2.7788, time 4978.04ms 
iter 2856: loss 2.7189, time 4932.90ms 
iter 2857: loss 2.6483, time 5021.41ms 
iter 2858: loss 2.8203, time 5017.77ms 
iter 2859: loss 2.6388, time 5005.45ms 
iter 2860: loss 2.7957, time 5022.64ms 
iter 2861: loss 2.7717, time 5022.94ms 
iter 2862: loss 2.9246, time 5021.17ms 
iter 2863: loss 2.8371, time 5024.89ms 
iter 2864: loss 2.7867, time 4960.24ms 
iter 2865: loss 2.6210, time 4955.10ms 
iter 2866: loss 2.9710, time 5004.28ms 
iter 2867: loss 2.7040, time 5013.25ms 
iter 2868: loss 2.7261, time 5021.08ms 
iter 2869: loss 2.7255, time 5021.90ms 
iter 2870: loss 2.7339, time 5021.64ms 
iter 2871: loss 2.4225, time 5020.37ms 
iter 2872: loss 2.8600, time 5024.90ms 
iter 2873: loss 2.9939, time 4993.03ms 
iter 2874: loss 2.9176, time 5009.32ms 
iter 2875: loss 2.7108, time 5022.94ms 
iter 2876: loss 2.7327, time 5022.82ms 
iter 2877: loss 2.7578, time 5017.76ms 
iter 2878: loss 2.6891, time 5001.24ms 
iter 2879: loss 2.7478, time 5022.28ms 
iter 2880: loss 2.6692, time 5027.68ms 
iter 2881: loss 2.8094, time 5010.60ms 
iter 2882: loss 2.6875, time 4971.10ms 
iter 2883: loss 2.6923, time 5015.03ms 
iter 2884: loss 2.8875, time 5001.50ms 
iter 2885: loss 2.8192, time 5022.26ms 
iter 2886: loss 2.8783, time 5021.53ms 
iter 2887: loss 2.9525, time 5023.95ms 
iter 2888: loss 2.6606, time 5024.56ms 
iter 2889: loss 2.7270, time 4997.06ms 
iter 2890: loss 2.7454, time 4919.91ms 
iter 2891: loss 2.6511, time 5022.92ms 
iter 2892: loss 2.7185, time 5023.77ms 
iter 2893: loss 2.9920, time 5021.43ms 
iter 2894: loss 2.6715, time 5022.35ms 
iter 2895: loss 2.7682, time 5024.67ms 
iter 2896: loss 2.7669, time 5029.53ms 
iter 2897: loss 2.7555, time 5029.11ms 
iter 2898: loss 2.7530, time 4976.67ms 
iter 2899: loss 2.8797, time 4915.71ms 
step 2900: train loss 2.7415, val loss 2.8787
iter 2900: loss 2.9411, time 19662.97ms 
iter 2901: loss 2.8430, time 5022.04ms 
iter 2902: loss 2.7517, time 5027.16ms 
iter 2903: loss 2.7363, time 5027.43ms 
iter 2904: loss 2.8094, time 4975.42ms 
iter 2905: loss 2.8275, time 4995.08ms 
iter 2906: loss 2.5748, time 5023.80ms 
iter 2907: loss 2.5778, time 5026.49ms 
iter 2908: loss 2.5704, time 4992.10ms 
iter 2909: loss 2.5864, time 5026.19ms 
iter 2910: loss 2.8183, time 5024.28ms 
iter 2911: loss 2.4965, time 4999.57ms 
iter 2912: loss 2.6746, time 4914.82ms 
iter 2913: loss 2.9492, time 4936.85ms 
iter 2914: loss 2.8104, time 5020.22ms 
iter 2915: loss 2.6272, time 5024.74ms 
iter 2916: loss 2.8591, time 5026.57ms 
iter 2917: loss 2.8963, time 5027.79ms 
iter 2918: loss 2.7295, time 5028.41ms 
iter 2919: loss 2.6172, time 5012.42ms 
iter 2920: loss 2.6524, time 5030.99ms 
iter 2921: loss 2.8251, time 5004.49ms 
iter 2922: loss 2.8213, time 5028.27ms 
iter 2923: loss 2.7227, time 5026.46ms 
iter 2924: loss 2.9050, time 5025.59ms 
iter 2925: loss 2.8072, time 5010.05ms 
iter 2926: loss 2.8608, time 5025.96ms 
iter 2927: loss 2.5695, time 5025.74ms 
iter 2928: loss 2.6907, time 5030.87ms 
iter 2929: loss 2.6165, time 4976.07ms 
iter 2930: loss 2.4868, time 4987.92ms 
iter 2931: loss 2.7583, time 5007.32ms 
iter 2932: loss 2.3902, time 5026.47ms 
iter 2933: loss 2.7160, time 5026.43ms 
iter 2934: loss 2.7142, time 5027.85ms 
iter 2935: loss 2.6716, time 5029.24ms 
iter 2936: loss 2.5805, time 5028.75ms 
iter 2937: loss 2.6518, time 4977.32ms 
iter 2938: loss 2.7578, time 5012.78ms 
iter 2939: loss 2.8855, time 5027.07ms 
iter 2940: loss 2.7060, time 5042.23ms 
iter 2941: loss 2.7562, time 5052.42ms 
iter 2942: loss 2.7903, time 5044.47ms 
iter 2943: loss 2.5609, time 5016.92ms 
iter 2944: loss 2.7316, time 5029.10ms 
iter 2945: loss 2.7199, time 4957.56ms 
iter 2946: loss 2.7362, time 4959.06ms 
iter 2947: loss 2.6380, time 5028.59ms 
iter 2948: loss 2.7575, time 5018.10ms 
iter 2949: loss 2.7669, time 5005.84ms 
step 2950: train loss 2.7410, val loss 2.8803
iter 2950: loss 2.7138, time 19597.84ms 
iter 2951: loss 2.5605, time 4977.62ms 
iter 2952: loss 2.7285, time 5025.39ms 
iter 2953: loss 2.7436, time 5025.86ms 
iter 2954: loss 2.7786, time 5028.66ms 
iter 2955: loss 2.8344, time 5025.55ms 
iter 2956: loss 2.7789, time 5026.89ms 
iter 2957: loss 2.8105, time 5029.90ms 
iter 2958: loss 2.6536, time 4978.12ms 
iter 2959: loss 2.6734, time 4914.55ms 
iter 2960: loss 2.7166, time 4993.63ms 
iter 2961: loss 2.7327, time 5002.88ms 
iter 2962: loss 2.8561, time 5026.87ms 
iter 2963: loss 2.7364, time 5028.48ms 
iter 2964: loss 2.8581, time 4996.60ms 
iter 2965: loss 2.7694, time 4998.48ms 
iter 2966: loss 2.6952, time 5027.17ms 
iter 2967: loss 2.7730, time 4979.86ms 
iter 2968: loss 2.8040, time 4972.88ms 
iter 2969: loss 2.5614, time 5026.92ms 
iter 2970: loss 2.6364, time 5029.60ms 
iter 2971: loss 2.6621, time 5007.30ms 
iter 2972: loss 2.8557, time 5031.61ms 
iter 2973: loss 2.7569, time 5027.25ms 
iter 2974: loss 2.6533, time 5032.01ms 
iter 2975: loss 2.8018, time 4969.14ms 
iter 2976: loss 2.5951, time 4977.08ms 
iter 2977: loss 2.7006, time 4952.56ms 
iter 2978: loss 2.5727, time 5029.03ms 
iter 2979: loss 2.7105, time 5027.39ms 
iter 2980: loss 2.5392, time 5012.86ms 
iter 2981: loss 2.5767, time 5027.36ms 
iter 2982: loss 2.7224, time 5028.23ms 
iter 2983: loss 2.6684, time 5025.74ms 
iter 2984: loss 2.7622, time 5020.76ms 
iter 2985: loss 2.8028, time 5020.81ms 
iter 2986: loss 2.7413, time 5026.02ms 
iter 2987: loss 2.7873, time 5025.88ms 
iter 2988: loss 2.8279, time 5006.28ms 
iter 2989: loss 2.5662, time 5028.90ms 
iter 2990: loss 2.8392, time 5025.53ms 
iter 2991: loss 2.7749, time 5026.44ms 
iter 2992: loss 2.8430, time 5032.91ms 
iter 2993: loss 2.5557, time 4992.29ms 
iter 2994: loss 2.7681, time 5017.97ms 
iter 2995: loss 2.7279, time 5030.06ms 
iter 2996: loss 2.7481, time 5022.57ms 
iter 2997: loss 2.6670, time 5019.70ms 
iter 2998: loss 2.8530, time 5027.99ms 
iter 2999: loss 2.5805, time 5029.18ms 
step 3000: train loss 2.7378, val loss 2.8846
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 3000: loss 2.7804, time 20779.94ms 
iter 3001: loss 2.7852, time 5030.70ms 
iter 3002: loss 2.8043, time 5029.09ms 
iter 3003: loss 2.6494, time 5029.32ms 
iter 3004: loss 2.9724, time 5008.56ms 
iter 3005: loss 2.6556, time 5031.56ms 
iter 3006: loss 3.0075, time 4980.57ms 
iter 3007: loss 2.5608, time 4917.63ms 
iter 3008: loss 2.8643, time 4972.79ms 
iter 3009: loss 2.6106, time 5029.09ms 
iter 3010: loss 2.6887, time 5029.69ms 
iter 3011: loss 3.0510, time 5030.99ms 
iter 3012: loss 2.7309, time 5028.76ms 
iter 3013: loss 2.7589, time 5034.62ms 
iter 3014: loss 2.7026, time 5030.86ms 
iter 3015: loss 2.5059, time 4957.51ms 
iter 3016: loss 2.7215, time 4926.73ms 
iter 3017: loss 2.8634, time 4959.77ms 
iter 3018: loss 2.6947, time 5020.30ms 
iter 3019: loss 2.6763, time 5031.89ms 
iter 3020: loss 2.7054, time 5028.62ms 
iter 3021: loss 2.8541, time 5020.83ms 
iter 3022: loss 2.7789, time 5018.68ms 
iter 3023: loss 2.8460, time 5030.22ms 
iter 3024: loss 2.7266, time 4975.64ms 
iter 3025: loss 2.5868, time 4941.68ms 
iter 3026: loss 2.7251, time 5024.46ms 
iter 3027: loss 2.6884, time 5028.21ms 
iter 3028: loss 2.5809, time 5031.34ms 
iter 3029: loss 2.8036, time 5027.99ms 
iter 3030: loss 2.5280, time 5030.58ms 
iter 3031: loss 2.6127, time 5028.26ms 
iter 3032: loss 2.6787, time 4964.15ms 
iter 3033: loss 2.7341, time 4918.41ms 
iter 3034: loss 2.8579, time 5023.14ms 
iter 3035: loss 2.8427, time 5029.70ms 
iter 3036: loss 2.4266, time 5030.08ms 
iter 3037: loss 2.6385, time 5033.50ms 
iter 3038: loss 2.7804, time 5019.83ms 
iter 3039: loss 2.6839, time 5028.37ms 
iter 3040: loss 2.6943, time 5031.24ms 
iter 3041: loss 2.6005, time 5001.23ms 
iter 3042: loss 2.8226, time 5015.10ms 
iter 3043: loss 2.6545, time 5030.66ms 
iter 3044: loss 2.8006, time 4956.20ms 
iter 3045: loss 2.9549, time 4925.63ms 
iter 3046: loss 2.6397, time 4981.68ms 
iter 3047: loss 2.5892, time 5030.27ms 
iter 3048: loss 3.0419, time 4986.59ms 
iter 3049: loss 2.7566, time 5033.43ms 
step 3050: train loss 2.7276, val loss 2.8815
iter 3050: loss 2.7078, time 19757.12ms 
iter 3051: loss 2.9020, time 5007.53ms 
iter 3052: loss 2.7314, time 4945.92ms 
iter 3053: loss 2.7418, time 4918.80ms 
iter 3054: loss 2.7953, time 4967.88ms 
iter 3055: loss 2.7920, time 4926.27ms 
iter 3056: loss 2.6628, time 4926.79ms 
iter 3057: loss 2.7177, time 5004.06ms 
iter 3058: loss 2.5158, time 5023.32ms 
iter 3059: loss 2.7242, time 5027.40ms 
iter 3060: loss 2.9033, time 5030.55ms 
iter 3061: loss 2.8343, time 5034.70ms 
iter 3062: loss 2.6558, time 5013.77ms 
iter 3063: loss 2.7955, time 5019.73ms 
iter 3064: loss 2.5824, time 5030.00ms 
iter 3065: loss 2.6732, time 5028.35ms 
iter 3066: loss 3.0173, time 5024.33ms 
iter 3067: loss 2.6666, time 5025.85ms 
iter 3068: loss 2.7541, time 5028.43ms 
iter 3069: loss 2.7161, time 4975.30ms 
iter 3070: loss 2.8640, time 4938.28ms 
iter 3071: loss 2.6632, time 5030.57ms 
iter 3072: loss 2.8163, time 5024.00ms 
iter 3073: loss 2.7940, time 5030.98ms 
iter 3074: loss 2.6799, time 5029.56ms 
iter 3075: loss 2.6549, time 5030.17ms 
iter 3076: loss 2.6373, time 5028.90ms 
iter 3077: loss 2.5597, time 5034.95ms 
iter 3078: loss 2.8258, time 4992.69ms 
iter 3079: loss 2.6936, time 5026.88ms 
iter 3080: loss 2.6707, time 5028.73ms 
iter 3081: loss 2.8717, time 5030.16ms 
iter 3082: loss 2.7708, time 5029.13ms 
iter 3083: loss 2.5576, time 5028.67ms 
iter 3084: loss 2.8347, time 5013.69ms 
iter 3085: loss 2.6566, time 5031.14ms 
iter 3086: loss 2.7493, time 5003.56ms 
iter 3087: loss 2.7076, time 5000.49ms 
iter 3088: loss 2.6277, time 5022.76ms 
iter 3089: loss 2.8162, time 5035.40ms 
iter 3090: loss 2.8522, time 5017.94ms 
iter 3091: loss 2.6908, time 5030.30ms 
iter 3092: loss 2.8401, time 5017.54ms 
iter 3093: loss 2.7978, time 5035.23ms 
iter 3094: loss 2.7194, time 4942.59ms 
iter 3095: loss 2.8676, time 4914.91ms 
iter 3096: loss 2.4458, time 4980.35ms 
iter 3097: loss 2.7125, time 5027.36ms 
iter 3098: loss 2.7743, time 5025.30ms 
iter 3099: loss 2.7503, time 5011.97ms 
step 3100: train loss 2.7260, val loss 2.8644
iter 3100: loss 2.8737, time 19610.00ms 
iter 3101: loss 2.6465, time 4935.11ms 
iter 3102: loss 2.7054, time 5004.78ms 
iter 3103: loss 2.5183, time 4956.05ms 
iter 3104: loss 2.6152, time 5013.70ms 
iter 3105: loss 2.7827, time 5025.68ms 
iter 3106: loss 2.8116, time 5016.64ms 
iter 3107: loss 2.7919, time 5027.75ms 
iter 3108: loss 2.9223, time 5026.94ms 
iter 3109: loss 2.8392, time 4979.95ms 
iter 3110: loss 2.6812, time 5019.26ms 
iter 3111: loss 2.7140, time 5025.05ms 
iter 3112: loss 2.8906, time 5024.26ms 
iter 3113: loss 2.7756, time 5026.32ms 
iter 3114: loss 2.6938, time 5030.14ms 
iter 3115: loss 2.5837, time 5025.39ms 
iter 3116: loss 2.5798, time 5026.20ms 
iter 3117: loss 2.7861, time 5024.08ms 
iter 3118: loss 2.7913, time 5000.03ms 
iter 3119: loss 2.7429, time 5027.81ms 
iter 3120: loss 2.6513, time 5023.89ms 
iter 3121: loss 2.5869, time 5026.01ms 
iter 3122: loss 2.7129, time 5024.35ms 
iter 3123: loss 2.7716, time 5026.14ms 
iter 3124: loss 2.7873, time 5028.75ms 
iter 3125: loss 2.6314, time 5027.43ms 
iter 3126: loss 2.7507, time 4994.26ms 
iter 3127: loss 2.5807, time 4995.01ms 
iter 3128: loss 2.6117, time 4998.08ms 
iter 3129: loss 2.7049, time 5024.37ms 
iter 3130: loss 2.7211, time 5024.82ms 
iter 3131: loss 2.7546, time 5025.71ms 
iter 3132: loss 2.6466, time 4967.32ms 
iter 3133: loss 2.7979, time 4969.63ms 
iter 3134: loss 2.7016, time 4960.76ms 
iter 3135: loss 2.6063, time 5026.03ms 
iter 3136: loss 2.6002, time 5021.43ms 
iter 3137: loss 2.7344, time 4985.69ms 
iter 3138: loss 2.6951, time 4945.61ms 
iter 3139: loss 2.7449, time 5014.09ms 
iter 3140: loss 2.7809, time 5023.94ms 
iter 3141: loss 2.8295, time 4982.07ms 
iter 3142: loss 2.6372, time 5027.16ms 
iter 3143: loss 2.7265, time 5018.43ms 
iter 3144: loss 2.7768, time 5024.17ms 
iter 3145: loss 2.8290, time 5025.66ms 
iter 3146: loss 2.8611, time 5023.37ms 
iter 3147: loss 2.7902, time 5025.34ms 
iter 3148: loss 2.6382, time 4987.00ms 
iter 3149: loss 2.6690, time 4917.54ms 
step 3150: train loss 2.7179, val loss 2.8901
iter 3150: loss 2.8085, time 19702.03ms 
iter 3151: loss 2.7097, time 5030.58ms 
iter 3152: loss 2.8472, time 5020.17ms 
iter 3153: loss 2.7961, time 5023.10ms 
iter 3154: loss 2.8609, time 4971.93ms 
iter 3155: loss 2.5701, time 4963.17ms 
iter 3156: loss 2.6737, time 5027.30ms 
iter 3157: loss 2.7787, time 5023.62ms 
iter 3158: loss 2.6676, time 5021.56ms 
iter 3159: loss 2.7920, time 5019.12ms 
iter 3160: loss 2.9256, time 5022.74ms 
iter 3161: loss 2.6037, time 5021.79ms 
iter 3162: loss 2.6624, time 4969.03ms 
iter 3163: loss 2.6129, time 4940.69ms 
iter 3164: loss 2.6078, time 5017.25ms 
iter 3165: loss 2.5959, time 5017.71ms 
iter 3166: loss 2.6921, time 5017.04ms 
iter 3167: loss 2.6238, time 5018.90ms 
iter 3168: loss 2.6056, time 5017.32ms 
iter 3169: loss 2.5469, time 5016.10ms 
iter 3170: loss 2.6361, time 5020.04ms 
iter 3171: loss 2.4227, time 4968.63ms 
iter 3172: loss 2.6720, time 4987.65ms 
iter 3173: loss 2.7677, time 5015.97ms 
iter 3174: loss 2.7314, time 5016.71ms 
iter 3175: loss 2.5162, time 5016.95ms 
iter 3176: loss 2.8007, time 5017.74ms 
iter 3177: loss 3.0576, time 5016.21ms 
iter 3178: loss 2.7262, time 5019.06ms 
iter 3179: loss 2.9510, time 4969.05ms 
iter 3180: loss 2.6753, time 4911.26ms 
iter 3181: loss 2.9010, time 4988.60ms 
iter 3182: loss 2.6576, time 5017.50ms 
iter 3183: loss 2.9541, time 5017.94ms 
iter 3184: loss 2.7748, time 5010.24ms 
iter 3185: loss 2.6743, time 5001.82ms 
iter 3186: loss 2.3974, time 5010.43ms 
iter 3187: loss 2.7488, time 5013.98ms 
iter 3188: loss 2.8116, time 4958.91ms 
iter 3189: loss 2.6710, time 5003.58ms 
iter 3190: loss 2.9196, time 5023.37ms 
iter 3191: loss 2.6498, time 5018.60ms 
iter 3192: loss 2.6215, time 5019.32ms 
iter 3193: loss 2.7754, time 5011.35ms 
iter 3194: loss 2.7229, time 5021.75ms 
iter 3195: loss 2.7738, time 5018.98ms 
iter 3196: loss 2.6276, time 4968.64ms 
iter 3197: loss 2.8556, time 4992.90ms 
iter 3198: loss 2.7042, time 5017.44ms 
iter 3199: loss 3.0024, time 5018.29ms 
step 3200: train loss 2.7021, val loss 2.8772
iter 3200: loss 2.5080, time 19682.58ms 
iter 3201: loss 2.7172, time 4974.40ms 
iter 3202: loss 2.8834, time 4970.05ms 
iter 3203: loss 2.7181, time 5013.50ms 
iter 3204: loss 2.8501, time 5011.75ms 
iter 3205: loss 2.6511, time 5019.82ms 
iter 3206: loss 2.8432, time 5019.86ms 
iter 3207: loss 2.6296, time 5030.22ms 
iter 3208: loss 2.7619, time 5027.64ms 
iter 3209: loss 2.7513, time 4979.06ms 
iter 3210: loss 2.6375, time 4962.30ms 
iter 3211: loss 2.6618, time 5022.05ms 
iter 3212: loss 2.7712, time 5027.58ms 
iter 3213: loss 2.5792, time 5027.51ms 
iter 3214: loss 2.6974, time 5031.73ms 
iter 3215: loss 2.6301, time 5031.14ms 
iter 3216: loss 2.8044, time 5028.06ms 
iter 3217: loss 2.5996, time 5019.55ms 
iter 3218: loss 2.7878, time 5020.20ms 
iter 3219: loss 2.6389, time 5025.46ms 
iter 3220: loss 2.8492, time 5020.77ms 
iter 3221: loss 2.6433, time 5025.74ms 
iter 3222: loss 2.6814, time 5020.34ms 
iter 3223: loss 2.7170, time 5012.37ms 
iter 3224: loss 2.5649, time 5022.71ms 
iter 3225: loss 2.7065, time 4989.10ms 
iter 3226: loss 2.4422, time 5021.88ms 
iter 3227: loss 2.3479, time 5025.98ms 
iter 3228: loss 2.6554, time 5024.71ms 
iter 3229: loss 2.8305, time 5028.55ms 
iter 3230: loss 2.5876, time 5025.00ms 
iter 3231: loss 2.6916, time 5026.85ms 
iter 3232: loss 2.6082, time 4972.79ms 
iter 3233: loss 2.5407, time 4989.36ms 
iter 3234: loss 2.7330, time 5022.26ms 
iter 3235: loss 2.6111, time 5022.94ms 
iter 3236: loss 2.7052, time 5020.44ms 
iter 3237: loss 2.7964, time 5004.97ms 
iter 3238: loss 2.8679, time 5007.47ms 
iter 3239: loss 2.6539, time 5027.64ms 
iter 3240: loss 2.7492, time 4926.99ms 
iter 3241: loss 2.7010, time 4926.59ms 
iter 3242: loss 2.7486, time 4928.39ms 
iter 3243: loss 2.8367, time 4926.32ms 
iter 3244: loss 2.7501, time 4982.56ms 
iter 3245: loss 2.7478, time 4913.79ms 
iter 3246: loss 2.5438, time 4912.76ms 
iter 3247: loss 2.4945, time 4995.63ms 
iter 3248: loss 2.8458, time 4970.24ms 
iter 3249: loss 2.5603, time 4995.97ms 
step 3250: train loss 2.7022, val loss 2.8694
iter 3250: loss 2.7694, time 19667.27ms 
iter 3251: loss 2.5654, time 5014.29ms 
iter 3252: loss 2.8217, time 5020.34ms 
iter 3253: loss 2.6749, time 4972.79ms 
iter 3254: loss 2.6899, time 4939.04ms 
iter 3255: loss 2.8225, time 4954.79ms 
iter 3256: loss 2.6724, time 5001.13ms 
iter 3257: loss 2.8471, time 5034.21ms 
iter 3258: loss 3.0127, time 5031.88ms 
iter 3259: loss 2.7153, time 5035.85ms 
iter 3260: loss 2.7095, time 5010.18ms 
iter 3261: loss 2.7034, time 5033.69ms 
iter 3262: loss 2.6077, time 5037.87ms 
iter 3263: loss 2.4510, time 4981.18ms 
iter 3264: loss 2.5337, time 4920.54ms 
iter 3265: loss 2.6249, time 4964.43ms 
iter 3266: loss 2.4937, time 5034.13ms 
iter 3267: loss 2.7628, time 5018.31ms 
iter 3268: loss 2.7917, time 5027.12ms 
iter 3269: loss 2.8935, time 5035.47ms 
iter 3270: loss 2.6477, time 5009.87ms 
iter 3271: loss 2.6868, time 4976.87ms 
iter 3272: loss 2.8379, time 5001.54ms 
iter 3273: loss 2.7796, time 5009.61ms 
iter 3274: loss 2.6292, time 5007.02ms 
iter 3275: loss 2.9169, time 4934.22ms 
iter 3276: loss 2.8084, time 4996.08ms 
iter 3277: loss 2.7020, time 4917.49ms 
iter 3278: loss 2.7503, time 4927.15ms 
iter 3279: loss 2.6163, time 4941.21ms 
iter 3280: loss 2.8294, time 4917.03ms 
iter 3281: loss 2.7963, time 4960.75ms 
iter 3282: loss 2.5655, time 5026.86ms 
iter 3283: loss 2.6494, time 5025.94ms 
iter 3284: loss 2.9481, time 5030.60ms 
iter 3285: loss 2.6873, time 5014.96ms 
iter 3286: loss 2.5797, time 5027.98ms 
iter 3287: loss 2.5523, time 5029.47ms 
iter 3288: loss 2.7265, time 5033.16ms 
iter 3289: loss 2.4401, time 4958.64ms 
iter 3290: loss 2.7676, time 4916.37ms 
iter 3291: loss 2.6045, time 4914.80ms 
iter 3292: loss 2.9673, time 4976.99ms 
iter 3293: loss 2.6625, time 5029.37ms 
iter 3294: loss 2.7261, time 5034.00ms 
iter 3295: loss 2.7395, time 5029.94ms 
iter 3296: loss 2.5566, time 5028.16ms 
iter 3297: loss 2.7593, time 5030.15ms 
iter 3298: loss 2.7339, time 5025.90ms 
iter 3299: loss 2.6383, time 4948.25ms 
step 3300: train loss 2.7139, val loss 2.8692
iter 3300: loss 2.7001, time 19626.96ms 
iter 3301: loss 2.6602, time 5023.31ms 
iter 3302: loss 2.5583, time 5021.19ms 
iter 3303: loss 2.7785, time 5023.81ms 
iter 3304: loss 2.9781, time 5027.45ms 
iter 3305: loss 2.6305, time 5010.61ms 
iter 3306: loss 2.7875, time 5009.63ms 
iter 3307: loss 2.7632, time 5033.43ms 
iter 3308: loss 2.8801, time 4920.54ms 
iter 3309: loss 2.6570, time 4916.99ms 
iter 3310: loss 2.7362, time 4948.80ms 
iter 3311: loss 2.7683, time 4998.82ms 
iter 3312: loss 2.6667, time 5024.00ms 
iter 3313: loss 2.7473, time 5004.45ms 
iter 3314: loss 2.7461, time 5025.65ms 
iter 3315: loss 2.6491, time 5030.03ms 
iter 3316: loss 2.5974, time 5008.34ms 
iter 3317: loss 2.7872, time 5018.99ms 
iter 3318: loss 2.7590, time 4988.39ms 
iter 3319: loss 2.6306, time 4915.56ms 
iter 3320: loss 2.7344, time 4916.41ms 
iter 3321: loss 2.7386, time 4916.99ms 
iter 3322: loss 2.6909, time 4939.98ms 
iter 3323: loss 2.6552, time 5002.28ms 
iter 3324: loss 2.8637, time 4997.72ms 
iter 3325: loss 2.8195, time 5030.91ms 
iter 3326: loss 2.8149, time 5028.76ms 
iter 3327: loss 2.7419, time 5032.43ms 
iter 3328: loss 2.6994, time 5030.25ms 
iter 3329: loss 2.5866, time 5031.13ms 
iter 3330: loss 2.7913, time 4965.54ms 
iter 3331: loss 2.7399, time 4916.07ms 
iter 3332: loss 2.6281, time 4918.73ms 
iter 3333: loss 2.7417, time 4957.06ms 
iter 3334: loss 2.3398, time 5003.90ms 
iter 3335: loss 2.7023, time 5002.40ms 
iter 3336: loss 2.6921, time 5033.99ms 
iter 3337: loss 2.6861, time 5034.23ms 
iter 3338: loss 2.6052, time 5035.62ms 
iter 3339: loss 2.6006, time 5029.79ms 
iter 3340: loss 2.7998, time 4980.22ms 
iter 3341: loss 2.5810, time 4980.64ms 
iter 3342: loss 2.8240, time 4919.69ms 
iter 3343: loss 2.8196, time 4916.04ms 
iter 3344: loss 2.7163, time 4992.25ms 
iter 3345: loss 2.6612, time 5033.03ms 
iter 3346: loss 2.7719, time 5033.24ms 
iter 3347: loss 2.6956, time 5028.82ms 
iter 3348: loss 2.5807, time 5022.98ms 
iter 3349: loss 2.8358, time 5024.40ms 
step 3350: train loss 2.6973, val loss 2.8596
iter 3350: loss 2.6980, time 19688.74ms 
iter 3351: loss 2.3728, time 4983.37ms 
iter 3352: loss 2.7463, time 5003.37ms 
iter 3353: loss 2.5949, time 5005.24ms 
iter 3354: loss 2.7364, time 4995.90ms 
iter 3355: loss 2.5898, time 4986.91ms 
iter 3356: loss 2.8257, time 4995.70ms 
iter 3357: loss 2.5009, time 5031.89ms 
iter 3358: loss 2.5872, time 4968.90ms 
iter 3359: loss 2.7347, time 4933.81ms 
iter 3360: loss 2.9273, time 4916.95ms 
iter 3361: loss 2.4547, time 5004.25ms 
iter 3362: loss 2.7157, time 5028.11ms 
iter 3363: loss 2.9289, time 5033.23ms 
iter 3364: loss 2.9210, time 5029.98ms 
iter 3365: loss 2.4204, time 5002.86ms 
iter 3366: loss 2.8285, time 5030.81ms 
iter 3367: loss 2.5760, time 5025.31ms 
iter 3368: loss 2.7634, time 5002.64ms 
iter 3369: loss 2.7322, time 4924.92ms 
iter 3370: loss 2.7256, time 4920.71ms 
iter 3371: loss 2.6312, time 5027.13ms 
iter 3372: loss 2.5888, time 5030.16ms 
iter 3373: loss 2.8393, time 5033.07ms 
iter 3374: loss 2.7178, time 5032.54ms 
iter 3375: loss 2.7513, time 5032.48ms 
iter 3376: loss 2.5756, time 5027.43ms 
iter 3377: loss 2.5972, time 5029.83ms 
iter 3378: loss 2.6552, time 4956.45ms 
iter 3379: loss 2.7655, time 4917.97ms 
iter 3380: loss 2.8297, time 4927.99ms 
iter 3381: loss 2.7656, time 5000.63ms 
iter 3382: loss 2.8455, time 5030.72ms 
iter 3383: loss 2.7986, time 5029.63ms 
iter 3384: loss 2.7340, time 5030.94ms 
iter 3385: loss 2.7093, time 4993.09ms 
iter 3386: loss 2.7737, time 5029.72ms 
iter 3387: loss 2.8208, time 5031.62ms 
iter 3388: loss 2.4816, time 4983.33ms 
iter 3389: loss 2.6162, time 4971.84ms 
iter 3390: loss 2.6920, time 4952.28ms 
iter 3391: loss 2.6872, time 5000.50ms 
iter 3392: loss 2.7525, time 4968.82ms 
iter 3393: loss 2.5450, time 5019.08ms 
iter 3394: loss 2.6369, time 5005.99ms 
iter 3395: loss 2.7211, time 5028.41ms 
iter 3396: loss 2.5616, time 5028.64ms 
iter 3397: loss 2.7113, time 5030.92ms 
iter 3398: loss 2.6952, time 4984.76ms 
iter 3399: loss 2.6713, time 4983.92ms 
step 3400: train loss 2.6779, val loss 2.8585
iter 3400: loss 2.6659, time 19714.21ms 
iter 3401: loss 2.9189, time 5036.30ms 
iter 3402: loss 2.7313, time 5033.15ms 
iter 3403: loss 2.6797, time 5029.77ms 
iter 3404: loss 2.7062, time 4980.20ms 
iter 3405: loss 2.6070, time 4917.92ms 
iter 3406: loss 2.7132, time 4917.09ms 
iter 3407: loss 2.6473, time 4984.72ms 
iter 3408: loss 2.8937, time 5028.17ms 
iter 3409: loss 2.7856, time 5016.81ms 
iter 3410: loss 2.4804, time 5007.34ms 
iter 3411: loss 2.6584, time 5023.00ms 
iter 3412: loss 2.7201, time 5031.70ms 
iter 3413: loss 2.6166, time 5033.17ms 
iter 3414: loss 2.5795, time 4980.72ms 
iter 3415: loss 2.8074, time 4918.16ms 
iter 3416: loss 2.6721, time 4980.20ms 
iter 3417: loss 2.6309, time 5028.97ms 
iter 3418: loss 2.6259, time 5034.45ms 
iter 3419: loss 2.9606, time 5035.26ms 
iter 3420: loss 2.6476, time 5033.73ms 
iter 3421: loss 2.6610, time 5034.84ms 
iter 3422: loss 2.3902, time 5021.52ms 
iter 3423: loss 2.6801, time 4920.24ms 
iter 3424: loss 2.6629, time 4917.60ms 
iter 3425: loss 2.7266, time 4930.36ms 
iter 3426: loss 2.6853, time 5031.58ms 
iter 3427: loss 2.5796, time 5013.09ms 
iter 3428: loss 2.8055, time 5025.97ms 
iter 3429: loss 2.5548, time 5028.65ms 
iter 3430: loss 2.7428, time 5031.74ms 
iter 3431: loss 2.7600, time 5028.83ms 
iter 3432: loss 2.7927, time 4997.91ms 
iter 3433: loss 2.6871, time 4976.45ms 
iter 3434: loss 2.6757, time 4984.25ms 
iter 3435: loss 3.0103, time 5031.27ms 
iter 3436: loss 2.7918, time 5026.17ms 
iter 3437: loss 2.5922, time 5024.11ms 
iter 3438: loss 2.6083, time 5026.24ms 
iter 3439: loss 2.7866, time 5027.99ms 
iter 3440: loss 2.7033, time 5025.68ms 
iter 3441: loss 2.5857, time 4974.26ms 
iter 3442: loss 2.4705, time 4918.77ms 
iter 3443: loss 2.8173, time 4969.03ms 
iter 3444: loss 2.6266, time 5028.31ms 
iter 3445: loss 2.5011, time 5032.99ms 
iter 3446: loss 2.6583, time 5032.92ms 
iter 3447: loss 2.6894, time 5031.50ms 
iter 3448: loss 2.7524, time 5033.67ms 
iter 3449: loss 2.7530, time 5029.87ms 
step 3450: train loss 2.6870, val loss 2.8525
iter 3450: loss 2.7778, time 19865.20ms 
iter 3451: loss 2.7664, time 5037.83ms 
iter 3452: loss 2.8183, time 5022.89ms 
iter 3453: loss 2.7147, time 5043.67ms 
iter 3454: loss 2.5514, time 5035.69ms 
iter 3455: loss 2.8607, time 5027.96ms 
iter 3456: loss 2.9321, time 4948.15ms 
iter 3457: loss 2.7781, time 4938.09ms 
iter 3458: loss 2.8219, time 4983.46ms 
iter 3459: loss 2.7156, time 5033.34ms 
iter 3460: loss 2.8811, time 5026.23ms 
iter 3461: loss 2.7261, time 5009.72ms 
iter 3462: loss 2.6292, time 5007.25ms 
iter 3463: loss 2.7778, time 4977.30ms 
iter 3464: loss 2.6005, time 4977.88ms 
iter 3465: loss 2.6698, time 4938.13ms 
iter 3466: loss 2.7994, time 4919.24ms 
iter 3467: loss 2.6265, time 4948.55ms 
iter 3468: loss 2.8126, time 4990.96ms 
iter 3469: loss 2.7680, time 5028.40ms 
iter 3470: loss 2.7352, time 5014.03ms 
iter 3471: loss 2.5011, time 4978.79ms 
iter 3472: loss 2.5659, time 4988.77ms 
iter 3473: loss 2.6619, time 4983.17ms 
iter 3474: loss 2.6413, time 4945.68ms 
iter 3475: loss 2.6975, time 4935.28ms 
iter 3476: loss 2.9142, time 5015.15ms 
iter 3477: loss 2.7224, time 5013.85ms 
iter 3478: loss 2.7022, time 5034.19ms 
iter 3479: loss 2.7599, time 5013.36ms 
iter 3480: loss 2.9172, time 5005.79ms 
iter 3481: loss 2.7193, time 5007.58ms 
iter 3482: loss 2.6889, time 5024.24ms 
iter 3483: loss 2.7449, time 4931.19ms 
iter 3484: loss 2.6588, time 4935.81ms 
iter 3485: loss 2.5745, time 4999.40ms 
iter 3486: loss 2.6136, time 5024.82ms 
iter 3487: loss 2.6604, time 5024.64ms 
iter 3488: loss 2.6526, time 5024.01ms 
iter 3489: loss 2.6354, time 5022.88ms 
iter 3490: loss 2.7085, time 5032.22ms 
iter 3491: loss 2.5100, time 5011.43ms 
iter 3492: loss 2.5958, time 4918.45ms 
iter 3493: loss 2.7241, time 4994.66ms 
iter 3494: loss 2.6997, time 5038.17ms 
iter 3495: loss 2.6244, time 5030.01ms 
iter 3496: loss 2.6930, time 5033.08ms 
iter 3497: loss 2.5131, time 5041.07ms 
iter 3498: loss 2.8065, time 5026.51ms 
iter 3499: loss 2.5366, time 5050.73ms 
step 3500: train loss 2.6878, val loss 2.8721
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 3500: loss 2.5596, time 20793.81ms 
iter 3501: loss 2.8000, time 5036.93ms 
iter 3502: loss 2.6936, time 5045.61ms 
iter 3503: loss 2.5695, time 5018.70ms 
iter 3504: loss 2.6619, time 5013.46ms 
iter 3505: loss 2.6950, time 5046.94ms 
iter 3506: loss 2.8861, time 4979.47ms 
iter 3507: loss 2.7245, time 5030.84ms 
iter 3508: loss 2.7093, time 5030.07ms 
iter 3509: loss 2.7992, time 5029.99ms 
iter 3510: loss 2.5428, time 5022.42ms 
iter 3511: loss 2.6777, time 5030.51ms 
iter 3512: loss 2.9092, time 5026.35ms 
iter 3513: loss 2.7440, time 5011.56ms 
iter 3514: loss 2.7450, time 4969.13ms 
iter 3515: loss 2.7268, time 4922.14ms 
iter 3516: loss 2.6518, time 4985.42ms 
iter 3517: loss 2.6692, time 5022.58ms 
iter 3518: loss 2.7326, time 5021.05ms 
iter 3519: loss 2.7353, time 5027.02ms 
iter 3520: loss 2.6434, time 5025.19ms 
iter 3521: loss 2.7277, time 5028.78ms 
iter 3522: loss 2.6424, time 5028.52ms 
iter 3523: loss 2.8022, time 4982.19ms 
iter 3524: loss 2.4771, time 4997.29ms 
iter 3525: loss 2.5836, time 5025.88ms 
iter 3526: loss 2.7449, time 5027.51ms 
iter 3527: loss 2.8265, time 5030.60ms 
iter 3528: loss 2.5607, time 5030.69ms 
iter 3529: loss 2.7953, time 5021.66ms 
iter 3530: loss 2.9449, time 5025.77ms 
iter 3531: loss 2.8748, time 4952.32ms 
iter 3532: loss 2.7695, time 4928.78ms 
iter 3533: loss 2.8333, time 4988.09ms 
iter 3534: loss 2.6573, time 5029.90ms 
iter 3535: loss 2.7461, time 5020.01ms 
iter 3536: loss 2.6143, time 5026.78ms 
iter 3537: loss 2.4924, time 5027.54ms 
iter 3538: loss 2.6558, time 5029.32ms 
iter 3539: loss 2.6354, time 5028.71ms 
iter 3540: loss 2.6601, time 4918.28ms 
iter 3541: loss 2.6687, time 4974.32ms 
iter 3542: loss 2.6671, time 5026.28ms 
iter 3543: loss 2.6425, time 5035.62ms 
iter 3544: loss 2.6231, time 5030.18ms 
iter 3545: loss 2.6987, time 5031.69ms 
iter 3546: loss 2.8475, time 5033.28ms 
iter 3547: loss 2.5856, time 5027.31ms 
iter 3548: loss 2.6802, time 5012.51ms 
iter 3549: loss 2.7402, time 4917.21ms 
step 3550: train loss 2.6744, val loss 2.8743
iter 3550: loss 2.6612, time 19704.77ms 
iter 3551: loss 2.7345, time 5034.25ms 
iter 3552: loss 2.6239, time 5028.97ms 
iter 3553: loss 2.7988, time 5024.08ms 
iter 3554: loss 2.7003, time 5024.76ms 
iter 3555: loss 2.6531, time 4971.43ms 
iter 3556: loss 2.5772, time 4990.64ms 
iter 3557: loss 2.7742, time 5028.34ms 
iter 3558: loss 2.4377, time 5024.29ms 
iter 3559: loss 2.6599, time 5024.30ms 
iter 3560: loss 3.0601, time 5023.96ms 
iter 3561: loss 2.6917, time 5017.14ms 
iter 3562: loss 2.5512, time 5031.29ms 
iter 3563: loss 2.5063, time 4982.60ms 
iter 3564: loss 2.8851, time 4957.32ms 
iter 3565: loss 2.5310, time 5022.68ms 
iter 3566: loss 2.5921, time 4985.54ms 
iter 3567: loss 2.6449, time 4951.41ms 
iter 3568: loss 2.6576, time 4926.11ms 
iter 3569: loss 2.5917, time 4950.59ms 
iter 3570: loss 2.8760, time 5031.40ms 
iter 3571: loss 2.5754, time 5005.10ms 
iter 3572: loss 2.5946, time 5026.76ms 
iter 3573: loss 2.6227, time 5027.54ms 
iter 3574: loss 2.6713, time 5029.31ms 
iter 3575: loss 2.4899, time 5024.07ms 
iter 3576: loss 2.5460, time 5032.40ms 
iter 3577: loss 2.6185, time 4997.64ms 
iter 3578: loss 2.7243, time 5017.76ms 
iter 3579: loss 2.7729, time 5032.08ms 
iter 3580: loss 2.7579, time 4999.72ms 
iter 3581: loss 2.7370, time 5026.00ms 
iter 3582: loss 2.6513, time 5042.10ms 
iter 3583: loss 2.6278, time 5041.29ms 
iter 3584: loss 2.5876, time 5041.98ms 
iter 3585: loss 2.5656, time 5042.29ms 
iter 3586: loss 2.8272, time 4930.91ms 
iter 3587: loss 2.6760, time 4984.42ms 
iter 3588: loss 2.6354, time 5017.96ms 
iter 3589: loss 2.6160, time 4995.20ms 
iter 3590: loss 2.5695, time 4974.93ms 
iter 3591: loss 2.9896, time 5047.59ms 
iter 3592: loss 2.6836, time 5037.82ms 
iter 3593: loss 2.5483, time 5045.84ms 
iter 3594: loss 2.7639, time 4995.07ms 
iter 3595: loss 2.4928, time 4929.79ms 
iter 3596: loss 2.7340, time 4949.10ms 
iter 3597: loss 2.5810, time 4932.55ms 
iter 3598: loss 2.6667, time 4985.86ms 
iter 3599: loss 2.8078, time 5021.65ms 
step 3600: train loss 2.6826, val loss 2.8468
iter 3600: loss 2.6233, time 19654.72ms 
iter 3601: loss 2.6052, time 4939.76ms 
iter 3602: loss 2.4783, time 4927.22ms 
iter 3603: loss 2.6917, time 4973.53ms 
iter 3604: loss 2.6552, time 5033.30ms 
iter 3605: loss 2.8617, time 5035.52ms 
iter 3606: loss 2.6954, time 5033.96ms 
iter 3607: loss 2.4629, time 5028.52ms 
iter 3608: loss 2.8447, time 4978.49ms 
iter 3609: loss 2.6557, time 4969.12ms 
iter 3610: loss 2.5159, time 4932.95ms 
iter 3611: loss 2.6275, time 5034.12ms 
iter 3612: loss 2.6200, time 5024.86ms 
iter 3613: loss 2.6035, time 4986.04ms 
iter 3614: loss 2.7519, time 4955.70ms 
iter 3615: loss 2.6993, time 4947.84ms 
iter 3616: loss 2.6122, time 4952.43ms 
iter 3617: loss 2.7281, time 5036.21ms 
iter 3618: loss 2.5957, time 4929.59ms 
iter 3619: loss 2.7102, time 4963.37ms 
iter 3620: loss 2.7809, time 5026.25ms 
iter 3621: loss 2.7104, time 5029.27ms 
iter 3622: loss 2.6664, time 5022.10ms 
iter 3623: loss 2.7316, time 5006.62ms 
iter 3624: loss 2.7805, time 4997.14ms 
iter 3625: loss 2.6502, time 5027.01ms 
iter 3626: loss 2.6239, time 4970.12ms 
iter 3627: loss 2.6698, time 4989.63ms 
iter 3628: loss 2.8240, time 5017.14ms 
iter 3629: loss 2.6698, time 5023.27ms 
iter 3630: loss 2.7743, time 5021.40ms 
iter 3631: loss 2.7266, time 5021.88ms 
iter 3632: loss 2.7106, time 5028.81ms 
iter 3633: loss 2.4766, time 5018.33ms 
iter 3634: loss 2.9012, time 5027.42ms 
iter 3635: loss 2.6143, time 4979.66ms 
iter 3636: loss 2.6246, time 4974.93ms 
iter 3637: loss 2.8580, time 5030.69ms 
iter 3638: loss 2.6227, time 5024.92ms 
iter 3639: loss 2.6386, time 5024.63ms 
iter 3640: loss 2.5832, time 5017.06ms 
iter 3641: loss 2.5812, time 4957.74ms 
iter 3642: loss 2.7044, time 4951.45ms 
iter 3643: loss 2.4550, time 4965.00ms 
iter 3644: loss 2.6665, time 4918.53ms 
iter 3645: loss 2.6501, time 4929.14ms 
iter 3646: loss 2.6920, time 4994.86ms 
iter 3647: loss 2.7143, time 5050.02ms 
iter 3648: loss 2.6556, time 5024.23ms 
iter 3649: loss 2.6751, time 5028.43ms 
step 3650: train loss 2.6915, val loss 2.8418
iter 3650: loss 2.6373, time 19654.72ms 
iter 3651: loss 2.7100, time 4989.81ms 
iter 3652: loss 2.6383, time 4989.42ms 
iter 3653: loss 2.7902, time 4989.55ms 
iter 3654: loss 2.7348, time 4976.95ms 
iter 3655: loss 2.7576, time 4959.98ms 
iter 3656: loss 2.5923, time 4916.12ms 
iter 3657: loss 2.6304, time 4946.54ms 
iter 3658: loss 2.6172, time 5005.10ms 
iter 3659: loss 2.6506, time 4999.45ms 
iter 3660: loss 2.6791, time 5014.62ms 
iter 3661: loss 2.9025, time 4979.50ms 
iter 3662: loss 2.6615, time 4967.96ms 
iter 3663: loss 2.7769, time 5014.85ms 
iter 3664: loss 2.7154, time 5024.93ms 
iter 3665: loss 2.5676, time 4968.87ms 
iter 3666: loss 2.8024, time 4950.70ms 
iter 3667: loss 2.5931, time 5026.78ms 
iter 3668: loss 2.5357, time 5019.57ms 
iter 3669: loss 2.6210, time 5017.48ms 
iter 3670: loss 2.5392, time 5026.27ms 
iter 3671: loss 2.8093, time 5027.98ms 
iter 3672: loss 2.7079, time 5031.16ms 
iter 3673: loss 2.5069, time 5023.26ms 
iter 3674: loss 2.6429, time 4916.75ms 
iter 3675: loss 2.7069, time 5000.36ms 
iter 3676: loss 2.7648, time 5030.22ms 
iter 3677: loss 2.7384, time 4994.47ms 
iter 3678: loss 2.6026, time 5016.60ms 
iter 3679: loss 2.8651, time 4994.73ms 
iter 3680: loss 2.4206, time 4987.72ms 
iter 3681: loss 2.8172, time 4997.77ms 
iter 3682: loss 2.8735, time 4959.38ms 
iter 3683: loss 2.4996, time 4924.02ms 
iter 3684: loss 2.5703, time 5009.44ms 
iter 3685: loss 2.9296, time 5042.09ms 
iter 3686: loss 2.7415, time 5040.68ms 
iter 3687: loss 2.8792, time 5034.75ms 
iter 3688: loss 2.4315, time 5034.46ms 
iter 3689: loss 2.7029, time 5037.26ms 
iter 3690: loss 2.5390, time 5019.01ms 
iter 3691: loss 2.7608, time 4936.26ms 
iter 3692: loss 2.7892, time 4928.02ms 
iter 3693: loss 2.7458, time 5040.03ms 
iter 3694: loss 2.6282, time 5026.21ms 
iter 3695: loss 2.2964, time 5039.32ms 
iter 3696: loss 2.5660, time 5028.28ms 
iter 3697: loss 2.7668, time 5033.22ms 
iter 3698: loss 2.7469, time 5020.87ms 
iter 3699: loss 2.5978, time 5019.27ms 
step 3700: train loss 2.6709, val loss 2.8494
iter 3700: loss 2.4896, time 19760.71ms 
iter 3701: loss 2.4013, time 5019.93ms 
iter 3702: loss 2.6273, time 5030.97ms 
iter 3703: loss 2.6734, time 5027.26ms 
iter 3704: loss 2.7039, time 5027.74ms 
iter 3705: loss 2.6827, time 5024.31ms 
iter 3706: loss 2.6908, time 4948.30ms 
iter 3707: loss 2.5754, time 4962.29ms 
iter 3708: loss 2.7664, time 5024.09ms 
iter 3709: loss 2.6024, time 5019.58ms 
iter 3710: loss 2.5761, time 5023.67ms 
iter 3711: loss 2.5953, time 5024.43ms 
iter 3712: loss 2.5149, time 5014.82ms 
iter 3713: loss 2.7383, time 5024.40ms 
iter 3714: loss 2.7674, time 4962.90ms 
iter 3715: loss 2.6488, time 4912.35ms 
iter 3716: loss 2.6432, time 4986.13ms 
iter 3717: loss 2.5533, time 5025.03ms 
iter 3718: loss 2.6516, time 5026.76ms 
iter 3719: loss 2.7279, time 5017.18ms 
iter 3720: loss 2.4486, time 5020.16ms 
iter 3721: loss 2.4963, time 5010.41ms 
iter 3722: loss 2.6605, time 5016.86ms 
iter 3723: loss 2.7004, time 4974.46ms 
iter 3724: loss 2.7050, time 4917.83ms 
iter 3725: loss 2.5352, time 5013.31ms 
iter 3726: loss 2.8789, time 5027.11ms 
iter 3727: loss 2.6553, time 5017.47ms 
iter 3728: loss 2.7357, time 5015.53ms 
iter 3729: loss 2.7175, time 5016.19ms 
iter 3730: loss 2.8755, time 5028.48ms 
iter 3731: loss 2.6871, time 5024.80ms 
iter 3732: loss 2.7527, time 4970.13ms 
iter 3733: loss 2.8244, time 4915.33ms 
iter 3734: loss 2.7368, time 4987.04ms 
iter 3735: loss 2.6527, time 5033.38ms 
iter 3736: loss 2.6725, time 5026.89ms 
iter 3737: loss 2.6279, time 5028.80ms 
iter 3738: loss 2.8819, time 5029.08ms 
iter 3739: loss 2.5445, time 5026.82ms 
iter 3740: loss 2.7752, time 5026.69ms 
iter 3741: loss 2.6561, time 4938.43ms 
iter 3742: loss 2.8009, time 4912.80ms 
iter 3743: loss 2.6249, time 4984.76ms 
iter 3744: loss 2.8082, time 4976.27ms 
iter 3745: loss 2.6680, time 5017.93ms 
iter 3746: loss 2.6842, time 4979.82ms 
iter 3747: loss 2.7327, time 5023.81ms 
iter 3748: loss 2.5726, time 5024.76ms 
iter 3749: loss 2.5413, time 5026.14ms 
step 3750: train loss 2.6669, val loss 2.8590
iter 3750: loss 2.7515, time 19750.29ms 
iter 3751: loss 2.4925, time 5018.69ms 
iter 3752: loss 2.6882, time 5027.66ms 
iter 3753: loss 2.7664, time 5025.34ms 
iter 3754: loss 2.6726, time 5023.22ms 
iter 3755: loss 2.7434, time 5024.53ms 
iter 3756: loss 2.6738, time 4971.42ms 
iter 3757: loss 2.7711, time 4970.49ms 
iter 3758: loss 2.8089, time 5022.18ms 
iter 3759: loss 2.5889, time 5021.86ms 
iter 3760: loss 2.5277, time 5023.21ms 
iter 3761: loss 2.7569, time 5022.08ms 
iter 3762: loss 2.6791, time 5013.73ms 
iter 3763: loss 2.6097, time 5022.55ms 
iter 3764: loss 2.7071, time 4960.43ms 
iter 3765: loss 2.6303, time 4912.57ms 
iter 3766: loss 2.5969, time 4989.97ms 
iter 3767: loss 2.5599, time 5023.47ms 
iter 3768: loss 2.7645, time 5022.82ms 
iter 3769: loss 2.7740, time 5024.33ms 
iter 3770: loss 2.5025, time 5024.48ms 
iter 3771: loss 2.6206, time 5029.54ms 
iter 3772: loss 2.7095, time 5035.46ms 
iter 3773: loss 2.7239, time 4940.80ms 
iter 3774: loss 2.7165, time 4913.51ms 
iter 3775: loss 2.7371, time 4956.87ms 
iter 3776: loss 2.6031, time 5017.47ms 
iter 3777: loss 2.6348, time 5014.91ms 
iter 3778: loss 2.5863, time 5023.21ms 
iter 3779: loss 2.7040, time 5027.80ms 
iter 3780: loss 2.6128, time 5026.72ms 
iter 3781: loss 2.6852, time 5025.34ms 
iter 3782: loss 2.8773, time 4975.29ms 
iter 3783: loss 2.5550, time 4913.53ms 
iter 3784: loss 2.7993, time 5000.62ms 
iter 3785: loss 2.5082, time 5030.43ms 
iter 3786: loss 2.6671, time 5029.19ms 
iter 3787: loss 2.7821, time 5008.84ms 
iter 3788: loss 2.6250, time 4983.40ms 
iter 3789: loss 2.6116, time 4993.26ms 
iter 3790: loss 2.5428, time 5029.50ms 
iter 3791: loss 2.7262, time 4995.92ms 
iter 3792: loss 2.5297, time 4916.06ms 
iter 3793: loss 2.5612, time 4991.55ms 
iter 3794: loss 2.5254, time 5035.72ms 
iter 3795: loss 2.7309, time 5029.71ms 
iter 3796: loss 2.7637, time 5030.50ms 
iter 3797: loss 2.7093, time 5030.21ms 
iter 3798: loss 2.6102, time 5027.36ms 
iter 3799: loss 2.4705, time 5024.61ms 
step 3800: train loss 2.6580, val loss 2.8576
iter 3800: loss 2.6502, time 19717.83ms 
iter 3801: loss 2.9211, time 5027.37ms 
iter 3802: loss 2.6868, time 5003.00ms 
iter 3803: loss 2.7819, time 5023.37ms 
iter 3804: loss 2.7524, time 5029.82ms 
iter 3805: loss 2.7553, time 4975.22ms 
iter 3806: loss 2.8134, time 4942.66ms 
iter 3807: loss 2.8313, time 5026.32ms 
iter 3808: loss 2.4378, time 5027.41ms 
iter 3809: loss 2.5608, time 5030.77ms 
iter 3810: loss 2.7060, time 5027.05ms 
iter 3811: loss 2.6839, time 5023.59ms 
iter 3812: loss 2.5745, time 5014.70ms 
iter 3813: loss 2.7079, time 5032.40ms 
iter 3814: loss 2.5999, time 4996.27ms 
iter 3815: loss 2.5569, time 5019.90ms 
iter 3816: loss 2.7903, time 5031.93ms 
iter 3817: loss 2.5431, time 5028.47ms 
iter 3818: loss 2.7322, time 5027.18ms 
iter 3819: loss 2.5341, time 5030.23ms 
iter 3820: loss 2.8637, time 5003.52ms 
iter 3821: loss 2.6323, time 5030.27ms 
iter 3822: loss 2.5268, time 4961.96ms 
iter 3823: loss 2.8049, time 4941.43ms 
iter 3824: loss 2.6673, time 5026.87ms 
iter 3825: loss 2.5848, time 5015.15ms 
iter 3826: loss 2.7778, time 4998.23ms 
iter 3827: loss 2.7081, time 5028.03ms 
iter 3828: loss 2.6458, time 5034.51ms 
iter 3829: loss 2.7833, time 5030.56ms 
iter 3830: loss 2.5984, time 5033.54ms 
iter 3831: loss 2.7839, time 4978.91ms 
iter 3832: loss 2.5186, time 4939.63ms 
iter 3833: loss 2.5476, time 5029.75ms 
iter 3834: loss 2.4962, time 5029.85ms 
iter 3835: loss 2.6231, time 5031.54ms 
iter 3836: loss 2.6280, time 4995.17ms 
iter 3837: loss 2.4987, time 5023.75ms 
iter 3838: loss 2.5779, time 5023.29ms 
iter 3839: loss 2.7514, time 5003.83ms 
iter 3840: loss 2.4551, time 4955.93ms 
iter 3841: loss 2.7150, time 5018.46ms 
iter 3842: loss 2.5602, time 4972.43ms 
iter 3843: loss 2.8901, time 4948.76ms 
iter 3844: loss 2.5910, time 5005.84ms 
iter 3845: loss 2.7982, time 5030.93ms 
iter 3846: loss 2.8558, time 5054.77ms 
iter 3847: loss 2.7824, time 5047.86ms 
iter 3848: loss 2.6620, time 4975.46ms 
iter 3849: loss 2.8448, time 4981.12ms 
step 3850: train loss 2.6445, val loss 2.8470
iter 3850: loss 2.5258, time 19674.93ms 
iter 3851: loss 2.7685, time 5029.05ms 
iter 3852: loss 2.5539, time 5032.28ms 
iter 3853: loss 2.7928, time 4979.03ms 
iter 3854: loss 2.7724, time 4919.82ms 
iter 3855: loss 2.7008, time 4995.94ms 
iter 3856: loss 2.6727, time 5030.26ms 
iter 3857: loss 2.9469, time 5027.11ms 
iter 3858: loss 2.6160, time 5034.75ms 
iter 3859: loss 2.7599, time 5037.86ms 
iter 3860: loss 2.9262, time 5028.94ms 
iter 3861: loss 2.7419, time 5027.88ms 
iter 3862: loss 2.6994, time 4974.34ms 
iter 3863: loss 2.6948, time 4945.74ms 
iter 3864: loss 2.6886, time 5022.42ms 
iter 3865: loss 2.6304, time 5013.47ms 
iter 3866: loss 2.5573, time 5019.37ms 
iter 3867: loss 2.8260, time 5029.22ms 
iter 3868: loss 2.6739, time 5024.89ms 
iter 3869: loss 2.7942, time 5028.69ms 
iter 3870: loss 2.7935, time 5033.90ms 
iter 3871: loss 2.4441, time 5000.44ms 
iter 3872: loss 2.6177, time 5027.65ms 
iter 3873: loss 2.8146, time 5024.61ms 
iter 3874: loss 2.5797, time 5009.59ms 
iter 3875: loss 2.7005, time 5022.89ms 
iter 3876: loss 2.5985, time 5023.55ms 
iter 3877: loss 2.7145, time 5022.26ms 
iter 3878: loss 2.6993, time 5032.82ms 
iter 3879: loss 2.5904, time 4976.41ms 
iter 3880: loss 2.6129, time 4951.27ms 
iter 3881: loss 2.6606, time 5038.31ms 
iter 3882: loss 2.7211, time 5033.51ms 
iter 3883: loss 2.4936, time 5029.63ms 
iter 3884: loss 2.7899, time 5026.00ms 
iter 3885: loss 2.5176, time 5021.48ms 
iter 3886: loss 2.7174, time 5044.34ms 
iter 3887: loss 2.6128, time 4959.18ms 
iter 3888: loss 2.6711, time 4925.02ms 
iter 3889: loss 2.7458, time 4972.50ms 
iter 3890: loss 2.7506, time 5041.32ms 
iter 3891: loss 2.6422, time 5027.79ms 
iter 3892: loss 2.4655, time 5033.18ms 
iter 3893: loss 2.7396, time 5033.45ms 
iter 3894: loss 2.7310, time 5031.09ms 
iter 3895: loss 2.4945, time 5033.94ms 
iter 3896: loss 2.6988, time 4979.51ms 
iter 3897: loss 2.6017, time 4971.91ms 
iter 3898: loss 2.5775, time 5021.67ms 
iter 3899: loss 2.8786, time 5012.36ms 
step 3900: train loss 2.6645, val loss 2.8392
iter 3900: loss 2.6610, time 19709.45ms 
iter 3901: loss 2.5821, time 5001.12ms 
iter 3902: loss 2.7361, time 4953.56ms 
iter 3903: loss 2.5914, time 5005.40ms 
iter 3904: loss 2.6939, time 5028.01ms 
iter 3905: loss 2.7785, time 5028.58ms 
iter 3906: loss 2.5699, time 5024.98ms 
iter 3907: loss 2.5697, time 4966.84ms 
iter 3908: loss 2.6780, time 4986.26ms 
iter 3909: loss 2.4759, time 5026.36ms 
iter 3910: loss 2.6782, time 4969.89ms 
iter 3911: loss 2.6761, time 4952.77ms 
iter 3912: loss 2.8568, time 5019.22ms 
iter 3913: loss 2.7020, time 5011.17ms 
iter 3914: loss 2.9892, time 5019.37ms 
iter 3915: loss 2.6079, time 5018.15ms 
iter 3916: loss 2.6983, time 5019.34ms 
iter 3917: loss 2.5976, time 5020.87ms 
iter 3918: loss 2.7399, time 4968.32ms 
iter 3919: loss 2.3613, time 4912.68ms 
iter 3920: loss 2.7449, time 4999.38ms 
iter 3921: loss 2.7065, time 5024.96ms 
iter 3922: loss 2.6958, time 5027.57ms 
iter 3923: loss 2.7297, time 5025.29ms 
iter 3924: loss 2.5229, time 5024.82ms 
iter 3925: loss 2.7873, time 5021.81ms 
iter 3926: loss 2.5606, time 5017.89ms 
iter 3927: loss 2.6466, time 4967.86ms 
iter 3928: loss 2.6122, time 4948.57ms 
iter 3929: loss 2.4801, time 5021.69ms 
iter 3930: loss 2.7876, time 5020.50ms 
iter 3931: loss 2.6539, time 5019.71ms 
iter 3932: loss 2.7038, time 4978.19ms 
iter 3933: loss 2.4940, time 4958.31ms 
iter 3934: loss 2.6490, time 4985.87ms 
iter 3935: loss 2.6788, time 4958.34ms 
iter 3936: loss 2.7567, time 4944.23ms 
iter 3937: loss 2.5366, time 5003.61ms 
iter 3938: loss 2.5692, time 5020.51ms 
iter 3939: loss 2.6386, time 5001.15ms 
iter 3940: loss 2.5112, time 5020.07ms 
iter 3941: loss 2.6741, time 5028.50ms 
iter 3942: loss 2.5317, time 5021.23ms 
iter 3943: loss 2.6431, time 5014.40ms 
iter 3944: loss 2.8437, time 4913.85ms 
iter 3945: loss 2.5837, time 4912.84ms 
iter 3946: loss 2.6970, time 5003.36ms 
iter 3947: loss 2.5712, time 5029.10ms 
iter 3948: loss 2.7698, time 5026.03ms 
iter 3949: loss 2.5592, time 5015.54ms 
step 3950: train loss 2.6538, val loss 2.8474
iter 3950: loss 2.5141, time 19691.20ms 
iter 3951: loss 2.6601, time 4917.07ms 
iter 3952: loss 2.3082, time 4985.67ms 
iter 3953: loss 2.6449, time 5021.41ms 
iter 3954: loss 2.6117, time 5019.20ms 
iter 3955: loss 2.6266, time 5017.35ms 
iter 3956: loss 2.5760, time 5021.48ms 
iter 3957: loss 2.4728, time 5019.00ms 
iter 3958: loss 2.6888, time 5020.78ms 
iter 3959: loss 2.6447, time 4970.11ms 
iter 3960: loss 2.4697, time 4947.62ms 
iter 3961: loss 2.5573, time 5018.72ms 
iter 3962: loss 2.5670, time 5021.01ms 
iter 3963: loss 2.7468, time 5022.30ms 
iter 3964: loss 2.6958, time 5020.79ms 
iter 3965: loss 2.5299, time 5014.88ms 
iter 3966: loss 2.7180, time 5018.63ms 
iter 3967: loss 2.8452, time 5021.38ms 
iter 3968: loss 2.7629, time 4925.69ms 
iter 3969: loss 2.6284, time 4931.26ms 
iter 3970: loss 2.7736, time 5018.16ms 
iter 3971: loss 2.6294, time 5016.76ms 
iter 3972: loss 2.9460, time 5032.04ms 
iter 3973: loss 2.8126, time 5035.33ms 
iter 3974: loss 2.6024, time 5034.74ms 
iter 3975: loss 2.5858, time 5028.89ms 
iter 3976: loss 2.6157, time 5023.52ms 
iter 3977: loss 2.8004, time 5009.55ms 
iter 3978: loss 2.5905, time 4940.26ms 
iter 3979: loss 2.6910, time 5030.36ms 
iter 3980: loss 2.6216, time 5035.73ms 
iter 3981: loss 2.9517, time 5037.69ms 
iter 3982: loss 2.6986, time 5034.92ms 
iter 3983: loss 2.6797, time 5037.91ms 
iter 3984: loss 2.6235, time 5030.13ms 
iter 3985: loss 2.7474, time 4974.87ms 
iter 3986: loss 2.5611, time 4961.54ms 
iter 3987: loss 2.7481, time 5017.07ms 
iter 3988: loss 2.7210, time 5024.80ms 
iter 3989: loss 2.7231, time 5028.98ms 
iter 3990: loss 2.8137, time 5013.19ms 
iter 3991: loss 2.3600, time 5028.90ms 
iter 3992: loss 2.5783, time 5032.59ms 
iter 3993: loss 2.6722, time 5032.21ms 
iter 3994: loss 2.7867, time 4977.30ms 
iter 3995: loss 2.6711, time 5020.79ms 
iter 3996: loss 2.8019, time 5031.97ms 
iter 3997: loss 2.7291, time 5030.57ms 
iter 3998: loss 2.5410, time 5027.48ms 
iter 3999: loss 2.6663, time 4998.02ms 
step 4000: train loss 2.6586, val loss 2.8476
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 4000: loss 2.5450, time 20726.20ms 
iter 4001: loss 2.7591, time 5025.31ms 
iter 4002: loss 2.7212, time 5001.30ms 
iter 4003: loss 2.6802, time 5022.10ms 
iter 4004: loss 2.7661, time 5025.06ms 
iter 4005: loss 2.6674, time 5023.14ms 
iter 4006: loss 2.4179, time 5028.20ms 
iter 4007: loss 2.6327, time 4979.94ms 
iter 4008: loss 2.5158, time 4933.19ms 
iter 4009: loss 2.4854, time 5007.03ms 
iter 4010: loss 2.6566, time 5018.79ms 
iter 4011: loss 2.8925, time 5034.03ms 
iter 4012: loss 2.7085, time 5037.21ms 
iter 4013: loss 2.6388, time 4988.36ms 
iter 4014: loss 2.7407, time 4996.31ms 
iter 4015: loss 2.7396, time 4942.48ms 
iter 4016: loss 2.7218, time 4914.01ms 
iter 4017: loss 2.5344, time 4938.33ms 
iter 4018: loss 2.6769, time 4967.62ms 
iter 4019: loss 2.5903, time 5016.06ms 
iter 4020: loss 2.6527, time 5023.95ms 
iter 4021: loss 2.4235, time 5030.69ms 
iter 4022: loss 2.5916, time 5019.10ms 
iter 4023: loss 2.8087, time 5003.04ms 
iter 4024: loss 2.4763, time 4948.55ms 
iter 4025: loss 2.6011, time 4927.52ms 
iter 4026: loss 2.6574, time 4974.12ms 
iter 4027: loss 2.8280, time 5036.59ms 
iter 4028: loss 2.7081, time 5035.18ms 
iter 4029: loss 2.6716, time 5036.62ms 
iter 4030: loss 2.6669, time 5033.20ms 
iter 4031: loss 2.4378, time 5028.58ms 
iter 4032: loss 2.6722, time 5042.90ms 
iter 4033: loss 2.5071, time 4983.81ms 
iter 4034: loss 2.7422, time 4989.33ms 
iter 4035: loss 2.7456, time 5031.83ms 
iter 4036: loss 2.6699, time 5019.42ms 
iter 4037: loss 2.8821, time 5034.03ms 
iter 4038: loss 2.4197, time 5005.19ms 
iter 4039: loss 2.7687, time 4948.05ms 
iter 4040: loss 2.6419, time 5025.23ms 
iter 4041: loss 2.5977, time 5039.17ms 
iter 4042: loss 2.6280, time 4980.75ms 
iter 4043: loss 2.8060, time 4940.56ms 
iter 4044: loss 2.6865, time 4952.64ms 
iter 4045: loss 2.7094, time 4994.31ms 
iter 4046: loss 2.5189, time 4918.73ms 
iter 4047: loss 2.5346, time 4950.75ms 
iter 4048: loss 2.5868, time 5008.56ms 
iter 4049: loss 2.7437, time 4952.61ms 
step 4050: train loss 2.6599, val loss 2.8604
iter 4050: loss 2.6015, time 19690.51ms 
iter 4051: loss 2.4667, time 4992.58ms 
iter 4052: loss 2.5709, time 4929.35ms 
iter 4053: loss 2.6968, time 4928.97ms 
iter 4054: loss 2.5721, time 4934.76ms 
iter 4055: loss 2.6105, time 4921.77ms 
iter 4056: loss 2.6719, time 4920.58ms 
iter 4057: loss 2.4347, time 5007.92ms 
iter 4058: loss 2.6664, time 5043.78ms 
iter 4059: loss 2.6915, time 5035.99ms 
iter 4060: loss 2.5086, time 5039.40ms 
iter 4061: loss 2.7571, time 5041.16ms 
iter 4062: loss 2.4427, time 5036.94ms 
iter 4063: loss 2.5851, time 5011.77ms 
iter 4064: loss 2.5092, time 4919.05ms 
iter 4065: loss 2.7001, time 4925.63ms 
iter 4066: loss 2.4328, time 5034.16ms 
iter 4067: loss 2.6110, time 5040.86ms 
iter 4068: loss 2.6767, time 4991.73ms 
iter 4069: loss 2.7334, time 5004.20ms 
iter 4070: loss 2.6395, time 5012.60ms 
iter 4071: loss 2.6948, time 4994.93ms 
iter 4072: loss 2.8506, time 4956.29ms 
iter 4073: loss 2.7757, time 4923.37ms 
iter 4074: loss 2.3977, time 4931.88ms 
iter 4075: loss 2.8414, time 4979.82ms 
iter 4076: loss 2.6001, time 5008.39ms 
iter 4077: loss 2.5222, time 5005.19ms 
iter 4078: loss 2.5996, time 5030.87ms 
iter 4079: loss 2.7503, time 5012.92ms 
iter 4080: loss 2.7469, time 5020.24ms 
iter 4081: loss 2.8072, time 5031.52ms 
iter 4082: loss 2.6538, time 4967.22ms 
iter 4083: loss 2.6582, time 4997.79ms 
iter 4084: loss 2.5071, time 5035.16ms 
iter 4085: loss 2.6744, time 5025.30ms 
iter 4086: loss 2.6476, time 5020.18ms 
iter 4087: loss 2.5875, time 5005.52ms 
iter 4088: loss 2.5795, time 5015.50ms 
iter 4089: loss 2.6205, time 5023.60ms 
iter 4090: loss 2.7823, time 4977.90ms 
iter 4091: loss 2.5917, time 4964.25ms 
iter 4092: loss 2.4671, time 5000.15ms 
iter 4093: loss 2.6125, time 5027.98ms 
iter 4094: loss 2.5084, time 5030.59ms 
iter 4095: loss 2.7180, time 5020.01ms 
iter 4096: loss 2.6449, time 5033.14ms 
iter 4097: loss 2.4170, time 5030.55ms 
iter 4098: loss 2.7887, time 4983.35ms 
iter 4099: loss 2.5531, time 4916.26ms 
step 4100: train loss 2.6375, val loss 2.8506
iter 4100: loss 2.5074, time 19695.57ms 
iter 4101: loss 2.4905, time 5031.99ms 
iter 4102: loss 2.4989, time 5009.49ms 
iter 4103: loss 2.5540, time 5015.97ms 
iter 4104: loss 2.7105, time 4974.97ms 
iter 4105: loss 2.6241, time 4919.31ms 
iter 4106: loss 2.6463, time 5003.83ms 
iter 4107: loss 2.6051, time 5025.92ms 
iter 4108: loss 2.7244, time 4994.81ms 
iter 4109: loss 2.7074, time 5014.62ms 
iter 4110: loss 2.6985, time 5046.62ms 
iter 4111: loss 2.5867, time 4991.88ms 
iter 4112: loss 2.6438, time 5033.93ms 
iter 4113: loss 2.5878, time 4920.28ms 
iter 4114: loss 2.6770, time 4941.98ms 
iter 4115: loss 2.6542, time 5022.56ms 
iter 4116: loss 2.7497, time 5043.64ms 
iter 4117: loss 2.6981, time 5037.04ms 
iter 4118: loss 2.6458, time 5025.45ms 
iter 4119: loss 2.5884, time 5041.19ms 
iter 4120: loss 2.6290, time 5045.81ms 
iter 4121: loss 2.7105, time 5023.71ms 
iter 4122: loss 2.4820, time 4953.20ms 
iter 4123: loss 2.5223, time 4975.02ms 
iter 4124: loss 2.5242, time 5027.46ms 
iter 4125: loss 2.8708, time 5021.83ms 
iter 4126: loss 2.6236, time 5028.63ms 
iter 4127: loss 2.6691, time 4957.34ms 
iter 4128: loss 2.4229, time 4960.48ms 
iter 4129: loss 2.7494, time 5029.36ms 
iter 4130: loss 2.8917, time 5029.94ms 
iter 4131: loss 2.3853, time 5035.62ms 
iter 4132: loss 2.6788, time 5011.57ms 
iter 4133: loss 2.5707, time 4943.52ms 
iter 4134: loss 2.6543, time 4970.62ms 
iter 4135: loss 2.4605, time 4920.84ms 
iter 4136: loss 2.6693, time 4944.82ms 
iter 4137: loss 2.7272, time 5019.37ms 
iter 4138: loss 2.6138, time 5024.45ms 
iter 4139: loss 2.6067, time 5021.92ms 
iter 4140: loss 2.5906, time 5025.76ms 
iter 4141: loss 2.5262, time 5026.39ms 
iter 4142: loss 2.7177, time 5026.07ms 
iter 4143: loss 2.3944, time 5027.64ms 
iter 4144: loss 2.8664, time 4976.21ms 
iter 4145: loss 2.6193, time 4926.53ms 
iter 4146: loss 2.6385, time 5026.44ms 
iter 4147: loss 2.5910, time 5029.30ms 
iter 4148: loss 2.7032, time 5029.03ms 
iter 4149: loss 2.4873, time 5012.75ms 
step 4150: train loss 2.6399, val loss 2.8442
iter 4150: loss 2.8037, time 19718.90ms 
iter 4151: loss 2.6088, time 4982.44ms 
iter 4152: loss 2.6644, time 4951.48ms 
iter 4153: loss 2.5859, time 4939.74ms 
iter 4154: loss 2.5283, time 4923.84ms 
iter 4155: loss 2.6011, time 5026.38ms 
iter 4156: loss 2.6204, time 5021.97ms 
iter 4157: loss 2.5781, time 5028.95ms 
iter 4158: loss 2.8257, time 5028.11ms 
iter 4159: loss 2.6604, time 5034.73ms 
iter 4160: loss 2.7861, time 5033.27ms 
iter 4161: loss 2.8428, time 5030.62ms 
iter 4162: loss 2.5924, time 4987.60ms 
iter 4163: loss 2.6815, time 4919.60ms 
iter 4164: loss 2.7967, time 4917.58ms 
iter 4165: loss 2.7153, time 4922.29ms 
iter 4166: loss 2.6120, time 5008.01ms 
iter 4167: loss 2.6348, time 5027.10ms 
iter 4168: loss 2.4666, time 5023.07ms 
iter 4169: loss 2.6516, time 5024.11ms 
iter 4170: loss 2.5061, time 5025.31ms 
iter 4171: loss 2.5804, time 5027.79ms 
iter 4172: loss 2.5220, time 5025.77ms 
iter 4173: loss 2.6025, time 4972.22ms 
iter 4174: loss 2.6086, time 4954.89ms 
iter 4175: loss 2.6113, time 4920.23ms 
iter 4176: loss 2.5735, time 4928.52ms 
iter 4177: loss 2.5254, time 5023.84ms 
iter 4178: loss 2.7037, time 5039.23ms 
iter 4179: loss 2.7271, time 5034.65ms 
iter 4180: loss 2.7057, time 5029.45ms 
iter 4181: loss 2.7304, time 5019.83ms 
iter 4182: loss 2.5927, time 5036.07ms 
iter 4183: loss 2.5745, time 5031.68ms 
iter 4184: loss 2.8084, time 5008.91ms 
iter 4185: loss 2.7419, time 4918.54ms 
iter 4186: loss 2.7547, time 4919.44ms 
iter 4187: loss 2.5298, time 5001.21ms 
iter 4188: loss 2.5638, time 5005.54ms 
iter 4189: loss 2.6734, time 5034.15ms 
iter 4190: loss 2.5354, time 5033.00ms 
iter 4191: loss 2.6505, time 5038.32ms 
iter 4192: loss 2.4900, time 5038.55ms 
iter 4193: loss 2.5915, time 5002.56ms 
iter 4194: loss 2.4221, time 5009.66ms 
iter 4195: loss 2.6075, time 4973.03ms 
iter 4196: loss 2.3963, time 4916.14ms 
iter 4197: loss 2.7285, time 4943.19ms 
iter 4198: loss 2.3493, time 5022.97ms 
iter 4199: loss 2.7910, time 5020.83ms 
step 4200: train loss 2.6265, val loss 2.8499
iter 4200: loss 2.5318, time 19728.24ms 
iter 4201: loss 2.8181, time 5034.47ms 
iter 4202: loss 2.6130, time 4982.40ms 
iter 4203: loss 2.5279, time 4920.76ms 
iter 4204: loss 2.8748, time 4917.27ms 
iter 4205: loss 2.4548, time 5008.67ms 
iter 4206: loss 2.6402, time 5036.42ms 
iter 4207: loss 2.6097, time 5035.05ms 
iter 4208: loss 2.6492, time 5066.55ms 
iter 4209: loss 2.6659, time 5036.13ms 
iter 4210: loss 2.4756, time 5031.72ms 
iter 4211: loss 2.6169, time 5036.67ms 
iter 4212: loss 2.6562, time 4983.84ms 
iter 4213: loss 2.6972, time 4919.57ms 
iter 4214: loss 2.5049, time 4917.39ms 
iter 4215: loss 2.6070, time 4936.70ms 
iter 4216: loss 2.5256, time 5026.52ms 
iter 4217: loss 2.6942, time 5014.37ms 
iter 4218: loss 2.7811, time 5035.03ms 
iter 4219: loss 2.7210, time 5038.78ms 
iter 4220: loss 2.6631, time 5034.58ms 
iter 4221: loss 2.5633, time 5023.70ms 
iter 4222: loss 2.6619, time 4994.60ms 
iter 4223: loss 2.6685, time 4916.35ms 
iter 4224: loss 2.7899, time 4921.85ms 
iter 4225: loss 2.6269, time 4921.27ms 
iter 4226: loss 2.5724, time 4942.80ms 
iter 4227: loss 2.7422, time 5032.09ms 
iter 4228: loss 2.5351, time 5031.11ms 
iter 4229: loss 2.4231, time 5025.68ms 
iter 4230: loss 2.4235, time 5030.00ms 
iter 4231: loss 2.6092, time 5023.73ms 
iter 4232: loss 2.4648, time 5027.49ms 
iter 4233: loss 2.6324, time 5036.73ms 
iter 4234: loss 2.6048, time 4966.64ms 
iter 4235: loss 2.6068, time 4917.91ms 
iter 4236: loss 2.8415, time 4917.70ms 
iter 4237: loss 2.8049, time 4981.78ms 
iter 4238: loss 2.8096, time 5032.36ms 
iter 4239: loss 2.7159, time 5030.52ms 
iter 4240: loss 2.7407, time 5033.07ms 
iter 4241: loss 2.8142, time 5031.10ms 
iter 4242: loss 2.6212, time 5033.65ms 
iter 4243: loss 2.7179, time 5029.43ms 
iter 4244: loss 2.6649, time 5029.84ms 
iter 4245: loss 2.5816, time 4979.44ms 
iter 4246: loss 2.5627, time 4917.07ms 
iter 4247: loss 2.7882, time 4915.98ms 
iter 4248: loss 2.5811, time 5003.87ms 
iter 4249: loss 2.5981, time 5029.71ms 
step 4250: train loss 2.6323, val loss 2.8365
iter 4250: loss 2.5662, time 19692.92ms 
iter 4251: loss 2.7967, time 5018.75ms 
iter 4252: loss 2.8204, time 4977.38ms 
iter 4253: loss 2.5749, time 4940.40ms 
iter 4254: loss 2.6661, time 4920.16ms 
iter 4255: loss 2.6448, time 4934.71ms 
iter 4256: loss 2.6485, time 5028.80ms 
iter 4257: loss 2.4711, time 5033.86ms 
iter 4258: loss 2.5884, time 5020.38ms 
iter 4259: loss 2.7276, time 5029.79ms 
iter 4260: loss 2.6355, time 4994.53ms 
iter 4261: loss 2.6819, time 4928.28ms 
iter 4262: loss 2.6765, time 4962.77ms 
iter 4263: loss 2.8165, time 4915.83ms 
iter 4264: loss 2.2620, time 4915.77ms 
iter 4265: loss 2.4854, time 4915.48ms 
iter 4266: loss 2.8948, time 4924.58ms 
iter 4267: loss 2.7492, time 4998.43ms 
iter 4268: loss 2.7399, time 4981.01ms 
iter 4269: loss 2.6493, time 5015.20ms 
iter 4270: loss 2.7095, time 5035.45ms 
iter 4271: loss 2.5731, time 4953.89ms 
iter 4272: loss 2.6400, time 4983.70ms 
iter 4273: loss 2.5878, time 4947.94ms 
iter 4274: loss 2.8538, time 4917.77ms 
iter 4275: loss 2.8037, time 4951.12ms 
iter 4276: loss 2.8146, time 4925.41ms 
iter 4277: loss 2.4166, time 4934.12ms 
iter 4278: loss 2.4571, time 5007.47ms 
iter 4279: loss 2.7290, time 5007.51ms 
iter 4280: loss 2.7788, time 5029.00ms 
iter 4281: loss 2.6088, time 5027.78ms 
iter 4282: loss 2.5301, time 5022.81ms 
iter 4283: loss 2.6939, time 5022.35ms 
iter 4284: loss 2.6452, time 5037.23ms 
iter 4285: loss 2.6748, time 4979.63ms 
iter 4286: loss 2.7322, time 4924.38ms 
iter 4287: loss 2.7265, time 4924.12ms 
iter 4288: loss 2.6608, time 4936.31ms 
iter 4289: loss 2.4612, time 5029.49ms 
iter 4290: loss 2.6034, time 5032.72ms 
iter 4291: loss 2.7374, time 5035.29ms 
iter 4292: loss 2.8701, time 5039.19ms 
iter 4293: loss 2.8607, time 5038.42ms 
iter 4294: loss 2.6939, time 5037.69ms 
iter 4295: loss 2.6565, time 5036.00ms 
iter 4296: loss 2.8474, time 4983.10ms 
iter 4297: loss 2.6178, time 4936.84ms 
iter 4298: loss 2.5803, time 4920.58ms 
iter 4299: loss 2.7139, time 4929.72ms 
step 4300: train loss 2.6286, val loss 2.8459
iter 4300: loss 2.8173, time 19774.73ms 
iter 4301: loss 2.5818, time 5034.22ms 
iter 4302: loss 2.6846, time 5024.83ms 
iter 4303: loss 2.5100, time 5035.05ms 
iter 4304: loss 2.6746, time 4971.69ms 
iter 4305: loss 2.7250, time 4919.66ms 
iter 4306: loss 2.7459, time 4918.60ms 
iter 4307: loss 2.5928, time 4919.86ms 
iter 4308: loss 2.6284, time 5038.47ms 
iter 4309: loss 2.5360, time 5044.23ms 
iter 4310: loss 2.6020, time 5045.53ms 
iter 4311: loss 2.8697, time 5040.42ms 
iter 4312: loss 2.8661, time 5040.81ms 
iter 4313: loss 2.7431, time 5034.40ms 
iter 4314: loss 2.8052, time 5031.77ms 
iter 4315: loss 2.6295, time 4981.76ms 
iter 4316: loss 2.7137, time 4919.77ms 
iter 4317: loss 2.6453, time 4917.83ms 
iter 4318: loss 2.5989, time 4931.35ms 
iter 4319: loss 2.4896, time 5026.92ms 
iter 4320: loss 2.5088, time 5026.74ms 
iter 4321: loss 2.6301, time 5011.96ms 
iter 4322: loss 2.6335, time 5021.48ms 
iter 4323: loss 2.4984, time 5039.16ms 
iter 4324: loss 2.4740, time 5032.03ms 
iter 4325: loss 2.5309, time 5015.95ms 
iter 4326: loss 2.5329, time 4937.78ms 
iter 4327: loss 2.6120, time 4918.66ms 
iter 4328: loss 2.8126, time 4920.56ms 
iter 4329: loss 2.8120, time 4948.87ms 
iter 4330: loss 2.6320, time 5011.17ms 
iter 4331: loss 2.7279, time 5037.68ms 
iter 4332: loss 2.5549, time 5044.14ms 
iter 4333: loss 2.5342, time 5034.56ms 
iter 4334: loss 2.5442, time 5033.90ms 
iter 4335: loss 2.4909, time 5034.52ms 
iter 4336: loss 2.6996, time 5018.64ms 
iter 4337: loss 2.4697, time 4922.37ms 
iter 4338: loss 2.7351, time 4932.82ms 
iter 4339: loss 2.5577, time 4922.56ms 
iter 4340: loss 2.7237, time 4921.38ms 
iter 4341: loss 2.6470, time 4957.03ms 
iter 4342: loss 2.5857, time 5027.20ms 
iter 4343: loss 2.6674, time 5032.27ms 
iter 4344: loss 2.6268, time 5031.93ms 
iter 4345: loss 2.7451, time 5042.55ms 
iter 4346: loss 2.6438, time 5024.88ms 
iter 4347: loss 2.5084, time 5030.18ms 
iter 4348: loss 2.8554, time 5030.46ms 
iter 4349: loss 2.4366, time 4976.01ms 
step 4350: train loss 2.6322, val loss 2.8616
iter 4350: loss 2.4959, time 19715.65ms 
iter 4351: loss 2.6196, time 5033.65ms 
iter 4352: loss 2.6563, time 5032.28ms 
iter 4353: loss 2.5779, time 5026.68ms 
iter 4354: loss 2.5547, time 5034.33ms 
iter 4355: loss 2.5642, time 5033.74ms 
iter 4356: loss 2.6180, time 5032.04ms 
iter 4357: loss 2.6906, time 5034.77ms 
iter 4358: loss 2.5765, time 4992.24ms 
iter 4359: loss 2.5725, time 4977.60ms 
iter 4360: loss 2.6044, time 4935.10ms 
iter 4361: loss 2.5151, time 5007.96ms 
iter 4362: loss 2.8453, time 4954.78ms 
iter 4363: loss 2.5998, time 4954.56ms 
iter 4364: loss 2.4631, time 4953.77ms 
iter 4365: loss 2.5139, time 4938.08ms 
iter 4366: loss 2.6597, time 5031.98ms 
iter 4367: loss 2.8383, time 5032.86ms 
iter 4368: loss 2.5429, time 4979.49ms 
iter 4369: loss 2.3897, time 4918.60ms 
iter 4370: loss 2.7824, time 4917.34ms 
iter 4371: loss 2.7010, time 4914.91ms 
iter 4372: loss 2.7422, time 4993.65ms 
iter 4373: loss 2.6947, time 5005.59ms 
iter 4374: loss 2.4875, time 5032.62ms 
iter 4375: loss 2.8322, time 5031.02ms 
iter 4376: loss 2.6632, time 5031.17ms 
iter 4377: loss 2.6250, time 5032.30ms 
iter 4378: loss 2.7795, time 4939.51ms 
iter 4379: loss 2.6237, time 4925.25ms 
iter 4380: loss 2.5690, time 4923.10ms 
iter 4381: loss 2.5452, time 4923.91ms 
iter 4382: loss 2.7327, time 4924.73ms 
iter 4383: loss 2.5743, time 4982.39ms 
iter 4384: loss 2.6913, time 5034.11ms 
iter 4385: loss 2.6304, time 4993.61ms 
iter 4386: loss 2.5314, time 5031.78ms 
iter 4387: loss 2.7752, time 5030.67ms 
iter 4388: loss 2.5388, time 5031.01ms 
iter 4389: loss 2.5189, time 5031.32ms 
iter 4390: loss 2.5961, time 4974.59ms 
iter 4391: loss 2.6217, time 4925.95ms 
iter 4392: loss 2.6169, time 4963.53ms 
iter 4393: loss 2.7817, time 4924.31ms 
iter 4394: loss 2.7262, time 5027.35ms 
iter 4395: loss 2.5776, time 5037.90ms 
iter 4396: loss 2.6240, time 5033.02ms 
iter 4397: loss 2.6472, time 5033.20ms 
iter 4398: loss 2.5977, time 5017.60ms 
iter 4399: loss 2.6667, time 5022.79ms 
step 4400: train loss 2.6217, val loss 2.8534
iter 4400: loss 2.6241, time 19688.02ms 
iter 4401: loss 2.8095, time 4944.00ms 
iter 4402: loss 2.5574, time 4988.09ms 
iter 4403: loss 2.6447, time 5018.63ms 
iter 4404: loss 2.4776, time 5031.35ms 
iter 4405: loss 2.4834, time 5013.88ms 
iter 4406: loss 2.5545, time 5025.10ms 
iter 4407: loss 2.4593, time 5024.37ms 
iter 4408: loss 2.5345, time 5028.26ms 
iter 4409: loss 2.7667, time 4972.26ms 
iter 4410: loss 2.7131, time 4916.89ms 
iter 4411: loss 2.6926, time 4915.08ms 
iter 4412: loss 2.6527, time 4915.03ms 
iter 4413: loss 2.5644, time 4998.76ms 
iter 4414: loss 2.7386, time 5019.40ms 
iter 4415: loss 2.4235, time 5020.96ms 
iter 4416: loss 2.7618, time 5011.15ms 
iter 4417: loss 2.5860, time 5013.36ms 
iter 4418: loss 2.4489, time 5023.12ms 
iter 4419: loss 2.7506, time 5022.75ms 
iter 4420: loss 2.5697, time 4970.46ms 
iter 4421: loss 2.7149, time 4915.69ms 
iter 4422: loss 2.5034, time 4916.03ms 
iter 4423: loss 2.5298, time 4945.18ms 
iter 4424: loss 2.5482, time 5029.84ms 
iter 4425: loss 2.7098, time 5022.96ms 
iter 4426: loss 2.5717, time 5027.46ms 
iter 4427: loss 2.7317, time 5033.17ms 
iter 4428: loss 2.5741, time 5032.34ms 
iter 4429: loss 2.6009, time 5030.09ms 
iter 4430: loss 2.8139, time 5028.65ms 
iter 4431: loss 2.7578, time 4975.88ms 
iter 4432: loss 2.7305, time 4917.97ms 
iter 4433: loss 2.5416, time 4932.72ms 
iter 4434: loss 2.6304, time 4937.76ms 
iter 4435: loss 2.6765, time 5030.18ms 
iter 4436: loss 2.6730, time 5022.38ms 
iter 4437: loss 2.5738, time 5009.76ms 
iter 4438: loss 2.6099, time 5023.16ms 
iter 4439: loss 2.5803, time 5021.17ms 
iter 4440: loss 2.6447, time 5026.72ms 
iter 4441: loss 2.6670, time 5016.71ms 
iter 4442: loss 2.7965, time 4926.31ms 
iter 4443: loss 2.5361, time 4918.98ms 
iter 4444: loss 2.7191, time 4933.60ms 
iter 4445: loss 2.6901, time 4915.67ms 
iter 4446: loss 2.7103, time 4998.23ms 
iter 4447: loss 2.6408, time 5000.49ms 
iter 4448: loss 2.5955, time 5010.31ms 
iter 4449: loss 2.6829, time 5025.51ms 
step 4450: train loss 2.6122, val loss 2.8475
iter 4450: loss 2.6758, time 19615.53ms 
iter 4451: loss 2.4208, time 4915.37ms 
iter 4452: loss 2.6294, time 4922.14ms 
iter 4453: loss 2.7106, time 4925.62ms 
iter 4454: loss 2.6323, time 4961.21ms 
iter 4455: loss 2.4996, time 4963.26ms 
iter 4456: loss 2.5821, time 5022.93ms 
iter 4457: loss 2.6993, time 5026.79ms 
iter 4458: loss 2.6785, time 5022.12ms 
iter 4459: loss 2.5336, time 5022.41ms 
iter 4460: loss 2.4964, time 5014.23ms 
iter 4461: loss 2.5104, time 4922.14ms 
iter 4462: loss 2.6360, time 4920.24ms 
iter 4463: loss 2.7629, time 4920.26ms 
iter 4464: loss 2.7699, time 4916.53ms 
iter 4465: loss 2.7241, time 4982.08ms 
iter 4466: loss 2.6229, time 4970.62ms 
iter 4467: loss 2.7591, time 4948.58ms 
iter 4468: loss 2.4515, time 4944.20ms 
iter 4469: loss 2.4131, time 4972.26ms 
iter 4470: loss 2.6226, time 4967.54ms 
iter 4471: loss 2.4991, time 4963.57ms 
iter 4472: loss 2.5236, time 4921.03ms 
iter 4473: loss 2.7051, time 4920.35ms 
iter 4474: loss 2.6665, time 4919.95ms 
iter 4475: loss 2.7291, time 4919.00ms 
iter 4476: loss 2.5543, time 4984.80ms 
iter 4477: loss 2.6503, time 5020.00ms 
iter 4478: loss 2.6770, time 5020.75ms 
iter 4479: loss 2.5020, time 5021.10ms 
iter 4480: loss 2.6886, time 5021.87ms 
iter 4481: loss 2.5248, time 5021.11ms 
iter 4482: loss 2.6029, time 5021.17ms 
iter 4483: loss 2.6444, time 4970.20ms 
iter 4484: loss 2.5476, time 4915.00ms 
iter 4485: loss 2.4331, time 4913.75ms 
iter 4486: loss 2.6094, time 4926.12ms 
iter 4487: loss 2.8824, time 5019.54ms 
iter 4488: loss 2.7309, time 5021.12ms 
iter 4489: loss 2.6162, time 5022.03ms 
iter 4490: loss 2.4781, time 5023.56ms 
iter 4491: loss 2.7889, time 5024.12ms 
iter 4492: loss 2.4724, time 5026.48ms 
iter 4493: loss 2.4679, time 5026.22ms 
iter 4494: loss 2.8159, time 4972.71ms 
iter 4495: loss 2.7376, time 4936.67ms 
iter 4496: loss 2.5760, time 4920.43ms 
iter 4497: loss 2.6005, time 4927.23ms 
iter 4498: loss 2.6001, time 4976.94ms 
iter 4499: loss 2.5353, time 4970.57ms 
step 4500: train loss 2.6001, val loss 2.8464
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 4500: loss 2.6730, time 20750.73ms 
iter 4501: loss 2.4720, time 5005.16ms 
iter 4502: loss 2.6530, time 4915.08ms 
iter 4503: loss 2.5038, time 4914.79ms 
iter 4504: loss 2.4310, time 4919.61ms 
iter 4505: loss 2.5276, time 5024.05ms 
iter 4506: loss 2.7680, time 5026.30ms 
iter 4507: loss 2.6885, time 5025.70ms 
iter 4508: loss 2.7921, time 5030.73ms 
iter 4509: loss 2.5092, time 5027.33ms 
iter 4510: loss 2.7400, time 5018.03ms 
iter 4511: loss 2.6028, time 4953.48ms 
iter 4512: loss 2.5169, time 4917.91ms 
iter 4513: loss 2.5722, time 4913.57ms 
iter 4514: loss 2.7236, time 4914.33ms 
iter 4515: loss 2.7013, time 4990.48ms 
iter 4516: loss 2.4053, time 5015.08ms 
iter 4517: loss 2.6415, time 5021.80ms 
iter 4518: loss 2.6053, time 5021.81ms 
iter 4519: loss 2.5850, time 5020.98ms 
iter 4520: loss 2.4593, time 5023.41ms 
iter 4521: loss 2.3673, time 5027.23ms 
iter 4522: loss 2.6025, time 4976.51ms 
iter 4523: loss 2.4417, time 4916.54ms 
iter 4524: loss 2.7276, time 4934.75ms 
iter 4525: loss 2.8121, time 5014.03ms 
iter 4526: loss 2.6590, time 5022.87ms 
iter 4527: loss 2.7281, time 5002.26ms 
iter 4528: loss 2.6779, time 5002.31ms 
iter 4529: loss 2.3916, time 5025.00ms 
iter 4530: loss 2.4635, time 5020.23ms 
iter 4531: loss 2.6700, time 5022.32ms 
iter 4532: loss 2.7191, time 4989.63ms 
iter 4533: loss 2.5479, time 4925.53ms 
iter 4534: loss 2.6162, time 4924.11ms 
iter 4535: loss 2.6819, time 4915.83ms 
iter 4536: loss 2.5666, time 5011.72ms 
iter 4537: loss 2.8144, time 5028.33ms 
iter 4538: loss 2.6140, time 5024.11ms 
iter 4539: loss 2.5930, time 5016.40ms 
iter 4540: loss 2.4481, time 5023.47ms 
iter 4541: loss 2.5763, time 5060.50ms 
iter 4542: loss 2.7060, time 4971.34ms 
iter 4543: loss 2.6736, time 4956.08ms 
iter 4544: loss 2.6071, time 4930.18ms 
iter 4545: loss 2.4876, time 4970.59ms 
iter 4546: loss 2.5426, time 4960.45ms 
iter 4547: loss 2.5907, time 5021.81ms 
iter 4548: loss 2.6307, time 5026.35ms 
iter 4549: loss 2.7454, time 5031.48ms 
step 4550: train loss 2.6167, val loss 2.8420
iter 4550: loss 2.5331, time 19667.51ms 
iter 4551: loss 2.6175, time 4916.69ms 
iter 4552: loss 2.4250, time 4914.11ms 
iter 4553: loss 2.6981, time 4914.83ms 
iter 4554: loss 2.5876, time 4987.56ms 
iter 4555: loss 2.5009, time 5024.56ms 
iter 4556: loss 2.5563, time 5016.58ms 
iter 4557: loss 2.2637, time 5021.48ms 
iter 4558: loss 2.6795, time 5023.54ms 
iter 4559: loss 2.5719, time 5024.21ms 
iter 4560: loss 2.5596, time 5023.46ms 
iter 4561: loss 2.6642, time 5006.78ms 
iter 4562: loss 2.6280, time 4971.72ms 
iter 4563: loss 2.6507, time 4921.64ms 
iter 4564: loss 2.6002, time 4915.49ms 
iter 4565: loss 2.6220, time 4995.97ms 
iter 4566: loss 2.4993, time 5020.08ms 
iter 4567: loss 2.4291, time 5012.20ms 
iter 4568: loss 2.6435, time 5029.97ms 
iter 4569: loss 2.5360, time 5016.47ms 
iter 4570: loss 2.5343, time 5027.67ms 
iter 4571: loss 2.5448, time 5030.06ms 
iter 4572: loss 2.7360, time 4977.56ms 
iter 4573: loss 2.4872, time 4915.72ms 
iter 4574: loss 2.4701, time 4914.73ms 
iter 4575: loss 2.7147, time 4954.34ms 
iter 4576: loss 2.8008, time 5028.47ms 
iter 4577: loss 2.7883, time 5027.81ms 
iter 4578: loss 2.7113, time 5014.90ms 
iter 4579: loss 2.7289, time 5022.52ms 
iter 4580: loss 2.8399, time 5029.09ms 
iter 4581: loss 2.4294, time 5029.75ms 
iter 4582: loss 2.7879, time 5035.38ms 
iter 4583: loss 2.5721, time 5008.41ms 
iter 4584: loss 2.6913, time 4971.07ms 
iter 4585: loss 2.5923, time 4963.31ms 
iter 4586: loss 2.5900, time 5027.75ms 
iter 4587: loss 2.7268, time 5026.86ms 
iter 4588: loss 2.6928, time 5030.56ms 
iter 4589: loss 2.5451, time 5027.37ms 
iter 4590: loss 2.6612, time 5029.49ms 
iter 4591: loss 2.5131, time 5029.69ms 
iter 4592: loss 2.8174, time 5031.71ms 
iter 4593: loss 2.5259, time 4978.05ms 
iter 4594: loss 2.7371, time 4924.02ms 
iter 4595: loss 2.5421, time 4919.62ms 
iter 4596: loss 2.7037, time 4991.12ms 
iter 4597: loss 2.7060, time 5028.12ms 
iter 4598: loss 2.6285, time 5011.97ms 
iter 4599: loss 2.6229, time 5028.07ms 
step 4600: train loss 2.6086, val loss 2.8336
iter 4600: loss 2.6676, time 19691.92ms 
iter 4601: loss 2.5782, time 4938.47ms 
iter 4602: loss 2.5011, time 4932.70ms 
iter 4603: loss 2.6691, time 4931.91ms 
iter 4604: loss 2.7794, time 4973.62ms 
iter 4605: loss 2.7844, time 5020.31ms 
iter 4606: loss 2.6958, time 5036.88ms 
iter 4607: loss 2.6585, time 5025.36ms 
iter 4608: loss 2.4381, time 5027.44ms 
iter 4609: loss 2.4852, time 5033.72ms 
iter 4610: loss 2.8096, time 4996.85ms 
iter 4611: loss 2.5369, time 4970.30ms 
iter 4612: loss 2.8412, time 4931.34ms 
iter 4613: loss 2.6365, time 4984.92ms 
iter 4614: loss 2.4559, time 5014.62ms 
iter 4615: loss 2.4971, time 5029.57ms 
iter 4616: loss 2.3359, time 5029.53ms 
iter 4617: loss 2.6152, time 5029.36ms 
iter 4618: loss 2.7091, time 5029.60ms 
iter 4619: loss 2.5482, time 5026.59ms 
iter 4620: loss 2.4986, time 4947.62ms 
iter 4621: loss 2.5816, time 4940.29ms 
iter 4622: loss 2.3948, time 4951.20ms 
iter 4623: loss 2.6124, time 4914.14ms 
iter 4624: loss 2.7607, time 5015.26ms 
iter 4625: loss 2.6320, time 5022.00ms 
iter 4626: loss 2.5514, time 5019.50ms 
iter 4627: loss 2.5515, time 5027.23ms 
iter 4628: loss 2.7173, time 5029.09ms 
iter 4629: loss 2.6207, time 5028.88ms 
iter 4630: loss 2.5605, time 5027.82ms 
iter 4631: loss 2.6081, time 4977.33ms 
iter 4632: loss 2.6689, time 4916.64ms 
iter 4633: loss 2.7468, time 4918.26ms 
iter 4634: loss 2.5539, time 5012.47ms 
iter 4635: loss 2.5243, time 5029.25ms 
iter 4636: loss 2.4055, time 5015.75ms 
iter 4637: loss 2.6172, time 5017.19ms 
iter 4638: loss 2.4990, time 5028.43ms 
iter 4639: loss 2.5857, time 5025.61ms 
iter 4640: loss 2.5287, time 4939.00ms 
iter 4641: loss 2.5718, time 4914.72ms 
iter 4642: loss 2.6065, time 4914.42ms 
iter 4643: loss 2.5380, time 4915.85ms 
iter 4644: loss 2.7264, time 4994.86ms 
iter 4645: loss 2.7205, time 5018.05ms 
iter 4646: loss 2.5167, time 4993.90ms 
iter 4647: loss 2.6843, time 4992.09ms 
iter 4648: loss 2.3621, time 4949.26ms 
iter 4649: loss 2.6618, time 5001.35ms 
step 4650: train loss 2.6031, val loss 2.8228
iter 4650: loss 2.7976, time 19657.22ms 
iter 4651: loss 2.6940, time 4914.90ms 
iter 4652: loss 2.5192, time 4913.19ms 
iter 4653: loss 2.4894, time 4928.96ms 
iter 4654: loss 2.7192, time 4913.30ms 
iter 4655: loss 2.4900, time 4946.76ms 
iter 4656: loss 2.5392, time 4977.75ms 
iter 4657: loss 2.7792, time 4976.38ms 
iter 4658: loss 2.6596, time 4924.27ms 
iter 4659: loss 2.9938, time 4926.30ms 
iter 4660: loss 2.5187, time 4935.38ms 
iter 4661: loss 2.5265, time 4960.07ms 
iter 4662: loss 2.4942, time 5006.41ms 
iter 4663: loss 2.1876, time 5015.46ms 
iter 4664: loss 2.5419, time 5029.49ms 
iter 4665: loss 2.6313, time 5030.21ms 
iter 4666: loss 2.5613, time 5029.28ms 
iter 4667: loss 2.7905, time 5038.95ms 
iter 4668: loss 2.7104, time 5028.61ms 
iter 4669: loss 2.4791, time 5006.22ms 
iter 4670: loss 2.5859, time 4981.96ms 
iter 4671: loss 2.3778, time 4983.03ms 
iter 4672: loss 2.4148, time 5014.04ms 
iter 4673: loss 2.5994, time 5031.16ms 
iter 4674: loss 2.5888, time 4984.64ms 
iter 4675: loss 2.7224, time 5007.68ms 
iter 4676: loss 2.4681, time 5028.41ms 
iter 4677: loss 2.4681, time 5023.61ms 
iter 4678: loss 2.6793, time 5035.26ms 
iter 4679: loss 2.7350, time 4945.55ms 
iter 4680: loss 2.5229, time 4917.83ms 
iter 4681: loss 2.6254, time 4955.97ms 
iter 4682: loss 2.4279, time 4941.28ms 
iter 4683: loss 2.5536, time 4971.86ms 
iter 4684: loss 2.6397, time 5027.24ms 
iter 4685: loss 2.7187, time 5025.30ms 
iter 4686: loss 2.4108, time 5028.43ms 
iter 4687: loss 2.5883, time 5031.08ms 
iter 4688: loss 2.3314, time 4998.24ms 
iter 4689: loss 2.5288, time 5032.56ms 
iter 4690: loss 2.4756, time 4977.17ms 
iter 4691: loss 2.6085, time 4917.82ms 
iter 4692: loss 2.3512, time 4918.18ms 
iter 4693: loss 2.6755, time 4970.08ms 
iter 4694: loss 2.6179, time 5014.68ms 
iter 4695: loss 2.4825, time 5027.56ms 
iter 4696: loss 2.5748, time 5032.99ms 
iter 4697: loss 2.6978, time 5033.75ms 
iter 4698: loss 2.8491, time 5030.69ms 
iter 4699: loss 2.6061, time 5030.41ms 
step 4700: train loss 2.5999, val loss 2.8508
iter 4700: loss 2.6501, time 19673.83ms 
iter 4701: loss 2.5662, time 5003.14ms 
iter 4702: loss 2.7044, time 5028.96ms 
iter 4703: loss 2.7111, time 5036.07ms 
iter 4704: loss 2.8252, time 5026.32ms 
iter 4705: loss 2.5581, time 5024.14ms 
iter 4706: loss 2.7208, time 5028.20ms 
iter 4707: loss 2.6697, time 5031.82ms 
iter 4708: loss 2.8000, time 4975.39ms 
iter 4709: loss 2.6488, time 4919.88ms 
iter 4710: loss 2.6857, time 4917.67ms 
iter 4711: loss 2.5894, time 4919.58ms 
iter 4712: loss 2.9829, time 4993.60ms 
iter 4713: loss 2.6558, time 5028.58ms 
iter 4714: loss 2.4584, time 5025.69ms 
iter 4715: loss 2.4911, time 5033.58ms 
iter 4716: loss 2.6496, time 5032.19ms 
iter 4717: loss 2.5737, time 5028.70ms 
iter 4718: loss 2.5338, time 5028.64ms 
iter 4719: loss 2.5668, time 4974.86ms 
iter 4720: loss 2.5457, time 4917.92ms 
iter 4721: loss 2.4916, time 4925.05ms 
iter 4722: loss 2.5235, time 5023.90ms 
iter 4723: loss 2.6262, time 5024.86ms 
iter 4724: loss 2.6996, time 5023.72ms 
iter 4725: loss 2.6236, time 5029.26ms 
iter 4726: loss 2.4761, time 4991.22ms 
iter 4727: loss 2.5895, time 5029.91ms 
iter 4728: loss 2.7270, time 5028.48ms 
iter 4729: loss 2.4537, time 4974.45ms 
iter 4730: loss 2.7357, time 4917.72ms 
iter 4731: loss 2.4465, time 4932.79ms 
iter 4732: loss 2.6877, time 5026.00ms 
iter 4733: loss 2.7920, time 5034.94ms 
iter 4734: loss 2.6547, time 5030.19ms 
iter 4735: loss 2.6925, time 5029.25ms 
iter 4736: loss 2.4780, time 5034.46ms 
iter 4737: loss 2.6661, time 5059.39ms 
iter 4738: loss 2.6550, time 5003.98ms 
iter 4739: loss 2.4908, time 4935.93ms 
iter 4740: loss 2.6384, time 4919.44ms 
iter 4741: loss 2.6446, time 4964.30ms 
iter 4742: loss 2.6183, time 5038.40ms 
iter 4743: loss 2.5767, time 5036.47ms 
iter 4744: loss 2.7766, time 5043.91ms 
iter 4745: loss 2.4985, time 5034.19ms 
iter 4746: loss 2.6737, time 4990.20ms 
iter 4747: loss 2.7210, time 5011.60ms 
iter 4748: loss 2.5549, time 4980.47ms 
iter 4749: loss 2.5264, time 5004.06ms 
step 4750: train loss 2.6030, val loss 2.8495
iter 4750: loss 2.5304, time 19742.78ms 
iter 4751: loss 2.6769, time 5022.27ms 
iter 4752: loss 2.7203, time 5030.89ms 
iter 4753: loss 2.5427, time 5019.93ms 
iter 4754: loss 2.6713, time 5024.06ms 
iter 4755: loss 2.5152, time 4977.77ms 
iter 4756: loss 2.5634, time 4917.86ms 
iter 4757: loss 2.4607, time 4916.48ms 
iter 4758: loss 2.7658, time 4985.36ms 
iter 4759: loss 2.6376, time 5023.09ms 
iter 4760: loss 2.5815, time 5027.09ms 
iter 4761: loss 2.5514, time 5023.50ms 
iter 4762: loss 2.5989, time 5020.19ms 
iter 4763: loss 2.6964, time 5016.04ms 
iter 4764: loss 2.6403, time 5009.64ms 
iter 4765: loss 2.5723, time 5034.27ms 
iter 4766: loss 2.6444, time 4975.40ms 
iter 4767: loss 2.6517, time 4915.61ms 
iter 4768: loss 2.3871, time 4996.24ms 
iter 4769: loss 2.7157, time 5022.29ms 
iter 4770: loss 2.3217, time 5023.62ms 
iter 4771: loss 2.7073, time 5025.11ms 
iter 4772: loss 2.5756, time 5015.52ms 
iter 4773: loss 2.7039, time 5024.75ms 
iter 4774: loss 2.7386, time 5004.38ms 
iter 4775: loss 2.6485, time 4954.49ms 
iter 4776: loss 2.3813, time 4919.90ms 
iter 4777: loss 2.5754, time 4923.89ms 
iter 4778: loss 2.6096, time 4923.11ms 
iter 4779: loss 2.4679, time 4993.46ms 
iter 4780: loss 2.6786, time 5008.04ms 
iter 4781: loss 2.7138, time 5005.81ms 
iter 4782: loss 2.4958, time 5026.29ms 
iter 4783: loss 2.8128, time 5022.21ms 
iter 4784: loss 2.6206, time 5023.58ms 
iter 4785: loss 2.7790, time 4971.74ms 
iter 4786: loss 2.5281, time 4915.15ms 
iter 4787: loss 2.7022, time 4975.83ms 
iter 4788: loss 2.4998, time 5024.39ms 
iter 4789: loss 2.7466, time 5022.56ms 
iter 4790: loss 2.5213, time 5021.50ms 
iter 4791: loss 2.4719, time 5028.46ms 
iter 4792: loss 2.4451, time 5022.90ms 
iter 4793: loss 2.5365, time 5025.59ms 
iter 4794: loss 2.6515, time 4972.28ms 
iter 4795: loss 2.5462, time 4917.78ms 
iter 4796: loss 2.6626, time 4947.46ms 
iter 4797: loss 2.6619, time 5011.71ms 
iter 4798: loss 2.4283, time 5023.70ms 
iter 4799: loss 2.6548, time 5010.51ms 
step 4800: train loss 2.6062, val loss 2.8432
iter 4800: loss 2.7249, time 19736.97ms 
iter 4801: loss 2.5449, time 5044.20ms 
iter 4802: loss 2.7572, time 4944.64ms 
iter 4803: loss 2.7849, time 4918.25ms 
iter 4804: loss 2.4962, time 4979.48ms 
iter 4805: loss 2.7472, time 5033.85ms 
iter 4806: loss 2.7298, time 5037.24ms 
iter 4807: loss 2.4659, time 5037.89ms 
iter 4808: loss 2.6030, time 5036.48ms 
iter 4809: loss 2.5311, time 5008.43ms 
iter 4810: loss 2.6762, time 5032.37ms 
iter 4811: loss 2.5491, time 5038.63ms 
iter 4812: loss 2.6213, time 4981.85ms 
iter 4813: loss 2.6416, time 4920.85ms 
iter 4814: loss 2.5101, time 4986.22ms 
iter 4815: loss 2.5514, time 5026.93ms 
iter 4816: loss 2.6183, time 5032.05ms 
iter 4817: loss 2.6842, time 5029.76ms 
iter 4818: loss 2.5451, time 5028.91ms 
iter 4819: loss 2.5781, time 5031.33ms 
iter 4820: loss 2.5733, time 5028.59ms 
iter 4821: loss 2.8453, time 4981.48ms 
iter 4822: loss 2.5263, time 4921.98ms 
iter 4823: loss 2.5482, time 4981.41ms 
iter 4824: loss 2.7240, time 5037.93ms 
iter 4825: loss 2.5793, time 5029.62ms 
iter 4826: loss 2.2091, time 4983.15ms 
iter 4827: loss 2.8311, time 5035.04ms 
iter 4828: loss 2.6851, time 5033.82ms 
iter 4829: loss 2.4487, time 5034.81ms 
iter 4830: loss 2.5252, time 4969.31ms 
iter 4831: loss 2.4588, time 4920.51ms 
iter 4832: loss 2.4957, time 4919.58ms 
iter 4833: loss 2.7870, time 5012.12ms 
iter 4834: loss 2.6419, time 5031.30ms 
iter 4835: loss 2.5856, time 5028.55ms 
iter 4836: loss 2.5080, time 5008.59ms 
iter 4837: loss 2.7399, time 4985.70ms 
iter 4838: loss 2.8358, time 5029.57ms 
iter 4839: loss 2.7333, time 5034.79ms 
iter 4840: loss 2.7294, time 4924.98ms 
iter 4841: loss 2.7141, time 4918.67ms 
iter 4842: loss 2.4457, time 4945.95ms 
iter 4843: loss 2.5522, time 4984.39ms 
iter 4844: loss 2.5238, time 5002.70ms 
iter 4845: loss 2.6986, time 5033.15ms 
iter 4846: loss 2.6690, time 5030.42ms 
iter 4847: loss 2.3531, time 5016.86ms 
iter 4848: loss 2.4368, time 5030.62ms 
iter 4849: loss 2.6721, time 5020.64ms 
step 4850: train loss 2.6070, val loss 2.8589
iter 4850: loss 2.3606, time 19744.21ms 
iter 4851: loss 2.6427, time 5028.67ms 
iter 4852: loss 2.3890, time 5031.77ms 
iter 4853: loss 2.5008, time 4987.43ms 
iter 4854: loss 2.5322, time 5029.48ms 
iter 4855: loss 2.5893, time 5016.77ms 
iter 4856: loss 2.6525, time 5032.06ms 
iter 4857: loss 2.4402, time 4978.44ms 
iter 4858: loss 2.6798, time 4917.85ms 
iter 4859: loss 2.5686, time 4985.74ms 
iter 4860: loss 2.5436, time 5025.26ms 
iter 4861: loss 2.7085, time 5016.33ms 
iter 4862: loss 2.4937, time 5025.04ms 
iter 4863: loss 2.4496, time 5023.63ms 
iter 4864: loss 2.4962, time 5014.61ms 
iter 4865: loss 2.5498, time 5001.50ms 
iter 4866: loss 2.6530, time 4957.06ms 
iter 4867: loss 2.6118, time 4918.88ms 
iter 4868: loss 2.5763, time 4962.60ms 
iter 4869: loss 2.5476, time 5023.17ms 
iter 4870: loss 2.5593, time 5028.59ms 
iter 4871: loss 2.8454, time 5035.14ms 
iter 4872: loss 2.5210, time 5030.55ms 
iter 4873: loss 2.5735, time 5027.20ms 
iter 4874: loss 2.7785, time 5029.12ms 
iter 4875: loss 2.7370, time 5029.91ms 
iter 4876: loss 2.5791, time 4977.19ms 
iter 4877: loss 2.6035, time 4939.85ms 
iter 4878: loss 2.7652, time 4950.11ms 
iter 4879: loss 2.6020, time 5025.64ms 
iter 4880: loss 2.7294, time 5031.93ms 
iter 4881: loss 2.6643, time 5018.90ms 
iter 4882: loss 2.5939, time 5026.19ms 
iter 4883: loss 2.5723, time 5025.66ms 
iter 4884: loss 2.5433, time 5031.01ms 
iter 4885: loss 2.4511, time 5030.75ms 
iter 4886: loss 2.8454, time 4978.57ms 
iter 4887: loss 2.6813, time 4916.57ms 
iter 4888: loss 2.4052, time 5001.87ms 
iter 4889: loss 2.5023, time 5031.90ms 
iter 4890: loss 2.6901, time 5035.82ms 
iter 4891: loss 2.7862, time 5027.94ms 
iter 4892: loss 2.6759, time 5020.30ms 
iter 4893: loss 2.5308, time 5015.87ms 
iter 4894: loss 2.6295, time 5031.41ms 
iter 4895: loss 2.4282, time 4978.56ms 
iter 4896: loss 2.6204, time 4917.46ms 
iter 4897: loss 2.6254, time 4935.42ms 
iter 4898: loss 2.6329, time 4928.10ms 
iter 4899: loss 2.4152, time 4979.44ms 
step 4900: train loss 2.5866, val loss 2.8463
iter 4900: loss 2.5116, time 19674.90ms 
iter 4901: loss 2.6075, time 5019.60ms 
iter 4902: loss 2.5344, time 4977.06ms 
iter 4903: loss 2.5278, time 4916.07ms 
iter 4904: loss 2.4992, time 4929.93ms 
iter 4905: loss 2.6673, time 5002.97ms 
iter 4906: loss 2.6189, time 5025.48ms 
iter 4907: loss 2.6628, time 5029.05ms 
iter 4908: loss 2.4541, time 5028.91ms 
iter 4909: loss 2.5503, time 5033.12ms 
iter 4910: loss 2.4823, time 5029.16ms 
iter 4911: loss 2.7395, time 5029.55ms 
iter 4912: loss 2.4173, time 4977.05ms 
iter 4913: loss 2.6533, time 4917.59ms 
iter 4914: loss 2.6821, time 4936.53ms 
iter 4915: loss 2.3856, time 5011.68ms 
iter 4916: loss 2.6241, time 5031.61ms 
iter 4917: loss 2.6680, time 5016.68ms 
iter 4918: loss 2.5789, time 4935.50ms 
iter 4919: loss 2.6877, time 4925.32ms 
iter 4920: loss 2.6174, time 4929.03ms 
iter 4921: loss 2.7426, time 4949.82ms 
iter 4922: loss 2.6863, time 4915.76ms 
iter 4923: loss 2.6102, time 4917.21ms 
iter 4924: loss 2.6643, time 4976.21ms 
iter 4925: loss 2.6287, time 4929.26ms 
iter 4926: loss 2.5248, time 4927.26ms 
iter 4927: loss 2.5794, time 4926.90ms 
iter 4928: loss 2.6080, time 4943.23ms 
iter 4929: loss 2.7986, time 5020.34ms 
iter 4930: loss 2.6563, time 5015.59ms 
iter 4931: loss 2.6984, time 5024.64ms 
iter 4932: loss 2.5140, time 4970.61ms 
iter 4933: loss 2.7208, time 4915.35ms 
iter 4934: loss 2.4446, time 4969.54ms 
iter 4935: loss 2.7500, time 5022.22ms 
iter 4936: loss 2.6376, time 5020.34ms 
iter 4937: loss 2.5414, time 5021.09ms 
iter 4938: loss 2.4901, time 5019.90ms 
iter 4939: loss 2.6900, time 5019.60ms 
iter 4940: loss 2.5220, time 5021.37ms 
iter 4941: loss 2.4827, time 4970.54ms 
iter 4942: loss 2.6378, time 4915.10ms 
iter 4943: loss 2.4779, time 4914.33ms 
iter 4944: loss 2.6132, time 4949.04ms 
iter 4945: loss 2.5765, time 5019.03ms 
iter 4946: loss 2.5539, time 5018.19ms 
iter 4947: loss 2.6381, time 4957.35ms 
iter 4948: loss 2.6860, time 5012.92ms 
iter 4949: loss 2.7446, time 5015.70ms 
step 4950: train loss 2.5959, val loss 2.8359
iter 4950: loss 2.6171, time 19621.31ms 
iter 4951: loss 2.6515, time 4926.43ms 
iter 4952: loss 2.6717, time 4926.69ms 
iter 4953: loss 2.5931, time 4979.27ms 
iter 4954: loss 2.8466, time 4950.36ms 
iter 4955: loss 2.5895, time 4963.78ms 
iter 4956: loss 2.4611, time 4940.41ms 
iter 4957: loss 2.3634, time 4998.54ms 
iter 4958: loss 2.5196, time 5050.70ms 
iter 4959: loss 2.5798, time 4977.79ms 
iter 4960: loss 2.7257, time 4922.84ms 
iter 4961: loss 2.3567, time 5018.05ms 
iter 4962: loss 2.5871, time 5028.10ms 
iter 4963: loss 2.5586, time 5019.72ms 
iter 4964: loss 2.7732, time 4991.38ms 
iter 4965: loss 2.4199, time 5033.53ms 
iter 4966: loss 2.6930, time 5030.28ms 
iter 4967: loss 2.7184, time 4984.40ms 
iter 4968: loss 2.6105, time 4925.59ms 
iter 4969: loss 2.7394, time 4926.42ms 
iter 4970: loss 2.4164, time 4924.73ms 
iter 4971: loss 2.5752, time 4975.01ms 
iter 4972: loss 2.8198, time 4957.05ms 
iter 4973: loss 2.6447, time 4968.84ms 
iter 4974: loss 2.5717, time 4947.44ms 
iter 4975: loss 2.6358, time 5007.16ms 
iter 4976: loss 2.7559, time 5047.37ms 
iter 4977: loss 2.6472, time 5018.84ms 
iter 4978: loss 2.6540, time 4955.57ms 
iter 4979: loss 2.5218, time 4919.04ms 
iter 4980: loss 2.6126, time 4984.89ms 
iter 4981: loss 2.6486, time 5014.51ms 
iter 4982: loss 2.7036, time 5006.65ms 
iter 4983: loss 2.5157, time 5025.46ms 
iter 4984: loss 2.8014, time 5022.14ms 
iter 4985: loss 2.5919, time 5016.88ms 
iter 4986: loss 2.5887, time 4994.17ms 
iter 4987: loss 2.5849, time 4934.04ms 
iter 4988: loss 2.7752, time 4928.82ms 
iter 4989: loss 2.7181, time 4935.88ms 
iter 4990: loss 2.5666, time 5020.16ms 
iter 4991: loss 2.3901, time 5004.08ms 
iter 4992: loss 2.5393, time 5010.17ms 
iter 4993: loss 2.5286, time 5001.41ms 
iter 4994: loss 2.6372, time 4984.96ms 
iter 4995: loss 2.7081, time 4983.71ms 
iter 4996: loss 2.6101, time 4990.05ms 
iter 4997: loss 2.6790, time 4924.88ms 
iter 4998: loss 2.4962, time 4988.58ms 
iter 4999: loss 2.5183, time 4997.82ms 
step 5000: train loss 2.5953, val loss 2.8360
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 5000: loss 2.6684, time 20721.04ms 
iter 5001: loss 2.3134, time 5030.63ms 
iter 5002: loss 2.4562, time 5039.57ms 
iter 5003: loss 2.4588, time 5005.91ms 
iter 5004: loss 2.6647, time 5041.67ms 
iter 5005: loss 2.4888, time 4979.78ms 
iter 5006: loss 2.6243, time 5031.62ms 
iter 5007: loss 2.4604, time 5042.06ms 
iter 5008: loss 2.4324, time 5039.76ms 
iter 5009: loss 2.5581, time 5047.91ms 
iter 5010: loss 2.7867, time 5045.00ms 
iter 5011: loss 2.8369, time 5036.41ms 
iter 5012: loss 2.5761, time 5034.50ms 
iter 5013: loss 2.7196, time 4957.08ms 
iter 5014: loss 2.5549, time 4929.71ms 
iter 5015: loss 2.5251, time 5032.18ms 
iter 5016: loss 2.5048, time 5028.43ms 
iter 5017: loss 2.5821, time 5039.66ms 
iter 5018: loss 2.5654, time 5032.21ms 
iter 5019: loss 2.7860, time 5036.20ms 
iter 5020: loss 2.5981, time 5027.40ms 
iter 5021: loss 2.5970, time 5036.68ms 
iter 5022: loss 2.4329, time 4986.47ms 
iter 5023: loss 2.7752, time 4923.00ms 
iter 5024: loss 2.4488, time 4981.97ms 
iter 5025: loss 2.4429, time 5007.85ms 
iter 5026: loss 2.4315, time 5036.47ms 
iter 5027: loss 2.6057, time 5019.93ms 
iter 5028: loss 2.5194, time 5012.90ms 
iter 5029: loss 2.6282, time 5030.49ms 
iter 5030: loss 2.7055, time 5029.57ms 
iter 5031: loss 2.5287, time 4985.31ms 
iter 5032: loss 2.5670, time 4938.06ms 
iter 5033: loss 2.5674, time 4958.48ms 
iter 5034: loss 2.6362, time 5032.83ms 
iter 5035: loss 2.8256, time 5026.36ms 
iter 5036: loss 2.5481, time 5026.38ms 
iter 5037: loss 2.6041, time 5043.27ms 
iter 5038: loss 2.5187, time 5046.21ms 
iter 5039: loss 2.8295, time 5024.63ms 
iter 5040: loss 2.4521, time 5032.97ms 
iter 5041: loss 2.6969, time 4985.70ms 
iter 5042: loss 2.5149, time 4922.84ms 
iter 5043: loss 2.4019, time 5014.23ms 
iter 5044: loss 2.4989, time 5042.43ms 
iter 5045: loss 2.7065, time 5026.17ms 
iter 5046: loss 2.5473, time 5038.46ms 
iter 5047: loss 2.3640, time 5019.44ms 
iter 5048: loss 2.7794, time 5020.65ms 
iter 5049: loss 2.5811, time 5027.49ms 
step 5050: train loss 2.5931, val loss 2.8408
iter 5050: loss 2.6776, time 19750.31ms 
iter 5051: loss 2.5712, time 4915.04ms 
iter 5052: loss 2.8511, time 4972.05ms 
iter 5053: loss 2.5287, time 5032.19ms 
iter 5054: loss 2.3934, time 5030.15ms 
iter 5055: loss 2.7427, time 5036.48ms 
iter 5056: loss 2.3084, time 5045.38ms 
iter 5057: loss 2.4954, time 5009.80ms 
iter 5058: loss 2.5873, time 4977.30ms 
iter 5059: loss 2.5443, time 5031.90ms 
iter 5060: loss 2.5359, time 5045.85ms 
iter 5061: loss 2.4937, time 5046.26ms 
iter 5062: loss 2.6010, time 5038.47ms 
iter 5063: loss 2.5933, time 5032.80ms 
iter 5064: loss 2.5715, time 5038.26ms 
iter 5065: loss 2.5073, time 5040.74ms 
iter 5066: loss 2.4231, time 4976.22ms 
iter 5067: loss 2.4953, time 4916.97ms 
iter 5068: loss 2.7293, time 5001.28ms 
iter 5069: loss 2.6812, time 5041.01ms 
iter 5070: loss 2.5965, time 5036.20ms 
iter 5071: loss 2.6614, time 4936.77ms 
iter 5072: loss 2.5138, time 4988.17ms 
iter 5073: loss 2.4619, time 5014.84ms 
iter 5074: loss 2.8232, time 5023.03ms 
iter 5075: loss 2.2316, time 4979.33ms 
iter 5076: loss 2.6855, time 4922.22ms 
iter 5077: loss 2.5742, time 4949.53ms 
iter 5078: loss 2.5412, time 5026.11ms 
iter 5079: loss 2.6282, time 5041.01ms 
iter 5080: loss 2.5564, time 5045.37ms 
iter 5081: loss 2.8164, time 5032.53ms 
iter 5082: loss 2.6919, time 5048.60ms 
iter 5083: loss 2.6030, time 5032.10ms 
iter 5084: loss 2.6501, time 5038.80ms 
iter 5085: loss 2.5971, time 4928.33ms 
iter 5086: loss 2.5689, time 4935.80ms 
iter 5087: loss 2.3328, time 4945.84ms 
iter 5088: loss 2.6172, time 5033.33ms 
iter 5089: loss 2.6002, time 5041.05ms 
iter 5090: loss 2.6967, time 5027.37ms 
iter 5091: loss 2.5021, time 5028.10ms 
iter 5092: loss 2.6607, time 5026.94ms 
iter 5093: loss 2.6097, time 5012.29ms 
iter 5094: loss 2.5547, time 5032.65ms 
iter 5095: loss 2.8197, time 4970.67ms 
iter 5096: loss 2.2000, time 4917.24ms 
iter 5097: loss 2.5297, time 4987.60ms 
iter 5098: loss 2.5248, time 5027.30ms 
iter 5099: loss 2.7004, time 5017.56ms 
step 5100: train loss 2.5812, val loss 2.8373
iter 5100: loss 2.5651, time 19683.21ms 
iter 5101: loss 2.6152, time 4923.39ms 
iter 5102: loss 2.6185, time 4948.09ms 
iter 5103: loss 2.5005, time 4938.09ms 
iter 5104: loss 2.4577, time 5027.00ms 
iter 5105: loss 2.4882, time 5025.93ms 
iter 5106: loss 2.6569, time 4973.59ms 
iter 5107: loss 2.5393, time 4938.39ms 
iter 5108: loss 2.8324, time 5027.03ms 
iter 5109: loss 2.5610, time 5010.72ms 
iter 5110: loss 2.5825, time 5026.62ms 
iter 5111: loss 2.5333, time 4976.44ms 
iter 5112: loss 2.7524, time 4917.62ms 
iter 5113: loss 2.8059, time 4945.91ms 
iter 5114: loss 2.6296, time 5024.38ms 
iter 5115: loss 2.6627, time 5020.41ms 
iter 5116: loss 2.4528, time 5021.69ms 
iter 5117: loss 2.6107, time 5021.87ms 
iter 5118: loss 2.7657, time 5023.07ms 
iter 5119: loss 2.6337, time 5029.69ms 
iter 5120: loss 2.5100, time 5032.42ms 
iter 5121: loss 2.7539, time 4962.13ms 
iter 5122: loss 2.5972, time 4916.15ms 
iter 5123: loss 2.5362, time 4937.86ms 
iter 5124: loss 2.5192, time 4974.36ms 
iter 5125: loss 2.6465, time 5024.28ms 
iter 5126: loss 2.4657, time 4948.78ms 
iter 5127: loss 2.6282, time 4942.77ms 
iter 5128: loss 2.8138, time 4928.60ms 
iter 5129: loss 2.7170, time 4951.45ms 
iter 5130: loss 2.6260, time 4927.69ms 
iter 5131: loss 2.5634, time 4928.44ms 
iter 5132: loss 2.6767, time 4928.79ms 
iter 5133: loss 2.5442, time 4960.89ms 
iter 5134: loss 2.5127, time 5030.33ms 
iter 5135: loss 2.7058, time 5024.71ms 
iter 5136: loss 2.4312, time 4996.07ms 
iter 5137: loss 2.5208, time 5033.04ms 
iter 5138: loss 2.4324, time 5025.35ms 
iter 5139: loss 2.5106, time 5028.93ms 
iter 5140: loss 2.5207, time 5036.56ms 
iter 5141: loss 2.6215, time 4938.07ms 
iter 5142: loss 2.7944, time 4914.60ms 
iter 5143: loss 2.7111, time 4965.13ms 
iter 5144: loss 2.6046, time 5029.79ms 
iter 5145: loss 2.7054, time 5024.99ms 
iter 5146: loss 2.6108, time 5013.75ms 
iter 5147: loss 2.6166, time 5027.73ms 
iter 5148: loss 2.4952, time 5026.70ms 
iter 5149: loss 2.7137, time 5022.52ms 
step 5150: train loss 2.5881, val loss 2.8331
iter 5150: loss 2.6149, time 19724.85ms 
iter 5151: loss 2.6288, time 5031.91ms 
iter 5152: loss 2.7173, time 5032.24ms 
iter 5153: loss 2.7366, time 5030.57ms 
iter 5154: loss 2.6513, time 5016.51ms 
iter 5155: loss 2.7121, time 5031.41ms 
iter 5156: loss 2.5766, time 5004.87ms 
iter 5157: loss 2.5949, time 5001.06ms 
iter 5158: loss 2.7026, time 5010.10ms 
iter 5159: loss 2.4385, time 5026.01ms 
iter 5160: loss 2.5963, time 5030.29ms 
iter 5161: loss 2.4033, time 5028.12ms 
iter 5162: loss 2.5633, time 5040.59ms 
iter 5163: loss 2.5840, time 5031.78ms 
iter 5164: loss 2.3433, time 5033.11ms 
iter 5165: loss 2.6316, time 5010.75ms 
iter 5166: loss 2.7595, time 4923.17ms 
iter 5167: loss 2.4989, time 4917.64ms 
iter 5168: loss 2.6186, time 4975.33ms 
iter 5169: loss 2.4504, time 5032.38ms 
iter 5170: loss 2.5040, time 5031.78ms 
iter 5171: loss 2.6397, time 5030.73ms 
iter 5172: loss 2.6267, time 5035.89ms 
iter 5173: loss 2.6417, time 5033.74ms 
iter 5174: loss 2.4317, time 5028.03ms 
iter 5175: loss 2.4208, time 4963.18ms 
iter 5176: loss 2.5552, time 4926.15ms 
iter 5177: loss 2.6319, time 4917.99ms 
iter 5178: loss 2.5623, time 4993.68ms 
iter 5179: loss 2.4701, time 5035.51ms 
iter 5180: loss 2.7110, time 5033.90ms 
iter 5181: loss 2.4629, time 5036.94ms 
iter 5182: loss 2.2435, time 5033.06ms 
iter 5183: loss 2.6415, time 5024.42ms 
iter 5184: loss 2.5066, time 5030.52ms 
iter 5185: loss 2.5981, time 4993.80ms 
iter 5186: loss 2.4481, time 4971.19ms 
iter 5187: loss 2.5765, time 5001.72ms 
iter 5188: loss 2.5946, time 4957.23ms 
iter 5189: loss 2.4217, time 5021.06ms 
iter 5190: loss 2.5192, time 5034.77ms 
iter 5191: loss 2.5196, time 5031.24ms 
iter 5192: loss 2.7574, time 5030.02ms 
iter 5193: loss 2.4363, time 5031.65ms 
iter 5194: loss 2.5814, time 4977.43ms 
iter 5195: loss 2.4923, time 4957.79ms 
iter 5196: loss 2.8834, time 4947.99ms 
iter 5197: loss 2.4210, time 5026.99ms 
iter 5198: loss 2.7029, time 5034.43ms 
iter 5199: loss 2.6379, time 5034.13ms 
step 5200: train loss 2.5773, val loss 2.8345
iter 5200: loss 2.6226, time 19710.58ms 
iter 5201: loss 2.4573, time 4971.02ms 
iter 5202: loss 2.5898, time 4956.51ms 
iter 5203: loss 2.6871, time 4946.21ms 
iter 5204: loss 2.7036, time 5023.89ms 
iter 5205: loss 2.7114, time 5030.99ms 
iter 5206: loss 2.5111, time 5006.05ms 
iter 5207: loss 2.5116, time 5011.52ms 
iter 5208: loss 2.5226, time 5020.08ms 
iter 5209: loss 2.5814, time 5007.88ms 
iter 5210: loss 2.4913, time 5019.95ms 
iter 5211: loss 2.6586, time 4968.42ms 
iter 5212: loss 2.6808, time 4914.31ms 
iter 5213: loss 2.4771, time 4971.20ms 
iter 5214: loss 2.5590, time 5019.73ms 
iter 5215: loss 2.4356, time 5019.89ms 
iter 5216: loss 2.6814, time 5025.27ms 
iter 5217: loss 2.5870, time 5019.88ms 
iter 5218: loss 2.4674, time 5013.55ms 
iter 5219: loss 2.4365, time 5016.02ms 
iter 5220: loss 2.4828, time 5025.81ms 
iter 5221: loss 2.6414, time 4975.12ms 
iter 5222: loss 2.7252, time 4922.08ms 
iter 5223: loss 2.5702, time 4998.39ms 
iter 5224: loss 2.6327, time 5019.18ms 
iter 5225: loss 2.5312, time 5019.88ms 
iter 5226: loss 2.4365, time 5020.74ms 
iter 5227: loss 2.3958, time 5018.45ms 
iter 5228: loss 2.6717, time 5022.08ms 
iter 5229: loss 2.5147, time 5024.49ms 
iter 5230: loss 2.6857, time 4973.68ms 
iter 5231: loss 2.5536, time 4949.08ms 
iter 5232: loss 2.4743, time 4975.40ms 
iter 5233: loss 2.6382, time 5021.32ms 
iter 5234: loss 2.3208, time 5019.31ms 
iter 5235: loss 2.3932, time 5007.71ms 
iter 5236: loss 2.5876, time 5023.87ms 
iter 5237: loss 2.4959, time 5021.82ms 
iter 5238: loss 2.6546, time 5004.06ms 
iter 5239: loss 2.5998, time 4926.34ms 
iter 5240: loss 2.5547, time 4925.52ms 
iter 5241: loss 2.6127, time 4937.71ms 
iter 5242: loss 2.5604, time 4946.76ms 
iter 5243: loss 2.6154, time 5022.16ms 
iter 5244: loss 2.5051, time 5019.34ms 
iter 5245: loss 2.6115, time 5021.77ms 
iter 5246: loss 2.5023, time 5009.74ms 
iter 5247: loss 2.4746, time 5028.25ms 
iter 5248: loss 2.6531, time 5025.62ms 
iter 5249: loss 2.7190, time 5033.06ms 
step 5250: train loss 2.5943, val loss 2.8439
iter 5250: loss 2.4709, time 19682.02ms 
iter 5251: loss 2.6566, time 5028.19ms 
iter 5252: loss 2.5284, time 5027.92ms 
iter 5253: loss 2.7404, time 5030.95ms 
iter 5254: loss 2.5698, time 5028.36ms 
iter 5255: loss 2.5873, time 5032.62ms 
iter 5256: loss 2.4598, time 4955.84ms 
iter 5257: loss 2.6265, time 4953.85ms 
iter 5258: loss 2.6189, time 4981.50ms 
iter 5259: loss 2.6387, time 5038.45ms 
iter 5260: loss 2.7116, time 5030.45ms 
iter 5261: loss 2.4658, time 5030.42ms 
iter 5262: loss 2.6566, time 5031.57ms 
iter 5263: loss 2.7127, time 5006.79ms 
iter 5264: loss 2.5322, time 4969.06ms 
iter 5265: loss 2.5029, time 4957.85ms 
iter 5266: loss 2.7185, time 4915.27ms 
iter 5267: loss 2.6082, time 4946.23ms 
iter 5268: loss 2.5242, time 4948.76ms 
iter 5269: loss 2.6195, time 4932.44ms 
iter 5270: loss 2.2835, time 4957.53ms 
iter 5271: loss 2.6360, time 4932.10ms 
iter 5272: loss 2.7277, time 4963.98ms 
iter 5273: loss 2.5960, time 4923.85ms 
iter 5274: loss 2.5513, time 4916.58ms 
iter 5275: loss 2.5828, time 4921.50ms 
iter 5276: loss 2.7157, time 4917.01ms 
iter 5277: loss 2.7281, time 4927.34ms 
iter 5278: loss 2.5834, time 4960.84ms 
iter 5279: loss 2.5482, time 5005.05ms 
iter 5280: loss 2.6737, time 5019.88ms 
iter 5281: loss 2.6458, time 5032.18ms 
iter 5282: loss 2.8221, time 5035.20ms 
iter 5283: loss 2.6936, time 5045.25ms 
iter 5284: loss 2.4338, time 5019.42ms 
iter 5285: loss 2.4166, time 4979.31ms 
iter 5286: loss 2.5996, time 5009.05ms 
iter 5287: loss 2.5774, time 5031.07ms 
iter 5288: loss 2.4942, time 5039.81ms 
iter 5289: loss 2.5812, time 5038.42ms 
iter 5290: loss 2.7556, time 5026.12ms 
iter 5291: loss 2.7447, time 5031.48ms 
iter 5292: loss 2.6063, time 5027.76ms 
iter 5293: loss 2.4677, time 4973.20ms 
iter 5294: loss 2.6684, time 4926.40ms 
iter 5295: loss 2.9075, time 4983.43ms 
iter 5296: loss 2.4736, time 5002.23ms 
iter 5297: loss 2.4237, time 5024.84ms 
iter 5298: loss 2.6787, time 5027.73ms 
iter 5299: loss 2.6466, time 5017.62ms 
step 5300: train loss 2.5795, val loss 2.8382
iter 5300: loss 2.5996, time 19638.00ms 
iter 5301: loss 2.5591, time 5036.79ms 
iter 5302: loss 2.5899, time 5060.12ms 
iter 5303: loss 2.8235, time 5057.75ms 
iter 5304: loss 2.6294, time 4994.70ms 
iter 5305: loss 2.5829, time 5001.61ms 
iter 5306: loss 2.6205, time 5020.85ms 
iter 5307: loss 2.6492, time 5025.50ms 
iter 5308: loss 2.5559, time 4976.41ms 
iter 5309: loss 2.5701, time 4930.48ms 
iter 5310: loss 2.6089, time 4979.73ms 
iter 5311: loss 2.6161, time 4969.54ms 
iter 5312: loss 2.5418, time 5020.60ms 
iter 5313: loss 2.5052, time 4998.07ms 
iter 5314: loss 2.4442, time 5007.57ms 
iter 5315: loss 2.6876, time 5027.65ms 
iter 5316: loss 2.3452, time 5028.42ms 
iter 5317: loss 2.6904, time 5023.35ms 
iter 5318: loss 2.6909, time 4968.07ms 
iter 5319: loss 2.5489, time 4924.30ms 
iter 5320: loss 2.7166, time 4988.46ms 
iter 5321: loss 2.7103, time 5019.58ms 
iter 5322: loss 2.5979, time 5018.98ms 
iter 5323: loss 2.7503, time 5015.14ms 
iter 5324: loss 2.6379, time 5021.48ms 
iter 5325: loss 2.6512, time 5017.35ms 
iter 5326: loss 2.5393, time 5023.05ms 
iter 5327: loss 2.4496, time 4967.96ms 
iter 5328: loss 2.6221, time 4913.45ms 
iter 5329: loss 2.4495, time 4932.23ms 
iter 5330: loss 2.3976, time 5023.12ms 
iter 5331: loss 2.2849, time 5020.05ms 
iter 5332: loss 2.6748, time 5021.02ms 
iter 5333: loss 2.6122, time 5019.77ms 
iter 5334: loss 2.6730, time 4996.81ms 
iter 5335: loss 2.5673, time 5014.48ms 
iter 5336: loss 2.6546, time 5027.76ms 
iter 5337: loss 2.6064, time 4977.70ms 
iter 5338: loss 2.8117, time 4915.10ms 
iter 5339: loss 2.3545, time 4962.02ms 
iter 5340: loss 2.5388, time 5029.20ms 
iter 5341: loss 2.4935, time 5028.06ms 
iter 5342: loss 2.7807, time 5031.70ms 
iter 5343: loss 2.7031, time 5025.90ms 
iter 5344: loss 2.7554, time 5028.45ms 
iter 5345: loss 2.7417, time 5032.91ms 
iter 5346: loss 2.4858, time 5029.69ms 
iter 5347: loss 2.4229, time 4960.71ms 
iter 5348: loss 2.2578, time 4914.97ms 
iter 5349: loss 2.7105, time 4992.29ms 
step 5350: train loss 2.5781, val loss 2.8437
iter 5350: loss 2.6394, time 19653.47ms 
iter 5351: loss 2.4325, time 5022.60ms 
iter 5352: loss 2.4761, time 5020.77ms 
iter 5353: loss 2.4628, time 4975.10ms 
iter 5354: loss 2.6080, time 4973.58ms 
iter 5355: loss 2.4539, time 4974.32ms 
iter 5356: loss 2.6760, time 5027.89ms 
iter 5357: loss 2.6180, time 5027.55ms 
iter 5358: loss 2.5392, time 5028.29ms 
iter 5359: loss 2.6249, time 5025.50ms 
iter 5360: loss 2.5469, time 5021.51ms 
iter 5361: loss 2.7691, time 5022.64ms 
iter 5362: loss 2.7590, time 4974.88ms 
iter 5363: loss 2.5576, time 4915.28ms 
iter 5364: loss 2.5092, time 4916.92ms 
iter 5365: loss 2.5752, time 4948.50ms 
iter 5366: loss 2.6240, time 5028.89ms 
iter 5367: loss 2.7111, time 5030.29ms 
iter 5368: loss 2.5148, time 5027.20ms 
iter 5369: loss 2.6488, time 5031.27ms 
iter 5370: loss 2.5844, time 5009.82ms 
iter 5371: loss 2.6349, time 5027.60ms 
iter 5372: loss 2.5690, time 5032.69ms 
iter 5373: loss 2.6588, time 4957.79ms 
iter 5374: loss 2.5888, time 4919.30ms 
iter 5375: loss 2.6850, time 4962.48ms 
iter 5376: loss 2.2996, time 5034.65ms 
iter 5377: loss 2.5244, time 5027.42ms 
iter 5378: loss 2.4943, time 5015.36ms 
iter 5379: loss 2.5233, time 5032.09ms 
iter 5380: loss 2.4805, time 5029.48ms 
iter 5381: loss 2.6449, time 5032.06ms 
iter 5382: loss 2.5499, time 4980.77ms 
iter 5383: loss 2.5447, time 4965.31ms 
iter 5384: loss 2.6168, time 4919.28ms 
iter 5385: loss 2.5711, time 4982.63ms 
iter 5386: loss 2.3365, time 5027.12ms 
iter 5387: loss 2.5567, time 5022.45ms 
iter 5388: loss 2.5324, time 5026.96ms 
iter 5389: loss 2.5933, time 5029.33ms 
iter 5390: loss 2.3865, time 5029.75ms 
iter 5391: loss 2.5285, time 5033.00ms 
iter 5392: loss 2.6807, time 4983.09ms 
iter 5393: loss 2.6612, time 4961.01ms 
iter 5394: loss 2.8221, time 5007.29ms 
iter 5395: loss 2.5394, time 5024.50ms 
iter 5396: loss 2.6467, time 5030.87ms 
iter 5397: loss 2.6980, time 4997.08ms 
iter 5398: loss 2.7541, time 4982.31ms 
iter 5399: loss 2.7040, time 5014.15ms 
step 5400: train loss 2.5829, val loss 2.8398
iter 5400: loss 2.6950, time 19668.04ms 
iter 5401: loss 2.6120, time 5011.15ms 
iter 5402: loss 2.6790, time 4988.06ms 
iter 5403: loss 2.6999, time 5030.25ms 
iter 5404: loss 2.6878, time 5026.78ms 
iter 5405: loss 2.6363, time 5030.86ms 
iter 5406: loss 2.4821, time 5031.25ms 
iter 5407: loss 2.4122, time 4959.10ms 
iter 5408: loss 2.5972, time 4923.47ms 
iter 5409: loss 2.8345, time 4921.09ms 
iter 5410: loss 2.5766, time 4991.32ms 
iter 5411: loss 2.4795, time 5022.69ms 
iter 5412: loss 2.4265, time 5025.05ms 
iter 5413: loss 2.8580, time 5022.77ms 
iter 5414: loss 2.4681, time 4961.24ms 
iter 5415: loss 2.3767, time 4984.39ms 
iter 5416: loss 2.5659, time 5026.36ms 
iter 5417: loss 2.4402, time 4966.95ms 
iter 5418: loss 2.6618, time 4934.28ms 
iter 5419: loss 2.5861, time 4983.50ms 
iter 5420: loss 2.5468, time 5030.28ms 
iter 5421: loss 2.7152, time 5029.80ms 
iter 5422: loss 2.5539, time 5018.45ms 
iter 5423: loss 2.5576, time 5034.88ms 
iter 5424: loss 2.4069, time 5038.40ms 
iter 5425: loss 2.4819, time 5031.95ms 
iter 5426: loss 2.6207, time 4999.86ms 
iter 5427: loss 2.4431, time 4962.30ms 
iter 5428: loss 2.4514, time 4914.18ms 
iter 5429: loss 2.5735, time 5000.25ms 
iter 5430: loss 2.5726, time 5031.63ms 
iter 5431: loss 2.4769, time 5027.11ms 
iter 5432: loss 2.6291, time 5026.83ms 
iter 5433: loss 2.6854, time 5025.63ms 
iter 5434: loss 2.6056, time 5026.77ms 
iter 5435: loss 2.6614, time 4964.69ms 
iter 5436: loss 2.2335, time 4914.25ms 
iter 5437: loss 2.6130, time 4934.73ms 
iter 5438: loss 2.3833, time 4956.34ms 
iter 5439: loss 2.3886, time 5027.26ms 
iter 5440: loss 2.7006, time 5019.35ms 
iter 5441: loss 2.5912, time 5026.85ms 
iter 5442: loss 2.6749, time 5022.35ms 
iter 5443: loss 2.4436, time 5020.96ms 
iter 5444: loss 2.4011, time 5017.75ms 
iter 5445: loss 2.6013, time 5016.33ms 
iter 5446: loss 2.5683, time 4916.19ms 
iter 5447: loss 2.7299, time 4915.18ms 
iter 5448: loss 2.5101, time 4964.02ms 
iter 5449: loss 3.0440, time 5006.12ms 
step 5450: train loss 2.5666, val loss 2.8432
iter 5450: loss 2.6319, time 19710.94ms 
iter 5451: loss 2.6096, time 5018.63ms 
iter 5452: loss 2.5929, time 4955.15ms 
iter 5453: loss 2.4723, time 4914.71ms 
iter 5454: loss 2.5538, time 4978.27ms 
iter 5455: loss 2.5313, time 5004.11ms 
iter 5456: loss 2.5910, time 5020.23ms 
iter 5457: loss 2.6298, time 5020.68ms 
iter 5458: loss 2.5131, time 5015.18ms 
iter 5459: loss 2.4545, time 5006.88ms 
iter 5460: loss 2.5666, time 5023.59ms 
iter 5461: loss 2.3123, time 4924.26ms 
iter 5462: loss 2.5070, time 4915.16ms 
iter 5463: loss 2.4533, time 4976.18ms 
iter 5464: loss 2.5321, time 5022.49ms 
iter 5465: loss 2.4214, time 5005.52ms 
iter 5466: loss 2.6289, time 5021.16ms 
iter 5467: loss 2.6114, time 5023.91ms 
iter 5468: loss 2.5586, time 5012.95ms 
iter 5469: loss 2.5032, time 4998.07ms 
iter 5470: loss 2.4372, time 4919.07ms 
iter 5471: loss 2.6362, time 4928.09ms 
iter 5472: loss 2.6640, time 4930.75ms 
iter 5473: loss 2.6001, time 4929.87ms 
iter 5474: loss 2.5075, time 4931.41ms 
iter 5475: loss 2.3998, time 4926.15ms 
iter 5476: loss 2.6251, time 4955.47ms 
iter 5477: loss 2.4632, time 4930.15ms 
iter 5478: loss 2.5393, time 4998.94ms 
iter 5479: loss 2.6769, time 5033.83ms 
iter 5480: loss 2.4865, time 4966.12ms 
iter 5481: loss 2.3843, time 4921.02ms 
iter 5482: loss 2.6857, time 4958.56ms 
iter 5483: loss 2.5995, time 5028.12ms 
iter 5484: loss 2.5054, time 5031.46ms 
iter 5485: loss 2.7010, time 5029.97ms 
iter 5486: loss 2.4430, time 5028.77ms 
iter 5487: loss 2.6073, time 5009.56ms 
iter 5488: loss 2.6221, time 5034.82ms 
iter 5489: loss 2.5425, time 5013.60ms 
iter 5490: loss 2.7209, time 5000.24ms 
iter 5491: loss 2.6227, time 5044.17ms 
iter 5492: loss 2.9519, time 4995.22ms 
iter 5493: loss 2.6367, time 5031.24ms 
iter 5494: loss 2.5112, time 5030.59ms 
iter 5495: loss 2.5245, time 5028.67ms 
iter 5496: loss 2.5717, time 4998.65ms 
iter 5497: loss 2.4809, time 5022.78ms 
iter 5498: loss 2.5040, time 4979.27ms 
iter 5499: loss 2.4648, time 4953.72ms 
step 5500: train loss 2.5731, val loss 2.8361
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 5500: loss 2.5925, time 20827.57ms 
iter 5501: loss 2.4992, time 4974.78ms 
iter 5502: loss 2.5278, time 4969.35ms 
iter 5503: loss 2.5375, time 5012.36ms 
iter 5504: loss 2.7193, time 4973.80ms 
iter 5505: loss 2.4234, time 4916.31ms 
iter 5506: loss 2.3400, time 4959.19ms 
iter 5507: loss 2.5380, time 5006.55ms 
iter 5508: loss 2.7329, time 4970.62ms 
iter 5509: loss 2.5693, time 5024.28ms 
iter 5510: loss 2.4532, time 4993.85ms 
iter 5511: loss 2.6037, time 5018.00ms 
iter 5512: loss 2.7804, time 5007.89ms 
iter 5513: loss 2.5433, time 4990.69ms 
iter 5514: loss 2.6888, time 4992.27ms 
iter 5515: loss 2.5791, time 4989.81ms 
iter 5516: loss 2.7277, time 5024.67ms 
iter 5517: loss 2.4424, time 5027.73ms 
iter 5518: loss 2.6242, time 5028.70ms 
iter 5519: loss 2.5357, time 5032.84ms 
iter 5520: loss 2.8347, time 5026.73ms 
iter 5521: loss 2.5168, time 5031.05ms 
iter 5522: loss 2.6473, time 5010.12ms 
iter 5523: loss 2.5070, time 4952.87ms 
iter 5524: loss 2.4716, time 4921.71ms 
iter 5525: loss 2.7451, time 5029.56ms 
iter 5526: loss 2.6377, time 5028.80ms 
iter 5527: loss 2.5360, time 5021.20ms 
iter 5528: loss 2.6443, time 5012.30ms 
iter 5529: loss 2.4378, time 5014.76ms 
iter 5530: loss 2.5651, time 5027.88ms 
iter 5531: loss 2.3860, time 5028.05ms 
iter 5532: loss 2.6265, time 4936.57ms 
iter 5533: loss 2.5937, time 4916.01ms 
iter 5534: loss 2.5078, time 4951.35ms 
iter 5535: loss 2.4390, time 5025.17ms 
iter 5536: loss 2.4507, time 5023.40ms 
iter 5537: loss 2.6726, time 5023.96ms 
iter 5538: loss 2.7501, time 5028.46ms 
iter 5539: loss 2.5963, time 5024.72ms 
iter 5540: loss 2.5335, time 5010.75ms 
iter 5541: loss 2.4182, time 4986.74ms 
iter 5542: loss 2.6075, time 4916.86ms 
iter 5543: loss 2.5116, time 4915.73ms 
iter 5544: loss 2.6270, time 4987.77ms 
iter 5545: loss 2.3711, time 5031.57ms 
iter 5546: loss 2.6959, time 5031.31ms 
iter 5547: loss 2.7280, time 5030.02ms 
iter 5548: loss 2.3864, time 5031.45ms 
iter 5549: loss 2.7753, time 5024.17ms 
step 5550: train loss 2.5572, val loss 2.8325
iter 5550: loss 2.5499, time 19665.03ms 
iter 5551: loss 2.5909, time 5033.79ms 
iter 5552: loss 2.5688, time 5028.99ms 
iter 5553: loss 2.5620, time 5027.52ms 
iter 5554: loss 2.6162, time 5028.18ms 
iter 5555: loss 2.4511, time 5006.30ms 
iter 5556: loss 2.3914, time 5016.86ms 
iter 5557: loss 2.7617, time 5024.97ms 
iter 5558: loss 2.5810, time 4963.22ms 
iter 5559: loss 2.6263, time 4915.46ms 
iter 5560: loss 2.5201, time 5000.64ms 
iter 5561: loss 2.4580, time 5009.97ms 
iter 5562: loss 2.5163, time 5025.44ms 
iter 5563: loss 2.5839, time 5022.33ms 
iter 5564: loss 2.5850, time 5027.60ms 
iter 5565: loss 2.4191, time 5031.71ms 
iter 5566: loss 2.5226, time 4962.61ms 
iter 5567: loss 2.3624, time 4915.57ms 
iter 5568: loss 2.5639, time 4925.11ms 
iter 5569: loss 2.6794, time 4982.14ms 
iter 5570: loss 2.5617, time 4996.46ms 
iter 5571: loss 2.4265, time 4996.77ms 
iter 5572: loss 2.4302, time 5010.14ms 
iter 5573: loss 2.5511, time 5016.13ms 
iter 5574: loss 2.6174, time 5018.70ms 
iter 5575: loss 2.5764, time 5003.69ms 
iter 5576: loss 2.6120, time 4916.81ms 
iter 5577: loss 2.6057, time 4928.33ms 
iter 5578: loss 2.3564, time 5018.59ms 
iter 5579: loss 2.8605, time 5028.52ms 
iter 5580: loss 2.5049, time 5030.05ms 
iter 5581: loss 2.6826, time 5033.67ms 
iter 5582: loss 2.5745, time 5026.31ms 
iter 5583: loss 2.5484, time 4985.69ms 
iter 5584: loss 2.3876, time 4964.93ms 
iter 5585: loss 2.6298, time 4973.74ms 
iter 5586: loss 2.3794, time 4968.87ms 
iter 5587: loss 2.4693, time 5023.55ms 
iter 5588: loss 2.5151, time 5029.57ms 
iter 5589: loss 2.4407, time 5026.76ms 
iter 5590: loss 2.6047, time 5027.88ms 
iter 5591: loss 2.5306, time 5027.01ms 
iter 5592: loss 2.4739, time 5028.43ms 
iter 5593: loss 2.7669, time 5028.34ms 
iter 5594: loss 2.5867, time 4962.22ms 
iter 5595: loss 2.4115, time 4941.61ms 
iter 5596: loss 2.5582, time 5029.09ms 
iter 5597: loss 2.2696, time 5025.02ms 
iter 5598: loss 2.7299, time 5019.50ms 
iter 5599: loss 2.6270, time 5025.91ms 
step 5600: train loss 2.5554, val loss 2.8405
iter 5600: loss 2.7420, time 19645.76ms 
iter 5601: loss 2.4729, time 4952.20ms 
iter 5602: loss 2.4067, time 4998.22ms 
iter 5603: loss 2.4501, time 5032.34ms 
iter 5604: loss 2.5999, time 4988.90ms 
iter 5605: loss 2.6659, time 5006.38ms 
iter 5606: loss 2.4541, time 4985.51ms 
iter 5607: loss 2.6056, time 4977.56ms 
iter 5608: loss 2.4510, time 5029.17ms 
iter 5609: loss 2.5685, time 4946.46ms 
iter 5610: loss 2.3579, time 4926.64ms 
iter 5611: loss 2.4492, time 4927.11ms 
iter 5612: loss 2.5199, time 4937.20ms 
iter 5613: loss 2.4208, time 4957.50ms 
iter 5614: loss 2.7779, time 4927.72ms 
iter 5615: loss 2.5322, time 4934.77ms 
iter 5616: loss 2.6844, time 4971.95ms 
iter 5617: loss 2.5413, time 4999.55ms 
iter 5618: loss 2.6328, time 4918.52ms 
iter 5619: loss 2.4368, time 4923.68ms 
iter 5620: loss 2.5796, time 4927.78ms 
iter 5621: loss 2.6925, time 4931.81ms 
iter 5622: loss 2.3882, time 4985.45ms 
iter 5623: loss 2.4593, time 4948.40ms 
iter 5624: loss 2.7084, time 4969.55ms 
iter 5625: loss 2.6647, time 4952.62ms 
iter 5626: loss 2.7530, time 4990.92ms 
iter 5627: loss 2.7290, time 4961.98ms 
iter 5628: loss 2.6616, time 4917.43ms 
iter 5629: loss 2.4780, time 4928.81ms 
iter 5630: loss 2.5328, time 4943.26ms 
iter 5631: loss 2.6926, time 4946.58ms 
iter 5632: loss 2.6322, time 4935.02ms 
iter 5633: loss 2.4840, time 4961.36ms 
iter 5634: loss 2.5147, time 4936.31ms 
iter 5635: loss 2.7042, time 4938.87ms 
iter 5636: loss 2.5576, time 4924.15ms 
iter 5637: loss 2.4971, time 4918.39ms 
iter 5638: loss 2.5785, time 4944.91ms 
iter 5639: loss 2.7055, time 4928.01ms 
iter 5640: loss 2.4987, time 4926.76ms 
iter 5641: loss 2.4814, time 4944.67ms 
iter 5642: loss 2.6373, time 4946.38ms 
iter 5643: loss 2.7644, time 4930.85ms 
iter 5644: loss 2.7507, time 4927.61ms 
iter 5645: loss 2.6162, time 4922.64ms 
iter 5646: loss 2.9197, time 4958.13ms 
iter 5647: loss 2.6169, time 4950.37ms 
iter 5648: loss 2.6797, time 5018.48ms 
iter 5649: loss 2.6109, time 5023.54ms 
step 5650: train loss 2.5562, val loss 2.8479
iter 5650: loss 2.6155, time 19750.37ms 
iter 5651: loss 2.4384, time 5016.03ms 
iter 5652: loss 2.7149, time 4917.68ms 
iter 5653: loss 2.6485, time 4936.61ms 
iter 5654: loss 2.5417, time 5030.56ms 
iter 5655: loss 2.6782, time 5016.17ms 
iter 5656: loss 2.5025, time 5036.27ms 
iter 5657: loss 2.6097, time 5013.52ms 
iter 5658: loss 2.4400, time 5026.11ms 
iter 5659: loss 2.5829, time 5016.15ms 
iter 5660: loss 2.4189, time 5038.74ms 
iter 5661: loss 2.5314, time 4935.77ms 
iter 5662: loss 2.4261, time 4915.73ms 
iter 5663: loss 2.6118, time 4983.64ms 
iter 5664: loss 2.5428, time 5017.12ms 
iter 5665: loss 2.5976, time 5012.56ms 
iter 5666: loss 2.3728, time 5038.92ms 
iter 5667: loss 2.5111, time 5046.88ms 
iter 5668: loss 2.4277, time 5041.38ms 
iter 5669: loss 2.3257, time 5042.54ms 
iter 5670: loss 2.5143, time 4982.91ms 
iter 5671: loss 2.5046, time 4914.26ms 
iter 5672: loss 2.6524, time 4951.24ms 
iter 5673: loss 2.4728, time 5032.51ms 
iter 5674: loss 2.7834, time 5039.22ms 
iter 5675: loss 2.5521, time 5032.93ms 
iter 5676: loss 2.6714, time 5015.15ms 
iter 5677: loss 2.7828, time 5000.98ms 
iter 5678: loss 2.3720, time 5003.12ms 
iter 5679: loss 2.4846, time 4973.37ms 
iter 5680: loss 2.6146, time 4913.59ms 
iter 5681: loss 2.6678, time 4915.33ms 
iter 5682: loss 2.4201, time 4989.90ms 
iter 5683: loss 2.5488, time 5029.29ms 
iter 5684: loss 2.5286, time 5028.06ms 
iter 5685: loss 2.3455, time 5026.63ms 
iter 5686: loss 2.8075, time 5032.60ms 
iter 5687: loss 2.5919, time 5014.89ms 
iter 5688: loss 2.5956, time 5001.73ms 
iter 5689: loss 2.7394, time 4961.20ms 
iter 5690: loss 2.6835, time 4915.21ms 
iter 5691: loss 2.3927, time 4937.14ms 
iter 5692: loss 2.5681, time 5028.22ms 
iter 5693: loss 2.5228, time 5032.37ms 
iter 5694: loss 2.6442, time 5029.82ms 
iter 5695: loss 2.7370, time 5010.06ms 
iter 5696: loss 2.7037, time 5027.17ms 
iter 5697: loss 2.6048, time 5015.19ms 
iter 5698: loss 2.3078, time 5003.03ms 
iter 5699: loss 2.6287, time 4916.39ms 
step 5700: train loss 2.5473, val loss 2.8385
iter 5700: loss 2.5943, time 19709.96ms 
iter 5701: loss 2.5831, time 5026.49ms 
iter 5702: loss 2.8471, time 5026.72ms 
iter 5703: loss 2.4201, time 5028.00ms 
iter 5704: loss 2.7931, time 5040.81ms 
iter 5705: loss 2.4614, time 4973.88ms 
iter 5706: loss 2.6917, time 4942.06ms 
iter 5707: loss 2.6011, time 5020.69ms 
iter 5708: loss 2.6956, time 5031.66ms 
iter 5709: loss 2.5230, time 5030.56ms 
iter 5710: loss 2.5943, time 5029.67ms 
iter 5711: loss 2.6919, time 5030.94ms 
iter 5712: loss 2.6362, time 5028.49ms 
iter 5713: loss 2.7052, time 5024.38ms 
iter 5714: loss 2.3916, time 4974.09ms 
iter 5715: loss 2.5723, time 4917.68ms 
iter 5716: loss 2.5355, time 5004.31ms 
iter 5717: loss 2.5149, time 5028.17ms 
iter 5718: loss 2.7554, time 5025.44ms 
iter 5719: loss 2.6332, time 5023.93ms 
iter 5720: loss 2.8388, time 5030.30ms 
iter 5721: loss 2.5917, time 5033.50ms 
iter 5722: loss 2.5474, time 5029.84ms 
iter 5723: loss 2.6192, time 4979.64ms 
iter 5724: loss 2.5162, time 4917.53ms 
iter 5725: loss 2.5974, time 4996.67ms 
iter 5726: loss 2.5213, time 5019.39ms 
iter 5727: loss 2.5097, time 5041.95ms 
iter 5728: loss 2.5878, time 5034.36ms 
iter 5729: loss 2.5191, time 5034.38ms 
iter 5730: loss 2.4896, time 5010.80ms 
iter 5731: loss 2.5693, time 5027.66ms 
iter 5732: loss 2.4040, time 4983.50ms 
iter 5733: loss 2.4574, time 4917.15ms 
iter 5734: loss 2.6430, time 4916.07ms 
iter 5735: loss 2.3982, time 4994.16ms 
iter 5736: loss 2.3332, time 5034.72ms 
iter 5737: loss 2.6234, time 5037.70ms 
iter 5738: loss 2.5311, time 5037.96ms 
iter 5739: loss 2.5968, time 5031.34ms 
iter 5740: loss 2.5551, time 5018.88ms 
iter 5741: loss 2.5629, time 5031.30ms 
iter 5742: loss 2.5760, time 4974.58ms 
iter 5743: loss 2.5583, time 4919.16ms 
iter 5744: loss 2.5028, time 4968.18ms 
iter 5745: loss 2.7182, time 5047.43ms 
iter 5746: loss 2.6865, time 5040.20ms 
iter 5747: loss 2.4336, time 5035.65ms 
iter 5748: loss 2.3571, time 5034.13ms 
iter 5749: loss 2.3102, time 5033.34ms 
step 5750: train loss 2.5485, val loss 2.8362
iter 5750: loss 2.6578, time 19722.85ms 
iter 5751: loss 2.4393, time 5016.24ms 
iter 5752: loss 2.5989, time 4992.98ms 
iter 5753: loss 2.7183, time 5004.26ms 
iter 5754: loss 2.4210, time 5022.29ms 
iter 5755: loss 2.4565, time 4993.96ms 
iter 5756: loss 2.3959, time 4998.81ms 
iter 5757: loss 2.4610, time 4932.55ms 
iter 5758: loss 2.6432, time 4915.93ms 
iter 5759: loss 2.6307, time 4964.02ms 
iter 5760: loss 2.4523, time 5023.60ms 
iter 5761: loss 2.4446, time 5027.81ms 
iter 5762: loss 2.6160, time 5017.75ms 
iter 5763: loss 2.6230, time 5032.52ms 
iter 5764: loss 2.4889, time 5023.12ms 
iter 5765: loss 2.4796, time 5006.62ms 
iter 5766: loss 2.5365, time 4918.59ms 
iter 5767: loss 2.4351, time 4915.42ms 
iter 5768: loss 2.5259, time 4972.70ms 
iter 5769: loss 2.6325, time 5010.88ms 
iter 5770: loss 2.5549, time 5000.84ms 
iter 5771: loss 2.5781, time 5024.55ms 
iter 5772: loss 2.6163, time 5027.14ms 
iter 5773: loss 2.6153, time 5029.75ms 
iter 5774: loss 2.5403, time 5030.13ms 
iter 5775: loss 2.4406, time 5025.39ms 
iter 5776: loss 2.3938, time 4940.47ms 
iter 5777: loss 2.4742, time 4999.28ms 
iter 5778: loss 2.4504, time 5015.78ms 
iter 5779: loss 2.6807, time 5023.53ms 
iter 5780: loss 2.5374, time 5025.57ms 
iter 5781: loss 2.5832, time 5006.29ms 
iter 5782: loss 2.5588, time 5027.49ms 
iter 5783: loss 2.5879, time 5007.81ms 
iter 5784: loss 2.7387, time 4916.52ms 
iter 5785: loss 2.5859, time 4919.61ms 
iter 5786: loss 2.8246, time 4989.17ms 
iter 5787: loss 2.4471, time 5044.31ms 
iter 5788: loss 2.4395, time 5024.34ms 
iter 5789: loss 2.4918, time 5041.28ms 
iter 5790: loss 2.4510, time 5026.55ms 
iter 5791: loss 2.3316, time 5014.15ms 
iter 5792: loss 2.5945, time 5073.17ms 
iter 5793: loss 2.4437, time 4980.05ms 
iter 5794: loss 2.6734, time 4928.48ms 
iter 5795: loss 2.4546, time 4934.13ms 
iter 5796: loss 2.4274, time 5026.68ms 
iter 5797: loss 2.4321, time 4992.07ms 
iter 5798: loss 2.3644, time 5018.15ms 
iter 5799: loss 2.5081, time 5023.13ms 
step 5800: train loss 2.5539, val loss 2.8308
iter 5800: loss 2.6437, time 19607.74ms 
iter 5801: loss 2.5289, time 4914.68ms 
iter 5802: loss 2.4385, time 4991.53ms 
iter 5803: loss 2.3559, time 5018.40ms 
iter 5804: loss 2.4015, time 5022.88ms 
iter 5805: loss 2.2371, time 5012.76ms 
iter 5806: loss 2.5987, time 5026.18ms 
iter 5807: loss 2.7092, time 5025.76ms 
iter 5808: loss 2.3211, time 5024.34ms 
iter 5809: loss 2.7719, time 4971.03ms 
iter 5810: loss 2.5331, time 4925.37ms 
iter 5811: loss 2.3513, time 5027.11ms 
iter 5812: loss 2.5990, time 5026.89ms 
iter 5813: loss 2.5575, time 5024.60ms 
iter 5814: loss 2.4474, time 5022.67ms 
iter 5815: loss 2.7064, time 5003.50ms 
iter 5816: loss 2.4755, time 5003.43ms 
iter 5817: loss 2.3265, time 5030.51ms 
iter 5818: loss 2.6307, time 4975.25ms 
iter 5819: loss 2.5785, time 5011.45ms 
iter 5820: loss 2.3590, time 5030.40ms 
iter 5821: loss 2.6848, time 5026.59ms 
iter 5822: loss 2.5142, time 5025.05ms 
iter 5823: loss 2.6282, time 5023.86ms 
iter 5824: loss 2.5545, time 5023.04ms 
iter 5825: loss 2.4984, time 5026.35ms 
iter 5826: loss 2.6412, time 4975.21ms 
iter 5827: loss 2.3669, time 4936.80ms 
iter 5828: loss 2.7409, time 5022.84ms 
iter 5829: loss 2.5032, time 5026.50ms 
iter 5830: loss 2.3247, time 5021.33ms 
iter 5831: loss 2.6736, time 5023.97ms 
iter 5832: loss 2.5892, time 5017.63ms 
iter 5833: loss 2.5877, time 5001.56ms 
iter 5834: loss 2.5008, time 5027.33ms 
iter 5835: loss 2.4592, time 5022.24ms 
iter 5836: loss 2.5727, time 5005.14ms 
iter 5837: loss 2.6107, time 5023.39ms 
iter 5838: loss 2.4115, time 5011.50ms 
iter 5839: loss 2.6722, time 5015.12ms 
iter 5840: loss 2.6047, time 5023.39ms 
iter 5841: loss 2.4290, time 5021.25ms 
iter 5842: loss 2.3946, time 5017.22ms 
iter 5843: loss 2.3949, time 4984.07ms 
iter 5844: loss 2.4632, time 4914.57ms 
iter 5845: loss 2.6487, time 4950.22ms 
iter 5846: loss 2.5639, time 5012.18ms 
iter 5847: loss 2.4382, time 5026.03ms 
iter 5848: loss 2.6693, time 5029.87ms 
iter 5849: loss 2.5768, time 5003.66ms 
step 5850: train loss 2.5500, val loss 2.8366
iter 5850: loss 2.5002, time 19674.06ms 
iter 5851: loss 2.6020, time 4964.60ms 
iter 5852: loss 2.4567, time 4978.88ms 
iter 5853: loss 2.5444, time 4995.80ms 
iter 5854: loss 2.6146, time 4994.62ms 
iter 5855: loss 2.5258, time 5013.01ms 
iter 5856: loss 2.5959, time 5002.81ms 
iter 5857: loss 2.6170, time 5023.96ms 
iter 5858: loss 2.2686, time 4961.92ms 
iter 5859: loss 2.6057, time 4916.87ms 
iter 5860: loss 2.4690, time 4986.51ms 
iter 5861: loss 2.4521, time 4957.27ms 
iter 5862: loss 2.4675, time 4984.71ms 
iter 5863: loss 2.5069, time 4993.66ms 
iter 5864: loss 2.5166, time 5015.08ms 
iter 5865: loss 2.8068, time 5015.16ms 
iter 5866: loss 2.5738, time 5007.56ms 
iter 5867: loss 2.8011, time 4950.49ms 
iter 5868: loss 2.5960, time 4915.44ms 
iter 5869: loss 2.6951, time 4981.85ms 
iter 5870: loss 2.4911, time 4989.18ms 
iter 5871: loss 2.3439, time 5013.13ms 
iter 5872: loss 2.4698, time 5015.90ms 
iter 5873: loss 2.4876, time 5023.30ms 
iter 5874: loss 2.6367, time 5043.87ms 
iter 5875: loss 2.5413, time 5035.77ms 
iter 5876: loss 2.4200, time 4938.70ms 
iter 5877: loss 2.5521, time 4952.86ms 
iter 5878: loss 2.6311, time 4981.93ms 
iter 5879: loss 2.6449, time 5009.56ms 
iter 5880: loss 2.7253, time 5016.12ms 
iter 5881: loss 2.7261, time 5024.32ms 
iter 5882: loss 2.5254, time 5031.53ms 
iter 5883: loss 2.4420, time 5023.26ms 
iter 5884: loss 2.6107, time 5034.28ms 
iter 5885: loss 2.6004, time 4979.35ms 
iter 5886: loss 2.4646, time 4968.03ms 
iter 5887: loss 2.4317, time 5036.89ms 
iter 5888: loss 2.5920, time 5020.30ms 
iter 5889: loss 2.8197, time 5025.57ms 
iter 5890: loss 2.4029, time 5022.88ms 
iter 5891: loss 2.6067, time 5029.66ms 
iter 5892: loss 2.5050, time 5011.23ms 
iter 5893: loss 2.6086, time 4993.88ms 
iter 5894: loss 2.6250, time 4929.40ms 
iter 5895: loss 2.8256, time 4946.28ms 
iter 5896: loss 2.5475, time 5012.73ms 
iter 5897: loss 2.4761, time 5030.36ms 
iter 5898: loss 2.6151, time 5025.14ms 
iter 5899: loss 2.6532, time 5028.49ms 
step 5900: train loss 2.5457, val loss 2.8365
iter 5900: loss 2.3564, time 19677.92ms 
iter 5901: loss 2.2863, time 4922.12ms 
iter 5902: loss 2.6498, time 4992.54ms 
iter 5903: loss 2.6009, time 5010.11ms 
iter 5904: loss 2.4126, time 5008.60ms 
iter 5905: loss 2.5179, time 5028.44ms 
iter 5906: loss 2.5292, time 5005.78ms 
iter 5907: loss 2.3989, time 5025.75ms 
iter 5908: loss 2.5008, time 5032.03ms 
iter 5909: loss 2.6611, time 4976.82ms 
iter 5910: loss 2.7439, time 4938.41ms 
iter 5911: loss 2.6226, time 5030.45ms 
iter 5912: loss 2.7199, time 5021.33ms 
iter 5913: loss 2.4912, time 5028.04ms 
iter 5914: loss 2.4555, time 5008.33ms 
iter 5915: loss 2.4823, time 5003.40ms 
iter 5916: loss 2.7866, time 5023.61ms 
iter 5917: loss 2.6572, time 5028.42ms 
iter 5918: loss 2.7554, time 4967.73ms 
iter 5919: loss 2.7257, time 4916.55ms 
iter 5920: loss 2.5848, time 4975.69ms 
iter 5921: loss 2.5457, time 5025.03ms 
iter 5922: loss 2.5784, time 5029.10ms 
iter 5923: loss 2.5984, time 5024.99ms 
iter 5924: loss 2.6262, time 4950.12ms 
iter 5925: loss 2.5947, time 4993.29ms 
iter 5926: loss 2.6345, time 4971.54ms 
iter 5927: loss 2.6877, time 4915.60ms 
iter 5928: loss 2.6250, time 4951.43ms 
iter 5929: loss 2.6379, time 5026.63ms 
iter 5930: loss 2.4526, time 5022.21ms 
iter 5931: loss 2.4330, time 5021.65ms 
iter 5932: loss 2.5849, time 4992.69ms 
iter 5933: loss 2.6563, time 4970.89ms 
iter 5934: loss 2.4092, time 4960.20ms 
iter 5935: loss 2.5033, time 4993.37ms 
iter 5936: loss 2.3194, time 4925.27ms 
iter 5937: loss 2.3214, time 4941.45ms 
iter 5938: loss 2.5100, time 4990.18ms 
iter 5939: loss 2.5720, time 5006.53ms 
iter 5940: loss 2.4746, time 5018.13ms 
iter 5941: loss 2.3504, time 5021.41ms 
iter 5942: loss 2.4577, time 5020.11ms 
iter 5943: loss 2.3336, time 5025.70ms 
iter 5944: loss 2.4434, time 4974.41ms 
iter 5945: loss 2.6264, time 4916.38ms 
iter 5946: loss 2.6250, time 4983.15ms 
iter 5947: loss 2.5938, time 5017.64ms 
iter 5948: loss 2.3459, time 5028.85ms 
iter 5949: loss 2.4641, time 4957.86ms 
step 5950: train loss 2.5521, val loss 2.8466
iter 5950: loss 2.5988, time 19639.41ms 
iter 5951: loss 2.6208, time 4919.96ms 
iter 5952: loss 2.5864, time 4991.68ms 
iter 5953: loss 2.7267, time 5026.45ms 
iter 5954: loss 2.5432, time 4997.87ms 
iter 5955: loss 2.5688, time 5024.97ms 
iter 5956: loss 2.6676, time 5023.21ms 
iter 5957: loss 2.6247, time 5026.20ms 
iter 5958: loss 2.3754, time 5022.01ms 
iter 5959: loss 2.4134, time 4944.83ms 
iter 5960: loss 2.4837, time 4918.01ms 
iter 5961: loss 2.5136, time 4995.97ms 
iter 5962: loss 2.6069, time 5023.94ms 
iter 5963: loss 2.4727, time 5035.13ms 
iter 5964: loss 2.7050, time 5030.31ms 
iter 5965: loss 2.5994, time 5023.35ms 
iter 5966: loss 2.5348, time 5019.90ms 
iter 5967: loss 2.5095, time 5026.17ms 
iter 5968: loss 2.8283, time 4979.62ms 
iter 5969: loss 2.3958, time 5024.11ms 
iter 5970: loss 2.5626, time 5027.96ms 
iter 5971: loss 2.6130, time 5033.21ms 
iter 5972: loss 2.4659, time 5032.45ms 
iter 5973: loss 2.6044, time 5031.98ms 
iter 5974: loss 2.6754, time 5022.89ms 
iter 5975: loss 2.5879, time 4972.36ms 
iter 5976: loss 2.4225, time 4917.49ms 
iter 5977: loss 2.3601, time 4957.76ms 
iter 5978: loss 2.6280, time 4950.82ms 
iter 5979: loss 2.5679, time 4938.79ms 
iter 5980: loss 2.4927, time 5026.85ms 
iter 5981: loss 2.3663, time 4929.55ms 
iter 5982: loss 2.4947, time 4959.06ms 
iter 5983: loss 2.5616, time 4930.39ms 
iter 5984: loss 2.6484, time 4970.70ms 
iter 5985: loss 2.3953, time 4917.92ms 
iter 5986: loss 2.6648, time 5003.37ms 
iter 5987: loss 2.5364, time 5027.75ms 
iter 5988: loss 2.5664, time 5012.42ms 
iter 5989: loss 2.4797, time 4982.17ms 
iter 5990: loss 2.3521, time 4965.24ms 
iter 5991: loss 2.8082, time 4929.75ms 
iter 5992: loss 2.5240, time 4958.69ms 
iter 5993: loss 2.4618, time 4934.80ms 
iter 5994: loss 2.4543, time 4924.76ms 
iter 5995: loss 2.5228, time 5006.46ms 
iter 5996: loss 2.5292, time 5028.66ms 
iter 5997: loss 2.5728, time 5023.28ms 
iter 5998: loss 2.5384, time 4962.37ms 
iter 5999: loss 2.8585, time 4981.04ms 
step 6000: train loss 2.5474, val loss 2.8285
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 6000: loss 2.3518, time 20725.55ms 
iter 6001: loss 2.6710, time 5031.53ms 
iter 6002: loss 2.8737, time 5028.23ms 
iter 6003: loss 2.2597, time 5021.23ms 
iter 6004: loss 2.5039, time 5034.33ms 
iter 6005: loss 2.3466, time 5033.89ms 
iter 6006: loss 2.5140, time 5044.65ms 
iter 6007: loss 2.5535, time 4970.95ms 
iter 6008: loss 2.6878, time 4956.26ms 
iter 6009: loss 2.4567, time 5023.63ms 
iter 6010: loss 2.6326, time 5017.28ms 
iter 6011: loss 2.7495, time 5024.45ms 
iter 6012: loss 2.6937, time 5023.98ms 
iter 6013: loss 2.4397, time 5022.38ms 
iter 6014: loss 2.6870, time 5024.92ms 
iter 6015: loss 2.7105, time 5029.90ms 
iter 6016: loss 2.5818, time 4973.17ms 
iter 6017: loss 2.5465, time 4960.31ms 
iter 6018: loss 2.3558, time 5019.09ms 
iter 6019: loss 2.4161, time 5026.41ms 
iter 6020: loss 2.5409, time 5026.66ms 
iter 6021: loss 2.7231, time 5024.09ms 
iter 6022: loss 2.6514, time 5023.27ms 
iter 6023: loss 2.6464, time 5014.38ms 
iter 6024: loss 2.7801, time 5004.58ms 
iter 6025: loss 2.5923, time 4926.21ms 
iter 6026: loss 2.5540, time 5012.76ms 
iter 6027: loss 2.6000, time 5022.11ms 
iter 6028: loss 2.5979, time 5024.02ms 
iter 6029: loss 2.4824, time 5026.38ms 
iter 6030: loss 2.4771, time 5023.71ms 
iter 6031: loss 2.6792, time 5025.81ms 
iter 6032: loss 2.5134, time 5027.34ms 
iter 6033: loss 2.7265, time 4944.99ms 
iter 6034: loss 2.5224, time 4956.07ms 
iter 6035: loss 2.3864, time 5015.71ms 
iter 6036: loss 2.6695, time 4995.03ms 
iter 6037: loss 2.5434, time 5019.74ms 
iter 6038: loss 2.3842, time 5021.34ms 
iter 6039: loss 2.5413, time 5020.97ms 
iter 6040: loss 2.5221, time 5023.28ms 
iter 6041: loss 2.7194, time 4972.57ms 
iter 6042: loss 2.5804, time 4939.41ms 
iter 6043: loss 2.6275, time 5022.31ms 
iter 6044: loss 2.4374, time 5022.86ms 
iter 6045: loss 2.5096, time 5019.53ms 
iter 6046: loss 2.5592, time 5023.06ms 
iter 6047: loss 2.6337, time 5029.79ms 
iter 6048: loss 2.6037, time 5025.29ms 
iter 6049: loss 2.4211, time 4988.49ms 
step 6050: train loss 2.5456, val loss 2.8249
iter 6050: loss 2.5948, time 19669.77ms 
iter 6051: loss 2.6001, time 5027.81ms 
iter 6052: loss 2.4643, time 5027.12ms 
iter 6053: loss 2.4180, time 5009.32ms 
iter 6054: loss 2.4498, time 5008.76ms 
iter 6055: loss 2.5772, time 4952.29ms 
iter 6056: loss 2.7005, time 4927.10ms 
iter 6057: loss 2.4894, time 5016.28ms 
iter 6058: loss 2.5909, time 5027.89ms 
iter 6059: loss 2.5069, time 5027.44ms 
iter 6060: loss 2.5793, time 5048.76ms 
iter 6061: loss 2.4956, time 5052.73ms 
iter 6062: loss 2.5817, time 5046.83ms 
iter 6063: loss 2.4819, time 5033.79ms 
iter 6064: loss 2.5051, time 4988.56ms 
iter 6065: loss 2.4462, time 5023.99ms 
iter 6066: loss 2.5806, time 5020.84ms 
iter 6067: loss 2.4296, time 5010.50ms 
iter 6068: loss 2.4694, time 5014.58ms 
iter 6069: loss 2.4636, time 5030.03ms 
iter 6070: loss 2.5239, time 5031.40ms 
iter 6071: loss 2.4583, time 5032.93ms 
iter 6072: loss 2.4285, time 4970.12ms 
iter 6073: loss 2.5008, time 4919.63ms 
iter 6074: loss 2.7041, time 4950.91ms 
iter 6075: loss 2.6298, time 5002.48ms 
iter 6076: loss 2.5993, time 5041.14ms 
iter 6077: loss 2.5728, time 5035.79ms 
iter 6078: loss 2.7266, time 5031.30ms 
iter 6079: loss 2.6675, time 5031.21ms 
iter 6080: loss 2.5679, time 5029.54ms 
iter 6081: loss 2.5031, time 5042.12ms 
iter 6082: loss 2.6228, time 4984.52ms 
iter 6083: loss 2.4046, time 4932.82ms 
iter 6084: loss 2.6437, time 4933.25ms 
iter 6085: loss 2.5339, time 4979.10ms 
iter 6086: loss 2.6775, time 5012.75ms 
iter 6087: loss 2.5977, time 5025.07ms 
iter 6088: loss 2.6644, time 5034.06ms 
iter 6089: loss 2.4705, time 5026.29ms 
iter 6090: loss 2.4146, time 5027.26ms 
iter 6091: loss 2.5719, time 5029.05ms 
iter 6092: loss 2.6161, time 5031.96ms 
iter 6093: loss 2.5598, time 5022.56ms 
iter 6094: loss 2.4797, time 5027.56ms 
iter 6095: loss 2.6797, time 5027.93ms 
iter 6096: loss 2.5379, time 5028.51ms 
iter 6097: loss 2.6974, time 5028.41ms 
iter 6098: loss 2.5256, time 5027.54ms 
iter 6099: loss 2.4115, time 5030.04ms 
step 6100: train loss 2.5537, val loss 2.8431
iter 6100: loss 2.4742, time 19697.75ms 
iter 6101: loss 2.4773, time 5023.06ms 
iter 6102: loss 2.6477, time 5026.66ms 
iter 6103: loss 2.7363, time 5022.11ms 
iter 6104: loss 2.4509, time 5024.32ms 
iter 6105: loss 2.5808, time 4981.30ms 
iter 6106: loss 2.8178, time 5022.42ms 
iter 6107: loss 2.6508, time 5022.36ms 
iter 6108: loss 2.4766, time 5024.79ms 
iter 6109: loss 2.5849, time 5021.20ms 
iter 6110: loss 2.5222, time 5023.23ms 
iter 6111: loss 2.6221, time 5021.76ms 
iter 6112: loss 2.6278, time 5027.05ms 
iter 6113: loss 2.5948, time 4986.76ms 
iter 6114: loss 2.4206, time 5020.68ms 
iter 6115: loss 2.3494, time 5021.14ms 
iter 6116: loss 2.5780, time 5022.46ms 
iter 6117: loss 2.5229, time 5020.80ms 
iter 6118: loss 2.4339, time 5021.46ms 
iter 6119: loss 2.3211, time 5022.79ms 
iter 6120: loss 2.5749, time 5032.00ms 
iter 6121: loss 2.4157, time 5027.72ms 
iter 6122: loss 2.5906, time 5015.30ms 
iter 6123: loss 2.6252, time 5027.69ms 
iter 6124: loss 2.4141, time 5027.03ms 
iter 6125: loss 2.6376, time 5031.02ms 
iter 6126: loss 2.4875, time 5031.69ms 
iter 6127: loss 2.6061, time 5033.18ms 
iter 6128: loss 2.7119, time 4976.41ms 
iter 6129: loss 2.6401, time 4915.64ms 
iter 6130: loss 2.6044, time 4979.46ms 
iter 6131: loss 2.5137, time 5030.92ms 
iter 6132: loss 2.3839, time 5029.33ms 
iter 6133: loss 2.6974, time 5021.17ms 
iter 6134: loss 2.6324, time 5037.99ms 
iter 6135: loss 2.3916, time 5038.09ms 
iter 6136: loss 2.6454, time 5030.49ms 
iter 6137: loss 2.5640, time 5026.83ms 
iter 6138: loss 2.5396, time 5010.84ms 
iter 6139: loss 2.5106, time 5028.30ms 
iter 6140: loss 2.4818, time 5034.08ms 
iter 6141: loss 2.5901, time 5037.21ms 
iter 6142: loss 2.5948, time 5004.18ms 
iter 6143: loss 2.5813, time 5028.19ms 
iter 6144: loss 2.4581, time 5017.90ms 
iter 6145: loss 2.4233, time 4994.43ms 
iter 6146: loss 2.6607, time 5031.00ms 
iter 6147: loss 2.5733, time 5036.21ms 
iter 6148: loss 2.6600, time 5043.40ms 
iter 6149: loss 2.5780, time 5039.69ms 
step 6150: train loss 2.5453, val loss 2.8393
iter 6150: loss 2.7143, time 19621.08ms 
iter 6151: loss 2.7200, time 4918.62ms 
iter 6152: loss 2.4565, time 4955.80ms 
iter 6153: loss 2.7467, time 5034.39ms 
iter 6154: loss 2.5058, time 5032.20ms 
iter 6155: loss 2.4988, time 5033.49ms 
iter 6156: loss 2.5809, time 5029.89ms 
iter 6157: loss 2.5906, time 5042.12ms 
iter 6158: loss 2.5551, time 5034.58ms 
iter 6159: loss 2.5348, time 5040.62ms 
iter 6160: loss 2.6237, time 5032.23ms 
iter 6161: loss 2.5026, time 5031.45ms 
iter 6162: loss 2.4115, time 5032.74ms 
iter 6163: loss 2.4983, time 5034.70ms 
iter 6164: loss 2.7748, time 5030.43ms 
iter 6165: loss 2.5483, time 5031.84ms 
iter 6166: loss 2.4568, time 5033.15ms 
iter 6167: loss 2.4161, time 4980.57ms 
iter 6168: loss 2.4936, time 4921.07ms 
iter 6169: loss 2.7085, time 5009.28ms 
iter 6170: loss 2.6158, time 5038.53ms 
iter 6171: loss 2.5833, time 5039.76ms 
iter 6172: loss 2.6443, time 5030.93ms 
iter 6173: loss 2.7697, time 5035.00ms 
iter 6174: loss 2.5544, time 5033.44ms 
iter 6175: loss 2.4994, time 5033.39ms 
iter 6176: loss 2.3774, time 4978.49ms 
iter 6177: loss 2.5934, time 5024.49ms 
iter 6178: loss 2.4334, time 5028.71ms 
iter 6179: loss 2.4861, time 5030.85ms 
iter 6180: loss 2.5248, time 5027.52ms 
iter 6181: loss 2.5693, time 5029.41ms 
iter 6182: loss 2.7422, time 5031.17ms 
iter 6183: loss 2.5179, time 5021.75ms 
iter 6184: loss 2.6606, time 4980.86ms 
iter 6185: loss 2.5504, time 4942.71ms 
iter 6186: loss 2.5632, time 5033.73ms 
iter 6187: loss 2.3191, time 5028.20ms 
iter 6188: loss 2.4387, time 5028.91ms 
iter 6189: loss 2.4553, time 5028.09ms 
iter 6190: loss 2.7260, time 5029.61ms 
iter 6191: loss 2.5448, time 5031.28ms 
iter 6192: loss 2.3855, time 5038.16ms 
iter 6193: loss 2.4486, time 4976.02ms 
iter 6194: loss 2.4719, time 4973.90ms 
iter 6195: loss 2.5102, time 5028.90ms 
iter 6196: loss 2.6678, time 5031.34ms 
iter 6197: loss 2.4702, time 5028.23ms 
iter 6198: loss 2.6360, time 5030.34ms 
iter 6199: loss 2.5934, time 5029.60ms 
step 6200: train loss 2.5333, val loss 2.8326
iter 6200: loss 2.7431, time 19699.73ms 
iter 6201: loss 2.2689, time 5032.69ms 
iter 6202: loss 2.3949, time 5030.76ms 
iter 6203: loss 2.6886, time 5019.22ms 
iter 6204: loss 2.5124, time 5032.10ms 
iter 6205: loss 2.4148, time 5031.38ms 
iter 6206: loss 2.7855, time 5037.72ms 
iter 6207: loss 2.5490, time 5021.83ms 
iter 6208: loss 2.5826, time 5025.35ms 
iter 6209: loss 2.5447, time 5034.80ms 
iter 6210: loss 2.7095, time 5033.49ms 
iter 6211: loss 2.5098, time 5033.40ms 
iter 6212: loss 2.4091, time 5027.99ms 
iter 6213: loss 2.4444, time 5027.27ms 
iter 6214: loss 2.5828, time 5028.18ms 
iter 6215: loss 2.4178, time 4974.19ms 
iter 6216: loss 2.2780, time 4916.16ms 
iter 6217: loss 2.5477, time 4936.70ms 
iter 6218: loss 2.1990, time 5031.24ms 
iter 6219: loss 2.2598, time 5032.30ms 
iter 6220: loss 2.4452, time 5029.76ms 
iter 6221: loss 2.5319, time 5029.76ms 
iter 6222: loss 2.4470, time 5032.42ms 
iter 6223: loss 2.6525, time 5030.63ms 
iter 6224: loss 2.6873, time 5031.98ms 
iter 6225: loss 2.4979, time 4976.83ms 
iter 6226: loss 2.5312, time 4998.86ms 
iter 6227: loss 2.4490, time 5025.32ms 
iter 6228: loss 2.5540, time 5029.36ms 
iter 6229: loss 2.4958, time 5028.41ms 
iter 6230: loss 2.6153, time 5028.15ms 
iter 6231: loss 2.4727, time 5027.35ms 
iter 6232: loss 2.4631, time 5032.10ms 
iter 6233: loss 2.5560, time 4999.31ms 
iter 6234: loss 2.7231, time 5026.88ms 
iter 6235: loss 2.3668, time 5025.62ms 
iter 6236: loss 2.7074, time 5030.71ms 
iter 6237: loss 2.6161, time 5028.53ms 
iter 6238: loss 2.7997, time 5018.21ms 
iter 6239: loss 2.5395, time 5028.30ms 
iter 6240: loss 2.6193, time 5030.08ms 
iter 6241: loss 2.3276, time 5022.03ms 
iter 6242: loss 2.6474, time 5029.50ms 
iter 6243: loss 2.5379, time 5031.90ms 
iter 6244: loss 2.4444, time 5012.42ms 
iter 6245: loss 2.5204, time 5021.42ms 
iter 6246: loss 2.5472, time 5025.63ms 
iter 6247: loss 2.6291, time 5028.97ms 
iter 6248: loss 2.4095, time 4987.44ms 
iter 6249: loss 2.4986, time 4926.20ms 
step 6250: train loss 2.5401, val loss 2.8310
iter 6250: loss 2.5537, time 19678.56ms 
iter 6251: loss 2.5643, time 5027.02ms 
iter 6252: loss 2.7493, time 5032.84ms 
iter 6253: loss 2.5005, time 4995.17ms 
iter 6254: loss 2.4512, time 4983.05ms 
iter 6255: loss 2.4209, time 4926.05ms 
iter 6256: loss 2.8083, time 4927.09ms 
iter 6257: loss 2.4791, time 4927.28ms 
iter 6258: loss 2.5829, time 4926.79ms 
iter 6259: loss 2.4190, time 4927.18ms 
iter 6260: loss 2.2997, time 4927.33ms 
iter 6261: loss 2.5186, time 4928.29ms 
iter 6262: loss 2.4916, time 4927.15ms 
iter 6263: loss 2.3268, time 4952.12ms 
iter 6264: loss 2.6055, time 5037.00ms 
iter 6265: loss 2.3998, time 5022.16ms 
iter 6266: loss 2.5167, time 4961.44ms 
iter 6267: loss 2.3771, time 4913.24ms 
iter 6268: loss 2.5349, time 4914.04ms 
iter 6269: loss 2.3639, time 4913.75ms 
iter 6270: loss 2.6043, time 4912.74ms 
iter 6271: loss 2.2550, time 4913.24ms 
iter 6272: loss 2.5983, time 4913.04ms 
iter 6273: loss 2.5144, time 4913.02ms 
iter 6274: loss 2.5875, time 4913.78ms 
iter 6275: loss 2.5417, time 4913.25ms 
iter 6276: loss 2.3800, time 4913.78ms 
iter 6277: loss 2.5053, time 4913.78ms 
iter 6278: loss 2.4723, time 4913.82ms 
iter 6279: loss 2.5744, time 4913.14ms 
iter 6280: loss 2.3272, time 4913.34ms 
iter 6281: loss 2.4148, time 4912.99ms 
iter 6282: loss 2.5573, time 4913.50ms 
iter 6283: loss 2.7744, time 4913.25ms 
iter 6284: loss 2.5476, time 4912.99ms 
iter 6285: loss 2.6270, time 4913.11ms 
iter 6286: loss 2.5298, time 4913.97ms 
iter 6287: loss 2.4760, time 4913.93ms 
iter 6288: loss 2.5645, time 4913.36ms 
iter 6289: loss 2.5103, time 4913.23ms 
iter 6290: loss 2.5719, time 4914.06ms 
iter 6291: loss 2.4599, time 4913.18ms 
iter 6292: loss 2.6818, time 4913.81ms 
iter 6293: loss 2.5495, time 4913.70ms 
iter 6294: loss 2.3210, time 4914.28ms 
iter 6295: loss 2.5256, time 4913.74ms 
iter 6296: loss 2.6259, time 4913.79ms 
iter 6297: loss 2.5344, time 4912.72ms 
iter 6298: loss 2.5876, time 4913.32ms 
iter 6299: loss 2.7107, time 4913.31ms 
step 6300: train loss 2.5388, val loss 2.8451
iter 6300: loss 2.5313, time 19589.28ms 
iter 6301: loss 2.3690, time 4940.94ms 
iter 6302: loss 2.4822, time 5023.19ms 
iter 6303: loss 2.5820, time 5021.86ms 
iter 6304: loss 2.5480, time 4993.62ms 
iter 6305: loss 2.6516, time 4913.37ms 
iter 6306: loss 2.4490, time 4972.95ms 
iter 6307: loss 2.6516, time 5022.37ms 
iter 6308: loss 2.5635, time 5027.87ms 
iter 6309: loss 2.6726, time 4972.37ms 
iter 6310: loss 2.7464, time 5011.25ms 
iter 6311: loss 2.6160, time 5012.47ms 
iter 6312: loss 2.3210, time 5021.62ms 
iter 6313: loss 2.5164, time 5020.75ms 
iter 6314: loss 2.3718, time 5023.24ms 
iter 6315: loss 2.4375, time 5023.47ms 
iter 6316: loss 2.5028, time 5025.93ms 
iter 6317: loss 2.5060, time 4971.57ms 
iter 6318: loss 2.5417, time 4919.26ms 
iter 6319: loss 2.5477, time 5015.70ms 
iter 6320: loss 2.6829, time 5020.70ms 
iter 6321: loss 2.6864, time 5020.50ms 
iter 6322: loss 2.4627, time 5021.98ms 
iter 6323: loss 2.5588, time 5020.82ms 
iter 6324: loss 2.5331, time 5021.53ms 
iter 6325: loss 2.6530, time 4976.23ms 
iter 6326: loss 2.5142, time 4999.15ms 
iter 6327: loss 2.5562, time 5016.93ms 
iter 6328: loss 2.5586, time 5022.56ms 
iter 6329: loss 2.5161, time 5017.53ms 
iter 6330: loss 2.6234, time 5020.18ms 
iter 6331: loss 2.5859, time 5021.43ms 
iter 6332: loss 2.4462, time 5024.21ms 
iter 6333: loss 2.6725, time 4996.52ms 
iter 6334: loss 2.6544, time 5024.49ms 
iter 6335: loss 2.4908, time 5023.74ms 
iter 6336: loss 2.5837, time 5020.61ms 
iter 6337: loss 2.5093, time 5020.28ms 
iter 6338: loss 2.4096, time 5022.85ms 
iter 6339: loss 2.4606, time 5023.04ms 
iter 6340: loss 2.4187, time 4968.09ms 
iter 6341: loss 2.5906, time 4974.80ms 
iter 6342: loss 2.3507, time 5017.15ms 
iter 6343: loss 2.6094, time 5027.09ms 
iter 6344: loss 2.4516, time 5027.07ms 
iter 6345: loss 2.5970, time 5028.47ms 
iter 6346: loss 2.4080, time 5026.76ms 
iter 6347: loss 2.2281, time 5028.44ms 
iter 6348: loss 2.4662, time 5028.85ms 
iter 6349: loss 2.5470, time 4990.55ms 
step 6350: train loss 2.5179, val loss 2.8431
iter 6350: loss 2.5876, time 19668.17ms 
iter 6351: loss 2.3680, time 5019.18ms 
iter 6352: loss 2.4804, time 5009.25ms 
iter 6353: loss 2.4206, time 5021.87ms 
iter 6354: loss 2.5672, time 5013.54ms 
iter 6355: loss 2.4145, time 5025.98ms 
iter 6356: loss 2.5334, time 5027.54ms 
iter 6357: loss 2.4085, time 5027.39ms 
iter 6358: loss 2.6987, time 5025.14ms 
iter 6359: loss 2.4345, time 5027.08ms 
iter 6360: loss 2.7110, time 5027.16ms 
iter 6361: loss 2.5141, time 4963.78ms 
iter 6362: loss 2.5626, time 4948.42ms 
iter 6363: loss 2.6494, time 5024.03ms 
iter 6364: loss 2.5046, time 5028.44ms 
iter 6365: loss 2.4710, time 5026.14ms 
iter 6366: loss 2.6780, time 5026.80ms 
iter 6367: loss 2.5075, time 5026.06ms 
iter 6368: loss 2.7191, time 5027.84ms 
iter 6369: loss 2.4762, time 5034.57ms 
iter 6370: loss 2.3991, time 5025.92ms 
iter 6371: loss 2.6613, time 5025.78ms 
iter 6372: loss 2.5840, time 5025.95ms 
iter 6373: loss 2.4544, time 5031.16ms 
iter 6374: loss 2.5445, time 5010.89ms 
iter 6375: loss 2.4684, time 5026.39ms 
iter 6376: loss 2.5196, time 5028.62ms 
iter 6377: loss 2.5028, time 4976.55ms 
iter 6378: loss 2.3902, time 4942.70ms 
iter 6379: loss 2.6519, time 5026.03ms 
iter 6380: loss 2.5662, time 5027.09ms 
iter 6381: loss 2.3792, time 5019.62ms 
iter 6382: loss 2.4205, time 5024.60ms 
iter 6383: loss 2.4586, time 5002.71ms 
iter 6384: loss 2.4494, time 5027.49ms 
iter 6385: loss 2.4336, time 5007.37ms 
iter 6386: loss 2.4709, time 4915.55ms 
iter 6387: loss 2.4188, time 4990.51ms 
iter 6388: loss 2.4808, time 5018.82ms 
iter 6389: loss 2.3920, time 5026.67ms 
iter 6390: loss 2.6916, time 5027.17ms 
iter 6391: loss 2.2962, time 5025.69ms 
iter 6392: loss 2.2999, time 5016.31ms 
iter 6393: loss 2.6811, time 5009.34ms 
iter 6394: loss 2.5190, time 4924.17ms 
iter 6395: loss 2.4623, time 4942.21ms 
iter 6396: loss 2.5772, time 5015.53ms 
iter 6397: loss 2.3913, time 5015.30ms 
iter 6398: loss 2.6131, time 5025.78ms 
iter 6399: loss 2.5099, time 5025.95ms 
step 6400: train loss 2.5281, val loss 2.8409
iter 6400: loss 2.5735, time 19662.05ms 
iter 6401: loss 2.4098, time 5029.45ms 
iter 6402: loss 2.5153, time 5027.02ms 
iter 6403: loss 2.8048, time 5015.95ms 
iter 6404: loss 2.4712, time 5025.97ms 
iter 6405: loss 2.3798, time 5021.29ms 
iter 6406: loss 2.6589, time 5032.51ms 
iter 6407: loss 2.6113, time 4990.67ms 
iter 6408: loss 2.4830, time 5025.58ms 
iter 6409: loss 2.5102, time 5026.44ms 
iter 6410: loss 2.3920, time 5028.41ms 
iter 6411: loss 2.4906, time 5026.08ms 
iter 6412: loss 2.4357, time 5025.75ms 
iter 6413: loss 2.6896, time 5025.91ms 
iter 6414: loss 2.5225, time 5023.20ms 
iter 6415: loss 2.5907, time 4916.73ms 
iter 6416: loss 2.4744, time 4961.66ms 
iter 6417: loss 2.5012, time 5026.47ms 
iter 6418: loss 2.4439, time 5017.99ms 
iter 6419: loss 2.6180, time 5026.41ms 
iter 6420: loss 2.3730, time 5026.21ms 
iter 6421: loss 2.4249, time 5028.08ms 
iter 6422: loss 2.5507, time 5028.26ms 
iter 6423: loss 2.7275, time 5030.43ms 
iter 6424: loss 2.4486, time 4950.60ms 
iter 6425: loss 2.2890, time 4996.76ms 
iter 6426: loss 2.5645, time 5026.51ms 
iter 6427: loss 2.3964, time 5026.81ms 
iter 6428: loss 2.5992, time 5013.87ms 
iter 6429: loss 2.2699, time 5024.82ms 
iter 6430: loss 2.4119, time 5008.87ms 
iter 6431: loss 2.5050, time 5027.04ms 
iter 6432: loss 2.7513, time 4977.05ms 
iter 6433: loss 2.5817, time 5005.33ms 
iter 6434: loss 2.4068, time 5029.89ms 
iter 6435: loss 2.5710, time 5030.66ms 
iter 6436: loss 2.4823, time 5042.34ms 
iter 6437: loss 2.5566, time 5040.40ms 
iter 6438: loss 2.5373, time 5041.60ms 
iter 6439: loss 2.5161, time 5029.10ms 
iter 6440: loss 2.4611, time 4981.39ms 
iter 6441: loss 2.5499, time 4929.79ms 
iter 6442: loss 2.6123, time 4943.48ms 
iter 6443: loss 2.6011, time 5021.14ms 
iter 6444: loss 2.4966, time 5023.59ms 
iter 6445: loss 2.6272, time 5024.74ms 
iter 6446: loss 2.6765, time 5019.37ms 
iter 6447: loss 2.6239, time 4998.47ms 
iter 6448: loss 2.5834, time 5013.50ms 
iter 6449: loss 2.5026, time 5029.65ms 
step 6450: train loss 2.5343, val loss 2.8366
iter 6450: loss 2.4574, time 19661.94ms 
iter 6451: loss 2.4593, time 5022.62ms 
iter 6452: loss 2.6975, time 5027.49ms 
iter 6453: loss 2.5759, time 5023.85ms 
iter 6454: loss 2.5510, time 5025.97ms 
iter 6455: loss 2.4273, time 4990.99ms 
iter 6456: loss 2.7568, time 5024.05ms 
iter 6457: loss 2.6867, time 5024.58ms 
iter 6458: loss 2.5007, time 5024.01ms 
iter 6459: loss 2.3954, time 5023.17ms 
iter 6460: loss 2.9903, time 5026.66ms 
iter 6461: loss 2.5776, time 5023.94ms 
iter 6462: loss 2.4316, time 5027.05ms 
iter 6463: loss 2.3308, time 4971.66ms 
iter 6464: loss 2.3551, time 4915.68ms 
iter 6465: loss 2.4361, time 4999.64ms 
iter 6466: loss 2.8027, time 5025.31ms 
iter 6467: loss 2.3220, time 4997.25ms 
iter 6468: loss 2.4495, time 5020.85ms 
iter 6469: loss 2.5355, time 5021.69ms 
iter 6470: loss 2.5823, time 5024.00ms 
iter 6471: loss 2.3576, time 5025.62ms 
iter 6472: loss 2.5207, time 5004.02ms 
iter 6473: loss 2.5749, time 5022.03ms 
iter 6474: loss 2.4886, time 5022.69ms 
iter 6475: loss 2.5051, time 5023.53ms 
iter 6476: loss 2.6166, time 5024.92ms 
iter 6477: loss 2.5891, time 5021.30ms 
iter 6478: loss 2.4090, time 5022.81ms 
iter 6479: loss 2.6997, time 5029.95ms 
iter 6480: loss 2.3682, time 4960.44ms 
iter 6481: loss 2.3418, time 4982.93ms 
iter 6482: loss 2.6549, time 5024.88ms 
iter 6483: loss 2.5870, time 5023.38ms 
iter 6484: loss 2.6692, time 5020.75ms 
iter 6485: loss 2.3403, time 5022.32ms 
iter 6486: loss 2.6689, time 5023.03ms 
iter 6487: loss 2.6344, time 5024.53ms 
iter 6488: loss 2.5350, time 4994.75ms 
iter 6489: loss 2.4675, time 5024.30ms 
iter 6490: loss 2.4732, time 5018.41ms 
iter 6491: loss 2.6431, time 5023.73ms 
iter 6492: loss 2.5216, time 5013.09ms 
iter 6493: loss 2.6084, time 5006.75ms 
iter 6494: loss 2.4983, time 5021.99ms 
iter 6495: loss 2.5831, time 5029.84ms 
iter 6496: loss 2.5775, time 4988.48ms 
iter 6497: loss 2.3794, time 5023.36ms 
iter 6498: loss 2.6669, time 5024.72ms 
iter 6499: loss 2.7624, time 5025.30ms 
step 6500: train loss 2.5221, val loss 2.8470
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 6500: loss 2.5108, time 20673.77ms 
iter 6501: loss 2.4773, time 4914.96ms 
iter 6502: loss 2.6289, time 4941.40ms 
iter 6503: loss 2.4282, time 5023.11ms 
iter 6504: loss 2.6743, time 5017.32ms 
iter 6505: loss 2.4339, time 5018.01ms 
iter 6506: loss 2.4113, time 5007.20ms 
iter 6507: loss 2.7623, time 5025.02ms 
iter 6508: loss 1.8139, time 5001.20ms 
iter 6509: loss 2.3749, time 5028.98ms 
iter 6510: loss 2.2884, time 4974.04ms 
iter 6511: loss 2.6292, time 4960.11ms 
iter 6512: loss 2.6780, time 5012.91ms 
iter 6513: loss 2.6666, time 5009.57ms 
iter 6514: loss 2.7844, time 5026.42ms 
iter 6515: loss 2.6636, time 5029.26ms 
iter 6516: loss 2.4159, time 5026.08ms 
iter 6517: loss 2.6103, time 5020.60ms 
iter 6518: loss 2.4135, time 5032.37ms 
iter 6519: loss 2.7269, time 5000.13ms 
iter 6520: loss 2.7309, time 5026.43ms 
iter 6521: loss 2.3437, time 5031.29ms 
iter 6522: loss 2.7331, time 5024.74ms 
iter 6523: loss 2.6605, time 5023.05ms 
iter 6524: loss 2.7225, time 5029.48ms 
iter 6525: loss 2.8024, time 5029.12ms 
iter 6526: loss 2.6222, time 4975.68ms 
iter 6527: loss 2.5429, time 4918.04ms 
iter 6528: loss 2.3658, time 4992.44ms 
iter 6529: loss 2.4121, time 5028.26ms 
iter 6530: loss 2.4543, time 5040.14ms 
iter 6531: loss 2.4856, time 5025.31ms 
iter 6532: loss 2.5972, time 5011.04ms 
iter 6533: loss 2.4757, time 5032.17ms 
iter 6534: loss 2.5566, time 5019.42ms 
iter 6535: loss 2.3590, time 4981.06ms 
iter 6536: loss 2.6181, time 5031.51ms 
iter 6537: loss 2.7844, time 5034.29ms 
iter 6538: loss 2.5084, time 5031.56ms 
iter 6539: loss 2.5429, time 5032.37ms 
iter 6540: loss 2.3608, time 5030.44ms 
iter 6541: loss 2.6363, time 5037.84ms 
iter 6542: loss 2.5993, time 5010.47ms 
iter 6543: loss 2.7696, time 4964.83ms 
iter 6544: loss 2.3888, time 4915.95ms 
iter 6545: loss 2.6112, time 5001.18ms 
iter 6546: loss 2.3938, time 5028.20ms 
iter 6547: loss 2.6458, time 5027.97ms 
iter 6548: loss 2.6377, time 5028.52ms 
iter 6549: loss 2.5377, time 5027.71ms 
step 6550: train loss 2.5214, val loss 2.8538
iter 6550: loss 2.7129, time 19608.40ms 
iter 6551: loss 2.4849, time 5002.88ms 
iter 6552: loss 2.4352, time 5030.78ms 
iter 6553: loss 2.3666, time 5018.01ms 
iter 6554: loss 2.8196, time 5030.03ms 
iter 6555: loss 2.5561, time 5020.60ms 
iter 6556: loss 2.2441, time 5029.70ms 
iter 6557: loss 2.6778, time 5033.08ms 
iter 6558: loss 2.5018, time 4981.29ms 
iter 6559: loss 2.7100, time 4957.48ms 
iter 6560: loss 2.5952, time 5030.85ms 
iter 6561: loss 2.1655, time 5030.31ms 
iter 6562: loss 2.3733, time 5030.77ms 
iter 6563: loss 2.5571, time 5029.13ms 
iter 6564: loss 2.6058, time 5020.37ms 
iter 6565: loss 2.3696, time 5030.75ms 
iter 6566: loss 2.4305, time 5035.84ms 
iter 6567: loss 2.6227, time 5022.25ms 
iter 6568: loss 2.5699, time 5030.97ms 
iter 6569: loss 2.7449, time 5030.66ms 
iter 6570: loss 2.3739, time 5029.74ms 
iter 6571: loss 2.6318, time 5029.69ms 
iter 6572: loss 2.5076, time 5029.75ms 
iter 6573: loss 2.5456, time 5033.82ms 
iter 6574: loss 2.5040, time 5018.84ms 
iter 6575: loss 2.4316, time 5028.61ms 
iter 6576: loss 2.5748, time 5031.33ms 
iter 6577: loss 2.3338, time 5032.06ms 
iter 6578: loss 2.4891, time 5030.05ms 
iter 6579: loss 2.4889, time 5022.65ms 
iter 6580: loss 2.5035, time 5025.98ms 
iter 6581: loss 2.6561, time 4927.98ms 
iter 6582: loss 2.5767, time 4928.86ms 
iter 6583: loss 2.5483, time 4928.50ms 
iter 6584: loss 2.5592, time 4928.32ms 
iter 6585: loss 2.5541, time 4989.62ms 
iter 6586: loss 2.4323, time 4987.70ms 
iter 6587: loss 2.6177, time 4928.35ms 
iter 6588: loss 2.4270, time 4925.38ms 
iter 6589: loss 2.5750, time 4928.42ms 
iter 6590: loss 2.5741, time 4928.79ms 
iter 6591: loss 2.3079, time 4928.11ms 
iter 6592: loss 2.6011, time 4928.38ms 
iter 6593: loss 2.6076, time 4928.67ms 
iter 6594: loss 2.3858, time 4926.92ms 
iter 6595: loss 2.4483, time 4928.61ms 
iter 6596: loss 2.5192, time 4927.82ms 
iter 6597: loss 2.3559, time 4929.23ms 
iter 6598: loss 2.4249, time 4927.85ms 
iter 6599: loss 2.3070, time 4928.50ms 
step 6600: train loss 2.5228, val loss 2.8390
iter 6600: loss 2.5240, time 19682.97ms 
iter 6601: loss 2.5037, time 5034.82ms 
iter 6602: loss 2.5274, time 5045.93ms 
iter 6603: loss 2.4362, time 4981.17ms 
iter 6604: loss 2.5448, time 4999.82ms 
iter 6605: loss 2.3704, time 5032.11ms 
iter 6606: loss 2.6133, time 5037.01ms 
iter 6607: loss 2.6054, time 5018.36ms 
iter 6608: loss 2.4824, time 5038.28ms 
iter 6609: loss 2.4688, time 5034.40ms 
iter 6610: loss 2.7655, time 5032.32ms 
iter 6611: loss 2.4387, time 4983.95ms 
iter 6612: loss 2.4893, time 5029.87ms 
iter 6613: loss 2.2852, time 5028.21ms 
iter 6614: loss 2.6432, time 5026.39ms 
iter 6615: loss 2.3690, time 5028.10ms 
iter 6616: loss 2.3958, time 5036.81ms 
iter 6617: loss 2.5621, time 5024.37ms 
iter 6618: loss 2.4663, time 5037.41ms 
iter 6619: loss 2.5724, time 5029.80ms 
iter 6620: loss 2.6070, time 5036.02ms 
iter 6621: loss 2.4063, time 5029.05ms 
iter 6622: loss 2.5446, time 5036.24ms 
iter 6623: loss 2.6159, time 4990.52ms 
iter 6624: loss 2.6698, time 4937.49ms 
iter 6625: loss 2.3677, time 4924.58ms 
iter 6626: loss 2.4625, time 4920.71ms 
iter 6627: loss 2.4806, time 4924.30ms 
iter 6628: loss 2.3009, time 4923.61ms 
iter 6629: loss 2.4683, time 4924.22ms 
iter 6630: loss 2.5319, time 4923.95ms 
iter 6631: loss 2.5334, time 4923.92ms 
iter 6632: loss 2.6553, time 4924.72ms 
iter 6633: loss 2.4905, time 4924.35ms 
iter 6634: loss 2.5379, time 4923.82ms 
iter 6635: loss 2.4559, time 4948.71ms 
iter 6636: loss 2.5687, time 4973.58ms 
iter 6637: loss 2.5459, time 4960.82ms 
iter 6638: loss 2.4841, time 5019.25ms 
iter 6639: loss 2.4791, time 5026.42ms 
iter 6640: loss 2.5207, time 5017.53ms 
iter 6641: loss 2.5241, time 5026.39ms 
iter 6642: loss 2.4146, time 5022.63ms 
iter 6643: loss 2.5895, time 5020.71ms 
iter 6644: loss 2.4904, time 5025.70ms 
iter 6645: loss 2.5364, time 4994.75ms 
iter 6646: loss 2.5320, time 5015.82ms 
iter 6647: loss 2.3689, time 5023.72ms 
iter 6648: loss 2.5396, time 5017.26ms 
iter 6649: loss 2.6638, time 5023.41ms 
step 6650: train loss 2.5299, val loss 2.8420
iter 6650: loss 2.3738, time 19618.54ms 
iter 6651: loss 2.5481, time 4993.65ms 
iter 6652: loss 2.6156, time 5031.79ms 
iter 6653: loss 2.7006, time 5005.56ms 
iter 6654: loss 2.2247, time 5005.80ms 
iter 6655: loss 2.5028, time 5002.34ms 
iter 6656: loss 2.5416, time 5022.08ms 
iter 6657: loss 2.7493, time 5000.34ms 
iter 6658: loss 2.5932, time 4917.25ms 
iter 6659: loss 2.5164, time 4956.46ms 
iter 6660: loss 2.8014, time 5007.48ms 
iter 6661: loss 2.6435, time 5004.72ms 
iter 6662: loss 2.6422, time 5027.66ms 
iter 6663: loss 2.4792, time 5028.90ms 
iter 6664: loss 2.4291, time 5007.63ms 
iter 6665: loss 2.4206, time 5025.42ms 
iter 6666: loss 2.6628, time 5016.43ms 
iter 6667: loss 2.5745, time 4916.29ms 
iter 6668: loss 2.6506, time 4930.78ms 
iter 6669: loss 2.6660, time 5026.65ms 
iter 6670: loss 2.5133, time 5027.65ms 
iter 6671: loss 2.6264, time 5006.78ms 
iter 6672: loss 2.4437, time 5019.29ms 
iter 6673: loss 2.4064, time 4965.31ms 
iter 6674: loss 2.4123, time 5022.90ms 
iter 6675: loss 2.5313, time 5029.88ms 
iter 6676: loss 2.5835, time 4976.25ms 
iter 6677: loss 2.7731, time 4915.89ms 
iter 6678: loss 2.7476, time 4922.57ms 
iter 6679: loss 2.4912, time 5030.00ms 
iter 6680: loss 2.4590, time 5024.67ms 
iter 6681: loss 2.5090, time 5024.61ms 
iter 6682: loss 2.3788, time 5024.39ms 
iter 6683: loss 2.5823, time 5022.14ms 
iter 6684: loss 2.6379, time 5020.66ms 
iter 6685: loss 2.7899, time 5026.40ms 
iter 6686: loss 2.6311, time 4973.78ms 
iter 6687: loss 2.3733, time 4916.51ms 
iter 6688: loss 2.5205, time 4918.62ms 
iter 6689: loss 2.6554, time 5015.26ms 
iter 6690: loss 2.5497, time 5022.37ms 
iter 6691: loss 2.7379, time 5024.64ms 
iter 6692: loss 2.5700, time 5013.67ms 
iter 6693: loss 2.6933, time 5036.03ms 
iter 6694: loss 2.5120, time 5030.33ms 
iter 6695: loss 2.5227, time 5000.96ms 
iter 6696: loss 2.5687, time 4918.03ms 
iter 6697: loss 2.5235, time 4915.77ms 
iter 6698: loss 2.4501, time 4916.78ms 
iter 6699: loss 2.7306, time 5003.02ms 
step 6700: train loss 2.5216, val loss 2.8353
iter 6700: loss 2.2774, time 19695.54ms 
iter 6701: loss 2.4738, time 5029.09ms 
iter 6702: loss 2.4789, time 5030.59ms 
iter 6703: loss 2.5998, time 4978.06ms 
iter 6704: loss 2.5393, time 4916.18ms 
iter 6705: loss 2.5199, time 4915.29ms 
iter 6706: loss 2.6276, time 4992.34ms 
iter 6707: loss 2.3833, time 5028.33ms 
iter 6708: loss 2.5597, time 5027.11ms 
iter 6709: loss 2.5275, time 5027.67ms 
iter 6710: loss 2.3931, time 5025.18ms 
iter 6711: loss 2.2446, time 5027.24ms 
iter 6712: loss 2.3754, time 5024.16ms 
iter 6713: loss 2.6586, time 4975.69ms 
iter 6714: loss 2.5215, time 4914.69ms 
iter 6715: loss 2.6529, time 4942.37ms 
iter 6716: loss 2.5165, time 5028.17ms 
iter 6717: loss 2.6673, time 5007.92ms 
iter 6718: loss 2.5764, time 5005.17ms 
iter 6719: loss 2.4879, time 5026.55ms 
iter 6720: loss 2.6186, time 5032.45ms 
iter 6721: loss 2.5078, time 5001.53ms 
iter 6722: loss 2.7178, time 5020.36ms 
iter 6723: loss 2.3944, time 4979.16ms 
iter 6724: loss 2.5986, time 4918.03ms 
iter 6725: loss 2.6343, time 4945.78ms 
iter 6726: loss 2.4358, time 5034.19ms 
iter 6727: loss 2.4961, time 5037.18ms 
iter 6728: loss 2.1663, time 5033.02ms 
iter 6729: loss 2.5425, time 5030.12ms 
iter 6730: loss 2.4541, time 5026.40ms 
iter 6731: loss 2.3540, time 5027.69ms 
iter 6732: loss 2.4808, time 5012.37ms 
iter 6733: loss 2.5079, time 4916.15ms 
iter 6734: loss 2.3759, time 4916.08ms 
iter 6735: loss 2.5932, time 5008.41ms 
iter 6736: loss 2.6678, time 5030.25ms 
iter 6737: loss 2.3469, time 5030.91ms 
iter 6738: loss 2.5533, time 5027.84ms 
iter 6739: loss 2.4137, time 5028.28ms 
iter 6740: loss 2.5721, time 5032.08ms 
iter 6741: loss 2.4324, time 5036.73ms 
iter 6742: loss 2.3974, time 5004.31ms 
iter 6743: loss 2.5775, time 4969.12ms 
iter 6744: loss 2.4210, time 4916.82ms 
iter 6745: loss 2.4882, time 4990.59ms 
iter 6746: loss 2.6560, time 5007.27ms 
iter 6747: loss 2.5014, time 4996.61ms 
iter 6748: loss 2.5637, time 5028.79ms 
iter 6749: loss 2.4245, time 5012.30ms 
step 6750: train loss 2.5376, val loss 2.8500
iter 6750: loss 2.3238, time 19600.77ms 
iter 6751: loss 2.6245, time 4955.42ms 
iter 6752: loss 2.5533, time 4988.24ms 
iter 6753: loss 2.5569, time 5010.61ms 
iter 6754: loss 2.4831, time 4982.93ms 
iter 6755: loss 2.7210, time 5003.80ms 
iter 6756: loss 2.4298, time 5007.94ms 
iter 6757: loss 2.5669, time 5001.49ms 
iter 6758: loss 2.4948, time 4925.91ms 
iter 6759: loss 2.3992, time 4916.84ms 
iter 6760: loss 2.6204, time 4915.28ms 
iter 6761: loss 2.6202, time 4953.20ms 
iter 6762: loss 2.6596, time 5010.52ms 
iter 6763: loss 2.3275, time 4988.37ms 
iter 6764: loss 2.4238, time 4968.27ms 
iter 6765: loss 2.4262, time 5006.85ms 
iter 6766: loss 2.3747, time 4985.57ms 
iter 6767: loss 2.4719, time 5011.50ms 
iter 6768: loss 2.6345, time 4947.57ms 
iter 6769: loss 2.4501, time 4933.01ms 
iter 6770: loss 2.5595, time 4916.50ms 
iter 6771: loss 2.7151, time 4923.64ms 
iter 6772: loss 2.5308, time 5029.86ms 
iter 6773: loss 2.4702, time 5028.36ms 
iter 6774: loss 2.6467, time 5028.63ms 
iter 6775: loss 2.6472, time 5027.78ms 
iter 6776: loss 2.5006, time 5025.97ms 
iter 6777: loss 2.4342, time 5027.79ms 
iter 6778: loss 2.2585, time 5031.56ms 
iter 6779: loss 2.5978, time 4977.17ms 
iter 6780: loss 2.5664, time 4917.83ms 
iter 6781: loss 2.6249, time 4992.63ms 
iter 6782: loss 2.5053, time 5022.35ms 
iter 6783: loss 2.2726, time 5006.41ms 
iter 6784: loss 2.5057, time 5004.50ms 
iter 6785: loss 2.5690, time 5013.20ms 
iter 6786: loss 2.6701, time 5014.17ms 
iter 6787: loss 2.4114, time 5001.39ms 
iter 6788: loss 2.4906, time 4918.83ms 
iter 6789: loss 2.3916, time 4915.32ms 
iter 6790: loss 2.4832, time 4915.26ms 
iter 6791: loss 2.6120, time 4915.55ms 
iter 6792: loss 2.4320, time 5000.01ms 
iter 6793: loss 2.5655, time 5027.56ms 
iter 6794: loss 2.4067, time 5023.78ms 
iter 6795: loss 2.4488, time 5025.71ms 
iter 6796: loss 2.5194, time 5027.14ms 
iter 6797: loss 2.5472, time 5027.80ms 
iter 6798: loss 2.4836, time 5028.05ms 
iter 6799: loss 2.2564, time 4977.10ms 
step 6800: train loss 2.4857, val loss 2.8568
iter 6800: loss 2.3387, time 19673.95ms 
iter 6801: loss 2.5808, time 5025.51ms 
iter 6802: loss 2.6975, time 5024.52ms 
iter 6803: loss 2.6085, time 4991.48ms 
iter 6804: loss 2.5517, time 5027.90ms 
iter 6805: loss 2.6679, time 5016.29ms 
iter 6806: loss 2.3992, time 4976.94ms 
iter 6807: loss 2.6214, time 4916.32ms 
iter 6808: loss 2.4660, time 4915.06ms 
iter 6809: loss 2.5462, time 4954.17ms 
iter 6810: loss 2.6200, time 5014.76ms 
iter 6811: loss 2.5695, time 4987.30ms 
iter 6812: loss 2.4950, time 5003.09ms 
iter 6813: loss 2.3914, time 5015.94ms 
iter 6814: loss 2.5749, time 5030.43ms 
iter 6815: loss 2.4719, time 5027.89ms 
iter 6816: loss 2.6150, time 4984.59ms 
iter 6817: loss 2.4781, time 4916.47ms 
iter 6818: loss 2.3495, time 4915.79ms 
iter 6819: loss 2.5239, time 4967.01ms 
iter 6820: loss 2.6118, time 5026.02ms 
iter 6821: loss 2.3368, time 5028.28ms 
iter 6822: loss 2.4659, time 5030.75ms 
iter 6823: loss 2.4217, time 5028.88ms 
iter 6824: loss 2.5319, time 5028.24ms 
iter 6825: loss 2.7038, time 5029.36ms 
iter 6826: loss 2.5936, time 4978.43ms 
iter 6827: loss 2.3636, time 4917.50ms 
iter 6828: loss 2.3819, time 4914.22ms 
iter 6829: loss 2.5382, time 4999.47ms 
iter 6830: loss 2.3827, time 5026.05ms 
iter 6831: loss 2.5630, time 5027.10ms 
iter 6832: loss 2.4467, time 4955.81ms 
iter 6833: loss 2.2361, time 5025.12ms 
iter 6834: loss 2.5810, time 5030.28ms 
iter 6835: loss 2.3791, time 5024.97ms 
iter 6836: loss 2.5361, time 4971.51ms 
iter 6837: loss 2.5469, time 4916.30ms 
iter 6838: loss 2.7908, time 4919.49ms 
iter 6839: loss 2.4460, time 5018.77ms 
iter 6840: loss 2.5368, time 5027.29ms 
iter 6841: loss 2.3580, time 5030.93ms 
iter 6842: loss 2.4352, time 5030.66ms 
iter 6843: loss 2.3634, time 5029.29ms 
iter 6844: loss 2.5884, time 5004.88ms 
iter 6845: loss 2.4826, time 4989.82ms 
iter 6846: loss 2.7459, time 4916.95ms 
iter 6847: loss 2.5818, time 4918.41ms 
iter 6848: loss 2.4992, time 4964.54ms 
iter 6849: loss 2.4316, time 5027.80ms 
step 6850: train loss 2.5119, val loss 2.8436
iter 6850: loss 2.3115, time 19716.84ms 
iter 6851: loss 2.4734, time 5030.29ms 
iter 6852: loss 2.7101, time 5030.30ms 
iter 6853: loss 2.5794, time 4929.95ms 
iter 6854: loss 2.4317, time 4954.17ms 
iter 6855: loss 2.3065, time 5024.08ms 
iter 6856: loss 2.6045, time 5019.50ms 
iter 6857: loss 2.5687, time 5028.57ms 
iter 6858: loss 2.3485, time 5014.30ms 
iter 6859: loss 2.6513, time 5016.13ms 
iter 6860: loss 2.3429, time 5010.36ms 
iter 6861: loss 2.5192, time 5018.11ms 
iter 6862: loss 2.5854, time 4963.19ms 
iter 6863: loss 2.6709, time 4935.43ms 
iter 6864: loss 2.3052, time 4922.43ms 
iter 6865: loss 2.5642, time 5016.62ms 
iter 6866: loss 2.3405, time 5020.81ms 
iter 6867: loss 2.6087, time 5020.33ms 
iter 6868: loss 2.4130, time 5023.46ms 
iter 6869: loss 2.3912, time 5022.11ms 
iter 6870: loss 2.5210, time 5024.31ms 
iter 6871: loss 2.4333, time 5026.13ms 
iter 6872: loss 2.5626, time 4971.07ms 
iter 6873: loss 2.5724, time 4916.32ms 
iter 6874: loss 2.4190, time 4983.46ms 
iter 6875: loss 2.3925, time 5025.54ms 
iter 6876: loss 2.7503, time 5000.86ms 
iter 6877: loss 2.5734, time 5020.46ms 
iter 6878: loss 2.4860, time 5020.25ms 
iter 6879: loss 2.4474, time 5024.86ms 
iter 6880: loss 2.4333, time 5022.83ms 
iter 6881: loss 2.6176, time 4970.86ms 
iter 6882: loss 2.1593, time 4916.17ms 
iter 6883: loss 2.4966, time 4943.26ms 
iter 6884: loss 2.5835, time 5021.99ms 
iter 6885: loss 2.4140, time 5021.54ms 
iter 6886: loss 2.4447, time 5021.76ms 
iter 6887: loss 2.6807, time 5024.33ms 
iter 6888: loss 2.4917, time 5022.71ms 
iter 6889: loss 2.5853, time 5010.75ms 
iter 6890: loss 2.3796, time 5026.04ms 
iter 6891: loss 2.5779, time 4970.02ms 
iter 6892: loss 2.5996, time 4914.33ms 
iter 6893: loss 2.6408, time 4954.41ms 
iter 6894: loss 2.6104, time 5023.42ms 
iter 6895: loss 2.4641, time 5022.72ms 
iter 6896: loss 2.3830, time 5022.51ms 
iter 6897: loss 2.4660, time 5021.22ms 
iter 6898: loss 2.2676, time 5024.96ms 
iter 6899: loss 2.4106, time 5022.70ms 
step 6900: train loss 2.5125, val loss 2.8577
iter 6900: loss 2.4744, time 19648.06ms 
iter 6901: loss 2.2770, time 5025.47ms 
iter 6902: loss 2.6981, time 5014.92ms 
iter 6903: loss 2.3034, time 5020.30ms 
iter 6904: loss 2.7348, time 5021.01ms 
iter 6905: loss 2.6840, time 5025.67ms 
iter 6906: loss 2.4766, time 5002.55ms 
iter 6907: loss 2.6133, time 4971.60ms 
iter 6908: loss 2.2920, time 4943.00ms 
iter 6909: loss 2.5095, time 4947.92ms 
iter 6910: loss 2.1735, time 5022.44ms 
iter 6911: loss 2.4549, time 5021.19ms 
iter 6912: loss 2.6248, time 5022.15ms 
iter 6913: loss 2.4604, time 5021.84ms 
iter 6914: loss 2.6623, time 5024.73ms 
iter 6915: loss 2.4332, time 5021.49ms 
iter 6916: loss 2.5495, time 5024.08ms 
iter 6917: loss 2.5761, time 5021.50ms 
iter 6918: loss 2.6235, time 4968.72ms 
iter 6919: loss 2.5173, time 5018.58ms 
iter 6920: loss 2.3074, time 5024.05ms 
iter 6921: loss 2.4328, time 5021.18ms 
iter 6922: loss 2.5181, time 5021.42ms 
iter 6923: loss 2.4933, time 5019.95ms 
iter 6924: loss 2.3127, time 5020.18ms 
iter 6925: loss 2.3305, time 5022.44ms 
iter 6926: loss 2.5657, time 4970.23ms 
iter 6927: loss 2.5613, time 4916.61ms 
iter 6928: loss 2.4359, time 4954.37ms 
iter 6929: loss 2.5179, time 5025.62ms 
iter 6930: loss 2.5573, time 4983.87ms 
iter 6931: loss 2.5435, time 4943.83ms 
iter 6932: loss 2.5219, time 4964.19ms 
iter 6933: loss 2.4196, time 5018.53ms 
iter 6934: loss 2.5767, time 5022.95ms 
iter 6935: loss 2.7343, time 5025.32ms 
iter 6936: loss 2.4433, time 4971.73ms 
iter 6937: loss 2.7695, time 4916.85ms 
iter 6938: loss 2.6430, time 4956.42ms 
iter 6939: loss 2.4836, time 5021.55ms 
iter 6940: loss 2.5974, time 5013.08ms 
iter 6941: loss 2.6716, time 5022.60ms 
iter 6942: loss 2.5069, time 5021.80ms 
iter 6943: loss 2.5835, time 5023.01ms 
iter 6944: loss 2.6375, time 5023.87ms 
iter 6945: loss 2.4031, time 5029.67ms 
iter 6946: loss 2.6590, time 4971.67ms 
iter 6947: loss 2.5923, time 4915.94ms 
iter 6948: loss 2.3539, time 4924.40ms 
iter 6949: loss 2.5374, time 5021.10ms 
step 6950: train loss 2.5159, val loss 2.8278
iter 6950: loss 2.4319, time 19704.21ms 
iter 6951: loss 2.3796, time 5034.45ms 
iter 6952: loss 2.4819, time 4999.31ms 
iter 6953: loss 2.6561, time 4918.74ms 
iter 6954: loss 2.3592, time 4918.74ms 
iter 6955: loss 2.4946, time 4990.15ms 
iter 6956: loss 2.4122, time 5023.45ms 
iter 6957: loss 2.4269, time 5001.06ms 
iter 6958: loss 2.5250, time 5011.51ms 
iter 6959: loss 2.5266, time 4959.14ms 
iter 6960: loss 2.5191, time 4990.49ms 
iter 6961: loss 2.5539, time 5010.57ms 
iter 6962: loss 2.6099, time 4928.08ms 
iter 6963: loss 2.5602, time 4923.88ms 
iter 6964: loss 2.8335, time 4924.31ms 
iter 6965: loss 2.4892, time 5003.73ms 
iter 6966: loss 2.5952, time 4986.86ms 
iter 6967: loss 2.4414, time 5015.93ms 
iter 6968: loss 2.3349, time 5010.76ms 
iter 6969: loss 2.5053, time 5008.20ms 
iter 6970: loss 2.7378, time 4996.94ms 
iter 6971: loss 2.4114, time 4963.25ms 
iter 6972: loss 2.5448, time 4919.35ms 
iter 6973: loss 2.4462, time 4924.74ms 
iter 6974: loss 2.7313, time 4998.33ms 
iter 6975: loss 2.5355, time 4998.28ms 
iter 6976: loss 2.3990, time 4950.00ms 
iter 6977: loss 2.5542, time 4985.51ms 
iter 6978: loss 2.5950, time 5030.40ms 
iter 6979: loss 2.6296, time 5030.47ms 
iter 6980: loss 2.3671, time 5007.68ms 
iter 6981: loss 2.5633, time 4920.64ms 
iter 6982: loss 2.5843, time 4917.87ms 
iter 6983: loss 2.3346, time 4952.03ms 
iter 6984: loss 2.3493, time 5016.59ms 
iter 6985: loss 2.9037, time 5036.99ms 
iter 6986: loss 2.6238, time 5034.69ms 
iter 6987: loss 2.3415, time 5036.32ms 
iter 6988: loss 2.5371, time 5032.77ms 
iter 6989: loss 2.4788, time 5033.56ms 
iter 6990: loss 2.5236, time 5030.83ms 
iter 6991: loss 2.4915, time 4978.43ms 
iter 6992: loss 2.4809, time 4916.78ms 
iter 6993: loss 2.6377, time 4961.12ms 
iter 6994: loss 2.5053, time 5029.72ms 
iter 6995: loss 2.5396, time 5025.00ms 
iter 6996: loss 2.1279, time 5026.46ms 
iter 6997: loss 2.4616, time 5031.73ms 
iter 6998: loss 2.3557, time 5034.86ms 
iter 6999: loss 2.6266, time 5019.81ms 
step 7000: train loss 2.5088, val loss 2.8383
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 7000: loss 2.8310, time 20716.43ms 
iter 7001: loss 2.5577, time 5030.21ms 
iter 7002: loss 2.4648, time 5031.57ms 
iter 7003: loss 2.5564, time 5032.98ms 
iter 7004: loss 2.6474, time 5034.12ms 
iter 7005: loss 2.6608, time 5034.71ms 
iter 7006: loss 2.6200, time 5026.30ms 
iter 7007: loss 2.6033, time 4978.90ms 
iter 7008: loss 2.5695, time 4917.67ms 
iter 7009: loss 2.3236, time 4923.01ms 
iter 7010: loss 2.6383, time 4920.97ms 
iter 7011: loss 2.5438, time 4927.91ms 
iter 7012: loss 2.4228, time 4922.87ms 
iter 7013: loss 2.4027, time 4914.37ms 
iter 7014: loss 2.6385, time 4917.62ms 
iter 7015: loss 2.5851, time 4964.29ms 
iter 7016: loss 2.5089, time 5012.79ms 
iter 7017: loss 2.3939, time 5029.51ms 
iter 7018: loss 2.3811, time 5038.81ms 
iter 7019: loss 2.5012, time 4999.66ms 
iter 7020: loss 2.5981, time 4929.08ms 
iter 7021: loss 2.5396, time 5020.34ms 
iter 7022: loss 2.3893, time 4973.02ms 
iter 7023: loss 2.4282, time 4914.79ms 
iter 7024: loss 2.4817, time 4935.29ms 
iter 7025: loss 2.5973, time 5025.92ms 
iter 7026: loss 2.5838, time 5030.08ms 
iter 7027: loss 2.2121, time 5027.98ms 
iter 7028: loss 2.3142, time 5025.03ms 
iter 7029: loss 2.2306, time 5025.69ms 
iter 7030: loss 2.4184, time 5015.62ms 
iter 7031: loss 2.2976, time 4968.10ms 
iter 7032: loss 2.3455, time 4916.02ms 
iter 7033: loss 2.4179, time 4929.19ms 
iter 7034: loss 2.5278, time 4926.33ms 
iter 7035: loss 2.6006, time 4928.08ms 
iter 7036: loss 2.4246, time 4927.13ms 
iter 7037: loss 2.6495, time 4927.85ms 
iter 7038: loss 2.3230, time 4927.98ms 
iter 7039: loss 2.3464, time 4924.32ms 
iter 7040: loss 2.5239, time 4931.98ms 
iter 7041: loss 2.5835, time 4917.52ms 
iter 7042: loss 2.5746, time 4915.08ms 
iter 7043: loss 2.5565, time 4971.36ms 
iter 7044: loss 2.4478, time 5026.54ms 
iter 7045: loss 2.5949, time 5024.54ms 
iter 7046: loss 2.6702, time 5025.64ms 
iter 7047: loss 2.4763, time 5010.55ms 
iter 7048: loss 2.3823, time 5013.45ms 
iter 7049: loss 2.4891, time 5024.60ms 
step 7050: train loss 2.5113, val loss 2.8411
iter 7050: loss 2.6097, time 19681.18ms 
iter 7051: loss 2.5484, time 5023.40ms 
iter 7052: loss 2.7621, time 5028.30ms 
iter 7053: loss 2.4780, time 5028.91ms 
iter 7054: loss 2.5118, time 5026.85ms 
iter 7055: loss 2.6697, time 5028.30ms 
iter 7056: loss 2.6683, time 4932.55ms 
iter 7057: loss 2.5587, time 4924.69ms 
iter 7058: loss 2.3943, time 4944.24ms 
iter 7059: loss 2.4232, time 5025.81ms 
iter 7060: loss 2.4330, time 5028.75ms 
iter 7061: loss 2.6836, time 5032.07ms 
iter 7062: loss 2.4290, time 5028.38ms 
iter 7063: loss 2.6769, time 5038.38ms 
iter 7064: loss 2.3719, time 5041.57ms 
iter 7065: loss 2.5752, time 4950.39ms 
iter 7066: loss 2.3713, time 4945.38ms 
iter 7067: loss 2.7052, time 4929.84ms 
iter 7068: loss 2.3658, time 5014.48ms 
iter 7069: loss 2.4632, time 5030.31ms 
iter 7070: loss 2.5582, time 5027.68ms 
iter 7071: loss 2.2311, time 5025.44ms 
iter 7072: loss 2.6448, time 5027.47ms 
iter 7073: loss 2.6920, time 5020.82ms 
iter 7074: loss 2.6027, time 5028.20ms 
iter 7075: loss 2.2492, time 4976.27ms 
iter 7076: loss 2.6717, time 4916.29ms 
iter 7077: loss 2.4308, time 4995.81ms 
iter 7078: loss 2.4134, time 5032.99ms 
iter 7079: loss 2.4445, time 5026.24ms 
iter 7080: loss 2.4355, time 5025.98ms 
iter 7081: loss 2.6311, time 5028.47ms 
iter 7082: loss 2.4876, time 5021.10ms 
iter 7083: loss 2.3956, time 5020.75ms 
iter 7084: loss 2.4779, time 4917.48ms 
iter 7085: loss 2.6808, time 4915.34ms 
iter 7086: loss 2.6166, time 4944.72ms 
iter 7087: loss 2.5805, time 4978.09ms 
iter 7088: loss 2.2267, time 4978.38ms 
iter 7089: loss 2.3340, time 5004.43ms 
iter 7090: loss 2.5153, time 5031.09ms 
iter 7091: loss 2.4519, time 5031.19ms 
iter 7092: loss 2.4073, time 5032.25ms 
iter 7093: loss 2.6139, time 5030.85ms 
iter 7094: loss 2.4878, time 5028.04ms 
iter 7095: loss 2.6925, time 5030.67ms 
iter 7096: loss 2.3073, time 4977.80ms 
iter 7097: loss 2.5518, time 4916.29ms 
iter 7098: loss 2.3989, time 4948.43ms 
iter 7099: loss 2.7606, time 5026.81ms 
step 7100: train loss 2.5118, val loss 2.8609
iter 7100: loss 2.7445, time 19689.94ms 
iter 7101: loss 2.5377, time 5028.42ms 
iter 7102: loss 2.3272, time 5029.81ms 
iter 7103: loss 2.4392, time 4923.90ms 
iter 7104: loss 2.5180, time 4931.04ms 
iter 7105: loss 2.4526, time 5025.71ms 
iter 7106: loss 2.5619, time 5019.65ms 
iter 7107: loss 2.5024, time 5032.17ms 
iter 7108: loss 2.4110, time 5031.91ms 
iter 7109: loss 2.5487, time 5012.80ms 
iter 7110: loss 2.6388, time 5029.13ms 
iter 7111: loss 2.6290, time 5031.09ms 
iter 7112: loss 2.4790, time 4977.91ms 
iter 7113: loss 2.5824, time 4974.61ms 
iter 7114: loss 2.4833, time 5031.50ms 
iter 7115: loss 2.4987, time 5032.86ms 
iter 7116: loss 2.2105, time 5032.56ms 
iter 7117: loss 2.7111, time 5034.89ms 
iter 7118: loss 2.3683, time 5034.60ms 
iter 7119: loss 2.4087, time 5031.79ms 
iter 7120: loss 2.6213, time 5037.06ms 
iter 7121: loss 2.5295, time 4977.73ms 
iter 7122: loss 2.5591, time 4919.09ms 
iter 7123: loss 2.7588, time 5004.43ms 
iter 7124: loss 2.5299, time 5034.47ms 
iter 7125: loss 2.5913, time 5030.91ms 
iter 7126: loss 2.6715, time 5029.83ms 
iter 7127: loss 2.6541, time 5027.44ms 
iter 7128: loss 2.3449, time 5032.97ms 
iter 7129: loss 2.4468, time 5044.91ms 
iter 7130: loss 2.4959, time 4960.25ms 
iter 7131: loss 2.5275, time 4916.37ms 
iter 7132: loss 2.4189, time 4992.16ms 
iter 7133: loss 2.4253, time 5003.20ms 
iter 7134: loss 2.6724, time 5026.16ms 
iter 7135: loss 2.7493, time 5023.86ms 
iter 7136: loss 2.6270, time 5026.45ms 
iter 7137: loss 2.3905, time 5027.61ms 
iter 7138: loss 2.4387, time 5029.27ms 
iter 7139: loss 2.5961, time 5009.59ms 
iter 7140: loss 2.5667, time 4978.04ms 
iter 7141: loss 2.5579, time 4987.48ms 
iter 7142: loss 2.4711, time 5030.75ms 
iter 7143: loss 2.4847, time 5032.32ms 
iter 7144: loss 2.4643, time 5028.62ms 
iter 7145: loss 2.4469, time 5026.84ms 
iter 7146: loss 2.7393, time 5026.53ms 
iter 7147: loss 2.3724, time 5026.10ms 
iter 7148: loss 2.6256, time 5032.00ms 
iter 7149: loss 2.6444, time 5024.57ms 
step 7150: train loss 2.4994, val loss 2.8510
iter 7150: loss 2.4772, time 19681.05ms 
iter 7151: loss 2.6311, time 5019.63ms 
iter 7152: loss 2.5901, time 5020.81ms 
iter 7153: loss 2.5026, time 5025.58ms 
iter 7154: loss 2.4524, time 5030.98ms 
iter 7155: loss 2.4510, time 5010.01ms 
iter 7156: loss 2.3422, time 5013.80ms 
iter 7157: loss 2.6351, time 5028.99ms 
iter 7158: loss 2.6130, time 5027.25ms 
iter 7159: loss 2.4797, time 5021.91ms 
iter 7160: loss 2.4290, time 5021.08ms 
iter 7161: loss 2.4358, time 5023.53ms 
iter 7162: loss 2.4740, time 5027.83ms 
iter 7163: loss 2.3653, time 4969.89ms 
iter 7164: loss 2.3984, time 4916.35ms 
iter 7165: loss 2.4623, time 4931.28ms 
iter 7166: loss 2.3556, time 5019.28ms 
iter 7167: loss 2.5260, time 4963.74ms 
iter 7168: loss 2.4707, time 4989.87ms 
iter 7169: loss 2.6006, time 5021.24ms 
iter 7170: loss 2.6324, time 5006.89ms 
iter 7171: loss 2.7114, time 5007.72ms 
iter 7172: loss 2.5264, time 4985.59ms 
iter 7173: loss 2.5643, time 4917.78ms 
iter 7174: loss 2.5442, time 4961.97ms 
iter 7175: loss 2.5415, time 5011.75ms 
iter 7176: loss 2.6415, time 5012.92ms 
iter 7177: loss 2.4023, time 5012.78ms 
iter 7178: loss 2.5954, time 5015.90ms 
iter 7179: loss 2.7145, time 5032.33ms 
iter 7180: loss 2.8360, time 5033.60ms 
iter 7181: loss 2.3915, time 5031.88ms 
iter 7182: loss 2.4911, time 4978.47ms 
iter 7183: loss 2.5115, time 4917.27ms 
iter 7184: loss 2.4910, time 5005.66ms 
iter 7185: loss 2.7020, time 5024.77ms 
iter 7186: loss 2.5208, time 5031.61ms 
iter 7187: loss 2.2119, time 5030.70ms 
iter 7188: loss 2.4477, time 5020.28ms 
iter 7189: loss 2.6277, time 5026.30ms 
iter 7190: loss 2.5990, time 5033.75ms 
iter 7191: loss 2.5215, time 4978.55ms 
iter 7192: loss 2.6209, time 4939.19ms 
iter 7193: loss 2.6015, time 4990.57ms 
iter 7194: loss 2.5983, time 5029.59ms 
iter 7195: loss 2.6105, time 5029.12ms 
iter 7196: loss 2.4349, time 5022.01ms 
iter 7197: loss 2.5960, time 5029.56ms 
iter 7198: loss 2.4170, time 5001.71ms 
iter 7199: loss 2.4938, time 4974.07ms 
step 7200: train loss 2.5039, val loss 2.8587
iter 7200: loss 2.3871, time 19655.80ms 
iter 7201: loss 2.3585, time 4931.14ms 
iter 7202: loss 2.4692, time 4953.08ms 
iter 7203: loss 2.3791, time 5041.18ms 
iter 7204: loss 2.6212, time 5022.02ms 
iter 7205: loss 2.4918, time 4978.51ms 
iter 7206: loss 2.4172, time 4961.53ms 
iter 7207: loss 2.5538, time 5015.96ms 
iter 7208: loss 2.6307, time 5024.65ms 
iter 7209: loss 2.5378, time 5023.68ms 
iter 7210: loss 2.2332, time 5031.36ms 
iter 7211: loss 2.6108, time 5011.22ms 
iter 7212: loss 2.6432, time 5014.09ms 
iter 7213: loss 2.5166, time 4979.62ms 
iter 7214: loss 2.3795, time 4918.63ms 
iter 7215: loss 2.3818, time 5007.51ms 
iter 7216: loss 2.3556, time 4962.45ms 
iter 7217: loss 2.4343, time 5028.39ms 
iter 7218: loss 2.7474, time 5014.88ms 
iter 7219: loss 2.6803, time 5029.25ms 
iter 7220: loss 2.6788, time 5022.65ms 
iter 7221: loss 2.6331, time 4996.24ms 
iter 7222: loss 2.5602, time 4933.13ms 
iter 7223: loss 2.5008, time 4921.57ms 
iter 7224: loss 2.7586, time 5005.41ms 
iter 7225: loss 2.4522, time 5021.87ms 
iter 7226: loss 2.4905, time 5023.26ms 
iter 7227: loss 2.3339, time 5025.07ms 
iter 7228: loss 2.6474, time 5012.26ms 
iter 7229: loss 2.3793, time 5009.21ms 
iter 7230: loss 2.5306, time 4993.39ms 
iter 7231: loss 2.5228, time 4923.29ms 
iter 7232: loss 2.2491, time 4916.19ms 
iter 7233: loss 2.6735, time 4963.02ms 
iter 7234: loss 2.5484, time 5016.89ms 
iter 7235: loss 2.6059, time 5004.29ms 
iter 7236: loss 2.4383, time 5019.18ms 
iter 7237: loss 2.5796, time 5028.01ms 
iter 7238: loss 2.6971, time 5002.22ms 
iter 7239: loss 2.6454, time 5009.04ms 
iter 7240: loss 2.4361, time 4918.68ms 
iter 7241: loss 2.6183, time 4936.43ms 
iter 7242: loss 2.4869, time 5032.36ms 
iter 7243: loss 2.5580, time 5023.06ms 
iter 7244: loss 2.5216, time 5028.01ms 
iter 7245: loss 2.5569, time 5020.50ms 
iter 7246: loss 2.2727, time 5018.80ms 
iter 7247: loss 2.4586, time 5026.96ms 
iter 7248: loss 2.4417, time 5000.47ms 
iter 7249: loss 2.5780, time 4927.11ms 
step 7250: train loss 2.5064, val loss 2.8479
iter 7250: loss 2.5021, time 19711.03ms 
iter 7251: loss 2.4895, time 5027.53ms 
iter 7252: loss 2.3868, time 5026.61ms 
iter 7253: loss 2.4295, time 5014.44ms 
iter 7254: loss 2.5558, time 4977.76ms 
iter 7255: loss 2.5230, time 4920.64ms 
iter 7256: loss 2.2776, time 5025.60ms 
iter 7257: loss 2.3747, time 5008.70ms 
iter 7258: loss 2.2937, time 5016.03ms 
iter 7259: loss 2.3230, time 5022.21ms 
iter 7260: loss 2.2526, time 5019.35ms 
iter 7261: loss 2.6332, time 4915.34ms 
iter 7262: loss 2.4566, time 4914.65ms 
iter 7263: loss 2.4377, time 4915.51ms 
iter 7264: loss 2.4612, time 4915.74ms 
iter 7265: loss 2.6836, time 4987.46ms 
iter 7266: loss 2.3701, time 5023.84ms 
iter 7267: loss 2.4912, time 5024.27ms 
iter 7268: loss 2.4645, time 5023.32ms 
iter 7269: loss 2.2429, time 5025.38ms 
iter 7270: loss 2.2901, time 5029.09ms 
iter 7271: loss 2.3274, time 4972.51ms 
iter 7272: loss 2.6818, time 4917.15ms 
iter 7273: loss 2.5800, time 5009.80ms 
iter 7274: loss 2.3766, time 5022.27ms 
iter 7275: loss 2.6107, time 5021.11ms 
iter 7276: loss 2.3572, time 5024.53ms 
iter 7277: loss 2.4478, time 5025.98ms 
iter 7278: loss 2.4352, time 5027.86ms 
iter 7279: loss 2.5604, time 5015.38ms 
iter 7280: loss 2.5789, time 4916.98ms 
iter 7281: loss 2.4902, time 4974.15ms 
iter 7282: loss 2.2546, time 5026.02ms 
iter 7283: loss 2.3578, time 5030.92ms 
iter 7284: loss 2.4958, time 5027.95ms 
iter 7285: loss 2.4590, time 5028.57ms 
iter 7286: loss 2.6589, time 5020.36ms 
iter 7287: loss 2.1798, time 5026.92ms 
iter 7288: loss 2.5264, time 4960.99ms 
iter 7289: loss 2.5132, time 4942.06ms 
iter 7290: loss 2.4271, time 5004.10ms 
iter 7291: loss 2.4808, time 5007.22ms 
iter 7292: loss 2.2365, time 4985.73ms 
iter 7293: loss 2.3459, time 5010.11ms 
iter 7294: loss 2.5834, time 5017.87ms 
iter 7295: loss 2.4650, time 5007.62ms 
iter 7296: loss 2.5523, time 5019.94ms 
iter 7297: loss 2.5779, time 4916.60ms 
iter 7298: loss 2.4521, time 4946.12ms 
iter 7299: loss 2.4622, time 5027.34ms 
step 7300: train loss 2.5075, val loss 2.8367
iter 7300: loss 2.6285, time 19701.50ms 
iter 7301: loss 2.4444, time 5022.97ms 
iter 7302: loss 2.5660, time 5007.77ms 
iter 7303: loss 2.7426, time 4969.00ms 
iter 7304: loss 2.5224, time 4958.68ms 
iter 7305: loss 2.5696, time 5002.67ms 
iter 7306: loss 2.4629, time 5003.38ms 
iter 7307: loss 2.4312, time 5029.38ms 
iter 7308: loss 2.6919, time 5023.66ms 
iter 7309: loss 2.4477, time 5013.29ms 
iter 7310: loss 2.3248, time 5014.17ms 
iter 7311: loss 2.4884, time 4918.23ms 
iter 7312: loss 2.3539, time 4946.98ms 
iter 7313: loss 2.4134, time 5033.43ms 
iter 7314: loss 2.4170, time 5032.81ms 
iter 7315: loss 2.4382, time 5022.00ms 
iter 7316: loss 2.6071, time 5001.77ms 
iter 7317: loss 2.4902, time 5007.31ms 
iter 7318: loss 2.5303, time 5004.57ms 
iter 7319: loss 2.5391, time 5012.19ms 
iter 7320: loss 2.2506, time 4932.24ms 
iter 7321: loss 2.2154, time 4990.92ms 
iter 7322: loss 2.5988, time 5012.57ms 
iter 7323: loss 2.3951, time 5025.08ms 
iter 7324: loss 2.5300, time 5016.44ms 
iter 7325: loss 2.5644, time 5022.21ms 
iter 7326: loss 2.4065, time 5015.41ms 
iter 7327: loss 2.5413, time 4978.55ms 
iter 7328: loss 2.4747, time 4970.36ms 
iter 7329: loss 2.6156, time 5022.02ms 
iter 7330: loss 2.4230, time 5022.08ms 
iter 7331: loss 2.6015, time 5016.41ms 
iter 7332: loss 2.5520, time 4999.48ms 
iter 7333: loss 2.4963, time 5013.24ms 
iter 7334: loss 2.5029, time 5021.75ms 
iter 7335: loss 2.4632, time 5023.71ms 
iter 7336: loss 2.6731, time 4973.76ms 
iter 7337: loss 2.5101, time 4965.95ms 
iter 7338: loss 2.5616, time 5024.82ms 
iter 7339: loss 2.6319, time 5022.31ms 
iter 7340: loss 2.5727, time 5023.78ms 
iter 7341: loss 2.4771, time 5023.64ms 
iter 7342: loss 2.5909, time 5023.78ms 
iter 7343: loss 2.4718, time 5024.28ms 
iter 7344: loss 2.4744, time 4974.12ms 
iter 7345: loss 2.4363, time 4917.98ms 
iter 7346: loss 2.3268, time 4988.41ms 
iter 7347: loss 2.4719, time 5020.05ms 
iter 7348: loss 2.5781, time 5022.75ms 
iter 7349: loss 2.6235, time 5024.18ms 
step 7350: train loss 2.5003, val loss 2.8407
iter 7350: loss 2.2926, time 19613.89ms 
iter 7351: loss 2.4464, time 4972.49ms 
iter 7352: loss 2.5303, time 5025.76ms 
iter 7353: loss 2.5174, time 5023.30ms 
iter 7354: loss 2.5246, time 5023.20ms 
iter 7355: loss 2.4023, time 5026.60ms 
iter 7356: loss 2.4707, time 5024.24ms 
iter 7357: loss 2.4381, time 5030.24ms 
iter 7358: loss 2.5542, time 4976.77ms 
iter 7359: loss 2.2480, time 4971.07ms 
iter 7360: loss 2.5831, time 5033.61ms 
iter 7361: loss 2.4227, time 5029.94ms 
iter 7362: loss 2.4622, time 5033.26ms 
iter 7363: loss 2.5399, time 5031.69ms 
iter 7364: loss 2.6084, time 5024.55ms 
iter 7365: loss 2.5708, time 5021.17ms 
iter 7366: loss 2.3573, time 4983.90ms 
iter 7367: loss 2.5291, time 4924.08ms 
iter 7368: loss 2.4069, time 5028.87ms 
iter 7369: loss 2.6009, time 5031.40ms 
iter 7370: loss 2.5657, time 5001.34ms 
iter 7371: loss 2.3740, time 5023.86ms 
iter 7372: loss 2.7286, time 5018.18ms 
iter 7373: loss 2.5858, time 5021.46ms 
iter 7374: loss 2.6392, time 5010.93ms 
iter 7375: loss 2.6126, time 4915.67ms 
iter 7376: loss 2.3605, time 4919.17ms 
iter 7377: loss 2.5051, time 5020.24ms 
iter 7378: loss 2.3514, time 5029.24ms 
iter 7379: loss 2.5816, time 5021.86ms 
iter 7380: loss 2.5828, time 5028.26ms 
iter 7381: loss 2.5144, time 5022.45ms 
iter 7382: loss 2.5601, time 5028.53ms 
iter 7383: loss 2.7125, time 5006.24ms 
iter 7384: loss 2.8266, time 4915.41ms 
iter 7385: loss 2.5509, time 4941.20ms 
iter 7386: loss 2.5374, time 5024.95ms 
iter 7387: loss 2.4288, time 4998.39ms 
iter 7388: loss 2.4749, time 5001.95ms 
iter 7389: loss 2.5129, time 5026.76ms 
iter 7390: loss 2.5126, time 5024.27ms 
iter 7391: loss 2.6271, time 5029.42ms 
iter 7392: loss 2.4763, time 5030.95ms 
iter 7393: loss 2.4738, time 4976.51ms 
iter 7394: loss 2.4856, time 4960.71ms 
iter 7395: loss 2.5804, time 5015.70ms 
iter 7396: loss 2.5997, time 5025.79ms 
iter 7397: loss 2.4113, time 5026.84ms 
iter 7398: loss 2.6323, time 5029.32ms 
iter 7399: loss 2.8106, time 5028.41ms 
step 7400: train loss 2.4924, val loss 2.8536
iter 7400: loss 2.5814, time 19652.18ms 
iter 7401: loss 2.6109, time 5006.97ms 
iter 7402: loss 2.3959, time 5030.22ms 
iter 7403: loss 2.3440, time 5006.36ms 
iter 7404: loss 2.5042, time 5028.97ms 
iter 7405: loss 2.4974, time 5032.30ms 
iter 7406: loss 2.3223, time 5030.94ms 
iter 7407: loss 2.2627, time 4978.74ms 
iter 7408: loss 2.4050, time 4920.24ms 
iter 7409: loss 2.6189, time 5005.52ms 
iter 7410: loss 2.3461, time 5029.84ms 
iter 7411: loss 2.6472, time 5028.71ms 
iter 7412: loss 2.5800, time 5025.37ms 
iter 7413: loss 2.5043, time 5002.49ms 
iter 7414: loss 2.6870, time 5022.48ms 
iter 7415: loss 2.2995, time 4998.54ms 
iter 7416: loss 2.3474, time 4915.18ms 
iter 7417: loss 2.3270, time 4928.75ms 
iter 7418: loss 2.4995, time 5025.91ms 
iter 7419: loss 2.3289, time 5029.09ms 
iter 7420: loss 2.4188, time 5029.24ms 
iter 7421: loss 2.6489, time 5030.39ms 
iter 7422: loss 2.2485, time 5025.62ms 
iter 7423: loss 2.4097, time 5027.68ms 
iter 7424: loss 2.5739, time 5031.32ms 
iter 7425: loss 2.4722, time 4994.18ms 
iter 7426: loss 2.4372, time 5020.22ms 
iter 7427: loss 2.5819, time 5025.53ms 
iter 7428: loss 2.4330, time 5028.78ms 
iter 7429: loss 2.3880, time 5025.10ms 
iter 7430: loss 2.5916, time 5021.68ms 
iter 7431: loss 2.4539, time 5023.29ms 
iter 7432: loss 2.4416, time 5025.54ms 
iter 7433: loss 2.3606, time 4974.53ms 
iter 7434: loss 2.3827, time 4915.71ms 
iter 7435: loss 2.6654, time 4998.97ms 
iter 7436: loss 2.6170, time 5016.28ms 
iter 7437: loss 2.3392, time 5018.09ms 
iter 7438: loss 2.6390, time 5030.35ms 
iter 7439: loss 2.6341, time 5033.40ms 
iter 7440: loss 2.2044, time 5022.86ms 
iter 7441: loss 2.4918, time 5030.25ms 
iter 7442: loss 2.4600, time 4977.61ms 
iter 7443: loss 2.5597, time 5003.84ms 
iter 7444: loss 2.4914, time 5026.99ms 
iter 7445: loss 2.6617, time 5031.55ms 
iter 7446: loss 2.5951, time 5029.85ms 
iter 7447: loss 2.6058, time 5030.44ms 
iter 7448: loss 2.6579, time 5029.84ms 
iter 7449: loss 2.4279, time 5028.25ms 
step 7450: train loss 2.4939, val loss 2.8346
iter 7450: loss 2.3769, time 19705.99ms 
iter 7451: loss 2.7262, time 5028.51ms 
iter 7452: loss 2.5296, time 5023.39ms 
iter 7453: loss 2.5055, time 5020.56ms 
iter 7454: loss 2.3860, time 5030.11ms 
iter 7455: loss 2.3697, time 5029.57ms 
iter 7456: loss 2.4295, time 4975.11ms 
iter 7457: loss 2.5158, time 4918.77ms 
iter 7458: loss 2.2940, time 4984.49ms 
iter 7459: loss 2.2370, time 5020.26ms 
iter 7460: loss 2.7122, time 5021.10ms 
iter 7461: loss 2.3779, time 5014.60ms 
iter 7462: loss 2.3758, time 5000.30ms 
iter 7463: loss 2.6500, time 4995.50ms 
iter 7464: loss 2.7874, time 4990.06ms 
iter 7465: loss 2.6076, time 4919.97ms 
iter 7466: loss 2.4619, time 4919.52ms 
iter 7467: loss 2.4239, time 4954.09ms 
iter 7468: loss 2.4047, time 4987.40ms 
iter 7469: loss 2.6570, time 5016.26ms 
iter 7470: loss 2.6244, time 5010.31ms 
iter 7471: loss 2.3094, time 4984.12ms 
iter 7472: loss 2.3545, time 5031.50ms 
iter 7473: loss 2.7405, time 5031.04ms 
iter 7474: loss 2.6467, time 4977.83ms 
iter 7475: loss 2.4618, time 4917.71ms 
iter 7476: loss 2.6715, time 5006.37ms 
iter 7477: loss 2.3627, time 5029.91ms 
iter 7478: loss 2.4017, time 5030.54ms 
iter 7479: loss 2.3121, time 5026.31ms 
iter 7480: loss 2.3975, time 5019.27ms 
iter 7481: loss 2.5747, time 5029.64ms 
iter 7482: loss 2.4510, time 5005.92ms 
iter 7483: loss 2.3326, time 4978.71ms 
iter 7484: loss 2.5055, time 4961.39ms 
iter 7485: loss 2.3638, time 5001.18ms 
iter 7486: loss 2.8384, time 4993.59ms 
iter 7487: loss 2.4300, time 4996.41ms 
iter 7488: loss 2.4019, time 5018.55ms 
iter 7489: loss 2.4150, time 5015.15ms 
iter 7490: loss 2.8653, time 5020.77ms 
iter 7491: loss 2.5968, time 4994.38ms 
iter 7492: loss 2.5880, time 4931.17ms 
iter 7493: loss 2.4342, time 4996.88ms 
iter 7494: loss 2.4592, time 5029.76ms 
iter 7495: loss 2.7493, time 5017.24ms 
iter 7496: loss 2.7116, time 5013.08ms 
iter 7497: loss 2.5692, time 5005.51ms 
iter 7498: loss 2.5954, time 5013.42ms 
iter 7499: loss 2.4356, time 4942.36ms 
step 7500: train loss 2.5042, val loss 2.8334
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 7500: loss 2.5938, time 20736.06ms 
iter 7501: loss 2.6073, time 4999.32ms 
iter 7502: loss 2.4876, time 4997.75ms 
iter 7503: loss 2.1789, time 5020.02ms 
iter 7504: loss 2.1241, time 5030.76ms 
iter 7505: loss 2.4272, time 4978.93ms 
iter 7506: loss 2.3765, time 4956.97ms 
iter 7507: loss 2.4888, time 5004.06ms 
iter 7508: loss 2.4814, time 4981.22ms 
iter 7509: loss 2.6347, time 4987.25ms 
iter 7510: loss 2.6347, time 4927.51ms 
iter 7511: loss 2.6665, time 4916.83ms 
iter 7512: loss 2.4320, time 4914.93ms 
iter 7513: loss 2.4163, time 4917.38ms 
iter 7514: loss 2.3508, time 4912.38ms 
iter 7515: loss 2.4450, time 4921.18ms 
iter 7516: loss 2.4834, time 4934.61ms 
iter 7517: loss 2.6667, time 4914.75ms 
iter 7518: loss 2.5161, time 4915.78ms 
iter 7519: loss 2.5706, time 4942.07ms 
iter 7520: loss 2.4577, time 5020.95ms 
iter 7521: loss 2.5472, time 5017.58ms 
iter 7522: loss 2.4839, time 5021.81ms 
iter 7523: loss 2.4291, time 4966.12ms 
iter 7524: loss 2.5311, time 4996.57ms 
iter 7525: loss 2.6182, time 5019.35ms 
iter 7526: loss 2.7306, time 4996.02ms 
iter 7527: loss 2.4733, time 5022.27ms 
iter 7528: loss 2.4344, time 5023.16ms 
iter 7529: loss 2.3416, time 5021.57ms 
iter 7530: loss 2.6568, time 5021.03ms 
iter 7531: loss 2.6188, time 4968.18ms 
iter 7532: loss 2.5077, time 4913.95ms 
iter 7533: loss 2.4611, time 4997.53ms 
iter 7534: loss 2.5891, time 5019.84ms 
iter 7535: loss 2.5162, time 5021.24ms 
iter 7536: loss 2.5793, time 5019.21ms 
iter 7537: loss 2.5520, time 5022.25ms 
iter 7538: loss 2.5714, time 5021.60ms 
iter 7539: loss 2.5477, time 5023.82ms 
iter 7540: loss 2.4620, time 4968.44ms 
iter 7541: loss 2.5569, time 4944.10ms 
iter 7542: loss 2.4331, time 4994.80ms 
iter 7543: loss 2.5646, time 5020.36ms 
iter 7544: loss 2.4638, time 5019.89ms 
iter 7545: loss 2.3593, time 5018.02ms 
iter 7546: loss 2.7789, time 5025.60ms 
iter 7547: loss 2.3874, time 5024.75ms 
iter 7548: loss 2.7512, time 5024.78ms 
iter 7549: loss 2.6294, time 4971.28ms 
step 7550: train loss 2.4977, val loss 2.8463
iter 7550: loss 2.3606, time 19677.93ms 
iter 7551: loss 2.4023, time 5015.90ms 
iter 7552: loss 2.6149, time 5024.88ms 
iter 7553: loss 2.4269, time 5024.86ms 
iter 7554: loss 2.4815, time 4972.39ms 
iter 7555: loss 2.6510, time 4950.43ms 
iter 7556: loss 2.4117, time 5025.36ms 
iter 7557: loss 2.4632, time 5023.62ms 
iter 7558: loss 2.3919, time 5023.32ms 
iter 7559: loss 2.6282, time 5023.31ms 
iter 7560: loss 2.4210, time 5021.22ms 
iter 7561: loss 2.3494, time 5004.01ms 
iter 7562: loss 2.5510, time 5012.02ms 
iter 7563: loss 2.5142, time 4916.90ms 
iter 7564: loss 2.4989, time 4999.38ms 
iter 7565: loss 2.4984, time 5020.71ms 
iter 7566: loss 2.4375, time 5021.52ms 
iter 7567: loss 2.5906, time 5023.39ms 
iter 7568: loss 2.4655, time 5024.36ms 
iter 7569: loss 2.3646, time 5023.97ms 
iter 7570: loss 2.3528, time 5023.40ms 
iter 7571: loss 2.4524, time 4972.55ms 
iter 7572: loss 2.4777, time 4937.60ms 
iter 7573: loss 2.3808, time 5023.35ms 
iter 7574: loss 2.5790, time 5021.77ms 
iter 7575: loss 2.4745, time 5029.68ms 
iter 7576: loss 2.3535, time 4917.84ms 
iter 7577: loss 2.8147, time 4912.91ms 
iter 7578: loss 2.4687, time 4999.69ms 
iter 7579: loss 2.7000, time 5028.81ms 
iter 7580: loss 2.4660, time 4984.48ms 
iter 7581: loss 2.4952, time 4931.61ms 
iter 7582: loss 2.5747, time 4913.74ms 
iter 7583: loss 2.6764, time 4913.56ms 
iter 7584: loss 2.4783, time 4913.53ms 
iter 7585: loss 2.8710, time 4913.66ms 
iter 7586: loss 2.3797, time 4913.85ms 
iter 7587: loss 2.5535, time 4914.15ms 
iter 7588: loss 2.4203, time 4915.63ms 
iter 7589: loss 2.5202, time 4913.40ms 
iter 7590: loss 2.5009, time 4913.77ms 
iter 7591: loss 2.2958, time 4913.85ms 
iter 7592: loss 2.3523, time 4913.73ms 
iter 7593: loss 2.5011, time 4913.95ms 
iter 7594: loss 2.4776, time 4913.65ms 
iter 7595: loss 2.2706, time 4913.91ms 
iter 7596: loss 2.4071, time 4913.84ms 
iter 7597: loss 2.3236, time 4914.77ms 
iter 7598: loss 2.6337, time 4972.23ms 
iter 7599: loss 2.6193, time 5023.05ms 
step 7600: train loss 2.4884, val loss 2.8499
iter 7600: loss 2.5278, time 19683.71ms 
iter 7601: loss 2.5236, time 5028.56ms 
iter 7602: loss 2.2000, time 5022.55ms 
iter 7603: loss 2.2930, time 5019.20ms 
iter 7604: loss 2.3301, time 5021.10ms 
iter 7605: loss 2.4330, time 5022.86ms 
iter 7606: loss 2.3199, time 5021.20ms 
iter 7607: loss 2.6005, time 5021.18ms 
iter 7608: loss 2.6762, time 5024.47ms 
iter 7609: loss 2.6157, time 5020.31ms 
iter 7610: loss 2.4427, time 4966.97ms 
iter 7611: loss 2.4532, time 4913.56ms 
iter 7612: loss 2.4504, time 4913.23ms 
iter 7613: loss 2.3082, time 4913.29ms 
iter 7614: loss 2.5568, time 4916.29ms 
iter 7615: loss 2.5299, time 4915.35ms 
iter 7616: loss 2.3460, time 4919.26ms 
iter 7617: loss 2.4006, time 4928.96ms 
iter 7618: loss 2.5993, time 4971.35ms 
iter 7619: loss 2.3618, time 4946.59ms 
iter 7620: loss 2.3306, time 4978.17ms 
iter 7621: loss 2.4647, time 4982.61ms 
iter 7622: loss 2.4807, time 4999.14ms 
iter 7623: loss 2.5604, time 4951.05ms 
iter 7624: loss 2.5202, time 4920.22ms 
iter 7625: loss 2.6548, time 4931.93ms 
iter 7626: loss 2.4398, time 4959.67ms 
iter 7627: loss 2.5890, time 4940.40ms 
iter 7628: loss 2.0351, time 4928.06ms 
iter 7629: loss 2.6483, time 4972.08ms 
iter 7630: loss 2.4692, time 4964.82ms 
iter 7631: loss 2.5341, time 4987.15ms 
iter 7632: loss 2.5819, time 4973.25ms 
iter 7633: loss 2.5823, time 4915.33ms 
iter 7634: loss 2.5912, time 4914.26ms 
iter 7635: loss 2.5641, time 4997.34ms 
iter 7636: loss 2.5866, time 5019.12ms 
iter 7637: loss 2.6896, time 5023.99ms 
iter 7638: loss 2.6297, time 5025.63ms 
iter 7639: loss 2.3544, time 5026.38ms 
iter 7640: loss 2.7613, time 5020.23ms 
iter 7641: loss 2.4646, time 5031.12ms 
iter 7642: loss 2.4935, time 4992.19ms 
iter 7643: loss 2.6392, time 5028.73ms 
iter 7644: loss 2.6083, time 4961.59ms 
iter 7645: loss 2.6258, time 4971.10ms 
iter 7646: loss 2.5471, time 5022.04ms 
iter 7647: loss 2.6099, time 5017.71ms 
iter 7648: loss 2.6052, time 5020.37ms 
iter 7649: loss 2.4613, time 4923.93ms 
step 7650: train loss 2.4919, val loss 2.8582
iter 7650: loss 2.4286, time 19635.81ms 
iter 7651: loss 2.4391, time 4924.33ms 
iter 7652: loss 2.2676, time 4925.80ms 
iter 7653: loss 2.3201, time 5012.64ms 
iter 7654: loss 2.4075, time 5018.34ms 
iter 7655: loss 2.6407, time 4971.14ms 
iter 7656: loss 2.5396, time 4915.05ms 
iter 7657: loss 2.3530, time 4914.47ms 
iter 7658: loss 2.4595, time 4970.50ms 
iter 7659: loss 2.6157, time 5018.33ms 
iter 7660: loss 2.6159, time 5020.44ms 
iter 7661: loss 2.5435, time 5012.93ms 
iter 7662: loss 2.6691, time 5008.53ms 
iter 7663: loss 2.3891, time 5013.87ms 
iter 7664: loss 2.5896, time 5021.91ms 
iter 7665: loss 2.4593, time 4975.50ms 
iter 7666: loss 2.6171, time 4949.78ms 
iter 7667: loss 2.5077, time 4915.81ms 
iter 7668: loss 2.4047, time 4925.56ms 
iter 7669: loss 2.6920, time 5030.04ms 
iter 7670: loss 2.5752, time 5026.63ms 
iter 7671: loss 2.5585, time 5025.33ms 
iter 7672: loss 2.3627, time 5026.35ms 
iter 7673: loss 2.3890, time 5028.31ms 
iter 7674: loss 2.5021, time 5014.05ms 
iter 7675: loss 2.5024, time 5003.66ms 
iter 7676: loss 2.5742, time 4976.37ms 
iter 7677: loss 2.4605, time 4941.65ms 
iter 7678: loss 2.4447, time 5014.09ms 
iter 7679: loss 2.4192, time 5028.26ms 
iter 7680: loss 2.4410, time 5030.01ms 
iter 7681: loss 2.4942, time 5026.32ms 
iter 7682: loss 2.7407, time 5023.28ms 
iter 7683: loss 2.3682, time 4959.13ms 
iter 7684: loss 2.3652, time 4953.45ms 
iter 7685: loss 2.3502, time 4916.17ms 
iter 7686: loss 2.4711, time 4972.07ms 
iter 7687: loss 2.3212, time 5025.70ms 
iter 7688: loss 2.6428, time 5026.88ms 
iter 7689: loss 2.1770, time 5028.52ms 
iter 7690: loss 2.6873, time 5025.89ms 
iter 7691: loss 2.3978, time 4997.59ms 
iter 7692: loss 2.6566, time 5020.37ms 
iter 7693: loss 2.5228, time 4964.53ms 
iter 7694: loss 2.5790, time 4916.89ms 
iter 7695: loss 2.2877, time 4915.72ms 
iter 7696: loss 2.5361, time 5020.17ms 
iter 7697: loss 2.6149, time 5030.27ms 
iter 7698: loss 2.5379, time 5036.53ms 
iter 7699: loss 2.4920, time 5035.28ms 
step 7700: train loss 2.5021, val loss 2.8562
iter 7700: loss 2.5271, time 19674.44ms 
iter 7701: loss 2.4313, time 4946.93ms 
iter 7702: loss 2.4969, time 5001.60ms 
iter 7703: loss 2.3521, time 5034.59ms 
iter 7704: loss 2.3287, time 5030.12ms 
iter 7705: loss 2.6610, time 5031.10ms 
iter 7706: loss 2.5713, time 5028.90ms 
iter 7707: loss 2.5412, time 5029.71ms 
iter 7708: loss 2.5450, time 5033.13ms 
iter 7709: loss 2.6617, time 4980.57ms 
iter 7710: loss 2.5522, time 4918.96ms 
iter 7711: loss 2.4692, time 4984.71ms 
iter 7712: loss 2.4958, time 5031.75ms 
iter 7713: loss 2.2844, time 5013.60ms 
iter 7714: loss 2.5630, time 5012.77ms 
iter 7715: loss 2.4919, time 5025.56ms 
iter 7716: loss 2.4625, time 5028.61ms 
iter 7717: loss 2.3753, time 5007.15ms 
iter 7718: loss 2.5935, time 4918.63ms 
iter 7719: loss 2.4645, time 4925.68ms 
iter 7720: loss 2.4808, time 4931.56ms 
iter 7721: loss 2.4021, time 4917.39ms 
iter 7722: loss 2.5596, time 4914.59ms 
iter 7723: loss 2.4576, time 4921.48ms 
iter 7724: loss 2.6102, time 5028.66ms 
iter 7725: loss 2.5228, time 5013.02ms 
iter 7726: loss 2.4741, time 5012.96ms 
iter 7727: loss 2.3667, time 5030.19ms 
iter 7728: loss 2.5314, time 5027.73ms 
iter 7729: loss 2.5158, time 5030.03ms 
iter 7730: loss 2.5525, time 4999.60ms 
iter 7731: loss 2.3183, time 4936.30ms 
iter 7732: loss 2.5793, time 4980.35ms 
iter 7733: loss 2.5226, time 4987.54ms 
iter 7734: loss 2.4789, time 5020.12ms 
iter 7735: loss 2.6187, time 5018.97ms 
iter 7736: loss 2.2869, time 5011.22ms 
iter 7737: loss 2.4899, time 5014.17ms 
iter 7738: loss 2.3193, time 5000.38ms 
iter 7739: loss 2.4272, time 5019.49ms 
iter 7740: loss 2.6961, time 4972.69ms 
iter 7741: loss 2.5743, time 4932.41ms 
iter 7742: loss 2.4773, time 4915.98ms 
iter 7743: loss 2.5615, time 4977.41ms 
iter 7744: loss 2.6937, time 4986.20ms 
iter 7745: loss 2.6629, time 4939.51ms 
iter 7746: loss 2.5495, time 4949.19ms 
iter 7747: loss 2.6133, time 4992.78ms 
iter 7748: loss 2.3668, time 5019.43ms 
iter 7749: loss 2.3803, time 4933.06ms 
step 7750: train loss 2.4850, val loss 2.8354
iter 7750: loss 2.4682, time 19686.30ms 
iter 7751: loss 2.3150, time 4951.59ms 
iter 7752: loss 2.4673, time 4998.33ms 
iter 7753: loss 2.3883, time 5022.16ms 
iter 7754: loss 2.5519, time 5022.95ms 
iter 7755: loss 2.5340, time 5027.95ms 
iter 7756: loss 2.6438, time 4976.68ms 
iter 7757: loss 2.4259, time 4917.16ms 
iter 7758: loss 2.7758, time 4962.33ms 
iter 7759: loss 2.3680, time 5028.54ms 
iter 7760: loss 2.5874, time 5030.09ms 
iter 7761: loss 2.5785, time 5031.79ms 
iter 7762: loss 2.5643, time 5029.82ms 
iter 7763: loss 2.5760, time 5030.55ms 
iter 7764: loss 2.4244, time 5007.82ms 
iter 7765: loss 2.5262, time 4916.82ms 
iter 7766: loss 2.7125, time 4917.22ms 
iter 7767: loss 2.4145, time 4999.54ms 
iter 7768: loss 2.6493, time 5034.32ms 
iter 7769: loss 2.6136, time 5031.16ms 
iter 7770: loss 2.3978, time 5032.25ms 
iter 7771: loss 2.5395, time 5034.61ms 
iter 7772: loss 2.5502, time 5030.04ms 
iter 7773: loss 2.4884, time 5035.62ms 
iter 7774: loss 2.3878, time 4981.72ms 
iter 7775: loss 2.4731, time 4979.34ms 
iter 7776: loss 2.5495, time 4996.76ms 
iter 7777: loss 2.2685, time 5031.50ms 
iter 7778: loss 2.6058, time 4944.24ms 
iter 7779: loss 2.2640, time 4964.47ms 
iter 7780: loss 2.5281, time 5031.52ms 
iter 7781: loss 2.3797, time 5035.44ms 
iter 7782: loss 2.3600, time 5033.88ms 
iter 7783: loss 2.5967, time 4973.22ms 
iter 7784: loss 2.3651, time 4918.91ms 
iter 7785: loss 2.3884, time 4923.05ms 
iter 7786: loss 2.3903, time 5004.54ms 
iter 7787: loss 2.4660, time 5032.87ms 
iter 7788: loss 2.4206, time 5033.04ms 
iter 7789: loss 2.4203, time 5032.35ms 
iter 7790: loss 2.4446, time 5033.01ms 
iter 7791: loss 2.4172, time 4987.66ms 
iter 7792: loss 2.7327, time 4956.91ms 
iter 7793: loss 2.3982, time 4919.84ms 
iter 7794: loss 2.3843, time 4920.12ms 
iter 7795: loss 2.5273, time 4976.04ms 
iter 7796: loss 2.4881, time 4964.10ms 
iter 7797: loss 2.6112, time 4956.43ms 
iter 7798: loss 2.7674, time 4922.02ms 
iter 7799: loss 2.6334, time 4927.95ms 
step 7800: train loss 2.4817, val loss 2.8520
iter 7800: loss 2.5971, time 19638.19ms 
iter 7801: loss 2.3111, time 4919.82ms 
iter 7802: loss 2.4983, time 4973.23ms 
iter 7803: loss 2.5523, time 5034.15ms 
iter 7804: loss 2.3348, time 5030.99ms 
iter 7805: loss 2.4581, time 5032.31ms 
iter 7806: loss 2.7117, time 5024.40ms 
iter 7807: loss 2.4259, time 5035.79ms 
iter 7808: loss 2.5411, time 5022.02ms 
iter 7809: loss 2.2220, time 4970.22ms 
iter 7810: loss 2.6371, time 4936.33ms 
iter 7811: loss 2.3095, time 5025.80ms 
iter 7812: loss 2.4028, time 4968.58ms 
iter 7813: loss 2.2331, time 4998.35ms 
iter 7814: loss 2.6215, time 4952.29ms 
iter 7815: loss 2.5592, time 4947.02ms 
iter 7816: loss 2.5387, time 5001.32ms 
iter 7817: loss 2.3241, time 5009.21ms 
iter 7818: loss 2.4615, time 4922.01ms 
iter 7819: loss 2.6935, time 4932.09ms 
iter 7820: loss 2.5565, time 4948.11ms 
iter 7821: loss 2.4788, time 5009.24ms 
iter 7822: loss 2.7944, time 5027.64ms 
iter 7823: loss 2.7820, time 5023.73ms 
iter 7824: loss 2.3850, time 4979.75ms 
iter 7825: loss 2.6846, time 5025.99ms 
iter 7826: loss 2.5498, time 5027.43ms 
iter 7827: loss 2.4742, time 4927.14ms 
iter 7828: loss 2.5244, time 4915.61ms 
iter 7829: loss 2.6096, time 4976.58ms 
iter 7830: loss 2.5220, time 5026.16ms 
iter 7831: loss 2.3964, time 5025.85ms 
iter 7832: loss 2.4951, time 5027.41ms 
iter 7833: loss 2.3502, time 5004.29ms 
iter 7834: loss 2.4010, time 5027.06ms 
iter 7835: loss 2.6730, time 5028.92ms 
iter 7836: loss 2.1433, time 4928.83ms 
iter 7837: loss 2.6709, time 4914.39ms 
iter 7838: loss 2.4004, time 5014.11ms 
iter 7839: loss 2.7468, time 5026.64ms 
iter 7840: loss 2.2735, time 5024.41ms 
iter 7841: loss 2.2999, time 4993.18ms 
iter 7842: loss 2.4498, time 4952.95ms 
iter 7843: loss 2.4981, time 4952.57ms 
iter 7844: loss 2.4241, time 4955.05ms 
iter 7845: loss 2.3441, time 4954.47ms 
iter 7846: loss 2.5938, time 4956.73ms 
iter 7847: loss 2.4217, time 5005.64ms 
iter 7848: loss 2.3948, time 5031.62ms 
iter 7849: loss 2.3786, time 5030.09ms 
step 7850: train loss 2.4832, val loss 2.8516
iter 7850: loss 2.4143, time 19707.46ms 
iter 7851: loss 2.1825, time 4970.79ms 
iter 7852: loss 2.5046, time 4919.60ms 
iter 7853: loss 2.5387, time 5019.57ms 
iter 7854: loss 2.5154, time 5031.66ms 
iter 7855: loss 2.5407, time 5012.50ms 
iter 7856: loss 2.4542, time 5018.41ms 
iter 7857: loss 2.4066, time 5030.53ms 
iter 7858: loss 2.6359, time 5030.27ms 
iter 7859: loss 2.5483, time 5029.89ms 
iter 7860: loss 2.4687, time 4977.79ms 
iter 7861: loss 2.3715, time 4918.02ms 
iter 7862: loss 2.2610, time 5006.97ms 
iter 7863: loss 2.4114, time 5007.68ms 
iter 7864: loss 2.4251, time 5023.65ms 
iter 7865: loss 2.5219, time 5031.30ms 
iter 7866: loss 2.6204, time 5000.50ms 
iter 7867: loss 2.7511, time 5031.51ms 
iter 7868: loss 2.4970, time 5030.02ms 
iter 7869: loss 2.5843, time 4980.38ms 
iter 7870: loss 2.6159, time 4920.71ms 
iter 7871: loss 2.6090, time 4917.43ms 
iter 7872: loss 2.5175, time 4915.51ms 
iter 7873: loss 2.6528, time 4964.61ms 
iter 7874: loss 2.4375, time 5028.09ms 
iter 7875: loss 2.5397, time 5030.82ms 
iter 7876: loss 2.4948, time 5021.32ms 
iter 7877: loss 2.6736, time 4988.03ms 
iter 7878: loss 2.6203, time 5036.77ms 
iter 7879: loss 2.5623, time 5034.29ms 
iter 7880: loss 2.3244, time 5031.89ms 
iter 7881: loss 2.5350, time 5029.34ms 
iter 7882: loss 2.4957, time 5031.53ms 
iter 7883: loss 2.4519, time 5016.33ms 
iter 7884: loss 2.5468, time 5029.41ms 
iter 7885: loss 2.7263, time 5029.47ms 
iter 7886: loss 2.5431, time 5035.52ms 
iter 7887: loss 2.3382, time 5025.52ms 
iter 7888: loss 2.5335, time 5032.61ms 
iter 7889: loss 2.3985, time 5034.98ms 
iter 7890: loss 2.4715, time 5034.50ms 
iter 7891: loss 2.5190, time 5040.36ms 
iter 7892: loss 2.5646, time 5039.18ms 
iter 7893: loss 2.6538, time 5030.63ms 
iter 7894: loss 2.5627, time 5042.00ms 
iter 7895: loss 2.5594, time 5023.15ms 
iter 7896: loss 2.5733, time 4997.11ms 
iter 7897: loss 2.5748, time 5042.91ms 
iter 7898: loss 2.3869, time 5026.32ms 
iter 7899: loss 2.2905, time 5033.48ms 
step 7900: train loss 2.4729, val loss 2.8564
iter 7900: loss 2.7791, time 19723.20ms 
iter 7901: loss 2.4546, time 5034.78ms 
iter 7902: loss 2.4469, time 5036.31ms 
iter 7903: loss 2.5314, time 5045.70ms 
iter 7904: loss 2.5528, time 5004.65ms 
iter 7905: loss 2.5272, time 5025.24ms 
iter 7906: loss 2.5167, time 5044.99ms 
iter 7907: loss 2.2437, time 5008.02ms 
iter 7908: loss 2.5340, time 4934.49ms 
iter 7909: loss 2.4979, time 4939.03ms 
iter 7910: loss 2.6352, time 4937.47ms 
iter 7911: loss 2.5175, time 4938.63ms 
iter 7912: loss 2.6115, time 5056.20ms 
iter 7913: loss 2.3486, time 5043.57ms 
iter 7914: loss 2.5161, time 5039.63ms 
iter 7915: loss 2.5446, time 5040.70ms 
iter 7916: loss 2.5115, time 5040.34ms 
iter 7917: loss 2.5494, time 5033.78ms 
iter 7918: loss 2.4165, time 5035.06ms 
iter 7919: loss 2.5787, time 5042.87ms 
iter 7920: loss 2.2732, time 5039.57ms 
iter 7921: loss 2.5933, time 5008.63ms 
iter 7922: loss 2.5971, time 4960.70ms 
iter 7923: loss 2.3955, time 4935.91ms 
iter 7924: loss 2.6989, time 4983.16ms 
iter 7925: loss 2.4633, time 4993.98ms 
iter 7926: loss 2.5295, time 5007.77ms 
iter 7927: loss 2.7351, time 5035.60ms 
iter 7928: loss 2.5555, time 5030.40ms 
iter 7929: loss 2.4653, time 5038.15ms 
iter 7930: loss 2.4288, time 5037.67ms 
iter 7931: loss 2.3342, time 5034.03ms 
iter 7932: loss 2.2785, time 5033.44ms 
iter 7933: loss 2.6016, time 5038.50ms 
iter 7934: loss 2.5940, time 5038.06ms 
iter 7935: loss 2.2020, time 5036.06ms 
iter 7936: loss 2.4155, time 5032.48ms 
iter 7937: loss 2.4672, time 5031.61ms 
iter 7938: loss 2.5363, time 5030.11ms 
iter 7939: loss 2.5493, time 5031.96ms 
iter 7940: loss 2.5554, time 5038.31ms 
iter 7941: loss 2.4652, time 5032.75ms 
iter 7942: loss 2.4732, time 5035.94ms 
iter 7943: loss 2.4993, time 5043.01ms 
iter 7944: loss 2.5685, time 5036.72ms 
iter 7945: loss 2.7435, time 5034.86ms 
iter 7946: loss 2.4675, time 5041.31ms 
iter 7947: loss 2.3241, time 5033.16ms 
iter 7948: loss 2.2882, time 5029.35ms 
iter 7949: loss 2.4851, time 5034.54ms 
step 7950: train loss 2.4845, val loss 2.8505
iter 7950: loss 2.5275, time 19752.34ms 
iter 7951: loss 2.3719, time 5034.83ms 
iter 7952: loss 2.6497, time 5032.68ms 
iter 7953: loss 2.5164, time 5033.47ms 
iter 7954: loss 2.3151, time 5003.84ms 
iter 7955: loss 2.4843, time 4956.21ms 
iter 7956: loss 2.6394, time 4958.29ms 
iter 7957: loss 2.4917, time 5029.12ms 
iter 7958: loss 2.3016, time 5034.34ms 
iter 7959: loss 2.7438, time 5026.54ms 
iter 7960: loss 2.5274, time 5031.62ms 
iter 7961: loss 2.6489, time 5026.85ms 
iter 7962: loss 2.5039, time 5040.08ms 
iter 7963: loss 2.5566, time 5028.18ms 
iter 7964: loss 2.6199, time 5035.51ms 
iter 7965: loss 2.3086, time 5028.69ms 
iter 7966: loss 2.4329, time 5035.56ms 
iter 7967: loss 2.4127, time 4995.13ms 
iter 7968: loss 2.4783, time 5030.67ms 
iter 7969: loss 2.6567, time 5025.52ms 
iter 7970: loss 2.3941, time 5030.34ms 
iter 7971: loss 2.5396, time 5033.34ms 
iter 7972: loss 2.5047, time 4980.43ms 
iter 7973: loss 2.4007, time 5030.76ms 
iter 7974: loss 2.4131, time 5030.88ms 
iter 7975: loss 2.3910, time 5035.73ms 
iter 7976: loss 2.5648, time 5032.69ms 
iter 7977: loss 2.6189, time 5036.27ms 
iter 7978: loss 2.5207, time 5028.01ms 
iter 7979: loss 2.5422, time 5033.21ms 
iter 7980: loss 2.5929, time 5026.33ms 
iter 7981: loss 2.7866, time 5030.51ms 
iter 7982: loss 2.5436, time 5030.29ms 
iter 7983: loss 2.4064, time 5032.81ms 
iter 7984: loss 2.7181, time 5011.38ms 
iter 7985: loss 2.6029, time 5033.83ms 
iter 7986: loss 2.4088, time 5028.88ms 
iter 7987: loss 2.6652, time 5030.68ms 
iter 7988: loss 2.3661, time 5036.41ms 
iter 7989: loss 2.5427, time 5030.23ms 
iter 7990: loss 2.4800, time 5031.51ms 
iter 7991: loss 2.4655, time 5019.16ms 
iter 7992: loss 2.4067, time 5007.81ms 
iter 7993: loss 2.3554, time 5002.80ms 
iter 7994: loss 2.5002, time 5022.77ms 
iter 7995: loss 2.5955, time 4953.74ms 
iter 7996: loss 2.3792, time 4969.13ms 
iter 7997: loss 2.5624, time 4983.85ms 
iter 7998: loss 2.6526, time 4992.39ms 
iter 7999: loss 2.5863, time 5027.70ms 
step 8000: train loss 2.5028, val loss 2.8494
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 8000: loss 2.4884, time 20790.25ms 
iter 8001: loss 2.5769, time 5020.76ms 
iter 8002: loss 2.4373, time 5024.35ms 
iter 8003: loss 2.6018, time 5020.35ms 
iter 8004: loss 2.5046, time 5029.02ms 
iter 8005: loss 2.7914, time 5031.38ms 
iter 8006: loss 2.5270, time 5026.19ms 
iter 8007: loss 2.6773, time 5017.04ms 
iter 8008: loss 2.3541, time 5020.67ms 
iter 8009: loss 2.5708, time 5026.44ms 
iter 8010: loss 2.6663, time 5025.45ms 
iter 8011: loss 2.6000, time 5010.98ms 
iter 8012: loss 2.5706, time 5024.84ms 
iter 8013: loss 2.4273, time 5026.88ms 
iter 8014: loss 2.5463, time 5031.45ms 
iter 8015: loss 2.7360, time 5032.04ms 
iter 8016: loss 2.4729, time 5029.65ms 
iter 8017: loss 2.3929, time 5030.02ms 
iter 8018: loss 2.4510, time 5022.84ms 
iter 8019: loss 2.4409, time 5024.71ms 
iter 8020: loss 2.5330, time 5017.44ms 
iter 8021: loss 2.7028, time 5003.30ms 
iter 8022: loss 2.5978, time 5029.02ms 
iter 8023: loss 2.4815, time 5020.56ms 
iter 8024: loss 2.6263, time 5020.20ms 
iter 8025: loss 2.4443, time 5020.62ms 
iter 8026: loss 2.3264, time 4994.03ms 
iter 8027: loss 2.5140, time 5017.57ms 
iter 8028: loss 2.5119, time 5020.59ms 
iter 8029: loss 2.5754, time 5013.48ms 
iter 8030: loss 2.3185, time 5028.13ms 
iter 8031: loss 2.5333, time 4992.68ms 
iter 8032: loss 2.4297, time 4976.88ms 
iter 8033: loss 2.4113, time 5027.40ms 
iter 8034: loss 2.4781, time 5020.83ms 
iter 8035: loss 2.5922, time 5007.67ms 
iter 8036: loss 2.5520, time 4965.66ms 
iter 8037: loss 2.4998, time 5025.61ms 
iter 8038: loss 2.7180, time 5008.98ms 
iter 8039: loss 2.4660, time 4985.49ms 
iter 8040: loss 2.6687, time 4978.68ms 
iter 8041: loss 2.5669, time 5006.42ms 
iter 8042: loss 2.5455, time 5007.26ms 
iter 8043: loss 2.4166, time 5000.34ms 
iter 8044: loss 2.3103, time 5031.85ms 
iter 8045: loss 2.3633, time 5011.03ms 
iter 8046: loss 2.4036, time 5027.68ms 
iter 8047: loss 2.3012, time 5028.91ms 
iter 8048: loss 2.3579, time 5027.19ms 
iter 8049: loss 2.4550, time 5030.99ms 
step 8050: train loss 2.4698, val loss 2.8530
iter 8050: loss 2.5048, time 19717.19ms 
iter 8051: loss 2.5535, time 5020.38ms 
iter 8052: loss 2.2644, time 5026.59ms 
iter 8053: loss 2.6929, time 5024.54ms 
iter 8054: loss 2.4505, time 5024.92ms 
iter 8055: loss 2.1590, time 5029.45ms 
iter 8056: loss 2.5557, time 5020.24ms 
iter 8057: loss 2.1841, time 4952.55ms 
iter 8058: loss 2.5152, time 5003.63ms 
iter 8059: loss 2.4669, time 5025.17ms 
iter 8060: loss 2.4900, time 4983.29ms 
iter 8061: loss 2.4671, time 4965.28ms 
iter 8062: loss 2.4537, time 4978.21ms 
iter 8063: loss 2.4815, time 4994.50ms 
iter 8064: loss 2.5098, time 5015.24ms 
iter 8065: loss 2.6160, time 5018.58ms 
iter 8066: loss 2.6059, time 5032.17ms 
iter 8067: loss 2.4431, time 5023.15ms 
iter 8068: loss 2.5654, time 5028.87ms 
iter 8069: loss 2.5344, time 5029.46ms 
iter 8070: loss 2.6996, time 5002.75ms 
iter 8071: loss 2.2015, time 5035.12ms 
iter 8072: loss 2.2632, time 5026.37ms 
iter 8073: loss 2.4277, time 5026.72ms 
iter 8074: loss 2.4776, time 5028.26ms 
iter 8075: loss 2.5075, time 5027.80ms 
iter 8076: loss 2.5514, time 5019.34ms 
iter 8077: loss 2.4653, time 5026.37ms 
iter 8078: loss 2.4143, time 5023.12ms 
iter 8079: loss 2.4297, time 5030.77ms 
iter 8080: loss 2.4473, time 5029.54ms 
iter 8081: loss 2.5714, time 5019.11ms 
iter 8082: loss 2.5153, time 5033.13ms 
iter 8083: loss 2.3161, time 5027.53ms 
iter 8084: loss 2.2935, time 5018.40ms 
iter 8085: loss 2.5812, time 5015.65ms 
iter 8086: loss 2.5302, time 5011.52ms 
iter 8087: loss 2.3774, time 5025.98ms 
iter 8088: loss 2.4487, time 5021.71ms 
iter 8089: loss 2.5011, time 5024.58ms 
iter 8090: loss 2.5960, time 5009.67ms 
iter 8091: loss 2.4663, time 5021.34ms 
iter 8092: loss 2.5704, time 5021.55ms 
iter 8093: loss 2.3951, time 5021.98ms 
iter 8094: loss 2.5533, time 5022.04ms 
iter 8095: loss 2.4253, time 5023.27ms 
iter 8096: loss 2.4665, time 5028.42ms 
iter 8097: loss 2.7104, time 5015.40ms 
iter 8098: loss 2.6031, time 5011.82ms 
iter 8099: loss 2.4913, time 5017.01ms 
step 8100: train loss 2.4903, val loss 2.8353
iter 8100: loss 2.4817, time 19712.59ms 
iter 8101: loss 2.4658, time 5022.86ms 
iter 8102: loss 2.5271, time 5024.95ms 
iter 8103: loss 2.5615, time 5015.90ms 
iter 8104: loss 2.3977, time 5013.64ms 
iter 8105: loss 2.3806, time 5022.04ms 
iter 8106: loss 2.4191, time 5020.86ms 
iter 8107: loss 2.4671, time 5022.74ms 
iter 8108: loss 2.3613, time 5022.62ms 
iter 8109: loss 2.4853, time 5029.04ms 
iter 8110: loss 2.4176, time 5028.24ms 
iter 8111: loss 2.5251, time 4986.90ms 
iter 8112: loss 2.2474, time 5000.81ms 
iter 8113: loss 2.5545, time 5030.97ms 
iter 8114: loss 2.7115, time 5019.14ms 
iter 8115: loss 2.5363, time 5002.66ms 
iter 8116: loss 2.6639, time 5010.87ms 
iter 8117: loss 2.3665, time 5031.83ms 
iter 8118: loss 2.4759, time 5027.36ms 
iter 8119: loss 2.5971, time 5025.36ms 
iter 8120: loss 2.5430, time 5028.18ms 
iter 8121: loss 2.2669, time 5026.75ms 
iter 8122: loss 2.4335, time 5012.41ms 
iter 8123: loss 2.4359, time 5021.23ms 
iter 8124: loss 2.5289, time 5017.86ms 
iter 8125: loss 2.6295, time 4923.89ms 
iter 8126: loss 2.5306, time 4989.34ms 
iter 8127: loss 2.4928, time 5027.94ms 
iter 8128: loss 2.6231, time 5004.80ms 
iter 8129: loss 2.5082, time 5019.41ms 
iter 8130: loss 2.4868, time 5017.29ms 
iter 8131: loss 2.3891, time 5027.93ms 
iter 8132: loss 2.6026, time 4985.76ms 
iter 8133: loss 2.4008, time 4990.32ms 
iter 8134: loss 2.3432, time 5013.29ms 
iter 8135: loss 2.4533, time 4931.14ms 
iter 8136: loss 2.3129, time 4947.17ms 
iter 8137: loss 2.4602, time 4914.60ms 
iter 8138: loss 2.5451, time 4913.43ms 
iter 8139: loss 2.3978, time 4913.52ms 
iter 8140: loss 2.4823, time 4913.73ms 
iter 8141: loss 2.6213, time 4997.10ms 
iter 8142: loss 2.4130, time 5023.08ms 
iter 8143: loss 2.4338, time 5025.03ms 
iter 8144: loss 2.3804, time 5031.81ms 
iter 8145: loss 2.2471, time 5020.56ms 
iter 8146: loss 2.4408, time 5025.26ms 
iter 8147: loss 2.5916, time 4993.67ms 
iter 8148: loss 2.4489, time 5023.55ms 
iter 8149: loss 2.2957, time 5023.41ms 
step 8150: train loss 2.4853, val loss 2.8435
iter 8150: loss 2.3922, time 19676.07ms 
iter 8151: loss 2.3713, time 5026.71ms 
iter 8152: loss 2.3204, time 5025.21ms 
iter 8153: loss 2.3419, time 5026.96ms 
iter 8154: loss 2.2936, time 5001.83ms 
iter 8155: loss 2.3361, time 5022.22ms 
iter 8156: loss 2.4083, time 5026.53ms 
iter 8157: loss 2.4915, time 4995.84ms 
iter 8158: loss 2.3700, time 4949.36ms 
iter 8159: loss 2.2883, time 4984.59ms 
iter 8160: loss 2.3998, time 5011.63ms 
iter 8161: loss 2.5175, time 5031.59ms 
iter 8162: loss 2.4827, time 5025.99ms 
iter 8163: loss 2.2500, time 5026.11ms 
iter 8164: loss 2.5281, time 5024.64ms 
iter 8165: loss 2.4096, time 5026.98ms 
iter 8166: loss 2.5202, time 5030.02ms 
iter 8167: loss 2.3859, time 4943.89ms 
iter 8168: loss 2.4764, time 5020.02ms 
iter 8169: loss 2.7348, time 5023.40ms 
iter 8170: loss 2.2040, time 5028.14ms 
iter 8171: loss 2.4348, time 5030.16ms 
iter 8172: loss 2.2081, time 5022.45ms 
iter 8173: loss 2.5350, time 5020.57ms 
iter 8174: loss 2.3326, time 4950.98ms 
iter 8175: loss 2.5476, time 4955.99ms 
iter 8176: loss 2.5336, time 5015.11ms 
iter 8177: loss 2.5635, time 5001.68ms 
iter 8178: loss 2.4216, time 5009.64ms 
iter 8179: loss 2.4880, time 5020.90ms 
iter 8180: loss 2.4450, time 5015.94ms 
iter 8181: loss 2.4359, time 4987.51ms 
iter 8182: loss 2.6388, time 4917.71ms 
iter 8183: loss 2.2245, time 5012.55ms 
iter 8184: loss 2.4730, time 5018.96ms 
iter 8185: loss 2.5348, time 5030.87ms 
iter 8186: loss 2.5758, time 5026.66ms 
iter 8187: loss 2.6126, time 5025.34ms 
iter 8188: loss 2.5921, time 5024.66ms 
iter 8189: loss 2.3694, time 5028.57ms 
iter 8190: loss 2.5789, time 5012.66ms 
iter 8191: loss 2.3373, time 5026.82ms 
iter 8192: loss 2.5967, time 5019.80ms 
iter 8193: loss 2.4307, time 5021.96ms 
iter 8194: loss 2.3466, time 5006.91ms 
iter 8195: loss 2.3495, time 5019.22ms 
iter 8196: loss 2.4760, time 5017.84ms 
iter 8197: loss 2.4258, time 5013.05ms 
iter 8198: loss 2.3381, time 4958.68ms 
iter 8199: loss 2.4865, time 5000.32ms 
step 8200: train loss 2.5029, val loss 2.8551
iter 8200: loss 2.7441, time 19684.90ms 
iter 8201: loss 2.3982, time 5013.38ms 
iter 8202: loss 2.3962, time 4972.12ms 
iter 8203: loss 2.3493, time 4995.58ms 
iter 8204: loss 2.4226, time 4993.62ms 
iter 8205: loss 2.5651, time 5024.71ms 
iter 8206: loss 2.4007, time 4995.03ms 
iter 8207: loss 2.6662, time 4994.90ms 
iter 8208: loss 2.2446, time 4982.83ms 
iter 8209: loss 2.5043, time 5023.19ms 
iter 8210: loss 2.4614, time 5026.33ms 
iter 8211: loss 2.6487, time 5027.16ms 
iter 8212: loss 2.4039, time 5013.77ms 
iter 8213: loss 2.3620, time 4996.88ms 
iter 8214: loss 2.4822, time 5026.90ms 
iter 8215: loss 2.5200, time 5000.23ms 
iter 8216: loss 2.2331, time 4941.53ms 
iter 8217: loss 2.3350, time 5011.38ms 
iter 8218: loss 2.5439, time 5016.03ms 
iter 8219: loss 2.5005, time 5021.82ms 
iter 8220: loss 2.4381, time 5029.30ms 
iter 8221: loss 2.3863, time 5018.05ms 
iter 8222: loss 2.7131, time 5004.10ms 
iter 8223: loss 2.2378, time 5003.83ms 
iter 8224: loss 2.5641, time 4932.92ms 
iter 8225: loss 2.5645, time 4932.06ms 
iter 8226: loss 2.3901, time 5013.61ms 
iter 8227: loss 2.6155, time 5003.46ms 
iter 8228: loss 2.4658, time 4956.01ms 
iter 8229: loss 2.4896, time 5024.42ms 
iter 8230: loss 2.6314, time 5029.25ms 
iter 8231: loss 2.5567, time 5026.79ms 
iter 8232: loss 2.4012, time 5023.67ms 
iter 8233: loss 2.6902, time 5017.80ms 
iter 8234: loss 2.6369, time 5015.77ms 
iter 8235: loss 2.3004, time 5008.77ms 
iter 8236: loss 2.4356, time 4982.34ms 
iter 8237: loss 2.3507, time 5005.74ms 
iter 8238: loss 2.5472, time 4968.36ms 
iter 8239: loss 2.2838, time 5029.97ms 
iter 8240: loss 2.5270, time 5002.89ms 
iter 8241: loss 2.3335, time 5013.88ms 
iter 8242: loss 2.3259, time 4960.97ms 
iter 8243: loss 2.4267, time 5006.80ms 
iter 8244: loss 2.5296, time 5033.08ms 
iter 8245: loss 2.2669, time 5041.88ms 
iter 8246: loss 2.4853, time 5031.13ms 
iter 8247: loss 2.3797, time 5030.92ms 
iter 8248: loss 2.4104, time 5023.28ms 
iter 8249: loss 2.5512, time 5024.21ms 
step 8250: train loss 2.4715, val loss 2.8587
iter 8250: loss 2.6777, time 19705.06ms 
iter 8251: loss 2.4956, time 5012.81ms 
iter 8252: loss 2.2281, time 4972.78ms 
iter 8253: loss 2.2278, time 5025.24ms 
iter 8254: loss 2.4740, time 5009.74ms 
iter 8255: loss 2.5287, time 5031.76ms 
iter 8256: loss 2.3987, time 5031.04ms 
iter 8257: loss 2.3912, time 5027.81ms 
iter 8258: loss 2.4746, time 5029.29ms 
iter 8259: loss 2.3862, time 5027.57ms 
iter 8260: loss 2.5543, time 5028.15ms 
iter 8261: loss 2.1907, time 5016.11ms 
iter 8262: loss 2.1564, time 5030.76ms 
iter 8263: loss 2.2845, time 5015.40ms 
iter 8264: loss 2.3502, time 5005.73ms 
iter 8265: loss 2.5105, time 5026.31ms 
iter 8266: loss 2.4520, time 5030.54ms 
iter 8267: loss 2.3803, time 5029.59ms 
iter 8268: loss 2.5687, time 5020.43ms 
iter 8269: loss 2.5272, time 5024.88ms 
iter 8270: loss 2.5752, time 4967.43ms 
iter 8271: loss 2.4978, time 5025.87ms 
iter 8272: loss 2.5713, time 5009.02ms 
iter 8273: loss 2.2757, time 5027.14ms 
iter 8274: loss 2.5675, time 4994.11ms 
iter 8275: loss 2.3072, time 5030.22ms 
iter 8276: loss 2.4113, time 5026.72ms 
iter 8277: loss 2.5389, time 5035.77ms 
iter 8278: loss 2.5830, time 5001.51ms 
iter 8279: loss 2.5923, time 4934.40ms 
iter 8280: loss 2.3267, time 5022.90ms 
iter 8281: loss 2.2388, time 5021.32ms 
iter 8282: loss 2.1405, time 5025.69ms 
iter 8283: loss 2.4779, time 5029.69ms 
iter 8284: loss 2.5042, time 4995.93ms 
iter 8285: loss 2.4692, time 5020.58ms 
iter 8286: loss 2.6767, time 4975.33ms 
iter 8287: loss 2.5871, time 4978.30ms 
iter 8288: loss 2.4065, time 5016.22ms 
iter 8289: loss 2.4892, time 5028.71ms 
iter 8290: loss 2.4683, time 5026.52ms 
iter 8291: loss 2.6182, time 5027.20ms 
iter 8292: loss 2.4180, time 5016.59ms 
iter 8293: loss 2.6028, time 4996.43ms 
iter 8294: loss 2.5335, time 4948.14ms 
iter 8295: loss 2.4606, time 4988.63ms 
iter 8296: loss 2.3599, time 5018.01ms 
iter 8297: loss 2.5122, time 5026.00ms 
iter 8298: loss 2.5627, time 5028.47ms 
iter 8299: loss 2.4944, time 5026.31ms 
step 8300: train loss 2.4733, val loss 2.8601
iter 8300: loss 2.2614, time 19748.79ms 
iter 8301: loss 2.5258, time 5034.41ms 
iter 8302: loss 2.6475, time 5027.97ms 
iter 8303: loss 2.4895, time 5029.93ms 
iter 8304: loss 2.3142, time 5031.37ms 
iter 8305: loss 2.3691, time 5034.09ms 
iter 8306: loss 2.5237, time 5025.16ms 
iter 8307: loss 2.4416, time 4983.09ms 
iter 8308: loss 2.4443, time 4974.35ms 
iter 8309: loss 2.3947, time 5008.04ms 
iter 8310: loss 2.5332, time 5028.22ms 
iter 8311: loss 2.3577, time 5038.88ms 
iter 8312: loss 2.3009, time 5023.01ms 
iter 8313: loss 2.4124, time 5029.79ms 
iter 8314: loss 2.2301, time 5015.22ms 
iter 8315: loss 2.5148, time 5032.85ms 
iter 8316: loss 2.4912, time 4999.23ms 
iter 8317: loss 2.6617, time 5032.54ms 
iter 8318: loss 2.4877, time 5021.39ms 
iter 8319: loss 2.6371, time 5036.25ms 
iter 8320: loss 2.2793, time 5003.05ms 
iter 8321: loss 2.5672, time 5035.52ms 
iter 8322: loss 2.6463, time 5027.77ms 
iter 8323: loss 2.5876, time 5042.91ms 
iter 8324: loss 2.4785, time 4998.01ms 
iter 8325: loss 2.4701, time 5028.47ms 
iter 8326: loss 2.4206, time 4997.92ms 
iter 8327: loss 2.4799, time 4988.27ms 
iter 8328: loss 2.5980, time 4938.28ms 
iter 8329: loss 2.4156, time 4949.97ms 
iter 8330: loss 2.4802, time 4941.78ms 
iter 8331: loss 2.5177, time 4937.49ms 
iter 8332: loss 2.4908, time 4917.77ms 
iter 8333: loss 2.3389, time 4946.52ms 
iter 8334: loss 2.4445, time 4944.07ms 
iter 8335: loss 2.5079, time 4936.49ms 
iter 8336: loss 2.5528, time 5008.71ms 
iter 8337: loss 2.4044, time 4977.81ms 
iter 8338: loss 2.3337, time 5000.87ms 
iter 8339: loss 2.5164, time 5027.39ms 
iter 8340: loss 2.3849, time 4963.55ms 
iter 8341: loss 2.3470, time 4957.74ms 
iter 8342: loss 2.5479, time 4999.68ms 
iter 8343: loss 2.2302, time 5030.31ms 
iter 8344: loss 2.1046, time 5032.19ms 
iter 8345: loss 2.4426, time 5030.33ms 
iter 8346: loss 2.2617, time 4995.21ms 
iter 8347: loss 2.5321, time 5024.07ms 
iter 8348: loss 2.6443, time 4969.24ms 
iter 8349: loss 2.8221, time 5027.25ms 
step 8350: train loss 2.4696, val loss 2.8373
iter 8350: loss 2.7865, time 19718.49ms 
iter 8351: loss 2.5406, time 5036.15ms 
iter 8352: loss 2.3138, time 5034.41ms 
iter 8353: loss 2.4169, time 4975.69ms 
iter 8354: loss 2.4270, time 4993.84ms 
iter 8355: loss 2.4959, time 5030.98ms 
iter 8356: loss 2.4456, time 5029.90ms 
iter 8357: loss 2.3749, time 5034.23ms 
iter 8358: loss 2.2963, time 5031.73ms 
iter 8359: loss 2.3949, time 5030.57ms 
iter 8360: loss 2.3106, time 5030.68ms 
iter 8361: loss 2.4429, time 4990.89ms 
iter 8362: loss 2.2822, time 5031.51ms 
iter 8363: loss 2.5138, time 5010.11ms 
iter 8364: loss 2.4617, time 5010.60ms 
iter 8365: loss 2.4293, time 5018.77ms 
iter 8366: loss 2.1184, time 5030.29ms 
iter 8367: loss 2.5721, time 5036.65ms 
iter 8368: loss 2.4754, time 4964.01ms 
iter 8369: loss 2.4404, time 4951.68ms 
iter 8370: loss 2.4660, time 5029.89ms 
iter 8371: loss 2.7598, time 5031.16ms 
iter 8372: loss 2.5912, time 5032.67ms 
iter 8373: loss 2.5622, time 5036.60ms 
iter 8374: loss 2.5970, time 5039.71ms 
iter 8375: loss 2.3515, time 5029.61ms 
iter 8376: loss 2.5971, time 5033.16ms 
iter 8377: loss 2.2483, time 5035.46ms 
iter 8378: loss 2.4542, time 5033.89ms 
iter 8379: loss 2.3636, time 5032.09ms 
iter 8380: loss 2.2528, time 5036.53ms 
iter 8381: loss 2.4146, time 5023.18ms 
iter 8382: loss 2.5434, time 5039.25ms 
iter 8383: loss 2.6279, time 5040.92ms 
iter 8384: loss 2.4665, time 4990.33ms 
iter 8385: loss 2.1942, time 5035.10ms 
iter 8386: loss 2.5462, time 5035.17ms 
iter 8387: loss 2.3033, time 5006.89ms 
iter 8388: loss 2.4504, time 5017.13ms 
iter 8389: loss 2.4803, time 4916.81ms 
iter 8390: loss 2.4844, time 4947.44ms 
iter 8391: loss 2.4283, time 4955.88ms 
iter 8392: loss 2.4693, time 4962.37ms 
iter 8393: loss 2.4658, time 5006.07ms 
iter 8394: loss 2.5137, time 4959.91ms 
iter 8395: loss 2.5809, time 4953.63ms 
iter 8396: loss 2.6327, time 4945.43ms 
iter 8397: loss 2.4501, time 4944.06ms 
iter 8398: loss 2.4652, time 4942.38ms 
iter 8399: loss 2.1002, time 4944.69ms 
step 8400: train loss 2.4827, val loss 2.8540
iter 8400: loss 2.6580, time 19723.69ms 
iter 8401: loss 2.3789, time 4993.27ms 
iter 8402: loss 2.4360, time 4935.75ms 
iter 8403: loss 2.3977, time 4957.36ms 
iter 8404: loss 2.4796, time 4937.25ms 
iter 8405: loss 2.4328, time 4955.79ms 
iter 8406: loss 2.1598, time 4996.55ms 
iter 8407: loss 2.7276, time 4929.38ms 
iter 8408: loss 2.6724, time 4918.36ms 
iter 8409: loss 2.4978, time 4981.26ms 
iter 8410: loss 2.2959, time 5022.80ms 
iter 8411: loss 2.2878, time 5024.49ms 
iter 8412: loss 2.6029, time 4958.31ms 
iter 8413: loss 2.5342, time 4925.79ms 
iter 8414: loss 2.3732, time 5006.45ms 
iter 8415: loss 2.5006, time 5030.79ms 
iter 8416: loss 2.3024, time 5018.38ms 
iter 8417: loss 2.6107, time 5009.97ms 
iter 8418: loss 2.3993, time 4951.91ms 
iter 8419: loss 2.4272, time 4974.31ms 
iter 8420: loss 2.5150, time 5023.25ms 
iter 8421: loss 2.5728, time 4954.98ms 
iter 8422: loss 2.5503, time 4982.99ms 
iter 8423: loss 2.5375, time 4991.71ms 
iter 8424: loss 2.5127, time 5018.85ms 
iter 8425: loss 2.2758, time 5027.30ms 
iter 8426: loss 2.3784, time 5031.36ms 
iter 8427: loss 2.3976, time 5015.77ms 
iter 8428: loss 2.6050, time 5031.08ms 
iter 8429: loss 2.5330, time 4980.02ms 
iter 8430: loss 2.3469, time 4958.82ms 
iter 8431: loss 2.6769, time 5013.03ms 
iter 8432: loss 2.7199, time 5026.01ms 
iter 8433: loss 2.3854, time 5022.21ms 
iter 8434: loss 2.3757, time 5037.22ms 
iter 8435: loss 2.5257, time 5025.92ms 
iter 8436: loss 2.3941, time 5019.03ms 
iter 8437: loss 2.3699, time 4984.24ms 
iter 8438: loss 2.5170, time 4978.64ms 
iter 8439: loss 2.5831, time 4944.11ms 
iter 8440: loss 2.4000, time 5023.99ms 
iter 8441: loss 2.4903, time 5022.59ms 
iter 8442: loss 2.3364, time 5012.32ms 
iter 8443: loss 2.4471, time 5008.12ms 
iter 8444: loss 2.5606, time 4964.73ms 
iter 8445: loss 2.3746, time 4992.60ms 
iter 8446: loss 2.5306, time 4945.23ms 
iter 8447: loss 2.5730, time 4973.75ms 
iter 8448: loss 2.6011, time 5029.00ms 
iter 8449: loss 2.1850, time 5015.53ms 
step 8450: train loss 2.4777, val loss 2.8605
iter 8450: loss 2.3583, time 19728.65ms 
iter 8451: loss 2.4998, time 5026.01ms 
iter 8452: loss 2.5205, time 5037.12ms 
iter 8453: loss 2.4233, time 4989.77ms 
iter 8454: loss 2.2819, time 4999.20ms 
iter 8455: loss 2.3699, time 5027.35ms 
iter 8456: loss 2.4997, time 4973.10ms 
iter 8457: loss 2.2750, time 5023.21ms 
iter 8458: loss 2.5639, time 5024.72ms 
iter 8459: loss 2.5233, time 5008.07ms 
iter 8460: loss 2.5307, time 5028.03ms 
iter 8461: loss 2.4453, time 4976.81ms 
iter 8462: loss 2.6469, time 4982.84ms 
iter 8463: loss 2.5385, time 5025.09ms 
iter 8464: loss 2.4649, time 5029.29ms 
iter 8465: loss 2.3553, time 5024.58ms 
iter 8466: loss 2.6321, time 5026.47ms 
iter 8467: loss 2.4670, time 5027.62ms 
iter 8468: loss 2.5549, time 5031.76ms 
iter 8469: loss 2.5866, time 5032.03ms 
iter 8470: loss 2.3227, time 4982.55ms 
iter 8471: loss 2.5866, time 5020.91ms 
iter 8472: loss 2.5370, time 5030.00ms 
iter 8473: loss 2.6129, time 5026.66ms 
iter 8474: loss 2.5260, time 5027.69ms 
iter 8475: loss 2.5772, time 5024.31ms 
iter 8476: loss 2.5358, time 5016.34ms 
iter 8477: loss 2.3645, time 5017.71ms 
iter 8478: loss 2.4887, time 4985.39ms 
iter 8479: loss 2.2661, time 5033.23ms 
iter 8480: loss 2.4956, time 5031.52ms 
iter 8481: loss 2.4602, time 5026.86ms 
iter 8482: loss 2.5809, time 5033.28ms 
iter 8483: loss 2.6048, time 5031.17ms 
iter 8484: loss 2.3227, time 4993.81ms 
iter 8485: loss 2.6109, time 4934.65ms 
iter 8486: loss 2.8415, time 4917.06ms 
iter 8487: loss 2.4204, time 4988.25ms 
iter 8488: loss 2.5763, time 5016.76ms 
iter 8489: loss 2.3964, time 5017.12ms 
iter 8490: loss 2.3779, time 5026.79ms 
iter 8491: loss 2.4250, time 5025.54ms 
iter 8492: loss 2.4869, time 5025.59ms 
iter 8493: loss 2.3709, time 5024.38ms 
iter 8494: loss 2.3160, time 4971.50ms 
iter 8495: loss 2.2654, time 5000.05ms 
iter 8496: loss 2.5918, time 5023.73ms 
iter 8497: loss 2.8197, time 5020.19ms 
iter 8498: loss 2.6395, time 5022.49ms 
iter 8499: loss 2.5809, time 5019.75ms 
step 8500: train loss 2.4617, val loss 2.8459
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 8500: loss 2.4319, time 20674.75ms 
iter 8501: loss 2.3048, time 5011.09ms 
iter 8502: loss 2.3577, time 5025.53ms 
iter 8503: loss 2.3002, time 5032.24ms 
iter 8504: loss 2.3692, time 5003.46ms 
iter 8505: loss 2.4363, time 4973.70ms 
iter 8506: loss 2.4019, time 5034.40ms 
iter 8507: loss 2.3906, time 4966.19ms 
iter 8508: loss 2.4290, time 4987.83ms 
iter 8509: loss 2.3911, time 5013.54ms 
iter 8510: loss 2.4873, time 5006.00ms 
iter 8511: loss 2.6049, time 4931.64ms 
iter 8512: loss 2.4966, time 5023.62ms 
iter 8513: loss 2.4681, time 5032.28ms 
iter 8514: loss 2.4791, time 5028.21ms 
iter 8515: loss 2.4608, time 4975.63ms 
iter 8516: loss 2.5207, time 4918.83ms 
iter 8517: loss 2.3650, time 4998.22ms 
iter 8518: loss 2.5991, time 5028.23ms 
iter 8519: loss 2.6269, time 5017.16ms 
iter 8520: loss 2.2443, time 5031.59ms 
iter 8521: loss 2.4838, time 5028.98ms 
iter 8522: loss 2.6629, time 5024.35ms 
iter 8523: loss 2.5211, time 5020.79ms 
iter 8524: loss 2.4344, time 4940.80ms 
iter 8525: loss 2.5184, time 5020.08ms 
iter 8526: loss 2.4873, time 5025.63ms 
iter 8527: loss 2.4504, time 5034.24ms 
iter 8528: loss 2.6548, time 5036.43ms 
iter 8529: loss 2.4917, time 5016.04ms 
iter 8530: loss 2.3714, time 5010.96ms 
iter 8531: loss 2.4211, time 5028.93ms 
iter 8532: loss 2.4036, time 4980.51ms 
iter 8533: loss 2.5123, time 4989.62ms 
iter 8534: loss 2.3935, time 5026.33ms 
iter 8535: loss 2.3731, time 5020.12ms 
iter 8536: loss 2.2014, time 5024.06ms 
iter 8537: loss 2.4510, time 5031.41ms 
iter 8538: loss 2.4172, time 5009.76ms 
iter 8539: loss 2.5655, time 5019.21ms 
iter 8540: loss 2.5701, time 4989.55ms 
iter 8541: loss 2.4023, time 4994.23ms 
iter 8542: loss 2.3843, time 5029.74ms 
iter 8543: loss 2.4110, time 4959.44ms 
iter 8544: loss 2.4553, time 4962.37ms 
iter 8545: loss 2.4434, time 4983.06ms 
iter 8546: loss 2.5732, time 4970.46ms 
iter 8547: loss 2.5185, time 5000.19ms 
iter 8548: loss 2.4031, time 5011.33ms 
iter 8549: loss 2.4544, time 4952.79ms 
step 8550: train loss 2.4759, val loss 2.8388
iter 8550: loss 2.4339, time 19675.85ms 
iter 8551: loss 2.4978, time 5017.41ms 
iter 8552: loss 2.3652, time 5017.25ms 
iter 8553: loss 2.4214, time 4972.82ms 
iter 8554: loss 2.4780, time 4933.99ms 
iter 8555: loss 2.4295, time 5029.92ms 
iter 8556: loss 2.6457, time 5026.91ms 
iter 8557: loss 2.3726, time 5025.92ms 
iter 8558: loss 2.3541, time 5025.78ms 
iter 8559: loss 2.4295, time 5026.16ms 
iter 8560: loss 2.4053, time 5027.63ms 
iter 8561: loss 2.7440, time 5021.30ms 
iter 8562: loss 2.7848, time 5001.14ms 
iter 8563: loss 2.5388, time 5028.08ms 
iter 8564: loss 2.3838, time 5031.31ms 
iter 8565: loss 2.4323, time 5026.87ms 
iter 8566: loss 2.4434, time 5033.65ms 
iter 8567: loss 2.4936, time 5023.61ms 
iter 8568: loss 2.6184, time 5027.62ms 
iter 8569: loss 2.4166, time 5027.17ms 
iter 8570: loss 2.4477, time 4971.20ms 
iter 8571: loss 2.5122, time 5026.72ms 
iter 8572: loss 2.3905, time 5016.93ms 
iter 8573: loss 2.6972, time 5029.98ms 
iter 8574: loss 2.5378, time 5020.94ms 
iter 8575: loss 2.4859, time 4990.55ms 
iter 8576: loss 2.5469, time 4955.39ms 
iter 8577: loss 2.4129, time 4929.16ms 
iter 8578: loss 2.3268, time 4946.43ms 
iter 8579: loss 2.5376, time 5028.52ms 
iter 8580: loss 2.4355, time 5025.74ms 
iter 8581: loss 2.5935, time 5026.41ms 
iter 8582: loss 2.5574, time 5027.55ms 
iter 8583: loss 2.5049, time 5023.76ms 
iter 8584: loss 2.4648, time 5028.12ms 
iter 8585: loss 2.3650, time 5026.15ms 
iter 8586: loss 2.6209, time 5030.55ms 
iter 8587: loss 2.3978, time 5026.04ms 
iter 8588: loss 2.3832, time 4986.52ms 
iter 8589: loss 2.5975, time 5024.67ms 
iter 8590: loss 2.3341, time 5027.92ms 
iter 8591: loss 2.4447, time 5009.68ms 
iter 8592: loss 2.4974, time 5008.61ms 
iter 8593: loss 2.4784, time 5027.00ms 
iter 8594: loss 2.5111, time 5028.04ms 
iter 8595: loss 2.5247, time 5027.56ms 
iter 8596: loss 2.3683, time 5025.86ms 
iter 8597: loss 2.4394, time 5027.04ms 
iter 8598: loss 2.4468, time 5031.91ms 
iter 8599: loss 2.5002, time 5027.59ms 
step 8600: train loss 2.4831, val loss 2.8512
iter 8600: loss 2.4325, time 19696.76ms 
iter 8601: loss 2.5065, time 5027.90ms 
iter 8602: loss 2.4346, time 5026.88ms 
iter 8603: loss 2.5827, time 5031.94ms 
iter 8604: loss 2.3212, time 5009.77ms 
iter 8605: loss 2.5619, time 5005.46ms 
iter 8606: loss 2.4989, time 5016.07ms 
iter 8607: loss 2.5808, time 5025.10ms 
iter 8608: loss 2.5278, time 5009.50ms 
iter 8609: loss 2.5887, time 5022.97ms 
iter 8610: loss 2.4481, time 4995.28ms 
iter 8611: loss 2.4619, time 4951.06ms 
iter 8612: loss 2.5042, time 4995.99ms 
iter 8613: loss 2.5706, time 5026.01ms 
iter 8614: loss 2.4191, time 5023.25ms 
iter 8615: loss 2.5443, time 4982.61ms 
iter 8616: loss 2.4675, time 5027.17ms 
iter 8617: loss 2.3311, time 5024.80ms 
iter 8618: loss 2.4251, time 5023.08ms 
iter 8619: loss 2.3415, time 5021.24ms 
iter 8620: loss 2.5625, time 5025.56ms 
iter 8621: loss 2.4452, time 5026.67ms 
iter 8622: loss 2.3184, time 5019.63ms 
iter 8623: loss 2.5486, time 4995.79ms 
iter 8624: loss 2.4138, time 5024.66ms 
iter 8625: loss 2.6809, time 5022.77ms 
iter 8626: loss 2.4310, time 5022.94ms 
iter 8627: loss 2.5395, time 4987.71ms 
iter 8628: loss 2.6191, time 5021.83ms 
iter 8629: loss 2.4638, time 5019.10ms 
iter 8630: loss 2.5211, time 5018.28ms 
iter 8631: loss 2.5293, time 5021.79ms 
iter 8632: loss 2.4901, time 5023.30ms 
iter 8633: loss 2.5461, time 5011.02ms 
iter 8634: loss 2.6032, time 5031.70ms 
iter 8635: loss 2.3890, time 5012.18ms 
iter 8636: loss 2.4460, time 5023.87ms 
iter 8637: loss 2.6510, time 5024.41ms 
iter 8638: loss 2.4056, time 5027.82ms 
iter 8639: loss 2.5913, time 5008.50ms 
iter 8640: loss 2.5164, time 5025.97ms 
iter 8641: loss 2.2664, time 5020.18ms 
iter 8642: loss 2.5124, time 5026.98ms 
iter 8643: loss 2.4903, time 5026.33ms 
iter 8644: loss 2.4467, time 5008.62ms 
iter 8645: loss 2.5450, time 5020.28ms 
iter 8646: loss 2.4419, time 4986.72ms 
iter 8647: loss 2.5772, time 5030.73ms 
iter 8648: loss 2.5018, time 5037.57ms 
iter 8649: loss 2.7086, time 5011.40ms 
step 8650: train loss 2.4676, val loss 2.8516
iter 8650: loss 2.3978, time 19696.45ms 
iter 8651: loss 2.3351, time 5024.08ms 
iter 8652: loss 2.3764, time 5026.31ms 
iter 8653: loss 2.1733, time 5026.96ms 
iter 8654: loss 2.3951, time 5026.72ms 
iter 8655: loss 2.2832, time 5019.04ms 
iter 8656: loss 2.3221, time 5027.87ms 
iter 8657: loss 2.3328, time 5027.77ms 
iter 8658: loss 2.7584, time 5032.41ms 
iter 8659: loss 2.2620, time 5010.67ms 
iter 8660: loss 2.2907, time 4978.45ms 
iter 8661: loss 2.3099, time 5013.91ms 
iter 8662: loss 2.4927, time 4987.35ms 
iter 8663: loss 2.3567, time 4935.79ms 
iter 8664: loss 2.4611, time 5025.56ms 
iter 8665: loss 2.6011, time 5025.84ms 
iter 8666: loss 2.4511, time 5001.50ms 
iter 8667: loss 2.4893, time 4938.30ms 
iter 8668: loss 2.4925, time 4934.40ms 
iter 8669: loss 2.5452, time 4981.17ms 
iter 8670: loss 2.5054, time 5032.67ms 
iter 8671: loss 2.2676, time 4999.41ms 
iter 8672: loss 2.4811, time 5027.43ms 
iter 8673: loss 2.2824, time 5027.73ms 
iter 8674: loss 2.7064, time 5026.44ms 
iter 8675: loss 2.6538, time 5030.18ms 
iter 8676: loss 2.4808, time 5027.85ms 
iter 8677: loss 2.6760, time 5023.65ms 
iter 8678: loss 2.6349, time 5033.17ms 
iter 8679: loss 2.4155, time 5002.51ms 
iter 8680: loss 2.2974, time 5030.62ms 
iter 8681: loss 2.6615, time 5029.97ms 
iter 8682: loss 2.3639, time 5028.71ms 
iter 8683: loss 2.4347, time 5031.70ms 
iter 8684: loss 2.5681, time 5018.61ms 
iter 8685: loss 2.5439, time 5008.47ms 
iter 8686: loss 2.5588, time 4976.43ms 
iter 8687: loss 2.4975, time 4978.29ms 
iter 8688: loss 2.2798, time 5029.64ms 
iter 8689: loss 2.4710, time 5026.97ms 
iter 8690: loss 2.5061, time 5019.18ms 
iter 8691: loss 2.2644, time 5029.27ms 
iter 8692: loss 2.5959, time 5023.04ms 
iter 8693: loss 2.4881, time 5019.50ms 
iter 8694: loss 2.4592, time 4945.51ms 
iter 8695: loss 2.3211, time 4978.11ms 
iter 8696: loss 2.4156, time 4921.04ms 
iter 8697: loss 2.4995, time 4985.74ms 
iter 8698: loss 2.4417, time 5014.10ms 
iter 8699: loss 2.4604, time 4999.04ms 
step 8700: train loss 2.4534, val loss 2.8390
iter 8700: loss 2.5338, time 19730.90ms 
iter 8701: loss 2.5025, time 4993.05ms 
iter 8702: loss 2.4608, time 5002.90ms 
iter 8703: loss 2.6648, time 5001.17ms 
iter 8704: loss 2.4934, time 4952.75ms 
iter 8705: loss 2.4551, time 5037.95ms 
iter 8706: loss 2.6799, time 5020.94ms 
iter 8707: loss 2.4724, time 4975.61ms 
iter 8708: loss 2.5904, time 4937.36ms 
iter 8709: loss 2.3423, time 4995.86ms 
iter 8710: loss 2.5886, time 5021.79ms 
iter 8711: loss 2.4652, time 5005.00ms 
iter 8712: loss 2.3653, time 5021.54ms 
iter 8713: loss 2.5232, time 5024.53ms 
iter 8714: loss 2.3562, time 5000.23ms 
iter 8715: loss 2.3480, time 4943.91ms 
iter 8716: loss 2.5519, time 5003.17ms 
iter 8717: loss 2.4485, time 5025.10ms 
iter 8718: loss 2.6451, time 5025.22ms 
iter 8719: loss 2.5510, time 4971.86ms 
iter 8720: loss 2.7150, time 5015.92ms 
iter 8721: loss 2.3817, time 5012.68ms 
iter 8722: loss 2.5618, time 5033.31ms 
iter 8723: loss 2.3854, time 4924.06ms 
iter 8724: loss 2.4046, time 5016.24ms 
iter 8725: loss 2.3695, time 5018.31ms 
iter 8726: loss 2.2789, time 5027.58ms 
iter 8727: loss 2.7083, time 5029.25ms 
iter 8728: loss 2.4951, time 5024.04ms 
iter 8729: loss 2.6019, time 5025.68ms 
iter 8730: loss 2.2139, time 5008.50ms 
iter 8731: loss 2.5654, time 4977.96ms 
iter 8732: loss 2.5423, time 5006.00ms 
iter 8733: loss 2.7154, time 5012.84ms 
iter 8734: loss 2.4352, time 5035.95ms 
iter 8735: loss 2.6377, time 5034.12ms 
iter 8736: loss 2.4531, time 5032.44ms 
iter 8737: loss 2.5358, time 5030.48ms 
iter 8738: loss 2.3764, time 4937.96ms 
iter 8739: loss 2.4892, time 5022.91ms 
iter 8740: loss 2.4905, time 5028.19ms 
iter 8741: loss 2.4396, time 5025.63ms 
iter 8742: loss 2.2988, time 4977.67ms 
iter 8743: loss 2.3962, time 5028.59ms 
iter 8744: loss 2.2132, time 5022.79ms 
iter 8745: loss 2.7087, time 5033.27ms 
iter 8746: loss 2.4178, time 4944.03ms 
iter 8747: loss 2.7039, time 4975.92ms 
iter 8748: loss 2.4319, time 5026.49ms 
iter 8749: loss 2.6836, time 5027.87ms 
step 8750: train loss 2.4745, val loss 2.8491
iter 8750: loss 2.4367, time 19740.00ms 
iter 8751: loss 2.4948, time 4930.35ms 
iter 8752: loss 2.2636, time 5020.69ms 
iter 8753: loss 2.3845, time 5029.41ms 
iter 8754: loss 2.4189, time 5023.87ms 
iter 8755: loss 2.4955, time 5030.72ms 
iter 8756: loss 2.4199, time 5027.11ms 
iter 8757: loss 2.5193, time 5027.52ms 
iter 8758: loss 2.5448, time 4973.57ms 
iter 8759: loss 2.4141, time 4943.64ms 
iter 8760: loss 2.3015, time 5004.17ms 
iter 8761: loss 2.4748, time 4992.61ms 
iter 8762: loss 2.4913, time 5026.34ms 
iter 8763: loss 2.4426, time 5025.57ms 
iter 8764: loss 2.5714, time 5013.11ms 
iter 8765: loss 2.4548, time 5024.69ms 
iter 8766: loss 2.4418, time 5030.34ms 
iter 8767: loss 2.4642, time 4983.78ms 
iter 8768: loss 2.5427, time 5024.06ms 
iter 8769: loss 2.4935, time 5025.28ms 
iter 8770: loss 2.4113, time 5026.79ms 
iter 8771: loss 2.6256, time 5027.42ms 
iter 8772: loss 2.4752, time 5029.14ms 
iter 8773: loss 2.4088, time 5023.61ms 
iter 8774: loss 2.3487, time 5035.70ms 
iter 8775: loss 2.6053, time 5000.47ms 
iter 8776: loss 2.4245, time 5026.93ms 
iter 8777: loss 2.3981, time 5030.89ms 
iter 8778: loss 2.1606, time 5031.51ms 
iter 8779: loss 2.5515, time 5034.05ms 
iter 8780: loss 2.4890, time 5028.23ms 
iter 8781: loss 2.4217, time 5023.37ms 
iter 8782: loss 2.5915, time 5027.37ms 
iter 8783: loss 2.4768, time 4988.44ms 
iter 8784: loss 2.7512, time 5028.96ms 
iter 8785: loss 2.6061, time 5027.64ms 
iter 8786: loss 2.6950, time 5026.75ms 
iter 8787: loss 2.5340, time 5028.49ms 
iter 8788: loss 2.1232, time 5018.93ms 
iter 8789: loss 2.4124, time 5028.25ms 
iter 8790: loss 2.4909, time 5033.63ms 
iter 8791: loss 2.3043, time 5016.01ms 
iter 8792: loss 2.2712, time 5014.77ms 
iter 8793: loss 2.6746, time 5029.24ms 
iter 8794: loss 2.6355, time 4991.00ms 
iter 8795: loss 2.4896, time 4996.43ms 
iter 8796: loss 2.3164, time 5026.96ms 
iter 8797: loss 2.3154, time 4951.08ms 
iter 8798: loss 2.3175, time 4927.00ms 
iter 8799: loss 2.3779, time 4965.58ms 
step 8800: train loss 2.4528, val loss 2.8562
iter 8800: loss 2.3275, time 19712.83ms 
iter 8801: loss 2.5643, time 5016.41ms 
iter 8802: loss 2.4962, time 5014.16ms 
iter 8803: loss 2.1464, time 5023.83ms 
iter 8804: loss 2.3464, time 5015.27ms 
iter 8805: loss 2.6552, time 5023.69ms 
iter 8806: loss 2.4324, time 5004.48ms 
iter 8807: loss 2.5125, time 5014.57ms 
iter 8808: loss 2.5108, time 5017.37ms 
iter 8809: loss 2.4866, time 5028.24ms 
iter 8810: loss 2.6634, time 5030.20ms 
iter 8811: loss 2.6502, time 5032.02ms 
iter 8812: loss 2.4129, time 4991.57ms 
iter 8813: loss 2.6485, time 5026.56ms 
iter 8814: loss 2.5560, time 5019.27ms 
iter 8815: loss 2.4754, time 5026.85ms 
iter 8816: loss 2.5449, time 5013.83ms 
iter 8817: loss 2.4122, time 5025.55ms 
iter 8818: loss 2.5836, time 4994.39ms 
iter 8819: loss 2.5281, time 4993.98ms 
iter 8820: loss 2.5380, time 4991.07ms 
iter 8821: loss 2.4084, time 5020.86ms 
iter 8822: loss 2.5262, time 5020.38ms 
iter 8823: loss 2.4861, time 5027.04ms 
iter 8824: loss 2.5449, time 5026.64ms 
iter 8825: loss 2.4763, time 5026.37ms 
iter 8826: loss 2.4479, time 5025.35ms 
iter 8827: loss 2.6023, time 5028.85ms 
iter 8828: loss 2.4597, time 4995.91ms 
iter 8829: loss 2.3202, time 5023.89ms 
iter 8830: loss 2.4900, time 5013.69ms 
iter 8831: loss 2.0305, time 5029.50ms 
iter 8832: loss 2.5646, time 5010.93ms 
iter 8833: loss 2.5443, time 4997.61ms 
iter 8834: loss 2.2631, time 5015.98ms 
iter 8835: loss 2.6884, time 4927.77ms 
iter 8836: loss 2.4874, time 4954.76ms 
iter 8837: loss 2.4050, time 5002.95ms 
iter 8838: loss 2.3550, time 5025.39ms 
iter 8839: loss 2.3235, time 5030.73ms 
iter 8840: loss 2.5186, time 5024.82ms 
iter 8841: loss 2.5353, time 5024.99ms 
iter 8842: loss 2.1512, time 5023.82ms 
iter 8843: loss 2.5149, time 4950.13ms 
iter 8844: loss 2.5951, time 4975.66ms 
iter 8845: loss 2.5392, time 5022.80ms 
iter 8846: loss 2.3485, time 5023.15ms 
iter 8847: loss 2.4161, time 5025.23ms 
iter 8848: loss 2.4297, time 5027.61ms 
iter 8849: loss 2.6761, time 5025.47ms 
step 8850: train loss 2.4655, val loss 2.8570
iter 8850: loss 2.4951, time 19697.24ms 
iter 8851: loss 2.5074, time 5000.22ms 
iter 8852: loss 2.2593, time 5019.93ms 
iter 8853: loss 2.4265, time 5029.43ms 
iter 8854: loss 2.1108, time 5025.81ms 
iter 8855: loss 2.4708, time 5016.72ms 
iter 8856: loss 2.5507, time 4923.77ms 
iter 8857: loss 2.5227, time 4994.08ms 
iter 8858: loss 2.5499, time 5023.59ms 
iter 8859: loss 2.4317, time 5022.28ms 
iter 8860: loss 2.4401, time 5020.86ms 
iter 8861: loss 2.6173, time 5026.36ms 
iter 8862: loss 2.5405, time 5027.24ms 
iter 8863: loss 2.3395, time 5023.31ms 
iter 8864: loss 2.5526, time 4971.46ms 
iter 8865: loss 2.7072, time 4975.34ms 
iter 8866: loss 2.5645, time 5001.66ms 
iter 8867: loss 2.4917, time 5020.43ms 
iter 8868: loss 2.6524, time 5022.01ms 
iter 8869: loss 2.2550, time 4970.72ms 
iter 8870: loss 2.3769, time 4947.83ms 
iter 8871: loss 2.5633, time 4917.48ms 
iter 8872: loss 2.4162, time 4921.55ms 
iter 8873: loss 2.4814, time 4978.96ms 
iter 8874: loss 2.6210, time 4930.89ms 
iter 8875: loss 2.3790, time 4913.83ms 
iter 8876: loss 2.3768, time 4956.37ms 
iter 8877: loss 2.4572, time 5018.67ms 
iter 8878: loss 2.5081, time 5026.09ms 
iter 8879: loss 2.5597, time 5025.75ms 
iter 8880: loss 2.5781, time 4972.31ms 
iter 8881: loss 2.6733, time 4979.23ms 
iter 8882: loss 2.5561, time 4971.39ms 
iter 8883: loss 2.6447, time 5029.84ms 
iter 8884: loss 2.3067, time 5034.75ms 
iter 8885: loss 2.5263, time 5029.04ms 
iter 8886: loss 2.5154, time 5031.54ms 
iter 8887: loss 2.5337, time 5028.58ms 
iter 8888: loss 2.6044, time 5016.92ms 
iter 8889: loss 2.6273, time 5031.47ms 
iter 8890: loss 2.5374, time 5036.66ms 
iter 8891: loss 2.5077, time 5033.52ms 
iter 8892: loss 2.5159, time 5026.77ms 
iter 8893: loss 2.6285, time 5025.56ms 
iter 8894: loss 2.1325, time 4992.69ms 
iter 8895: loss 2.4392, time 4957.94ms 
iter 8896: loss 2.5573, time 4964.50ms 
iter 8897: loss 2.5403, time 5028.28ms 
iter 8898: loss 2.4955, time 5019.48ms 
iter 8899: loss 2.6370, time 5030.29ms 
step 8900: train loss 2.4633, val loss 2.8438
iter 8900: loss 2.4092, time 19619.70ms 
iter 8901: loss 2.5313, time 4976.59ms 
iter 8902: loss 2.5501, time 5019.54ms 
iter 8903: loss 2.7208, time 5020.91ms 
iter 8904: loss 2.5185, time 5019.34ms 
iter 8905: loss 2.4100, time 5021.04ms 
iter 8906: loss 2.3639, time 5020.24ms 
iter 8907: loss 2.1132, time 5026.91ms 
iter 8908: loss 2.5237, time 4979.68ms 
iter 8909: loss 2.4262, time 4966.77ms 
iter 8910: loss 2.5500, time 4957.51ms 
iter 8911: loss 2.3419, time 4923.83ms 
iter 8912: loss 2.4756, time 5023.36ms 
iter 8913: loss 2.5627, time 5020.80ms 
iter 8914: loss 2.3648, time 5021.63ms 
iter 8915: loss 2.5408, time 5024.78ms 
iter 8916: loss 2.3965, time 5022.18ms 
iter 8917: loss 2.4772, time 5000.97ms 
iter 8918: loss 2.5812, time 5027.05ms 
iter 8919: loss 2.2013, time 5010.32ms 
iter 8920: loss 2.5342, time 5019.14ms 
iter 8921: loss 2.4431, time 5016.37ms 
iter 8922: loss 2.3265, time 5029.59ms 
iter 8923: loss 2.5620, time 5032.42ms 
iter 8924: loss 2.3586, time 5032.15ms 
iter 8925: loss 2.3244, time 5038.48ms 
iter 8926: loss 2.3723, time 4988.22ms 
iter 8927: loss 2.4192, time 5030.42ms 
iter 8928: loss 2.5437, time 5025.32ms 
iter 8929: loss 2.1520, time 5026.64ms 
iter 8930: loss 2.5589, time 5033.53ms 
iter 8931: loss 2.5869, time 5034.01ms 
iter 8932: loss 2.2950, time 5030.59ms 
iter 8933: loss 2.3966, time 5029.97ms 
iter 8934: loss 2.5884, time 4996.17ms 
iter 8935: loss 2.4727, time 5029.02ms 
iter 8936: loss 2.2411, time 5036.30ms 
iter 8937: loss 2.3946, time 5032.11ms 
iter 8938: loss 2.6292, time 5029.89ms 
iter 8939: loss 2.6586, time 5036.74ms 
iter 8940: loss 2.5117, time 5019.90ms 
iter 8941: loss 2.5279, time 5036.82ms 
iter 8942: loss 2.5397, time 4988.42ms 
iter 8943: loss 2.3421, time 5025.15ms 
iter 8944: loss 2.2719, time 5010.15ms 
iter 8945: loss 2.4299, time 5001.66ms 
iter 8946: loss 2.5378, time 4974.61ms 
iter 8947: loss 2.3302, time 5024.91ms 
iter 8948: loss 2.5642, time 5010.01ms 
iter 8949: loss 2.3939, time 5026.48ms 
step 8950: train loss 2.4719, val loss 2.8645
iter 8950: loss 2.3657, time 19699.92ms 
iter 8951: loss 2.4333, time 5025.34ms 
iter 8952: loss 2.5962, time 5024.30ms 
iter 8953: loss 2.1642, time 5021.94ms 
iter 8954: loss 2.4972, time 5024.85ms 
iter 8955: loss 2.3074, time 4994.26ms 
iter 8956: loss 2.6249, time 5021.24ms 
iter 8957: loss 2.4951, time 5032.69ms 
iter 8958: loss 2.5131, time 5028.39ms 
iter 8959: loss 2.6240, time 5027.43ms 
iter 8960: loss 2.1845, time 5029.06ms 
iter 8961: loss 2.4287, time 5016.98ms 
iter 8962: loss 2.4989, time 4978.06ms 
iter 8963: loss 2.4973, time 4995.30ms 
iter 8964: loss 2.5383, time 5007.85ms 
iter 8965: loss 2.4831, time 5028.75ms 
iter 8966: loss 2.5005, time 5023.60ms 
iter 8967: loss 2.5025, time 4993.86ms 
iter 8968: loss 2.6501, time 5019.30ms 
iter 8969: loss 2.3326, time 5029.73ms 
iter 8970: loss 2.4208, time 5000.49ms 
iter 8971: loss 2.6766, time 4952.80ms 
iter 8972: loss 2.3615, time 5027.24ms 
iter 8973: loss 2.4896, time 5028.72ms 
iter 8974: loss 2.3273, time 5028.56ms 
iter 8975: loss 2.5567, time 5029.13ms 
iter 8976: loss 2.4689, time 5025.18ms 
iter 8977: loss 2.1821, time 5012.77ms 
iter 8978: loss 2.4684, time 4984.03ms 
iter 8979: loss 2.4588, time 5017.04ms 
iter 8980: loss 2.4820, time 5026.41ms 
iter 8981: loss 2.4078, time 5024.26ms 
iter 8982: loss 2.3664, time 5021.35ms 
iter 8983: loss 2.3751, time 5025.78ms 
iter 8984: loss 2.3590, time 5037.70ms 
iter 8985: loss 2.4595, time 5040.08ms 
iter 8986: loss 2.5607, time 4984.35ms 
iter 8987: loss 2.4617, time 5014.75ms 
iter 8988: loss 2.4072, time 5022.85ms 
iter 8989: loss 2.5332, time 5021.35ms 
iter 8990: loss 2.4522, time 5023.50ms 
iter 8991: loss 2.3271, time 5024.05ms 
iter 8992: loss 2.5182, time 5030.07ms 
iter 8993: loss 2.5609, time 5018.10ms 
iter 8994: loss 2.7288, time 4917.08ms 
iter 8995: loss 2.4285, time 4998.82ms 
iter 8996: loss 2.3831, time 4982.92ms 
iter 8997: loss 2.5883, time 5025.30ms 
iter 8998: loss 2.8142, time 5028.31ms 
iter 8999: loss 2.3573, time 5030.04ms 
step 9000: train loss 2.4665, val loss 2.8482
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 9000: loss 2.3805, time 20687.77ms 
iter 9001: loss 2.5621, time 4995.93ms 
iter 9002: loss 2.3480, time 5029.79ms 
iter 9003: loss 2.3610, time 5026.52ms 
iter 9004: loss 2.4921, time 5027.86ms 
iter 9005: loss 2.3549, time 5022.49ms 
iter 9006: loss 2.2124, time 5023.68ms 
iter 9007: loss 2.6706, time 5015.09ms 
iter 9008: loss 2.5696, time 4933.03ms 
iter 9009: loss 2.3255, time 5028.06ms 
iter 9010: loss 2.6409, time 5034.58ms 
iter 9011: loss 2.2084, time 5028.38ms 
iter 9012: loss 2.4024, time 5019.35ms 
iter 9013: loss 2.5416, time 5005.74ms 
iter 9014: loss 2.5027, time 5003.03ms 
iter 9015: loss 2.4819, time 5029.60ms 
iter 9016: loss 2.5775, time 5023.56ms 
iter 9017: loss 2.3483, time 5020.77ms 
iter 9018: loss 2.5076, time 5024.95ms 
iter 9019: loss 2.4456, time 5027.06ms 
iter 9020: loss 2.4299, time 5030.54ms 
iter 9021: loss 2.4771, time 5030.08ms 
iter 9022: loss 2.2741, time 5028.92ms 
iter 9023: loss 2.5795, time 4989.81ms 
iter 9024: loss 2.4823, time 5023.72ms 
iter 9025: loss 2.5864, time 5004.84ms 
iter 9026: loss 2.1808, time 5027.31ms 
iter 9027: loss 2.5564, time 5014.62ms 
iter 9028: loss 2.6075, time 4961.02ms 
iter 9029: loss 2.1268, time 5020.26ms 
iter 9030: loss 2.2415, time 5013.28ms 
iter 9031: loss 2.4418, time 5029.03ms 
iter 9032: loss 2.4336, time 5015.83ms 
iter 9033: loss 2.1484, time 5026.18ms 
iter 9034: loss 2.6940, time 5024.38ms 
iter 9035: loss 2.5421, time 5025.90ms 
iter 9036: loss 2.6518, time 4977.17ms 
iter 9037: loss 2.4634, time 5021.75ms 
iter 9038: loss 2.4322, time 5026.43ms 
iter 9039: loss 2.4399, time 5027.14ms 
iter 9040: loss 2.5726, time 5026.14ms 
iter 9041: loss 2.4785, time 5024.75ms 
iter 9042: loss 2.4783, time 5007.99ms 
iter 9043: loss 2.4932, time 5012.16ms 
iter 9044: loss 2.2257, time 4918.50ms 
iter 9045: loss 2.6125, time 5000.09ms 
iter 9046: loss 2.3816, time 5030.58ms 
iter 9047: loss 2.5492, time 5021.68ms 
iter 9048: loss 2.3738, time 5027.74ms 
iter 9049: loss 2.3261, time 5021.51ms 
step 9050: train loss 2.4595, val loss 2.8376
iter 9050: loss 2.5989, time 19661.41ms 
iter 9051: loss 2.4285, time 5017.02ms 
iter 9052: loss 2.3779, time 5019.30ms 
iter 9053: loss 2.6305, time 5010.79ms 
iter 9054: loss 2.4890, time 5032.06ms 
iter 9055: loss 2.5383, time 5031.66ms 
iter 9056: loss 2.4445, time 5029.70ms 
iter 9057: loss 2.0992, time 5020.46ms 
iter 9058: loss 2.5651, time 4948.48ms 
iter 9059: loss 2.4883, time 4916.71ms 
iter 9060: loss 2.4087, time 4994.18ms 
iter 9061: loss 2.6941, time 5029.02ms 
iter 9062: loss 2.4060, time 5015.36ms 
iter 9063: loss 2.3069, time 5029.60ms 
iter 9064: loss 2.5164, time 5031.57ms 
iter 9065: loss 2.6496, time 5025.71ms 
iter 9066: loss 2.5350, time 5008.62ms 
iter 9067: loss 2.5067, time 4980.73ms 
iter 9068: loss 2.6524, time 4991.87ms 
iter 9069: loss 2.6362, time 5027.85ms 
iter 9070: loss 2.6592, time 5019.36ms 
iter 9071: loss 2.4720, time 5029.96ms 
iter 9072: loss 2.5340, time 5032.08ms 
iter 9073: loss 2.5533, time 5026.55ms 
iter 9074: loss 2.3711, time 5030.24ms 
iter 9075: loss 2.4580, time 4981.98ms 
iter 9076: loss 2.4546, time 4959.70ms 
iter 9077: loss 2.5184, time 5028.15ms 
iter 9078: loss 2.2006, time 5024.70ms 
iter 9079: loss 2.3359, time 5023.96ms 
iter 9080: loss 2.4996, time 5030.53ms 
iter 9081: loss 2.3828, time 4988.24ms 
iter 9082: loss 2.4288, time 5006.16ms 
iter 9083: loss 2.6059, time 5024.11ms 
iter 9084: loss 2.6006, time 4956.67ms 
iter 9085: loss 2.4756, time 4985.78ms 
iter 9086: loss 2.5781, time 4981.46ms 
iter 9087: loss 2.3441, time 5018.49ms 
iter 9088: loss 2.5789, time 5020.94ms 
iter 9089: loss 2.3109, time 5005.93ms 
iter 9090: loss 2.5706, time 5024.67ms 
iter 9091: loss 2.6462, time 5027.92ms 
iter 9092: loss 2.5870, time 4982.80ms 
iter 9093: loss 2.5166, time 4963.54ms 
iter 9094: loss 2.7412, time 5027.10ms 
iter 9095: loss 2.5920, time 5034.48ms 
iter 9096: loss 2.1676, time 5027.09ms 
iter 9097: loss 2.3463, time 5025.51ms 
iter 9098: loss 2.3765, time 5024.69ms 
iter 9099: loss 2.6650, time 5030.49ms 
step 9100: train loss 2.4788, val loss 2.8400
iter 9100: loss 2.6697, time 19674.89ms 
iter 9101: loss 2.2886, time 5020.84ms 
iter 9102: loss 2.5261, time 5020.92ms 
iter 9103: loss 2.3182, time 5023.97ms 
iter 9104: loss 2.4292, time 5027.30ms 
iter 9105: loss 2.5162, time 4994.51ms 
iter 9106: loss 2.6436, time 4924.22ms 
iter 9107: loss 2.6199, time 5008.50ms 
iter 9108: loss 2.3608, time 5025.42ms 
iter 9109: loss 2.6064, time 5020.15ms 
iter 9110: loss 2.5979, time 5011.55ms 
iter 9111: loss 2.4804, time 5010.33ms 
iter 9112: loss 2.6140, time 5028.43ms 
iter 9113: loss 2.6237, time 4941.72ms 
iter 9114: loss 2.5643, time 4920.25ms 
iter 9115: loss 2.4625, time 4996.80ms 
iter 9116: loss 2.6549, time 5014.96ms 
iter 9117: loss 2.3867, time 5022.93ms 
iter 9118: loss 2.4947, time 5018.55ms 
iter 9119: loss 2.4548, time 5028.55ms 
iter 9120: loss 2.1811, time 5025.57ms 
iter 9121: loss 2.5340, time 5028.69ms 
iter 9122: loss 2.3078, time 5013.81ms 
iter 9123: loss 2.4821, time 5008.48ms 
iter 9124: loss 2.2707, time 5020.77ms 
iter 9125: loss 2.4815, time 5024.67ms 
iter 9126: loss 2.3915, time 5002.46ms 
iter 9127: loss 2.5559, time 4981.93ms 
iter 9128: loss 2.3250, time 5019.57ms 
iter 9129: loss 2.5990, time 5020.40ms 
iter 9130: loss 2.4101, time 4916.54ms 
iter 9131: loss 2.2662, time 4983.22ms 
iter 9132: loss 2.4439, time 4995.62ms 
iter 9133: loss 2.4184, time 5014.02ms 
iter 9134: loss 2.3826, time 4959.63ms 
iter 9135: loss 2.6795, time 5016.75ms 
iter 9136: loss 2.3924, time 5026.62ms 
iter 9137: loss 2.5586, time 5027.19ms 
iter 9138: loss 2.3288, time 4972.49ms 
iter 9139: loss 2.4151, time 4914.86ms 
iter 9140: loss 2.3656, time 4913.84ms 
iter 9141: loss 2.5418, time 4914.12ms 
iter 9142: loss 2.5490, time 5014.76ms 
iter 9143: loss 2.6150, time 5049.91ms 
iter 9144: loss 2.5251, time 5032.33ms 
iter 9145: loss 2.4644, time 5032.11ms 
iter 9146: loss 2.3684, time 5042.03ms 
iter 9147: loss 2.5658, time 5027.86ms 
iter 9148: loss 2.3477, time 5031.45ms 
iter 9149: loss 2.2612, time 5024.73ms 
step 9150: train loss 2.4414, val loss 2.8631
iter 9150: loss 2.3291, time 19713.96ms 
iter 9151: loss 2.5870, time 5025.67ms 
iter 9152: loss 2.7311, time 5004.93ms 
iter 9153: loss 2.6063, time 5023.94ms 
iter 9154: loss 2.4097, time 5023.34ms 
iter 9155: loss 2.7395, time 5020.58ms 
iter 9156: loss 2.4362, time 5027.77ms 
iter 9157: loss 2.3461, time 5027.04ms 
iter 9158: loss 2.4322, time 5033.67ms 
iter 9159: loss 2.7680, time 5030.13ms 
iter 9160: loss 2.5659, time 5006.34ms 
iter 9161: loss 2.6127, time 5030.03ms 
iter 9162: loss 2.4293, time 5027.80ms 
iter 9163: loss 2.4773, time 5017.11ms 
iter 9164: loss 2.4688, time 5025.76ms 
iter 9165: loss 2.5690, time 5025.57ms 
iter 9166: loss 2.3114, time 5021.59ms 
iter 9167: loss 2.3942, time 5027.20ms 
iter 9168: loss 2.3566, time 4990.96ms 
iter 9169: loss 2.3524, time 5025.32ms 
iter 9170: loss 2.6334, time 5025.82ms 
iter 9171: loss 2.1871, time 5030.17ms 
iter 9172: loss 2.3736, time 5031.82ms 
iter 9173: loss 2.5389, time 5026.81ms 
iter 9174: loss 2.4538, time 5030.48ms 
iter 9175: loss 2.4824, time 5035.29ms 
iter 9176: loss 2.2761, time 4987.89ms 
iter 9177: loss 2.3952, time 5033.82ms 
iter 9178: loss 2.5729, time 5031.56ms 
iter 9179: loss 2.8110, time 5034.58ms 
iter 9180: loss 2.4101, time 5032.96ms 
iter 9181: loss 2.2820, time 5026.04ms 
iter 9182: loss 2.5191, time 5031.53ms 
iter 9183: loss 2.5501, time 5033.58ms 
iter 9184: loss 2.4155, time 4996.48ms 
iter 9185: loss 2.5365, time 5020.07ms 
iter 9186: loss 2.3611, time 5030.55ms 
iter 9187: loss 2.4820, time 5031.46ms 
iter 9188: loss 2.3077, time 5027.96ms 
iter 9189: loss 2.5678, time 5026.89ms 
iter 9190: loss 2.5751, time 5029.05ms 
iter 9191: loss 2.5752, time 5028.78ms 
iter 9192: loss 2.5503, time 5039.75ms 
iter 9193: loss 2.3897, time 5004.71ms 
iter 9194: loss 2.4791, time 5037.63ms 
iter 9195: loss 2.3546, time 5033.91ms 
iter 9196: loss 2.4203, time 5031.14ms 
iter 9197: loss 2.7073, time 5030.39ms 
iter 9198: loss 2.4015, time 5010.93ms 
iter 9199: loss 2.3246, time 5024.61ms 
step 9200: train loss 2.4719, val loss 2.8555
iter 9200: loss 2.6086, time 19701.48ms 
iter 9201: loss 2.6745, time 5021.08ms 
iter 9202: loss 2.5981, time 5030.72ms 
iter 9203: loss 2.5590, time 5029.93ms 
iter 9204: loss 2.5549, time 5019.31ms 
iter 9205: loss 2.5027, time 5026.75ms 
iter 9206: loss 2.3315, time 4978.50ms 
iter 9207: loss 2.3499, time 4984.82ms 
iter 9208: loss 2.3333, time 5024.72ms 
iter 9209: loss 2.6296, time 5015.95ms 
iter 9210: loss 2.5825, time 5024.94ms 
iter 9211: loss 2.4823, time 5023.78ms 
iter 9212: loss 2.5916, time 5025.64ms 
iter 9213: loss 2.6767, time 5007.41ms 
iter 9214: loss 2.5022, time 4916.95ms 
iter 9215: loss 2.4420, time 4928.69ms 
iter 9216: loss 2.5577, time 5015.20ms 
iter 9217: loss 2.5023, time 5030.15ms 
iter 9218: loss 2.6164, time 5007.71ms 
iter 9219: loss 2.3103, time 5021.57ms 
iter 9220: loss 2.4080, time 5016.68ms 
iter 9221: loss 2.2910, time 5023.53ms 
iter 9222: loss 2.6295, time 5031.35ms 
iter 9223: loss 2.5941, time 5014.94ms 
iter 9224: loss 2.4247, time 5022.38ms 
iter 9225: loss 2.4101, time 5021.38ms 
iter 9226: loss 2.3088, time 5019.29ms 
iter 9227: loss 2.3843, time 5023.16ms 
iter 9228: loss 2.3949, time 5025.27ms 
iter 9229: loss 2.6616, time 5022.75ms 
iter 9230: loss 2.3690, time 4977.06ms 
iter 9231: loss 2.2083, time 4915.22ms 
iter 9232: loss 2.5170, time 4997.43ms 
iter 9233: loss 2.4996, time 5020.95ms 
iter 9234: loss 2.3917, time 5019.47ms 
iter 9235: loss 2.4009, time 5020.91ms 
iter 9236: loss 2.6458, time 5024.73ms 
iter 9237: loss 2.5019, time 5021.16ms 
iter 9238: loss 2.5065, time 5020.43ms 
iter 9239: loss 2.4766, time 4970.26ms 
iter 9240: loss 2.6517, time 4971.37ms 
iter 9241: loss 2.4371, time 5023.04ms 
iter 9242: loss 2.4592, time 5022.32ms 
iter 9243: loss 2.4807, time 5021.75ms 
iter 9244: loss 2.5515, time 5022.62ms 
iter 9245: loss 2.4795, time 5020.72ms 
iter 9246: loss 2.4026, time 5024.58ms 
iter 9247: loss 2.2497, time 4948.16ms 
iter 9248: loss 2.4689, time 4967.53ms 
iter 9249: loss 2.5376, time 5021.08ms 
step 9250: train loss 2.4601, val loss 2.8335
iter 9250: loss 2.2238, time 19675.54ms 
iter 9251: loss 2.6046, time 5026.04ms 
iter 9252: loss 2.5010, time 4973.34ms 
iter 9253: loss 2.3067, time 4934.81ms 
iter 9254: loss 2.5346, time 5028.68ms 
iter 9255: loss 2.2883, time 5030.91ms 
iter 9256: loss 2.4764, time 5027.93ms 
iter 9257: loss 2.3136, time 5029.30ms 
iter 9258: loss 2.3394, time 5027.35ms 
iter 9259: loss 2.4096, time 4994.64ms 
iter 9260: loss 2.4627, time 4993.18ms 
iter 9261: loss 2.6789, time 4922.37ms 
iter 9262: loss 2.6268, time 4999.31ms 
iter 9263: loss 2.3921, time 5027.89ms 
iter 9264: loss 2.4626, time 4996.18ms 
iter 9265: loss 2.2291, time 5026.52ms 
iter 9266: loss 2.2249, time 5030.26ms 
iter 9267: loss 2.6324, time 5023.95ms 
iter 9268: loss 2.4982, time 5024.92ms 
iter 9269: loss 2.5859, time 4973.33ms 
iter 9270: loss 2.1089, time 4928.96ms 
iter 9271: loss 2.3236, time 5024.22ms 
iter 9272: loss 2.6012, time 5023.72ms 
iter 9273: loss 2.3871, time 5026.45ms 
iter 9274: loss 2.5250, time 4995.01ms 
iter 9275: loss 2.5027, time 4937.16ms 
iter 9276: loss 2.5324, time 5006.64ms 
iter 9277: loss 2.3576, time 4995.95ms 
iter 9278: loss 2.2639, time 4918.25ms 
iter 9279: loss 2.4257, time 4962.29ms 
iter 9280: loss 2.5167, time 4998.54ms 
iter 9281: loss 2.5382, time 4992.18ms 
iter 9282: loss 2.4786, time 5028.37ms 
iter 9283: loss 2.7982, time 5029.03ms 
iter 9284: loss 2.4342, time 5026.46ms 
iter 9285: loss 2.7431, time 5026.66ms 
iter 9286: loss 2.6287, time 5006.93ms 
iter 9287: loss 2.3984, time 4980.35ms 
iter 9288: loss 2.4349, time 4995.77ms 
iter 9289: loss 2.5098, time 5029.03ms 
iter 9290: loss 2.5462, time 5027.60ms 
iter 9291: loss 2.6582, time 5028.75ms 
iter 9292: loss 2.3906, time 5014.60ms 
iter 9293: loss 2.2983, time 5030.65ms 
iter 9294: loss 2.5350, time 5027.06ms 
iter 9295: loss 2.5160, time 4970.63ms 
iter 9296: loss 2.5641, time 4974.27ms 
iter 9297: loss 2.5584, time 5024.56ms 
iter 9298: loss 2.4073, time 5021.85ms 
iter 9299: loss 2.4051, time 5023.60ms 
step 9300: train loss 2.4549, val loss 2.8473
iter 9300: loss 2.3637, time 19622.01ms 
iter 9301: loss 2.5006, time 5008.61ms 
iter 9302: loss 2.3660, time 5023.21ms 
iter 9303: loss 2.3962, time 5007.11ms 
iter 9304: loss 2.5062, time 5024.32ms 
iter 9305: loss 2.3449, time 4986.38ms 
iter 9306: loss 2.7711, time 4981.49ms 
iter 9307: loss 2.5020, time 5026.44ms 
iter 9308: loss 2.4454, time 4982.49ms 
iter 9309: loss 2.5079, time 5019.26ms 
iter 9310: loss 2.4328, time 5021.08ms 
iter 9311: loss 2.2836, time 5023.44ms 
iter 9312: loss 2.5222, time 5030.78ms 
iter 9313: loss 2.3228, time 5029.29ms 
iter 9314: loss 2.4482, time 5024.11ms 
iter 9315: loss 2.5933, time 5024.86ms 
iter 9316: loss 2.3970, time 4971.77ms 
iter 9317: loss 2.5691, time 4939.57ms 
iter 9318: loss 2.3179, time 5003.97ms 
iter 9319: loss 2.4195, time 5027.61ms 
iter 9320: loss 2.5122, time 5021.72ms 
iter 9321: loss 2.4867, time 5021.38ms 
iter 9322: loss 2.4337, time 5007.72ms 
iter 9323: loss 2.6112, time 5021.67ms 
iter 9324: loss 2.2258, time 5025.96ms 
iter 9325: loss 2.6051, time 4981.42ms 
iter 9326: loss 2.4780, time 4963.45ms 
iter 9327: loss 2.6122, time 5025.89ms 
iter 9328: loss 2.4470, time 5023.07ms 
iter 9329: loss 2.2903, time 5017.30ms 
iter 9330: loss 2.6290, time 5019.26ms 
iter 9331: loss 2.8853, time 5024.17ms 
iter 9332: loss 2.4828, time 5022.22ms 
iter 9333: loss 2.2826, time 5004.17ms 
iter 9334: loss 2.6212, time 5000.96ms 
iter 9335: loss 2.4486, time 5027.19ms 
iter 9336: loss 2.3414, time 5026.31ms 
iter 9337: loss 2.6387, time 5024.68ms 
iter 9338: loss 2.5435, time 5026.72ms 
iter 9339: loss 2.4023, time 5025.80ms 
iter 9340: loss 2.4885, time 5026.64ms 
iter 9341: loss 2.3003, time 4984.73ms 
iter 9342: loss 2.5846, time 4961.09ms 
iter 9343: loss 2.4140, time 5027.01ms 
iter 9344: loss 2.4115, time 4956.18ms 
iter 9345: loss 2.5496, time 5026.74ms 
iter 9346: loss 2.0768, time 5027.67ms 
iter 9347: loss 2.4509, time 5028.74ms 
iter 9348: loss 2.2160, time 5032.77ms 
iter 9349: loss 2.5294, time 4979.93ms 
step 9350: train loss 2.4479, val loss 2.8612
iter 9350: loss 2.5195, time 19698.07ms 
iter 9351: loss 2.5312, time 5025.84ms 
iter 9352: loss 2.4828, time 5025.66ms 
iter 9353: loss 2.5230, time 5035.10ms 
iter 9354: loss 2.4053, time 4988.05ms 
iter 9355: loss 2.2752, time 4959.11ms 
iter 9356: loss 2.4146, time 4951.61ms 
iter 9357: loss 2.3635, time 5025.63ms 
iter 9358: loss 2.4736, time 5025.07ms 
iter 9359: loss 2.2655, time 5013.64ms 
iter 9360: loss 2.3348, time 5020.04ms 
iter 9361: loss 2.3921, time 5021.74ms 
iter 9362: loss 2.4256, time 5019.71ms 
iter 9363: loss 2.2747, time 5016.72ms 
iter 9364: loss 2.4278, time 4951.82ms 
iter 9365: loss 2.4764, time 5017.35ms 
iter 9366: loss 2.3390, time 5026.72ms 
iter 9367: loss 2.3518, time 5015.50ms 
iter 9368: loss 2.4730, time 5012.57ms 
iter 9369: loss 2.6866, time 5004.26ms 
iter 9370: loss 2.4290, time 5021.23ms 
iter 9371: loss 2.5665, time 5030.99ms 
iter 9372: loss 2.4845, time 4992.48ms 
iter 9373: loss 2.6225, time 5006.59ms 
iter 9374: loss 2.4026, time 5008.64ms 
iter 9375: loss 2.4000, time 5026.32ms 
iter 9376: loss 2.4531, time 5027.53ms 
iter 9377: loss 2.4117, time 5029.59ms 
iter 9378: loss 2.5462, time 5026.54ms 
iter 9379: loss 2.6815, time 5010.65ms 
iter 9380: loss 2.4470, time 4948.73ms 
iter 9381: loss 2.3687, time 5031.99ms 
iter 9382: loss 2.4387, time 5034.83ms 
iter 9383: loss 2.6121, time 5041.20ms 
iter 9384: loss 2.3861, time 5031.51ms 
iter 9385: loss 2.4453, time 5028.08ms 
iter 9386: loss 2.3885, time 5023.34ms 
iter 9387: loss 2.3188, time 5027.94ms 
iter 9388: loss 2.5765, time 4938.45ms 
iter 9389: loss 2.4135, time 4984.51ms 
iter 9390: loss 2.3975, time 5018.14ms 
iter 9391: loss 2.3262, time 5017.47ms 
iter 9392: loss 2.4068, time 5027.23ms 
iter 9393: loss 2.4840, time 5018.16ms 
iter 9394: loss 2.4425, time 5021.46ms 
iter 9395: loss 2.4876, time 5025.26ms 
iter 9396: loss 2.5519, time 5007.37ms 
iter 9397: loss 2.2882, time 5004.90ms 
iter 9398: loss 2.4410, time 5021.56ms 
iter 9399: loss 2.3636, time 4977.77ms 
step 9400: train loss 2.4378, val loss 2.8476
iter 9400: loss 2.4418, time 19695.17ms 
iter 9401: loss 2.2383, time 5028.58ms 
iter 9402: loss 2.6354, time 4995.46ms 
iter 9403: loss 2.6637, time 5026.73ms 
iter 9404: loss 2.6131, time 5003.05ms 
iter 9405: loss 2.3396, time 5012.91ms 
iter 9406: loss 2.5021, time 5021.42ms 
iter 9407: loss 2.4208, time 5020.92ms 
iter 9408: loss 2.7593, time 5008.37ms 
iter 9409: loss 2.5487, time 5026.86ms 
iter 9410: loss 2.3735, time 4995.43ms 
iter 9411: loss 2.3001, time 5023.07ms 
iter 9412: loss 2.4174, time 5009.56ms 
iter 9413: loss 2.4292, time 5014.64ms 
iter 9414: loss 2.3801, time 5029.73ms 
iter 9415: loss 2.2442, time 5028.90ms 
iter 9416: loss 2.5378, time 5020.51ms 
iter 9417: loss 2.3491, time 5048.79ms 
iter 9418: loss 2.4747, time 5015.59ms 
iter 9419: loss 2.5187, time 5034.82ms 
iter 9420: loss 2.2886, time 5030.84ms 
iter 9421: loss 2.4651, time 5027.76ms 
iter 9422: loss 2.4977, time 5026.85ms 
iter 9423: loss 2.4436, time 5024.41ms 
iter 9424: loss 2.6142, time 4976.01ms 
iter 9425: loss 2.3914, time 4971.74ms 
iter 9426: loss 2.4453, time 4915.85ms 
iter 9427: loss 2.4991, time 5000.29ms 
iter 9428: loss 2.6652, time 5032.44ms 
iter 9429: loss 2.3698, time 5032.66ms 
iter 9430: loss 2.4309, time 5036.44ms 
iter 9431: loss 2.4676, time 5028.88ms 
iter 9432: loss 2.5306, time 5031.42ms 
iter 9433: loss 2.5027, time 5036.02ms 
iter 9434: loss 2.5647, time 4978.63ms 
iter 9435: loss 2.5132, time 5022.57ms 
iter 9436: loss 2.3418, time 5021.44ms 
iter 9437: loss 2.3897, time 5021.53ms 
iter 9438: loss 2.4064, time 4982.22ms 
iter 9439: loss 2.3028, time 4914.29ms 
iter 9440: loss 2.6764, time 4933.94ms 
iter 9441: loss 2.7830, time 5020.92ms 
iter 9442: loss 2.4831, time 4984.24ms 
iter 9443: loss 2.4415, time 5004.99ms 
iter 9444: loss 2.3148, time 5023.80ms 
iter 9445: loss 2.2785, time 5023.86ms 
iter 9446: loss 2.5042, time 5022.56ms 
iter 9447: loss 2.5386, time 5022.69ms 
iter 9448: loss 2.5574, time 5023.94ms 
iter 9449: loss 2.5434, time 5023.50ms 
step 9450: train loss 2.4674, val loss 2.8400
iter 9450: loss 2.6250, time 19664.45ms 
iter 9451: loss 2.4431, time 5022.50ms 
iter 9452: loss 2.4797, time 5020.39ms 
iter 9453: loss 2.6805, time 5020.63ms 
iter 9454: loss 2.5943, time 5023.82ms 
iter 9455: loss 2.2708, time 5027.71ms 
iter 9456: loss 2.5443, time 4972.28ms 
iter 9457: loss 2.5129, time 4980.98ms 
iter 9458: loss 2.5898, time 5018.12ms 
iter 9459: loss 2.4140, time 5033.61ms 
iter 9460: loss 2.4846, time 5028.07ms 
iter 9461: loss 2.5366, time 5024.70ms 
iter 9462: loss 2.4553, time 5026.87ms 
iter 9463: loss 2.4106, time 5029.40ms 
iter 9464: loss 2.6223, time 4982.59ms 
iter 9465: loss 2.4383, time 4917.23ms 
iter 9466: loss 2.4094, time 4976.05ms 
iter 9467: loss 2.6350, time 4999.22ms 
iter 9468: loss 2.5173, time 5012.13ms 
iter 9469: loss 2.3183, time 5029.36ms 
iter 9470: loss 2.5036, time 5017.69ms 
iter 9471: loss 2.4782, time 5016.39ms 
iter 9472: loss 2.5730, time 5015.21ms 
iter 9473: loss 2.4105, time 4983.34ms 
iter 9474: loss 2.6078, time 4986.77ms 
iter 9475: loss 2.2173, time 5026.88ms 
iter 9476: loss 2.3610, time 5025.89ms 
iter 9477: loss 2.3990, time 5026.38ms 
iter 9478: loss 2.2601, time 5023.05ms 
iter 9479: loss 2.5799, time 5031.49ms 
iter 9480: loss 2.6117, time 5028.60ms 
iter 9481: loss 2.2135, time 4970.75ms 
iter 9482: loss 2.3574, time 4983.10ms 
iter 9483: loss 2.5249, time 5008.06ms 
iter 9484: loss 2.5376, time 5009.68ms 
iter 9485: loss 2.5075, time 5010.99ms 
iter 9486: loss 2.4955, time 4921.95ms 
iter 9487: loss 2.4424, time 4913.93ms 
iter 9488: loss 2.3508, time 4913.60ms 
iter 9489: loss 2.3682, time 4914.79ms 
iter 9490: loss 2.4546, time 4914.25ms 
iter 9491: loss 2.6673, time 4914.27ms 
iter 9492: loss 2.4442, time 4914.26ms 
iter 9493: loss 2.6188, time 4916.66ms 
iter 9494: loss 2.4713, time 5019.47ms 
iter 9495: loss 2.6623, time 5025.09ms 
iter 9496: loss 2.1314, time 5027.07ms 
iter 9497: loss 2.3099, time 5026.39ms 
iter 9498: loss 2.4523, time 4987.38ms 
iter 9499: loss 2.5338, time 5024.13ms 
step 9500: train loss 2.4420, val loss 2.8529
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 9500: loss 2.5119, time 20727.87ms 
iter 9501: loss 2.4492, time 5021.57ms 
iter 9502: loss 2.1585, time 4976.71ms 
iter 9503: loss 2.5284, time 4960.96ms 
iter 9504: loss 2.5538, time 5020.44ms 
iter 9505: loss 2.5646, time 5027.89ms 
iter 9506: loss 2.3014, time 5029.89ms 
iter 9507: loss 2.5310, time 5026.60ms 
iter 9508: loss 2.3512, time 4982.75ms 
iter 9509: loss 2.4505, time 4982.47ms 
iter 9510: loss 2.5125, time 4960.43ms 
iter 9511: loss 2.1175, time 4927.54ms 
iter 9512: loss 2.6755, time 4969.10ms 
iter 9513: loss 2.4566, time 4996.72ms 
iter 9514: loss 2.6304, time 5008.35ms 
iter 9515: loss 2.4198, time 5023.57ms 
iter 9516: loss 2.2743, time 5023.28ms 
iter 9517: loss 2.3457, time 5017.68ms 
iter 9518: loss 2.4297, time 5017.26ms 
iter 9519: loss 2.4595, time 4983.20ms 
iter 9520: loss 2.1846, time 4966.21ms 
iter 9521: loss 2.4088, time 5026.34ms 
iter 9522: loss 2.2004, time 5024.51ms 
iter 9523: loss 2.5088, time 4998.19ms 
iter 9524: loss 2.7114, time 5032.44ms 
iter 9525: loss 2.3543, time 4989.96ms 
iter 9526: loss 2.5253, time 4957.20ms 
iter 9527: loss 2.4796, time 4915.26ms 
iter 9528: loss 2.5256, time 4957.30ms 
iter 9529: loss 2.1630, time 5017.51ms 
iter 9530: loss 2.4506, time 4947.01ms 
iter 9531: loss 1.9804, time 4959.33ms 
iter 9532: loss 2.7087, time 4950.13ms 
iter 9533: loss 2.4614, time 4970.95ms 
iter 9534: loss 2.5450, time 5000.39ms 
iter 9535: loss 2.5115, time 4986.89ms 
iter 9536: loss 2.2624, time 4947.57ms 
iter 9537: loss 2.6464, time 5025.43ms 
iter 9538: loss 2.4822, time 5025.56ms 
iter 9539: loss 2.5730, time 5021.46ms 
iter 9540: loss 2.3978, time 5027.87ms 
iter 9541: loss 2.4291, time 5026.36ms 
iter 9542: loss 2.5546, time 5027.85ms 
iter 9543: loss 2.2529, time 4991.43ms 
iter 9544: loss 2.4731, time 4917.35ms 
iter 9545: loss 2.6056, time 4992.88ms 
iter 9546: loss 2.3384, time 5019.53ms 
iter 9547: loss 2.4107, time 5025.09ms 
iter 9548: loss 2.3142, time 4999.51ms 
iter 9549: loss 2.4538, time 5015.57ms 
step 9550: train loss 2.4500, val loss 2.8600
iter 9550: loss 2.4934, time 19670.59ms 
iter 9551: loss 2.5821, time 5023.76ms 
iter 9552: loss 2.2697, time 5029.50ms 
iter 9553: loss 2.5397, time 5005.41ms 
iter 9554: loss 2.4457, time 5024.77ms 
iter 9555: loss 2.6865, time 5023.61ms 
iter 9556: loss 2.3726, time 5026.88ms 
iter 9557: loss 2.3467, time 5028.52ms 
iter 9558: loss 2.4618, time 5015.16ms 
iter 9559: loss 2.4816, time 5022.76ms 
iter 9560: loss 2.3680, time 5026.97ms 
iter 9561: loss 2.3634, time 5018.17ms 
iter 9562: loss 2.4181, time 5022.77ms 
iter 9563: loss 2.6206, time 5022.78ms 
iter 9564: loss 2.5229, time 5012.30ms 
iter 9565: loss 2.4167, time 5016.82ms 
iter 9566: loss 2.5854, time 4980.70ms 
iter 9567: loss 2.4111, time 5023.13ms 
iter 9568: loss 2.4119, time 5021.21ms 
iter 9569: loss 2.0492, time 5001.04ms 
iter 9570: loss 2.2980, time 5032.77ms 
iter 9571: loss 2.5540, time 5010.54ms 
iter 9572: loss 2.6345, time 5026.05ms 
iter 9573: loss 2.5525, time 4928.83ms 
iter 9574: loss 2.3385, time 4919.07ms 
iter 9575: loss 2.4175, time 4933.39ms 
iter 9576: loss 2.5595, time 5028.18ms 
iter 9577: loss 2.5513, time 5021.01ms 
iter 9578: loss 2.4574, time 5026.30ms 
iter 9579: loss 2.4623, time 5027.73ms 
iter 9580: loss 2.5698, time 5026.56ms 
iter 9581: loss 2.4611, time 5030.39ms 
iter 9582: loss 2.2391, time 5035.41ms 
iter 9583: loss 2.4650, time 5013.23ms 
iter 9584: loss 2.0273, time 5017.57ms 
iter 9585: loss 2.4521, time 5028.61ms 
iter 9586: loss 2.3511, time 5026.47ms 
iter 9587: loss 2.2511, time 5025.08ms 
iter 9588: loss 2.4620, time 5023.65ms 
iter 9589: loss 2.5494, time 5012.63ms 
iter 9590: loss 2.4181, time 4939.11ms 
iter 9591: loss 2.3810, time 4998.30ms 
iter 9592: loss 2.4902, time 5022.89ms 
iter 9593: loss 2.5940, time 5014.97ms 
iter 9594: loss 2.5096, time 5003.89ms 
iter 9595: loss 2.5585, time 5021.39ms 
iter 9596: loss 2.3021, time 5024.70ms 
iter 9597: loss 2.3197, time 5029.86ms 
iter 9598: loss 2.3155, time 4976.79ms 
iter 9599: loss 2.3542, time 4957.66ms 
step 9600: train loss 2.4440, val loss 2.8516
iter 9600: loss 2.5822, time 19649.05ms 
iter 9601: loss 2.5230, time 5033.11ms 
iter 9602: loss 2.4897, time 5033.96ms 
iter 9603: loss 2.4638, time 5032.62ms 
iter 9604: loss 2.6880, time 5007.89ms 
iter 9605: loss 2.5616, time 4969.50ms 
iter 9606: loss 2.5210, time 4964.15ms 
iter 9607: loss 2.6429, time 5027.76ms 
iter 9608: loss 2.4156, time 5029.78ms 
iter 9609: loss 2.3881, time 5027.13ms 
iter 9610: loss 2.6067, time 5027.88ms 
iter 9611: loss 2.4938, time 4987.45ms 
iter 9612: loss 2.0484, time 5021.68ms 
iter 9613: loss 2.2662, time 5027.60ms 
iter 9614: loss 2.6693, time 5029.54ms 
iter 9615: loss 2.4506, time 5038.25ms 
iter 9616: loss 2.6048, time 5035.21ms 
iter 9617: loss 2.2607, time 5035.14ms 
iter 9618: loss 2.4254, time 5032.46ms 
iter 9619: loss 2.5867, time 5038.77ms 
iter 9620: loss 2.5057, time 4969.25ms 
iter 9621: loss 2.2675, time 5023.10ms 
iter 9622: loss 2.2551, time 5034.66ms 
iter 9623: loss 2.3711, time 5034.35ms 
iter 9624: loss 2.5004, time 5033.43ms 
iter 9625: loss 2.2882, time 5034.10ms 
iter 9626: loss 2.6725, time 5026.27ms 
iter 9627: loss 2.3961, time 5036.61ms 
iter 9628: loss 2.5905, time 4941.79ms 
iter 9629: loss 2.4823, time 4986.17ms 
iter 9630: loss 2.5115, time 5025.85ms 
iter 9631: loss 2.3700, time 5003.15ms 
iter 9632: loss 2.3319, time 5018.00ms 
iter 9633: loss 2.4181, time 5027.95ms 
iter 9634: loss 2.2618, time 5037.72ms 
iter 9635: loss 2.5151, time 5040.19ms 
iter 9636: loss 2.5047, time 4981.45ms 
iter 9637: loss 2.3182, time 4998.07ms 
iter 9638: loss 2.5955, time 5027.19ms 
iter 9639: loss 2.3078, time 5019.90ms 
iter 9640: loss 2.5752, time 5020.76ms 
iter 9641: loss 2.3410, time 5029.20ms 
iter 9642: loss 2.4241, time 5021.18ms 
iter 9643: loss 2.4522, time 5031.66ms 
iter 9644: loss 2.4780, time 4998.55ms 
iter 9645: loss 2.4349, time 5022.36ms 
iter 9646: loss 2.6846, time 5024.03ms 
iter 9647: loss 2.5150, time 5023.97ms 
iter 9648: loss 2.5190, time 5023.37ms 
iter 9649: loss 2.2841, time 5028.31ms 
step 9650: train loss 2.4273, val loss 2.8661
iter 9650: loss 2.4548, time 19706.17ms 
iter 9651: loss 2.4643, time 4988.08ms 
iter 9652: loss 2.2905, time 5016.63ms 
iter 9653: loss 2.4917, time 5004.11ms 
iter 9654: loss 2.5411, time 5021.37ms 
iter 9655: loss 2.2952, time 5023.79ms 
iter 9656: loss 2.4162, time 5030.18ms 
iter 9657: loss 2.3529, time 5001.80ms 
iter 9658: loss 2.5685, time 5014.07ms 
iter 9659: loss 2.3581, time 5024.06ms 
iter 9660: loss 2.4351, time 5029.19ms 
iter 9661: loss 2.5820, time 5019.03ms 
iter 9662: loss 2.3716, time 5027.77ms 
iter 9663: loss 2.5839, time 4985.11ms 
iter 9664: loss 2.4471, time 4921.02ms 
iter 9665: loss 2.4013, time 5007.83ms 
iter 9666: loss 2.3296, time 5028.57ms 
iter 9667: loss 2.7155, time 5017.32ms 
iter 9668: loss 2.3674, time 5024.14ms 
iter 9669: loss 2.4706, time 5028.38ms 
iter 9670: loss 2.2891, time 5029.63ms 
iter 9671: loss 2.3065, time 5011.06ms 
iter 9672: loss 2.4188, time 4939.48ms 
iter 9673: loss 2.3770, time 5026.96ms 
iter 9674: loss 2.6251, time 5032.38ms 
iter 9675: loss 2.1575, time 5025.64ms 
iter 9676: loss 2.4676, time 5031.65ms 
iter 9677: loss 2.3245, time 5024.25ms 
iter 9678: loss 2.4071, time 5026.56ms 
iter 9679: loss 2.6343, time 5031.56ms 
iter 9680: loss 2.4917, time 4980.82ms 
iter 9681: loss 2.3847, time 5027.65ms 
iter 9682: loss 2.4949, time 5020.84ms 
iter 9683: loss 2.4274, time 5024.16ms 
iter 9684: loss 2.4458, time 5023.92ms 
iter 9685: loss 2.3453, time 5021.86ms 
iter 9686: loss 2.4905, time 5027.68ms 
iter 9687: loss 2.3407, time 4976.63ms 
iter 9688: loss 2.3314, time 5019.10ms 
iter 9689: loss 2.5906, time 5029.37ms 
iter 9690: loss 2.2293, time 5029.33ms 
iter 9691: loss 2.4991, time 4991.20ms 
iter 9692: loss 2.2840, time 5001.98ms 
iter 9693: loss 2.4507, time 5023.48ms 
iter 9694: loss 2.4945, time 5028.63ms 
iter 9695: loss 2.7025, time 4931.38ms 
iter 9696: loss 2.1967, time 5014.78ms 
iter 9697: loss 2.2488, time 5025.31ms 
iter 9698: loss 2.4075, time 5023.92ms 
iter 9699: loss 2.3705, time 5024.22ms 
step 9700: train loss 2.4617, val loss 2.8473
iter 9700: loss 2.5076, time 19674.18ms 
iter 9701: loss 2.3964, time 5023.43ms 
iter 9702: loss 2.2769, time 5022.02ms 
iter 9703: loss 2.4928, time 5026.27ms 
iter 9704: loss 2.1291, time 5016.71ms 
iter 9705: loss 2.5525, time 5030.93ms 
iter 9706: loss 2.6332, time 5003.45ms 
iter 9707: loss 2.4256, time 5022.54ms 
iter 9708: loss 2.4942, time 4997.25ms 
iter 9709: loss 2.4088, time 4960.56ms 
iter 9710: loss 2.1398, time 5036.60ms 
iter 9711: loss 2.3787, time 5007.51ms 
iter 9712: loss 2.4486, time 4994.64ms 
iter 9713: loss 2.4606, time 5029.11ms 
iter 9714: loss 2.4355, time 5005.08ms 
iter 9715: loss 2.5635, time 4982.03ms 
iter 9716: loss 2.2536, time 4968.37ms 
iter 9717: loss 2.2093, time 5006.49ms 
iter 9718: loss 2.1992, time 5017.76ms 
iter 9719: loss 2.2525, time 5008.34ms 
iter 9720: loss 2.3852, time 5016.98ms 
iter 9721: loss 2.4956, time 5004.04ms 
iter 9722: loss 2.5670, time 5027.36ms 
iter 9723: loss 2.4372, time 4979.89ms 
iter 9724: loss 2.3581, time 5014.32ms 
iter 9725: loss 2.4992, time 5029.49ms 
iter 9726: loss 2.7154, time 5013.12ms 
iter 9727: loss 2.4239, time 5003.58ms 
iter 9728: loss 2.5443, time 5027.33ms 
iter 9729: loss 2.3726, time 4988.90ms 
iter 9730: loss 2.5568, time 4974.84ms 
iter 9731: loss 2.3270, time 4955.63ms 
iter 9732: loss 2.5352, time 4938.59ms 
iter 9733: loss 2.4519, time 5025.79ms 
iter 9734: loss 2.5288, time 5029.36ms 
iter 9735: loss 2.2525, time 5027.94ms 
iter 9736: loss 2.4026, time 5026.33ms 
iter 9737: loss 2.2634, time 5026.18ms 
iter 9738: loss 2.3416, time 5024.19ms 
iter 9739: loss 2.6268, time 5027.98ms 
iter 9740: loss 2.3718, time 5025.89ms 
iter 9741: loss 2.6335, time 5023.53ms 
iter 9742: loss 2.5473, time 5026.72ms 
iter 9743: loss 2.4850, time 5021.30ms 
iter 9744: loss 1.9976, time 5021.69ms 
iter 9745: loss 2.5855, time 5023.45ms 
iter 9746: loss 2.6533, time 5023.38ms 
iter 9747: loss 2.4515, time 5001.57ms 
iter 9748: loss 2.1676, time 5026.47ms 
iter 9749: loss 2.3983, time 5024.15ms 
step 9750: train loss 2.4391, val loss 2.8555
iter 9750: loss 2.6282, time 19676.19ms 
iter 9751: loss 2.6321, time 4915.76ms 
iter 9752: loss 2.3479, time 4998.80ms 
iter 9753: loss 2.6116, time 5021.78ms 
iter 9754: loss 2.4188, time 5019.62ms 
iter 9755: loss 2.3250, time 5021.48ms 
iter 9756: loss 2.4563, time 5024.79ms 
iter 9757: loss 2.3980, time 5007.10ms 
iter 9758: loss 2.6285, time 5005.74ms 
iter 9759: loss 2.4521, time 4989.06ms 
iter 9760: loss 2.2620, time 5023.25ms 
iter 9761: loss 2.4334, time 5021.77ms 
iter 9762: loss 2.3912, time 5025.27ms 
iter 9763: loss 2.5128, time 5022.40ms 
iter 9764: loss 2.2844, time 5022.17ms 
iter 9765: loss 2.2684, time 5022.99ms 
iter 9766: loss 2.4131, time 4984.68ms 
iter 9767: loss 2.4834, time 5013.90ms 
iter 9768: loss 2.2603, time 5017.65ms 
iter 9769: loss 2.5062, time 5019.61ms 
iter 9770: loss 2.5250, time 5019.04ms 
iter 9771: loss 2.5579, time 5026.84ms 
iter 9772: loss 2.3374, time 5024.13ms 
iter 9773: loss 2.5834, time 5023.84ms 
iter 9774: loss 2.4192, time 4984.72ms 
iter 9775: loss 2.3534, time 5024.34ms 
iter 9776: loss 2.4466, time 5027.39ms 
iter 9777: loss 2.2896, time 5031.69ms 
iter 9778: loss 2.4704, time 5035.96ms 
iter 9779: loss 2.5141, time 4983.06ms 
iter 9780: loss 2.5470, time 4984.02ms 
iter 9781: loss 2.3774, time 5010.99ms 
iter 9782: loss 2.5736, time 5024.18ms 
iter 9783: loss 2.4022, time 4913.50ms 
iter 9784: loss 2.6598, time 4913.13ms 
iter 9785: loss 2.3677, time 4913.49ms 
iter 9786: loss 2.6398, time 4914.59ms 
iter 9787: loss 2.2948, time 4914.66ms 
iter 9788: loss 2.5251, time 4913.74ms 
iter 9789: loss 2.2209, time 4947.71ms 
iter 9790: loss 2.4362, time 5027.03ms 
iter 9791: loss 2.3744, time 5027.77ms 
iter 9792: loss 2.6189, time 5033.72ms 
iter 9793: loss 2.7028, time 5030.16ms 
iter 9794: loss 2.4395, time 5024.24ms 
iter 9795: loss 2.3765, time 5025.23ms 
iter 9796: loss 2.2595, time 5016.73ms 
iter 9797: loss 2.4759, time 5025.51ms 
iter 9798: loss 2.5028, time 5026.47ms 
iter 9799: loss 2.5296, time 5025.49ms 
step 9800: train loss 2.4514, val loss 2.8365
iter 9800: loss 2.3905, time 19636.81ms 
iter 9801: loss 2.4849, time 4973.31ms 
iter 9802: loss 2.4483, time 5029.74ms 
iter 9803: loss 2.4094, time 5034.69ms 
iter 9804: loss 2.4919, time 5006.85ms 
iter 9805: loss 2.3245, time 5024.59ms 
iter 9806: loss 2.3113, time 5025.43ms 
iter 9807: loss 2.3909, time 4978.34ms 
iter 9808: loss 2.3771, time 4990.25ms 
iter 9809: loss 2.3779, time 5005.40ms 
iter 9810: loss 2.4703, time 4988.46ms 
iter 9811: loss 2.4620, time 4984.29ms 
iter 9812: loss 2.4771, time 5017.37ms 
iter 9813: loss 2.5984, time 5024.76ms 
iter 9814: loss 2.3641, time 5028.02ms 
iter 9815: loss 2.3749, time 4997.28ms 
iter 9816: loss 2.3894, time 5023.47ms 
iter 9817: loss 2.4576, time 5013.67ms 
iter 9818: loss 2.4443, time 5028.60ms 
iter 9819: loss 2.6183, time 5033.36ms 
iter 9820: loss 2.2081, time 5026.71ms 
iter 9821: loss 2.5897, time 5028.96ms 
iter 9822: loss 2.4709, time 4990.98ms 
iter 9823: loss 2.4222, time 4942.37ms 
iter 9824: loss 2.4220, time 5017.28ms 
iter 9825: loss 2.4474, time 5025.51ms 
iter 9826: loss 2.2787, time 5023.80ms 
iter 9827: loss 2.4345, time 5020.67ms 
iter 9828: loss 2.4084, time 5020.78ms 
iter 9829: loss 2.3727, time 5019.39ms 
iter 9830: loss 2.2697, time 4975.05ms 
iter 9831: loss 2.4082, time 5008.14ms 
iter 9832: loss 2.5524, time 5022.36ms 
iter 9833: loss 2.4526, time 5019.43ms 
iter 9834: loss 2.5901, time 5013.14ms 
iter 9835: loss 2.4904, time 5023.79ms 
iter 9836: loss 2.4545, time 5015.81ms 
iter 9837: loss 2.6228, time 5029.80ms 
iter 9838: loss 2.5729, time 5001.93ms 
iter 9839: loss 2.5466, time 5009.48ms 
iter 9840: loss 2.3557, time 5024.82ms 
iter 9841: loss 2.4599, time 5025.33ms 
iter 9842: loss 2.2777, time 5024.42ms 
iter 9843: loss 2.4416, time 5024.73ms 
iter 9844: loss 2.4394, time 5025.47ms 
iter 9845: loss 2.3441, time 5025.66ms 
iter 9846: loss 2.7007, time 5017.87ms 
iter 9847: loss 2.5842, time 5023.61ms 
iter 9848: loss 2.2586, time 5009.52ms 
iter 9849: loss 2.4877, time 4996.04ms 
step 9850: train loss 2.4348, val loss 2.8485
iter 9850: loss 2.4932, time 19654.55ms 
iter 9851: loss 2.2897, time 5016.93ms 
iter 9852: loss 2.4135, time 5012.09ms 
iter 9853: loss 2.4000, time 5019.77ms 
iter 9854: loss 2.3361, time 5024.12ms 
iter 9855: loss 2.3851, time 5022.16ms 
iter 9856: loss 2.5083, time 5025.81ms 
iter 9857: loss 2.3218, time 5028.94ms 
iter 9858: loss 2.5461, time 4976.53ms 
iter 9859: loss 2.3170, time 4996.17ms 
iter 9860: loss 2.2723, time 5027.39ms 
iter 9861: loss 2.3328, time 5029.16ms 
iter 9862: loss 2.1894, time 5016.00ms 
iter 9863: loss 2.3856, time 5022.57ms 
iter 9864: loss 2.4290, time 5028.98ms 
iter 9865: loss 2.5502, time 4997.81ms 
iter 9866: loss 2.2530, time 5012.87ms 
iter 9867: loss 2.4692, time 5012.76ms 
iter 9868: loss 2.4943, time 5023.63ms 
iter 9869: loss 2.5608, time 5018.81ms 
iter 9870: loss 2.3775, time 5000.33ms 
iter 9871: loss 2.7179, time 5026.61ms 
iter 9872: loss 2.0517, time 4929.88ms 
iter 9873: loss 2.5299, time 5021.68ms 
iter 9874: loss 2.3855, time 5017.36ms 
iter 9875: loss 2.4487, time 4932.36ms 
iter 9876: loss 2.6733, time 5017.26ms 
iter 9877: loss 2.6003, time 5018.67ms 
iter 9878: loss 2.5326, time 5019.43ms 
iter 9879: loss 2.7412, time 5003.62ms 
iter 9880: loss 2.2685, time 4940.64ms 
iter 9881: loss 2.3486, time 5020.56ms 
iter 9882: loss 2.3269, time 4995.23ms 
iter 9883: loss 2.5929, time 4978.43ms 
iter 9884: loss 2.4490, time 5027.86ms 
iter 9885: loss 2.5934, time 5030.94ms 
iter 9886: loss 2.4956, time 4994.88ms 
iter 9887: loss 2.4912, time 5009.86ms 
iter 9888: loss 2.5109, time 5007.71ms 
iter 9889: loss 2.5081, time 5028.64ms 
iter 9890: loss 2.5471, time 4973.80ms 
iter 9891: loss 2.5177, time 4998.93ms 
iter 9892: loss 2.5226, time 4997.99ms 
iter 9893: loss 2.1866, time 5027.33ms 
iter 9894: loss 2.5397, time 4985.29ms 
iter 9895: loss 2.5771, time 5025.45ms 
iter 9896: loss 2.6045, time 5024.53ms 
iter 9897: loss 2.4581, time 5020.24ms 
iter 9898: loss 2.3194, time 5025.34ms 
iter 9899: loss 2.4198, time 5017.56ms 
step 9900: train loss 2.4448, val loss 2.8507
iter 9900: loss 2.4580, time 19703.95ms 
iter 9901: loss 2.3702, time 5007.94ms 
iter 9902: loss 2.6136, time 5016.70ms 
iter 9903: loss 2.3133, time 4985.23ms 
iter 9904: loss 2.5578, time 5024.92ms 
iter 9905: loss 2.5869, time 5009.60ms 
iter 9906: loss 2.4940, time 5013.58ms 
iter 9907: loss 2.6044, time 5008.19ms 
iter 9908: loss 2.3693, time 5023.42ms 
iter 9909: loss 2.5263, time 5013.29ms 
iter 9910: loss 2.3912, time 4948.17ms 
iter 9911: loss 2.2868, time 4953.03ms 
iter 9912: loss 2.4096, time 4988.09ms 
iter 9913: loss 2.4896, time 5023.14ms 
iter 9914: loss 2.4348, time 5006.88ms 
iter 9915: loss 2.3074, time 5015.42ms 
iter 9916: loss 2.5594, time 5023.72ms 
iter 9917: loss 2.3147, time 5026.11ms 
iter 9918: loss 2.3670, time 5028.19ms 
iter 9919: loss 2.5563, time 4993.65ms 
iter 9920: loss 2.4627, time 4996.69ms 
iter 9921: loss 2.5675, time 5039.23ms 
iter 9922: loss 2.5738, time 5028.35ms 
iter 9923: loss 2.3056, time 5017.95ms 
iter 9924: loss 2.3948, time 5024.53ms 
iter 9925: loss 2.4052, time 5025.21ms 
iter 9926: loss 2.5107, time 5026.55ms 
iter 9927: loss 2.5297, time 4983.24ms 
iter 9928: loss 2.5187, time 5002.12ms 
iter 9929: loss 2.2603, time 5024.26ms 
iter 9930: loss 2.4266, time 5022.40ms 
iter 9931: loss 2.2033, time 5018.08ms 
iter 9932: loss 2.4397, time 5018.37ms 
iter 9933: loss 2.5026, time 5017.28ms 
iter 9934: loss 2.6969, time 5024.19ms 
iter 9935: loss 2.5240, time 4983.93ms 
iter 9936: loss 2.4598, time 4990.43ms 
iter 9937: loss 2.1521, time 5027.78ms 
iter 9938: loss 2.5400, time 5009.35ms 
iter 9939: loss 2.2529, time 5003.75ms 
iter 9940: loss 2.3480, time 5018.66ms 
iter 9941: loss 2.7601, time 5020.62ms 
iter 9942: loss 2.3489, time 4984.90ms 
iter 9943: loss 2.4041, time 4917.57ms 
iter 9944: loss 2.4540, time 4981.19ms 
iter 9945: loss 2.3253, time 5013.98ms 
iter 9946: loss 2.3634, time 5029.43ms 
iter 9947: loss 2.3565, time 5029.90ms 
iter 9948: loss 2.4612, time 5026.17ms 
iter 9949: loss 2.3308, time 5021.50ms 
step 9950: train loss 2.4393, val loss 2.8614
iter 9950: loss 2.4016, time 19601.00ms 
iter 9951: loss 2.3997, time 4914.38ms 
iter 9952: loss 2.3829, time 4968.72ms 
iter 9953: loss 2.5303, time 5021.91ms 
iter 9954: loss 2.2743, time 4999.18ms 
iter 9955: loss 2.5693, time 5020.87ms 
iter 9956: loss 2.3242, time 4971.56ms 
iter 9957: loss 2.6333, time 4990.74ms 
iter 9958: loss 2.4934, time 5027.39ms 
iter 9959: loss 2.4654, time 5026.44ms 
iter 9960: loss 2.5276, time 5021.03ms 
iter 9961: loss 2.4748, time 5026.09ms 
iter 9962: loss 2.5184, time 4938.83ms 
iter 9963: loss 2.4624, time 5010.94ms 
iter 9964: loss 2.5141, time 4973.85ms 
iter 9965: loss 2.4137, time 4992.27ms 
iter 9966: loss 2.5147, time 5024.32ms 
iter 9967: loss 2.3542, time 5026.67ms 
iter 9968: loss 2.3792, time 5027.80ms 
iter 9969: loss 2.5476, time 5023.40ms 
iter 9970: loss 2.4359, time 5025.75ms 
iter 9971: loss 2.4784, time 5027.36ms 
iter 9972: loss 2.4495, time 5026.71ms 
iter 9973: loss 2.4903, time 5020.00ms 
iter 9974: loss 2.5291, time 5027.94ms 
iter 9975: loss 2.2194, time 5024.96ms 
iter 9976: loss 2.5808, time 5029.59ms 
iter 9977: loss 2.8121, time 5011.25ms 
iter 9978: loss 2.4701, time 5029.21ms 
iter 9979: loss 2.2767, time 4968.17ms 
iter 9980: loss 2.4280, time 4914.84ms 
iter 9981: loss 2.4816, time 4975.55ms 
iter 9982: loss 2.4777, time 4975.83ms 
iter 9983: loss 2.3758, time 5033.74ms 
iter 9984: loss 2.4063, time 5027.09ms 
iter 9985: loss 2.4646, time 5018.06ms 
iter 9986: loss 2.3240, time 5024.58ms 
iter 9987: loss 2.4116, time 5029.08ms 
iter 9988: loss 2.2341, time 5018.82ms 
iter 9989: loss 2.3874, time 5026.35ms 
iter 9990: loss 2.2676, time 5030.08ms 
iter 9991: loss 2.7077, time 5020.17ms 
iter 9992: loss 2.4285, time 5021.93ms 
iter 9993: loss 2.4104, time 5015.62ms 
iter 9994: loss 2.1770, time 5025.11ms 
iter 9995: loss 2.4076, time 5017.02ms 
iter 9996: loss 2.3875, time 4999.27ms 
iter 9997: loss 2.3831, time 5011.18ms 
iter 9998: loss 2.2769, time 5022.02ms 
iter 9999: loss 2.3814, time 4996.18ms 
step 10000: train loss 2.4498, val loss 2.8588
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 10000: loss 2.4859, time 20723.36ms 
iter 10001: loss 2.2766, time 4990.76ms 
iter 10002: loss 2.3267, time 5018.70ms 
iter 10003: loss 2.5351, time 5030.35ms 
iter 10004: loss 2.4980, time 5029.84ms 
iter 10005: loss 2.4033, time 5030.22ms 
iter 10006: loss 2.4079, time 5027.91ms 
iter 10007: loss 2.5968, time 5028.00ms 
iter 10008: loss 2.6650, time 5029.27ms 
iter 10009: loss 2.4627, time 5000.76ms 
iter 10010: loss 2.5289, time 5012.41ms 
iter 10011: loss 2.4589, time 5012.06ms 
iter 10012: loss 2.2839, time 5025.02ms 
iter 10013: loss 2.2141, time 5024.83ms 
iter 10014: loss 2.3092, time 5023.69ms 
iter 10015: loss 2.3452, time 5025.61ms 
iter 10016: loss 2.3345, time 4978.29ms 
iter 10017: loss 2.4078, time 4945.03ms 
iter 10018: loss 2.3328, time 5023.45ms 
iter 10019: loss 2.4658, time 5025.33ms 
iter 10020: loss 2.5240, time 5024.60ms 
iter 10021: loss 2.3698, time 5022.11ms 
iter 10022: loss 2.6995, time 5021.52ms 
iter 10023: loss 2.6192, time 5022.38ms 
iter 10024: loss 2.3382, time 5024.50ms 
iter 10025: loss 2.3324, time 5001.78ms 
iter 10026: loss 2.5034, time 5025.73ms 
iter 10027: loss 2.4408, time 5029.91ms 
iter 10028: loss 2.5185, time 5027.82ms 
iter 10029: loss 2.5066, time 5024.61ms 
iter 10030: loss 2.3836, time 5028.68ms 
iter 10031: loss 2.6015, time 5007.92ms 
iter 10032: loss 2.6694, time 4970.09ms 
iter 10033: loss 2.4407, time 5006.58ms 
iter 10034: loss 2.5233, time 5024.84ms 
iter 10035: loss 2.0500, time 5022.59ms 
iter 10036: loss 2.3447, time 4972.03ms 
iter 10037: loss 2.4624, time 5015.17ms 
iter 10038: loss 2.6231, time 5029.25ms 
iter 10039: loss 2.0529, time 5024.35ms 
iter 10040: loss 2.3422, time 5024.82ms 
iter 10041: loss 2.3762, time 5018.72ms 
iter 10042: loss 2.4239, time 5014.62ms 
iter 10043: loss 2.6025, time 5002.03ms 
iter 10044: loss 2.4229, time 4980.41ms 
iter 10045: loss 2.5208, time 5023.39ms 
iter 10046: loss 2.1679, time 5025.25ms 
iter 10047: loss 2.5986, time 5025.72ms 
iter 10048: loss 2.2673, time 5023.05ms 
iter 10049: loss 2.3973, time 5020.83ms 
step 10050: train loss 2.4376, val loss 2.8663
iter 10050: loss 2.3663, time 19686.27ms 
iter 10051: loss 2.4654, time 5016.89ms 
iter 10052: loss 2.6582, time 5015.67ms 
iter 10053: loss 2.4193, time 4973.35ms 
iter 10054: loss 2.6446, time 5023.02ms 
iter 10055: loss 2.5577, time 5020.01ms 
iter 10056: loss 2.3527, time 5026.49ms 
iter 10057: loss 2.3225, time 5006.96ms 
iter 10058: loss 2.2435, time 5027.71ms 
iter 10059: loss 2.5068, time 5025.37ms 
iter 10060: loss 2.6540, time 5023.45ms 
iter 10061: loss 2.7612, time 5026.40ms 
iter 10062: loss 2.7169, time 5028.15ms 
iter 10063: loss 2.4525, time 5023.11ms 
iter 10064: loss 2.4414, time 4979.89ms 
iter 10065: loss 2.5460, time 5020.15ms 
iter 10066: loss 2.2435, time 5018.04ms 
iter 10067: loss 2.3268, time 5024.79ms 
iter 10068: loss 2.4075, time 5006.00ms 
iter 10069: loss 2.5378, time 5022.44ms 
iter 10070: loss 2.5967, time 5021.49ms 
iter 10071: loss 2.3028, time 5023.95ms 
iter 10072: loss 2.5251, time 5014.01ms 
iter 10073: loss 2.4142, time 4987.44ms 
iter 10074: loss 2.3503, time 5027.15ms 
iter 10075: loss 2.2138, time 5023.45ms 
iter 10076: loss 2.2702, time 5025.35ms 
iter 10077: loss 2.5108, time 5019.77ms 
iter 10078: loss 2.6131, time 4999.91ms 
iter 10079: loss 2.4554, time 4964.79ms 
iter 10080: loss 2.5538, time 5023.88ms 
iter 10081: loss 2.3139, time 5024.46ms 
iter 10082: loss 2.1710, time 5023.78ms 
iter 10083: loss 2.4165, time 5030.59ms 
iter 10084: loss 2.5487, time 5028.86ms 
iter 10085: loss 2.5171, time 5027.99ms 
iter 10086: loss 2.7750, time 4981.93ms 
iter 10087: loss 2.1888, time 5007.43ms 
iter 10088: loss 2.5405, time 5023.41ms 
iter 10089: loss 2.4780, time 5026.06ms 
iter 10090: loss 2.4739, time 5024.68ms 
iter 10091: loss 2.1469, time 5029.19ms 
iter 10092: loss 2.2297, time 5017.24ms 
iter 10093: loss 2.6740, time 5026.26ms 
iter 10094: loss 2.4016, time 4971.89ms 
iter 10095: loss 2.3459, time 4995.24ms 
iter 10096: loss 2.4963, time 5022.06ms 
iter 10097: loss 2.4292, time 5022.99ms 
iter 10098: loss 2.3049, time 5023.01ms 
iter 10099: loss 2.2231, time 5020.69ms 
step 10100: train loss 2.4423, val loss 2.8673
iter 10100: loss 2.4574, time 19668.36ms 
iter 10101: loss 2.5105, time 4969.47ms 
iter 10102: loss 2.3839, time 5024.09ms 
iter 10103: loss 2.7413, time 5023.33ms 
iter 10104: loss 2.5351, time 5021.28ms 
iter 10105: loss 2.6732, time 5022.94ms 
iter 10106: loss 2.4003, time 5023.63ms 
iter 10107: loss 2.5912, time 4982.60ms 
iter 10108: loss 2.4054, time 5022.67ms 
iter 10109: loss 2.4748, time 5017.38ms 
iter 10110: loss 2.4088, time 5023.77ms 
iter 10111: loss 2.3867, time 5023.42ms 
iter 10112: loss 2.4128, time 5021.76ms 
iter 10113: loss 2.3177, time 5023.92ms 
iter 10114: loss 2.4868, time 5026.76ms 
iter 10115: loss 2.2386, time 4987.06ms 
iter 10116: loss 2.2635, time 5023.59ms 
iter 10117: loss 2.4476, time 5025.33ms 
iter 10118: loss 2.5708, time 5028.66ms 
iter 10119: loss 2.1471, time 5028.62ms 
iter 10120: loss 2.2488, time 4975.72ms 
iter 10121: loss 2.4132, time 4989.08ms 
iter 10122: loss 2.5839, time 5016.48ms 
iter 10123: loss 2.5293, time 5021.57ms 
iter 10124: loss 2.4463, time 5021.95ms 
iter 10125: loss 2.2980, time 5021.72ms 
iter 10126: loss 2.3342, time 5027.44ms 
iter 10127: loss 2.4756, time 5024.75ms 
iter 10128: loss 2.5711, time 4979.05ms 
iter 10129: loss 2.4011, time 5015.30ms 
iter 10130: loss 2.2725, time 5026.75ms 
iter 10131: loss 2.4672, time 5025.26ms 
iter 10132: loss 2.2066, time 5027.31ms 
iter 10133: loss 2.2700, time 5030.12ms 
iter 10134: loss 2.5548, time 5023.11ms 
iter 10135: loss 2.5886, time 5021.70ms 
iter 10136: loss 2.5174, time 4975.73ms 
iter 10137: loss 2.3684, time 5024.78ms 
iter 10138: loss 2.3941, time 5024.34ms 
iter 10139: loss 2.5005, time 5019.78ms 
iter 10140: loss 2.3098, time 5019.30ms 
iter 10141: loss 2.3978, time 5020.84ms 
iter 10142: loss 2.4555, time 5026.90ms 
iter 10143: loss 2.5289, time 5024.22ms 
iter 10144: loss 2.4112, time 4977.32ms 
iter 10145: loss 2.4967, time 5014.83ms 
iter 10146: loss 2.2384, time 5025.66ms 
iter 10147: loss 2.3876, time 5022.67ms 
iter 10148: loss 2.2318, time 5027.64ms 
iter 10149: loss 2.5738, time 5020.21ms 
step 10150: train loss 2.4450, val loss 2.8592
iter 10150: loss 2.8538, time 19673.86ms 
iter 10151: loss 2.5877, time 5022.85ms 
iter 10152: loss 2.1802, time 5023.75ms 
iter 10153: loss 2.2975, time 4980.71ms 
iter 10154: loss 2.4601, time 5022.76ms 
iter 10155: loss 2.3938, time 5014.49ms 
iter 10156: loss 2.2856, time 5020.28ms 
iter 10157: loss 2.6147, time 5019.70ms 
iter 10158: loss 2.5682, time 5032.20ms 
iter 10159: loss 2.5645, time 5033.09ms 
iter 10160: loss 2.4674, time 4956.32ms 
iter 10161: loss 2.6104, time 4992.27ms 
iter 10162: loss 2.4166, time 5023.38ms 
iter 10163: loss 2.3438, time 5023.79ms 
iter 10164: loss 2.4526, time 5009.05ms 
iter 10165: loss 2.3754, time 5015.27ms 
iter 10166: loss 2.5491, time 4986.99ms 
iter 10167: loss 2.1927, time 5022.87ms 
iter 10168: loss 2.6526, time 5028.36ms 
iter 10169: loss 2.3833, time 5028.63ms 
iter 10170: loss 2.3932, time 5025.04ms 
iter 10171: loss 2.4813, time 5028.84ms 
iter 10172: loss 2.3204, time 5006.05ms 
iter 10173: loss 2.5509, time 5032.21ms 
iter 10174: loss 2.6797, time 4987.03ms 
iter 10175: loss 2.6176, time 4996.44ms 
iter 10176: loss 2.5331, time 5017.99ms 
iter 10177: loss 2.4185, time 5022.68ms 
iter 10178: loss 2.4454, time 5016.60ms 
iter 10179: loss 2.2558, time 5019.21ms 
iter 10180: loss 2.3754, time 5026.90ms 
iter 10181: loss 2.3126, time 4984.72ms 
iter 10182: loss 2.5874, time 5025.21ms 
iter 10183: loss 2.3252, time 5024.45ms 
iter 10184: loss 2.4225, time 5013.49ms 
iter 10185: loss 2.4522, time 5017.45ms 
iter 10186: loss 2.3976, time 5021.44ms 
iter 10187: loss 2.6177, time 5009.38ms 
iter 10188: loss 2.5563, time 4958.01ms 
iter 10189: loss 2.3704, time 4940.65ms 
iter 10190: loss 2.2717, time 5028.95ms 
iter 10191: loss 2.3001, time 5027.73ms 
iter 10192: loss 2.6144, time 5026.68ms 
iter 10193: loss 2.1883, time 5025.63ms 
iter 10194: loss 2.4120, time 5027.01ms 
iter 10195: loss 2.2692, time 5025.41ms 
iter 10196: loss 2.5926, time 5030.63ms 
iter 10197: loss 2.3067, time 4991.80ms 
iter 10198: loss 2.4441, time 5027.97ms 
iter 10199: loss 2.3718, time 4929.07ms 
step 10200: train loss 2.4426, val loss 2.8535
iter 10200: loss 2.4206, time 19687.79ms 
iter 10201: loss 2.4292, time 5027.27ms 
iter 10202: loss 2.4517, time 4980.42ms 
iter 10203: loss 2.5351, time 5023.96ms 
iter 10204: loss 2.4689, time 5025.46ms 
iter 10205: loss 2.1622, time 5028.22ms 
iter 10206: loss 2.4877, time 5027.78ms 
iter 10207: loss 2.4099, time 5027.82ms 
iter 10208: loss 2.2503, time 5026.00ms 
iter 10209: loss 2.4576, time 5025.32ms 
iter 10210: loss 2.6369, time 4980.10ms 
iter 10211: loss 2.6584, time 5021.06ms 
iter 10212: loss 2.4970, time 5027.93ms 
iter 10213: loss 2.5498, time 5032.13ms 
iter 10214: loss 2.3643, time 5028.03ms 
iter 10215: loss 2.4848, time 5028.69ms 
iter 10216: loss 2.5985, time 5035.78ms 
iter 10217: loss 2.4499, time 5029.19ms 
iter 10218: loss 2.3210, time 4950.84ms 
iter 10219: loss 2.3659, time 5019.41ms 
iter 10220: loss 2.3599, time 5018.42ms 
iter 10221: loss 2.2951, time 5003.65ms 
iter 10222: loss 2.5111, time 5021.92ms 
iter 10223: loss 2.3765, time 5023.81ms 
iter 10224: loss 2.5048, time 5020.56ms 
iter 10225: loss 2.4990, time 5004.96ms 
iter 10226: loss 2.5420, time 4927.73ms 
iter 10227: loss 2.3745, time 5021.09ms 
iter 10228: loss 2.4789, time 5019.77ms 
iter 10229: loss 2.4083, time 5019.15ms 
iter 10230: loss 2.6688, time 5010.48ms 
iter 10231: loss 2.4055, time 4983.66ms 
iter 10232: loss 2.4151, time 5019.37ms 
iter 10233: loss 2.3499, time 5022.02ms 
iter 10234: loss 2.4467, time 4933.48ms 
iter 10235: loss 2.3405, time 5019.07ms 
iter 10236: loss 2.3875, time 5018.50ms 
iter 10237: loss 2.5141, time 5020.04ms 
iter 10238: loss 2.4069, time 5018.25ms 
iter 10239: loss 2.2725, time 5021.21ms 
iter 10240: loss 2.3214, time 5021.15ms 
iter 10241: loss 2.6053, time 5021.25ms 
iter 10242: loss 2.3407, time 4973.75ms 
iter 10243: loss 2.5137, time 5021.83ms 
iter 10244: loss 2.5741, time 5027.17ms 
iter 10245: loss 2.3618, time 5024.48ms 
iter 10246: loss 2.4811, time 5015.75ms 
iter 10247: loss 2.4535, time 5030.30ms 
iter 10248: loss 2.4336, time 5003.54ms 
iter 10249: loss 2.4190, time 5029.01ms 
step 10250: train loss 2.4510, val loss 2.8583
iter 10250: loss 2.2332, time 19707.76ms 
iter 10251: loss 2.5345, time 5023.61ms 
iter 10252: loss 2.4633, time 5024.29ms 
iter 10253: loss 2.4146, time 5025.32ms 
iter 10254: loss 2.5640, time 5026.84ms 
iter 10255: loss 2.3405, time 4957.58ms 
iter 10256: loss 2.5526, time 4916.17ms 
iter 10257: loss 2.2668, time 4934.92ms 
iter 10258: loss 2.5435, time 4932.76ms 
iter 10259: loss 2.5487, time 4943.69ms 
iter 10260: loss 2.5867, time 4945.11ms 
iter 10261: loss 2.4071, time 4919.91ms 
iter 10262: loss 2.6185, time 4921.55ms 
iter 10263: loss 2.4306, time 4912.31ms 
iter 10264: loss 2.4311, time 4911.14ms 
iter 10265: loss 2.6586, time 4928.59ms 
iter 10266: loss 2.6148, time 5000.73ms 
iter 10267: loss 2.5085, time 5031.95ms 
iter 10268: loss 2.4950, time 5015.92ms 
iter 10269: loss 2.3203, time 5028.39ms 
iter 10270: loss 2.6566, time 5009.31ms 
iter 10271: loss 2.1841, time 5025.72ms 
iter 10272: loss 2.3910, time 5001.86ms 
iter 10273: loss 2.3171, time 5005.85ms 
iter 10274: loss 2.2600, time 5024.29ms 
iter 10275: loss 2.5088, time 5026.03ms 
iter 10276: loss 2.6370, time 5031.15ms 
iter 10277: loss 2.4775, time 4968.31ms 
iter 10278: loss 2.5441, time 5011.08ms 
iter 10279: loss 2.5481, time 5014.31ms 
iter 10280: loss 2.5351, time 4974.99ms 
iter 10281: loss 2.4190, time 5002.99ms 
iter 10282: loss 2.3093, time 5029.96ms 
iter 10283: loss 2.2502, time 5030.55ms 
iter 10284: loss 2.2117, time 5032.69ms 
iter 10285: loss 2.5824, time 5040.24ms 
iter 10286: loss 2.2805, time 5031.62ms 
iter 10287: loss 2.4130, time 5013.19ms 
iter 10288: loss 2.5937, time 4914.78ms 
iter 10289: loss 2.4473, time 4985.24ms 
iter 10290: loss 2.4905, time 5020.13ms 
iter 10291: loss 2.4886, time 5026.99ms 
iter 10292: loss 2.6393, time 5022.69ms 
iter 10293: loss 2.4465, time 5024.11ms 
iter 10294: loss 2.6270, time 5026.18ms 
iter 10295: loss 2.2381, time 5015.70ms 
iter 10296: loss 2.5055, time 4939.14ms 
iter 10297: loss 2.4024, time 4997.77ms 
iter 10298: loss 2.2671, time 5007.75ms 
iter 10299: loss 2.4574, time 5005.22ms 
step 10300: train loss 2.4306, val loss 2.8539
iter 10300: loss 2.6705, time 19687.93ms 
iter 10301: loss 2.5194, time 4972.18ms 
iter 10302: loss 2.3229, time 4923.08ms 
iter 10303: loss 2.3057, time 5026.10ms 
iter 10304: loss 2.5065, time 5020.21ms 
iter 10305: loss 2.3255, time 5027.27ms 
iter 10306: loss 2.4681, time 5024.92ms 
iter 10307: loss 2.5568, time 5029.11ms 
iter 10308: loss 2.4395, time 4990.08ms 
iter 10309: loss 2.4805, time 4951.62ms 
iter 10310: loss 2.4578, time 4930.89ms 
iter 10311: loss 2.4118, time 5028.98ms 
iter 10312: loss 2.2548, time 5026.20ms 
iter 10313: loss 2.3605, time 5006.36ms 
iter 10314: loss 2.3714, time 5027.69ms 
iter 10315: loss 2.6867, time 5027.61ms 
iter 10316: loss 2.7300, time 5027.58ms 
iter 10317: loss 2.5485, time 5030.51ms 
iter 10318: loss 2.6753, time 4986.98ms 
iter 10319: loss 2.5200, time 5023.54ms 
iter 10320: loss 2.2631, time 5027.30ms 
iter 10321: loss 2.8136, time 5020.56ms 
iter 10322: loss 2.2683, time 5020.73ms 
iter 10323: loss 2.3967, time 5019.21ms 
iter 10324: loss 2.4236, time 5019.10ms 
iter 10325: loss 2.3587, time 5004.04ms 
iter 10326: loss 2.4882, time 4974.07ms 
iter 10327: loss 2.6671, time 5029.65ms 
iter 10328: loss 2.3136, time 5026.67ms 
iter 10329: loss 2.5096, time 5028.59ms 
iter 10330: loss 2.3740, time 5031.09ms 
iter 10331: loss 2.4126, time 5026.14ms 
iter 10332: loss 2.4564, time 5025.87ms 
iter 10333: loss 2.6409, time 5040.59ms 
iter 10334: loss 2.3407, time 5028.02ms 
iter 10335: loss 2.5553, time 5024.21ms 
iter 10336: loss 2.4395, time 5021.38ms 
iter 10337: loss 2.2360, time 5022.60ms 
iter 10338: loss 2.4863, time 5007.42ms 
iter 10339: loss 2.5696, time 5007.44ms 
iter 10340: loss 2.3900, time 5017.83ms 
iter 10341: loss 2.5368, time 4942.98ms 
iter 10342: loss 2.6211, time 4914.51ms 
iter 10343: loss 2.1787, time 4944.66ms 
iter 10344: loss 2.3839, time 4950.72ms 
iter 10345: loss 2.4580, time 4976.10ms 
iter 10346: loss 2.4326, time 4924.84ms 
iter 10347: loss 2.4463, time 4917.17ms 
iter 10348: loss 2.5983, time 4995.43ms 
iter 10349: loss 2.4129, time 5010.96ms 
step 10350: train loss 2.4323, val loss 2.8564
iter 10350: loss 2.5446, time 19729.70ms 
iter 10351: loss 2.2417, time 5025.56ms 
iter 10352: loss 2.4979, time 4997.38ms 
iter 10353: loss 2.4551, time 5027.61ms 
iter 10354: loss 2.3899, time 5016.75ms 
iter 10355: loss 2.5072, time 4958.86ms 
iter 10356: loss 2.5046, time 5027.66ms 
iter 10357: loss 2.3737, time 5027.71ms 
iter 10358: loss 2.6495, time 5028.77ms 
iter 10359: loss 2.2908, time 5038.88ms 
iter 10360: loss 2.1906, time 5024.94ms 
iter 10361: loss 2.4701, time 5023.17ms 
iter 10362: loss 2.4014, time 4971.26ms 
iter 10363: loss 2.1613, time 4955.18ms 
iter 10364: loss 2.4489, time 5021.65ms 
iter 10365: loss 2.4625, time 5021.99ms 
iter 10366: loss 2.5201, time 5022.10ms 
iter 10367: loss 2.2925, time 5022.37ms 
iter 10368: loss 2.4760, time 5024.21ms 
iter 10369: loss 2.5513, time 5024.76ms 
iter 10370: loss 2.4317, time 5017.59ms 
iter 10371: loss 2.5161, time 4982.07ms 
iter 10372: loss 2.5033, time 5021.71ms 
iter 10373: loss 2.6301, time 5020.21ms 
iter 10374: loss 2.4748, time 5021.11ms 
iter 10375: loss 2.5239, time 5021.55ms 
iter 10376: loss 2.4367, time 5026.64ms 
iter 10377: loss 2.5842, time 5020.13ms 
iter 10378: loss 2.3644, time 5023.34ms 
iter 10379: loss 2.4085, time 4972.11ms 
iter 10380: loss 2.4376, time 4976.66ms 
iter 10381: loss 2.7204, time 5006.02ms 
iter 10382: loss 2.5446, time 5020.60ms 
iter 10383: loss 2.3273, time 5017.55ms 
iter 10384: loss 2.3897, time 5012.59ms 
iter 10385: loss 2.3588, time 5025.35ms 
iter 10386: loss 2.6618, time 5029.15ms 
iter 10387: loss 2.4638, time 4981.13ms 
iter 10388: loss 2.6442, time 4989.61ms 
iter 10389: loss 2.5191, time 5029.45ms 
iter 10390: loss 2.3078, time 5040.33ms 
iter 10391: loss 2.4166, time 5014.45ms 
iter 10392: loss 2.4840, time 5016.26ms 
iter 10393: loss 2.5156, time 5010.59ms 
iter 10394: loss 2.3545, time 5024.30ms 
iter 10395: loss 2.3388, time 4945.55ms 
iter 10396: loss 2.5066, time 4956.61ms 
iter 10397: loss 2.5966, time 5021.73ms 
iter 10398: loss 2.2923, time 5022.45ms 
iter 10399: loss 2.4584, time 5022.70ms 
step 10400: train loss 2.4270, val loss 2.8660
iter 10400: loss 2.5116, time 19651.60ms 
iter 10401: loss 2.4670, time 4961.50ms 
iter 10402: loss 2.4826, time 5029.17ms 
iter 10403: loss 2.5461, time 5019.28ms 
iter 10404: loss 2.5565, time 5022.57ms 
iter 10405: loss 2.4011, time 5021.33ms 
iter 10406: loss 2.3506, time 5018.83ms 
iter 10407: loss 2.4964, time 5020.93ms 
iter 10408: loss 2.5764, time 5009.09ms 
iter 10409: loss 2.5022, time 4996.16ms 
iter 10410: loss 2.4581, time 4959.74ms 
iter 10411: loss 2.6961, time 4970.09ms 
iter 10412: loss 2.4558, time 4965.95ms 
iter 10413: loss 2.3007, time 5030.26ms 
iter 10414: loss 2.4197, time 5028.98ms 
iter 10415: loss 2.5947, time 5031.82ms 
iter 10416: loss 2.6268, time 4967.60ms 
iter 10417: loss 2.3856, time 5008.60ms 
iter 10418: loss 2.5158, time 5030.70ms 
iter 10419: loss 2.5251, time 5028.27ms 
iter 10420: loss 2.3971, time 5012.58ms 
iter 10421: loss 2.5465, time 5031.64ms 
iter 10422: loss 2.5217, time 4978.78ms 
iter 10423: loss 2.4932, time 5025.98ms 
iter 10424: loss 2.4965, time 4990.29ms 
iter 10425: loss 2.3730, time 5028.00ms 
iter 10426: loss 2.2276, time 5042.84ms 
iter 10427: loss 2.4220, time 5011.15ms 
iter 10428: loss 2.3178, time 5029.55ms 
iter 10429: loss 2.6459, time 5012.23ms 
iter 10430: loss 2.5227, time 5023.01ms 
iter 10431: loss 2.3040, time 4999.30ms 
iter 10432: loss 2.4640, time 5001.44ms 
iter 10433: loss 2.3005, time 5028.19ms 
iter 10434: loss 2.5483, time 5024.40ms 
iter 10435: loss 2.3366, time 5021.05ms 
iter 10436: loss 2.2464, time 5026.58ms 
iter 10437: loss 2.1475, time 5022.15ms 
iter 10438: loss 2.3838, time 5025.03ms 
iter 10439: loss 2.7339, time 5037.51ms 
iter 10440: loss 2.3833, time 4974.73ms 
iter 10441: loss 2.3834, time 5028.84ms 
iter 10442: loss 2.2973, time 5028.39ms 
iter 10443: loss 2.4167, time 5022.26ms 
iter 10444: loss 2.5636, time 5029.17ms 
iter 10445: loss 2.4741, time 5017.99ms 
iter 10446: loss 2.7537, time 5024.88ms 
iter 10447: loss 2.4047, time 4976.15ms 
iter 10448: loss 2.4099, time 4991.99ms 
iter 10449: loss 2.3587, time 5013.85ms 
step 10450: train loss 2.4428, val loss 2.8427
iter 10450: loss 2.6290, time 19682.96ms 
iter 10451: loss 2.4653, time 5019.77ms 
iter 10452: loss 2.8162, time 5024.73ms 
iter 10453: loss 2.4521, time 5020.60ms 
iter 10454: loss 2.4424, time 5020.78ms 
iter 10455: loss 2.4127, time 5021.32ms 
iter 10456: loss 2.4450, time 4971.53ms 
iter 10457: loss 2.3671, time 5022.11ms 
iter 10458: loss 2.4771, time 5023.34ms 
iter 10459: loss 2.5661, time 5007.23ms 
iter 10460: loss 2.3729, time 5001.31ms 
iter 10461: loss 2.3498, time 5028.94ms 
iter 10462: loss 2.5200, time 4990.83ms 
iter 10463: loss 2.5724, time 4920.08ms 
iter 10464: loss 2.2384, time 4920.17ms 
iter 10465: loss 2.5509, time 4922.99ms 
iter 10466: loss 2.4236, time 4962.82ms 
iter 10467: loss 2.6275, time 5022.41ms 
iter 10468: loss 2.3996, time 5021.15ms 
iter 10469: loss 2.2697, time 5021.89ms 
iter 10470: loss 2.3555, time 5022.88ms 
iter 10471: loss 2.5601, time 5002.69ms 
iter 10472: loss 2.5538, time 5005.90ms 
iter 10473: loss 2.7600, time 5020.69ms 
iter 10474: loss 2.5443, time 5007.38ms 
iter 10475: loss 2.1567, time 5007.60ms 
iter 10476: loss 2.2941, time 4913.36ms 
iter 10477: loss 2.5341, time 4913.43ms 
iter 10478: loss 2.5796, time 4913.25ms 
iter 10479: loss 2.5706, time 4913.91ms 
iter 10480: loss 2.4767, time 4913.56ms 
iter 10481: loss 2.2829, time 4913.90ms 
iter 10482: loss 2.4934, time 4967.87ms 
iter 10483: loss 2.3451, time 4985.06ms 
iter 10484: loss 2.4719, time 5028.00ms 
iter 10485: loss 2.4009, time 5028.92ms 
iter 10486: loss 2.4489, time 5026.85ms 
iter 10487: loss 2.2359, time 4978.97ms 
iter 10488: loss 1.9499, time 5026.33ms 
iter 10489: loss 2.3136, time 5026.87ms 
iter 10490: loss 2.2379, time 5034.13ms 
iter 10491: loss 2.5707, time 5026.77ms 
iter 10492: loss 2.4032, time 5027.27ms 
iter 10493: loss 2.4103, time 5030.44ms 
iter 10494: loss 2.1838, time 5027.39ms 
iter 10495: loss 2.3816, time 4990.85ms 
iter 10496: loss 2.4985, time 5026.87ms 
iter 10497: loss 2.2978, time 5008.98ms 
iter 10498: loss 2.4130, time 5031.56ms 
iter 10499: loss 2.2744, time 5029.25ms 
step 10500: train loss 2.4296, val loss 2.8452
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 10500: loss 2.5986, time 20650.13ms 
iter 10501: loss 2.2771, time 4990.42ms 
iter 10502: loss 2.4951, time 5025.51ms 
iter 10503: loss 2.3207, time 5023.48ms 
iter 10504: loss 2.6536, time 5019.65ms 
iter 10505: loss 2.1470, time 5022.95ms 
iter 10506: loss 2.3936, time 4953.46ms 
iter 10507: loss 2.4540, time 5020.51ms 
iter 10508: loss 2.5522, time 5023.19ms 
iter 10509: loss 2.4832, time 5019.97ms 
iter 10510: loss 2.2469, time 5027.41ms 
iter 10511: loss 2.5718, time 5005.09ms 
iter 10512: loss 2.5048, time 5019.83ms 
iter 10513: loss 2.5381, time 5024.03ms 
iter 10514: loss 2.2742, time 4982.92ms 
iter 10515: loss 2.7252, time 5020.48ms 
iter 10516: loss 2.5842, time 4972.83ms 
iter 10517: loss 2.6256, time 5024.82ms 
iter 10518: loss 2.2730, time 5022.48ms 
iter 10519: loss 2.6128, time 5020.62ms 
iter 10520: loss 2.2839, time 5021.08ms 
iter 10521: loss 2.5605, time 4973.83ms 
iter 10522: loss 2.4652, time 4998.33ms 
iter 10523: loss 2.4753, time 5023.06ms 
iter 10524: loss 2.6220, time 5023.62ms 
iter 10525: loss 2.4319, time 4989.21ms 
iter 10526: loss 2.3124, time 4997.00ms 
iter 10527: loss 2.4645, time 4980.64ms 
iter 10528: loss 2.3829, time 5024.66ms 
iter 10529: loss 2.3837, time 4983.50ms 
iter 10530: loss 2.6784, time 5016.19ms 
iter 10531: loss 2.4542, time 5024.29ms 
iter 10532: loss 2.4799, time 5021.11ms 
iter 10533: loss 2.4262, time 5021.30ms 
iter 10534: loss 2.5664, time 5020.15ms 
iter 10535: loss 2.6089, time 5023.90ms 
iter 10536: loss 2.3052, time 5026.23ms 
iter 10537: loss 2.5679, time 5016.84ms 
iter 10538: loss 2.4534, time 5023.20ms 
iter 10539: loss 2.5237, time 5021.12ms 
iter 10540: loss 2.2412, time 5026.78ms 
iter 10541: loss 2.6401, time 4996.80ms 
iter 10542: loss 2.4446, time 5012.22ms 
iter 10543: loss 2.3222, time 5028.14ms 
iter 10544: loss 2.2068, time 4979.19ms 
iter 10545: loss 2.4146, time 5009.35ms 
iter 10546: loss 2.5123, time 5016.87ms 
iter 10547: loss 2.4211, time 5024.48ms 
iter 10548: loss 2.2995, time 5026.48ms 
iter 10549: loss 2.6334, time 4991.15ms 
step 10550: train loss 2.4345, val loss 2.8633
iter 10550: loss 2.4915, time 19712.95ms 
iter 10551: loss 2.5060, time 5021.91ms 
iter 10552: loss 2.3752, time 5038.40ms 
iter 10553: loss 2.2736, time 5036.21ms 
iter 10554: loss 2.4549, time 5035.62ms 
iter 10555: loss 2.6123, time 5019.46ms 
iter 10556: loss 2.3847, time 5022.56ms 
iter 10557: loss 2.4993, time 4965.56ms 
iter 10558: loss 2.4198, time 5013.74ms 
iter 10559: loss 2.3672, time 5019.23ms 
iter 10560: loss 2.4141, time 5014.34ms 
iter 10561: loss 2.6585, time 5019.49ms 
iter 10562: loss 2.3691, time 5018.26ms 
iter 10563: loss 2.3820, time 5024.67ms 
iter 10564: loss 2.5582, time 4990.21ms 
iter 10565: loss 2.3946, time 4943.39ms 
iter 10566: loss 2.4564, time 4996.32ms 
iter 10567: loss 2.3303, time 5019.37ms 
iter 10568: loss 2.3373, time 5007.20ms 
iter 10569: loss 2.3303, time 5020.49ms 
iter 10570: loss 2.4363, time 5022.50ms 
iter 10571: loss 2.4181, time 5025.85ms 
iter 10572: loss 2.2110, time 5032.14ms 
iter 10573: loss 2.4384, time 4984.42ms 
iter 10574: loss 2.5122, time 4961.77ms 
iter 10575: loss 2.5150, time 4952.11ms 
iter 10576: loss 2.2740, time 4970.30ms 
iter 10577: loss 2.5283, time 4986.65ms 
iter 10578: loss 2.3059, time 5016.88ms 
iter 10579: loss 2.4250, time 5015.37ms 
iter 10580: loss 2.3433, time 5020.14ms 
iter 10581: loss 2.1288, time 5008.58ms 
iter 10582: loss 2.3627, time 4999.86ms 
iter 10583: loss 2.4269, time 5018.16ms 
iter 10584: loss 2.4133, time 4968.39ms 
iter 10585: loss 2.3064, time 4984.06ms 
iter 10586: loss 2.5311, time 5022.09ms 
iter 10587: loss 2.2343, time 5023.77ms 
iter 10588: loss 2.2765, time 5029.74ms 
iter 10589: loss 2.5491, time 4972.21ms 
iter 10590: loss 2.3805, time 5034.74ms 
iter 10591: loss 2.6654, time 5031.83ms 
iter 10592: loss 2.4573, time 4998.40ms 
iter 10593: loss 2.4720, time 5004.97ms 
iter 10594: loss 2.5956, time 5021.85ms 
iter 10595: loss 2.4540, time 4994.05ms 
iter 10596: loss 2.4723, time 5015.04ms 
iter 10597: loss 2.5863, time 5020.26ms 
iter 10598: loss 2.2658, time 5021.51ms 
iter 10599: loss 2.4624, time 5027.23ms 
step 10600: train loss 2.4299, val loss 2.8683
iter 10600: loss 2.4108, time 19716.46ms 
iter 10601: loss 2.5982, time 5028.88ms 
iter 10602: loss 2.1768, time 5030.42ms 
iter 10603: loss 2.6700, time 5028.20ms 
iter 10604: loss 2.5401, time 5024.54ms 
iter 10605: loss 2.3985, time 4981.09ms 
iter 10606: loss 2.4626, time 5005.15ms 
iter 10607: loss 2.3592, time 5035.73ms 
iter 10608: loss 2.1593, time 4995.83ms 
iter 10609: loss 2.4865, time 5032.77ms 
iter 10610: loss 2.4523, time 5022.33ms 
iter 10611: loss 2.5121, time 5029.07ms 
iter 10612: loss 2.2426, time 5000.09ms 
iter 10613: loss 2.3712, time 4990.37ms 
iter 10614: loss 2.5723, time 5033.59ms 
iter 10615: loss 2.2509, time 5031.96ms 
iter 10616: loss 2.2913, time 5026.40ms 
iter 10617: loss 2.6319, time 5009.01ms 
iter 10618: loss 2.4154, time 5028.93ms 
iter 10619: loss 2.2041, time 5028.86ms 
iter 10620: loss 2.4807, time 4978.76ms 
iter 10621: loss 2.3939, time 5027.95ms 
iter 10622: loss 2.2976, time 5028.79ms 
iter 10623: loss 2.7254, time 5027.79ms 
iter 10624: loss 2.6404, time 5029.33ms 
iter 10625: loss 2.2991, time 4986.27ms 
iter 10626: loss 2.4159, time 5031.28ms 
iter 10627: loss 2.4235, time 5029.15ms 
iter 10628: loss 2.3243, time 4995.57ms 
iter 10629: loss 2.5868, time 5002.09ms 
iter 10630: loss 2.3380, time 5018.35ms 
iter 10631: loss 2.6188, time 5032.14ms 
iter 10632: loss 2.4250, time 5034.57ms 
iter 10633: loss 2.3459, time 5031.22ms 
iter 10634: loss 2.4816, time 5004.24ms 
iter 10635: loss 2.2834, time 4999.28ms 
iter 10636: loss 2.5738, time 5029.81ms 
iter 10637: loss 2.5472, time 5029.94ms 
iter 10638: loss 2.2350, time 5030.49ms 
iter 10639: loss 2.5866, time 5019.72ms 
iter 10640: loss 2.6023, time 5026.17ms 
iter 10641: loss 2.3174, time 5025.65ms 
iter 10642: loss 2.5896, time 5007.57ms 
iter 10643: loss 2.3025, time 4949.00ms 
iter 10644: loss 2.3798, time 5014.98ms 
iter 10645: loss 2.3321, time 5020.68ms 
iter 10646: loss 2.2000, time 5030.47ms 
iter 10647: loss 2.2020, time 5026.59ms 
iter 10648: loss 2.3604, time 5028.62ms 
iter 10649: loss 2.2902, time 5029.12ms 
step 10650: train loss 2.4259, val loss 2.8757
iter 10650: loss 2.4171, time 19654.77ms 
iter 10651: loss 2.3932, time 4927.93ms 
iter 10652: loss 2.4422, time 4985.81ms 
iter 10653: loss 2.4148, time 5021.11ms 
iter 10654: loss 2.4057, time 5023.70ms 
iter 10655: loss 2.3765, time 5023.39ms 
iter 10656: loss 2.2647, time 4936.41ms 
iter 10657: loss 2.7692, time 5011.74ms 
iter 10658: loss 2.0730, time 5028.76ms 
iter 10659: loss 2.4791, time 5027.21ms 
iter 10660: loss 2.4722, time 5025.58ms 
iter 10661: loss 2.3538, time 5024.71ms 
iter 10662: loss 2.4802, time 5013.06ms 
iter 10663: loss 2.5934, time 5025.23ms 
iter 10664: loss 2.4209, time 5015.37ms 
iter 10665: loss 2.4174, time 5033.14ms 
iter 10666: loss 2.6133, time 5022.05ms 
iter 10667: loss 2.2777, time 5020.16ms 
iter 10668: loss 2.6169, time 5028.93ms 
iter 10669: loss 2.4163, time 5019.54ms 
iter 10670: loss 2.3580, time 5022.24ms 
iter 10671: loss 2.4805, time 5018.43ms 
iter 10672: loss 2.3601, time 4980.04ms 
iter 10673: loss 2.2111, time 5018.63ms 
iter 10674: loss 2.3497, time 5027.62ms 
iter 10675: loss 2.4604, time 5015.66ms 
iter 10676: loss 2.4740, time 5022.27ms 
iter 10677: loss 2.1783, time 5008.43ms 
iter 10678: loss 2.2494, time 5024.12ms 
iter 10679: loss 2.3675, time 4972.06ms 
iter 10680: loss 2.3772, time 4979.37ms 
iter 10681: loss 2.4953, time 5020.49ms 
iter 10682: loss 2.4795, time 5020.98ms 
iter 10683: loss 2.6401, time 5022.43ms 
iter 10684: loss 2.4257, time 5021.79ms 
iter 10685: loss 2.4972, time 5025.14ms 
iter 10686: loss 2.4043, time 5029.39ms 
iter 10687: loss 2.2985, time 4932.27ms 
iter 10688: loss 2.4950, time 4990.39ms 
iter 10689: loss 2.5060, time 5027.79ms 
iter 10690: loss 2.4661, time 5027.88ms 
iter 10691: loss 2.3158, time 5003.22ms 
iter 10692: loss 2.4977, time 5000.76ms 
iter 10693: loss 2.5286, time 5024.63ms 
iter 10694: loss 2.4522, time 5021.70ms 
iter 10695: loss 2.3126, time 4965.61ms 
iter 10696: loss 2.2883, time 4948.79ms 
iter 10697: loss 2.5327, time 5028.97ms 
iter 10698: loss 2.4728, time 4947.41ms 
iter 10699: loss 2.4147, time 5001.23ms 
step 10700: train loss 2.4260, val loss 2.8444
iter 10700: loss 2.4781, time 19710.57ms 
iter 10701: loss 2.3545, time 5031.50ms 
iter 10702: loss 2.4026, time 5008.16ms 
iter 10703: loss 2.2924, time 5027.60ms 
iter 10704: loss 2.4011, time 5012.22ms 
iter 10705: loss 2.3521, time 4959.61ms 
iter 10706: loss 2.6940, time 4981.31ms 
iter 10707: loss 2.3539, time 5004.95ms 
iter 10708: loss 2.4828, time 5030.62ms 
iter 10709: loss 2.3978, time 4958.99ms 
iter 10710: loss 2.2551, time 5035.37ms 
iter 10711: loss 2.6984, time 5031.19ms 
iter 10712: loss 2.6140, time 4958.61ms 
iter 10713: loss 2.5495, time 4961.26ms 
iter 10714: loss 2.4531, time 5022.76ms 
iter 10715: loss 2.4789, time 5024.62ms 
iter 10716: loss 2.3689, time 5028.95ms 
iter 10717: loss 2.4472, time 4964.79ms 
iter 10718: loss 2.4787, time 5021.16ms 
iter 10719: loss 2.5908, time 5025.87ms 
iter 10720: loss 2.6333, time 5026.45ms 
iter 10721: loss 2.3403, time 5023.50ms 
iter 10722: loss 2.4031, time 5025.69ms 
iter 10723: loss 2.5302, time 5011.36ms 
iter 10724: loss 2.3262, time 4945.00ms 
iter 10725: loss 2.4725, time 4916.61ms 
iter 10726: loss 2.5067, time 5005.88ms 
iter 10727: loss 2.4825, time 5011.22ms 
iter 10728: loss 2.2569, time 5027.24ms 
iter 10729: loss 2.5650, time 5029.83ms 
iter 10730: loss 2.5070, time 5013.36ms 
iter 10731: loss 2.4399, time 5029.16ms 
iter 10732: loss 2.2912, time 5031.12ms 
iter 10733: loss 2.3381, time 4972.89ms 
iter 10734: loss 2.3159, time 5021.05ms 
iter 10735: loss 2.3720, time 4933.77ms 
iter 10736: loss 2.3855, time 4987.95ms 
iter 10737: loss 2.5617, time 5027.02ms 
iter 10738: loss 2.5192, time 5023.45ms 
iter 10739: loss 2.3388, time 5021.52ms 
iter 10740: loss 2.6634, time 5030.06ms 
iter 10741: loss 2.4080, time 5020.91ms 
iter 10742: loss 2.3616, time 5023.50ms 
iter 10743: loss 2.2700, time 5023.26ms 
iter 10744: loss 2.2556, time 5022.86ms 
iter 10745: loss 2.3672, time 5022.25ms 
iter 10746: loss 2.3729, time 5021.87ms 
iter 10747: loss 2.4218, time 5023.95ms 
iter 10748: loss 2.2017, time 5032.29ms 
iter 10749: loss 2.2489, time 4976.38ms 
step 10750: train loss 2.4318, val loss 2.8449
iter 10750: loss 2.3477, time 19673.63ms 
iter 10751: loss 2.2671, time 5025.07ms 
iter 10752: loss 2.3658, time 5028.09ms 
iter 10753: loss 2.4305, time 5027.92ms 
iter 10754: loss 2.6615, time 4985.00ms 
iter 10755: loss 2.6482, time 5013.23ms 
iter 10756: loss 2.5078, time 5022.14ms 
iter 10757: loss 2.3886, time 5024.65ms 
iter 10758: loss 2.2720, time 5021.88ms 
iter 10759: loss 2.4929, time 5021.87ms 
iter 10760: loss 2.2809, time 5022.29ms 
iter 10761: loss 2.1165, time 5023.15ms 
iter 10762: loss 2.4490, time 4973.04ms 
iter 10763: loss 2.3772, time 4973.07ms 
iter 10764: loss 2.3450, time 5034.71ms 
iter 10765: loss 2.5671, time 5028.72ms 
iter 10766: loss 2.3195, time 5031.79ms 
iter 10767: loss 2.4658, time 5036.12ms 
iter 10768: loss 2.4430, time 5027.17ms 
iter 10769: loss 2.1690, time 5019.12ms 
iter 10770: loss 2.2014, time 4976.55ms 
iter 10771: loss 2.5136, time 4915.89ms 
iter 10772: loss 2.7132, time 4982.51ms 
iter 10773: loss 2.2993, time 5023.59ms 
iter 10774: loss 2.3212, time 5020.82ms 
iter 10775: loss 2.3897, time 5022.18ms 
iter 10776: loss 2.5374, time 5023.24ms 
iter 10777: loss 2.3541, time 5022.97ms 
iter 10778: loss 2.4969, time 5024.96ms 
iter 10779: loss 2.0634, time 5030.24ms 
iter 10780: loss 2.5113, time 5006.30ms 
iter 10781: loss 2.3400, time 4934.69ms 
iter 10782: loss 2.4801, time 4987.00ms 
iter 10783: loss 2.5671, time 5021.91ms 
iter 10784: loss 2.3636, time 5021.60ms 
iter 10785: loss 2.6029, time 5022.74ms 
iter 10786: loss 2.4801, time 5011.32ms 
iter 10787: loss 2.5580, time 4919.67ms 
iter 10788: loss 2.4049, time 4937.82ms 
iter 10789: loss 2.4064, time 4993.64ms 
iter 10790: loss 2.4085, time 5009.09ms 
iter 10791: loss 2.4746, time 5027.24ms 
iter 10792: loss 2.3646, time 5028.52ms 
iter 10793: loss 2.3024, time 5031.23ms 
iter 10794: loss 2.2777, time 5007.04ms 
iter 10795: loss 2.4902, time 4973.43ms 
iter 10796: loss 2.4032, time 4924.36ms 
iter 10797: loss 2.4073, time 4962.19ms 
iter 10798: loss 2.4547, time 5018.33ms 
iter 10799: loss 2.2405, time 5027.49ms 
step 10800: train loss 2.4260, val loss 2.8359
iter 10800: loss 2.4316, time 19731.97ms 
iter 10801: loss 2.6036, time 5016.61ms 
iter 10802: loss 2.3322, time 5025.17ms 
iter 10803: loss 2.6693, time 4994.24ms 
iter 10804: loss 2.3328, time 5030.74ms 
iter 10805: loss 2.5181, time 5017.17ms 
iter 10806: loss 2.2230, time 5025.41ms 
iter 10807: loss 2.3350, time 5026.18ms 
iter 10808: loss 2.3000, time 5027.85ms 
iter 10809: loss 2.4477, time 5000.98ms 
iter 10810: loss 2.3385, time 4973.47ms 
iter 10811: loss 2.2496, time 4970.47ms 
iter 10812: loss 2.7472, time 4981.14ms 
iter 10813: loss 2.3309, time 4977.06ms 
iter 10814: loss 2.5404, time 4981.34ms 
iter 10815: loss 2.5758, time 5010.97ms 
iter 10816: loss 2.5039, time 5025.92ms 
iter 10817: loss 2.3793, time 5020.57ms 
iter 10818: loss 2.5910, time 4961.72ms 
iter 10819: loss 2.4357, time 4915.10ms 
iter 10820: loss 2.6260, time 4913.68ms 
iter 10821: loss 2.5257, time 4913.75ms 
iter 10822: loss 2.1847, time 4914.35ms 
iter 10823: loss 2.5866, time 4966.88ms 
iter 10824: loss 2.3047, time 5019.55ms 
iter 10825: loss 2.5755, time 5022.69ms 
iter 10826: loss 2.2720, time 5032.83ms 
iter 10827: loss 2.5297, time 4919.36ms 
iter 10828: loss 2.3549, time 4960.96ms 
iter 10829: loss 2.5536, time 5023.51ms 
iter 10830: loss 2.3310, time 5027.07ms 
iter 10831: loss 2.5036, time 5018.36ms 
iter 10832: loss 2.4741, time 5025.32ms 
iter 10833: loss 2.4497, time 5028.48ms 
iter 10834: loss 2.4649, time 5022.27ms 
iter 10835: loss 2.4903, time 5025.96ms 
iter 10836: loss 2.5520, time 4987.05ms 
iter 10837: loss 2.4490, time 5025.55ms 
iter 10838: loss 2.4615, time 5027.70ms 
iter 10839: loss 2.2883, time 5020.98ms 
iter 10840: loss 2.6557, time 5016.35ms 
iter 10841: loss 2.4798, time 5010.67ms 
iter 10842: loss 2.4628, time 5007.61ms 
iter 10843: loss 2.3268, time 5007.42ms 
iter 10844: loss 2.3114, time 4915.60ms 
iter 10845: loss 2.4958, time 4992.68ms 
iter 10846: loss 2.5682, time 5029.38ms 
iter 10847: loss 2.5357, time 5027.41ms 
iter 10848: loss 2.3115, time 5029.98ms 
iter 10849: loss 2.5038, time 5025.07ms 
step 10850: train loss 2.4289, val loss 2.8555
iter 10850: loss 2.3826, time 19688.01ms 
iter 10851: loss 2.4453, time 5024.17ms 
iter 10852: loss 2.4658, time 5025.82ms 
iter 10853: loss 2.2895, time 5025.41ms 
iter 10854: loss 2.4081, time 5027.38ms 
iter 10855: loss 2.5250, time 5027.19ms 
iter 10856: loss 2.3231, time 5027.51ms 
iter 10857: loss 2.5790, time 5024.37ms 
iter 10858: loss 2.3636, time 4981.45ms 
iter 10859: loss 2.5319, time 4988.76ms 
iter 10860: loss 2.2714, time 5029.19ms 
iter 10861: loss 2.4247, time 5025.81ms 
iter 10862: loss 2.2731, time 5025.02ms 
iter 10863: loss 2.2968, time 5026.30ms 
iter 10864: loss 2.2536, time 5028.78ms 
iter 10865: loss 2.4630, time 5023.37ms 
iter 10866: loss 2.3599, time 4975.14ms 
iter 10867: loss 2.4902, time 4987.68ms 
iter 10868: loss 2.4989, time 5025.10ms 
iter 10869: loss 2.6157, time 5026.42ms 
iter 10870: loss 2.4277, time 5026.03ms 
iter 10871: loss 2.4066, time 5014.02ms 
iter 10872: loss 2.4297, time 5024.86ms 
iter 10873: loss 2.5653, time 5022.13ms 
iter 10874: loss 2.5057, time 4986.79ms 
iter 10875: loss 2.4717, time 4938.55ms 
iter 10876: loss 2.5624, time 5021.28ms 
iter 10877: loss 2.4133, time 5018.38ms 
iter 10878: loss 2.4266, time 5021.62ms 
iter 10879: loss 2.5537, time 5027.45ms 
iter 10880: loss 2.7577, time 5022.26ms 
iter 10881: loss 2.3985, time 5025.41ms 
iter 10882: loss 2.3344, time 5031.31ms 
iter 10883: loss 2.4801, time 4980.95ms 
iter 10884: loss 2.2942, time 4986.87ms 
iter 10885: loss 2.6238, time 5025.29ms 
iter 10886: loss 2.2325, time 5022.88ms 
iter 10887: loss 2.4882, time 5012.23ms 
iter 10888: loss 2.3791, time 5025.21ms 
iter 10889: loss 2.5935, time 5024.48ms 
iter 10890: loss 2.3073, time 5024.15ms 
iter 10891: loss 2.3511, time 4965.61ms 
iter 10892: loss 2.4560, time 4981.68ms 
iter 10893: loss 2.4693, time 5028.41ms 
iter 10894: loss 2.5010, time 5028.35ms 
iter 10895: loss 2.6452, time 5024.66ms 
iter 10896: loss 2.4380, time 5025.74ms 
iter 10897: loss 2.5215, time 5022.44ms 
iter 10898: loss 2.3575, time 5023.82ms 
iter 10899: loss 2.7447, time 4977.54ms 
step 10900: train loss 2.4342, val loss 2.8590
iter 10900: loss 2.6534, time 19693.07ms 
iter 10901: loss 2.5356, time 5024.01ms 
iter 10902: loss 2.3289, time 5027.55ms 
iter 10903: loss 2.4736, time 5025.27ms 
iter 10904: loss 2.4163, time 4983.41ms 
iter 10905: loss 2.4705, time 4935.29ms 
iter 10906: loss 2.1600, time 5012.36ms 
iter 10907: loss 2.7449, time 5015.76ms 
iter 10908: loss 2.3703, time 5022.24ms 
iter 10909: loss 2.4446, time 5025.86ms 
iter 10910: loss 2.3513, time 5016.59ms 
iter 10911: loss 2.4011, time 5018.34ms 
iter 10912: loss 2.6210, time 5024.60ms 
iter 10913: loss 2.4188, time 5021.05ms 
iter 10914: loss 2.4507, time 5031.82ms 
iter 10915: loss 2.4059, time 5018.82ms 
iter 10916: loss 2.4575, time 5030.36ms 
iter 10917: loss 2.3298, time 5021.10ms 
iter 10918: loss 2.3778, time 5020.92ms 
iter 10919: loss 2.5374, time 5020.76ms 
iter 10920: loss 2.0874, time 5025.80ms 
iter 10921: loss 2.3527, time 4975.95ms 
iter 10922: loss 2.4209, time 5000.36ms 
iter 10923: loss 2.6770, time 5025.28ms 
iter 10924: loss 2.4067, time 5024.71ms 
iter 10925: loss 2.2004, time 5025.32ms 
iter 10926: loss 2.5527, time 5022.28ms 
iter 10927: loss 2.3046, time 5021.65ms 
iter 10928: loss 2.5118, time 5025.49ms 
iter 10929: loss 2.5848, time 4972.87ms 
iter 10930: loss 2.4880, time 4957.27ms 
iter 10931: loss 2.4470, time 5022.07ms 
iter 10932: loss 2.4418, time 5023.18ms 
iter 10933: loss 2.4821, time 5022.89ms 
iter 10934: loss 2.4612, time 5023.61ms 
iter 10935: loss 2.4833, time 5026.65ms 
iter 10936: loss 2.2868, time 5020.94ms 
iter 10937: loss 2.1082, time 5024.98ms 
iter 10938: loss 2.5123, time 4973.25ms 
iter 10939: loss 2.5234, time 4993.39ms 
iter 10940: loss 2.4759, time 5023.22ms 
iter 10941: loss 2.4014, time 5022.02ms 
iter 10942: loss 2.5272, time 5027.70ms 
iter 10943: loss 2.3394, time 5005.71ms 
iter 10944: loss 2.5183, time 5023.57ms 
iter 10945: loss 2.5089, time 5028.42ms 
iter 10946: loss 2.4488, time 4973.52ms 
iter 10947: loss 2.4548, time 4954.09ms 
iter 10948: loss 2.3431, time 5023.31ms 
iter 10949: loss 2.4742, time 5023.90ms 
step 10950: train loss 2.4294, val loss 2.8544
iter 10950: loss 2.4621, time 19686.82ms 
iter 10951: loss 2.3488, time 5029.60ms 
iter 10952: loss 2.4502, time 4982.54ms 
iter 10953: loss 2.3259, time 5024.91ms 
iter 10954: loss 2.3085, time 5030.45ms 
iter 10955: loss 2.3426, time 5029.59ms 
iter 10956: loss 2.4283, time 5024.83ms 
iter 10957: loss 2.3340, time 5025.25ms 
iter 10958: loss 2.4269, time 5028.26ms 
iter 10959: loss 2.4341, time 5029.81ms 
iter 10960: loss 2.4183, time 5003.57ms 
iter 10961: loss 2.2374, time 4997.83ms 
iter 10962: loss 2.4097, time 5026.86ms 
iter 10963: loss 2.3951, time 5023.14ms 
iter 10964: loss 2.1852, time 5009.13ms 
iter 10965: loss 2.3844, time 5027.60ms 
iter 10966: loss 2.5131, time 5013.73ms 
iter 10967: loss 2.3585, time 4957.31ms 
iter 10968: loss 2.1560, time 4940.67ms 
iter 10969: loss 2.5263, time 5024.94ms 
iter 10970: loss 2.4799, time 5023.86ms 
iter 10971: loss 2.3687, time 5041.25ms 
iter 10972: loss 2.5841, time 5023.31ms 
iter 10973: loss 2.4174, time 5016.86ms 
iter 10974: loss 2.3105, time 5029.86ms 
iter 10975: loss 2.3815, time 5024.93ms 
iter 10976: loss 2.4871, time 5003.88ms 
iter 10977: loss 2.3092, time 5011.45ms 
iter 10978: loss 2.6566, time 5012.70ms 
iter 10979: loss 2.4501, time 5025.68ms 
iter 10980: loss 2.4414, time 5020.03ms 
iter 10981: loss 2.5366, time 5025.31ms 
iter 10982: loss 2.7119, time 5027.29ms 
iter 10983: loss 2.4543, time 4977.84ms 
iter 10984: loss 2.5322, time 4976.30ms 
iter 10985: loss 2.4535, time 5015.51ms 
iter 10986: loss 2.5124, time 5007.06ms 
iter 10987: loss 2.1574, time 5008.04ms 
iter 10988: loss 2.6382, time 4994.31ms 
iter 10989: loss 2.4774, time 5029.70ms 
iter 10990: loss 2.5040, time 5033.76ms 
iter 10991: loss 2.0942, time 5017.57ms 
iter 10992: loss 2.5976, time 5001.71ms 
iter 10993: loss 2.4140, time 5005.89ms 
iter 10994: loss 2.4074, time 5009.50ms 
iter 10995: loss 2.6230, time 5021.10ms 
iter 10996: loss 2.2301, time 5023.56ms 
iter 10997: loss 2.4914, time 5023.99ms 
iter 10998: loss 2.2481, time 5023.09ms 
iter 10999: loss 2.4493, time 4994.00ms 
step 11000: train loss 2.4207, val loss 2.8356
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 11000: loss 2.4210, time 20716.93ms 
iter 11001: loss 2.5825, time 4996.89ms 
iter 11002: loss 2.3070, time 5024.44ms 
iter 11003: loss 2.6005, time 5025.01ms 
iter 11004: loss 2.2992, time 4984.87ms 
iter 11005: loss 2.5586, time 5033.88ms 
iter 11006: loss 2.3679, time 5018.79ms 
iter 11007: loss 2.5776, time 5020.87ms 
iter 11008: loss 1.9804, time 5022.27ms 
iter 11009: loss 2.6946, time 5020.09ms 
iter 11010: loss 2.6438, time 5027.49ms 
iter 11011: loss 2.2697, time 5028.60ms 
iter 11012: loss 2.5104, time 4978.19ms 
iter 11013: loss 2.6818, time 5014.01ms 
iter 11014: loss 2.2890, time 5026.60ms 
iter 11015: loss 2.4754, time 5022.67ms 
iter 11016: loss 2.3784, time 5031.93ms 
iter 11017: loss 2.4783, time 5026.02ms 
iter 11018: loss 2.4829, time 4995.23ms 
iter 11019: loss 2.6098, time 5029.72ms 
iter 11020: loss 2.2521, time 4979.70ms 
iter 11021: loss 2.5201, time 5005.96ms 
iter 11022: loss 2.6163, time 5028.60ms 
iter 11023: loss 2.4295, time 5028.38ms 
iter 11024: loss 2.4046, time 5024.03ms 
iter 11025: loss 2.3715, time 5029.58ms 
iter 11026: loss 2.6457, time 5024.38ms 
iter 11027: loss 2.7587, time 5024.84ms 
iter 11028: loss 2.3199, time 4991.10ms 
iter 11029: loss 2.6297, time 4976.86ms 
iter 11030: loss 2.3459, time 5027.35ms 
iter 11031: loss 2.4298, time 5016.79ms 
iter 11032: loss 2.5389, time 4991.70ms 
iter 11033: loss 2.4860, time 4946.35ms 
iter 11034: loss 2.4294, time 4929.49ms 
iter 11035: loss 2.2708, time 4980.40ms 
iter 11036: loss 2.2686, time 5017.03ms 
iter 11037: loss 2.5080, time 4998.26ms 
iter 11038: loss 2.5590, time 5021.51ms 
iter 11039: loss 2.4317, time 5002.35ms 
iter 11040: loss 2.3687, time 5039.50ms 
iter 11041: loss 2.5193, time 5027.29ms 
iter 11042: loss 2.4795, time 4987.42ms 
iter 11043: loss 2.5505, time 4985.34ms 
iter 11044: loss 2.5047, time 4929.84ms 
iter 11045: loss 2.4322, time 5004.78ms 
iter 11046: loss 2.5419, time 5029.73ms 
iter 11047: loss 2.2833, time 5029.30ms 
iter 11048: loss 2.2425, time 5032.79ms 
iter 11049: loss 2.3717, time 5032.96ms 
step 11050: train loss 2.4475, val loss 2.8475
iter 11050: loss 2.7858, time 19700.49ms 
iter 11051: loss 2.4635, time 5005.21ms 
iter 11052: loss 2.5210, time 5027.51ms 
iter 11053: loss 2.2817, time 5023.44ms 
iter 11054: loss 2.5204, time 5028.29ms 
iter 11055: loss 2.4420, time 5030.64ms 
iter 11056: loss 2.5437, time 5032.07ms 
iter 11057: loss 2.5258, time 4976.74ms 
iter 11058: loss 2.3294, time 5010.43ms 
iter 11059: loss 2.5735, time 4991.31ms 
iter 11060: loss 2.6618, time 5000.13ms 
iter 11061: loss 2.4785, time 5017.45ms 
iter 11062: loss 2.2994, time 5022.66ms 
iter 11063: loss 2.3741, time 4972.69ms 
iter 11064: loss 2.1563, time 5003.46ms 
iter 11065: loss 2.4296, time 5014.35ms 
iter 11066: loss 2.6643, time 5019.35ms 
iter 11067: loss 2.5006, time 5031.99ms 
iter 11068: loss 2.4481, time 5029.22ms 
iter 11069: loss 2.3915, time 4993.76ms 
iter 11070: loss 2.4549, time 5003.40ms 
iter 11071: loss 2.4014, time 5014.85ms 
iter 11072: loss 2.4897, time 5025.41ms 
iter 11073: loss 2.4057, time 4976.92ms 
iter 11074: loss 2.4241, time 4919.86ms 
iter 11075: loss 2.6046, time 4922.25ms 
iter 11076: loss 2.4874, time 5000.95ms 
iter 11077: loss 2.5802, time 5026.70ms 
iter 11078: loss 2.4552, time 5028.61ms 
iter 11079: loss 2.3383, time 5015.54ms 
iter 11080: loss 2.6236, time 5013.48ms 
iter 11081: loss 2.4799, time 5006.33ms 
iter 11082: loss 2.3132, time 4997.99ms 
iter 11083: loss 2.3525, time 4918.31ms 
iter 11084: loss 2.4410, time 4916.97ms 
iter 11085: loss 2.4739, time 4988.04ms 
iter 11086: loss 2.4841, time 5025.22ms 
iter 11087: loss 2.4178, time 5027.96ms 
iter 11088: loss 2.7511, time 5014.56ms 
iter 11089: loss 2.4807, time 5025.86ms 
iter 11090: loss 2.6599, time 5013.16ms 
iter 11091: loss 2.0618, time 5027.04ms 
iter 11092: loss 2.4341, time 4977.48ms 
iter 11093: loss 2.3999, time 4920.85ms 
iter 11094: loss 2.5185, time 5018.08ms 
iter 11095: loss 2.5170, time 5023.04ms 
iter 11096: loss 2.5652, time 5019.82ms 
iter 11097: loss 2.4768, time 5022.41ms 
iter 11098: loss 2.7155, time 5023.49ms 
iter 11099: loss 2.5218, time 5021.36ms 
step 11100: train loss 2.4247, val loss 2.8522
iter 11100: loss 2.3794, time 19678.09ms 
iter 11101: loss 2.5435, time 5025.57ms 
iter 11102: loss 2.2844, time 5024.08ms 
iter 11103: loss 2.2532, time 5022.57ms 
iter 11104: loss 2.4120, time 5022.09ms 
iter 11105: loss 2.2771, time 5022.92ms 
iter 11106: loss 2.3778, time 4997.63ms 
iter 11107: loss 2.3018, time 4970.80ms 
iter 11108: loss 2.4282, time 4959.85ms 
iter 11109: loss 2.5359, time 5026.94ms 
iter 11110: loss 2.4520, time 5022.95ms 
iter 11111: loss 2.4974, time 5024.10ms 
iter 11112: loss 2.2882, time 5023.67ms 
iter 11113: loss 2.3324, time 5024.60ms 
iter 11114: loss 2.3769, time 5028.40ms 
iter 11115: loss 2.2478, time 5021.90ms 
iter 11116: loss 2.4837, time 4988.82ms 
iter 11117: loss 2.5660, time 4945.19ms 
iter 11118: loss 2.4431, time 5024.43ms 
iter 11119: loss 2.3216, time 5025.60ms 
iter 11120: loss 2.4790, time 5028.56ms 
iter 11121: loss 2.3659, time 5028.96ms 
iter 11122: loss 2.2343, time 5014.07ms 
iter 11123: loss 2.0670, time 5028.61ms 
iter 11124: loss 2.4023, time 5030.93ms 
iter 11125: loss 2.3436, time 4979.48ms 
iter 11126: loss 2.2505, time 5011.77ms 
iter 11127: loss 2.6591, time 5025.03ms 
iter 11128: loss 2.3612, time 5024.92ms 
iter 11129: loss 2.4457, time 5025.12ms 
iter 11130: loss 2.3164, time 5026.32ms 
iter 11131: loss 2.2245, time 5024.01ms 
iter 11132: loss 2.2380, time 5025.73ms 
iter 11133: loss 2.4346, time 4980.97ms 
iter 11134: loss 2.3452, time 4917.04ms 
iter 11135: loss 2.3136, time 5004.00ms 
iter 11136: loss 2.2584, time 5024.89ms 
iter 11137: loss 2.4123, time 5027.55ms 
iter 11138: loss 2.5569, time 5029.28ms 
iter 11139: loss 2.4411, time 5031.33ms 
iter 11140: loss 2.5358, time 5029.98ms 
iter 11141: loss 2.4490, time 4986.90ms 
iter 11142: loss 2.4194, time 4915.75ms 
iter 11143: loss 2.4991, time 4970.80ms 
iter 11144: loss 2.6768, time 5027.80ms 
iter 11145: loss 2.3290, time 5024.45ms 
iter 11146: loss 2.5429, time 5025.83ms 
iter 11147: loss 2.5491, time 5029.60ms 
iter 11148: loss 2.5087, time 5023.14ms 
iter 11149: loss 2.2042, time 5022.16ms 
step 11150: train loss 2.4406, val loss 2.8499
iter 11150: loss 2.6001, time 19677.57ms 
iter 11151: loss 2.4751, time 5020.02ms 
iter 11152: loss 2.1586, time 5022.55ms 
iter 11153: loss 2.5144, time 5021.28ms 
iter 11154: loss 2.1622, time 5020.29ms 
iter 11155: loss 2.5604, time 5004.03ms 
iter 11156: loss 2.4060, time 4916.49ms 
iter 11157: loss 2.3795, time 4940.12ms 
iter 11158: loss 2.4532, time 5016.90ms 
iter 11159: loss 2.5455, time 5020.63ms 
iter 11160: loss 2.4123, time 5024.30ms 
iter 11161: loss 2.0409, time 5024.83ms 
iter 11162: loss 2.5737, time 5021.15ms 
iter 11163: loss 2.2715, time 5022.50ms 
iter 11164: loss 2.4839, time 5025.83ms 
iter 11165: loss 2.4692, time 4971.40ms 
iter 11166: loss 2.3748, time 4997.65ms 
iter 11167: loss 2.3369, time 5022.19ms 
iter 11168: loss 2.5148, time 5022.44ms 
iter 11169: loss 2.5636, time 5021.76ms 
iter 11170: loss 2.5491, time 5023.02ms 
iter 11171: loss 2.3442, time 5024.48ms 
iter 11172: loss 2.4482, time 5024.37ms 
iter 11173: loss 2.5945, time 4971.42ms 
iter 11174: loss 2.1206, time 4966.21ms 
iter 11175: loss 2.5135, time 5005.74ms 
iter 11176: loss 2.4033, time 5026.63ms 
iter 11177: loss 2.4692, time 5020.28ms 
iter 11178: loss 2.2938, time 5021.37ms 
iter 11179: loss 2.3642, time 4996.01ms 
iter 11180: loss 2.5414, time 5023.98ms 
iter 11181: loss 2.3583, time 4963.10ms 
iter 11182: loss 2.5210, time 4915.82ms 
iter 11183: loss 2.4936, time 5000.11ms 
iter 11184: loss 2.3867, time 5027.49ms 
iter 11185: loss 2.5028, time 5027.05ms 
iter 11186: loss 2.4928, time 5025.37ms 
iter 11187: loss 2.7673, time 5028.42ms 
iter 11188: loss 2.3906, time 5028.52ms 
iter 11189: loss 2.6282, time 5028.16ms 
iter 11190: loss 2.5884, time 4979.12ms 
iter 11191: loss 2.4222, time 4959.34ms 
iter 11192: loss 2.1316, time 5022.13ms 
iter 11193: loss 2.3811, time 5024.27ms 
iter 11194: loss 2.5679, time 5029.75ms 
iter 11195: loss 2.4111, time 5030.76ms 
iter 11196: loss 2.4380, time 5041.04ms 
iter 11197: loss 2.4892, time 5039.59ms 
iter 11198: loss 2.4838, time 4959.52ms 
iter 11199: loss 2.2111, time 4944.27ms 
step 11200: train loss 2.4285, val loss 2.8568
iter 11200: loss 2.2473, time 19749.52ms 
iter 11201: loss 2.4503, time 5016.10ms 
iter 11202: loss 2.6262, time 5002.18ms 
iter 11203: loss 2.4429, time 5003.41ms 
iter 11204: loss 2.1901, time 4995.59ms 
iter 11205: loss 2.3927, time 4975.36ms 
iter 11206: loss 2.2390, time 4968.02ms 
iter 11207: loss 2.3705, time 4939.85ms 
iter 11208: loss 2.4984, time 4996.11ms 
iter 11209: loss 2.5749, time 5022.49ms 
iter 11210: loss 2.2989, time 5021.98ms 
iter 11211: loss 2.5838, time 5023.63ms 
iter 11212: loss 2.3635, time 4997.83ms 
iter 11213: loss 2.3738, time 4942.24ms 
iter 11214: loss 2.3514, time 4949.11ms 
iter 11215: loss 2.2307, time 5020.76ms 
iter 11216: loss 2.4485, time 5020.82ms 
iter 11217: loss 2.2902, time 5021.70ms 
iter 11218: loss 2.2434, time 5024.20ms 
iter 11219: loss 2.5692, time 5024.04ms 
iter 11220: loss 2.4666, time 5023.40ms 
iter 11221: loss 2.5821, time 5023.60ms 
iter 11222: loss 2.5606, time 4972.14ms 
iter 11223: loss 2.5255, time 4981.15ms 
iter 11224: loss 2.4509, time 5021.76ms 
iter 11225: loss 2.5445, time 5023.16ms 
iter 11226: loss 2.4715, time 5022.96ms 
iter 11227: loss 2.5324, time 5021.79ms 
iter 11228: loss 2.5647, time 5020.72ms 
iter 11229: loss 2.5558, time 5023.27ms 
iter 11230: loss 2.4240, time 4971.94ms 
iter 11231: loss 2.2721, time 4987.28ms 
iter 11232: loss 2.4119, time 5021.44ms 
iter 11233: loss 2.3236, time 5024.32ms 
iter 11234: loss 2.3799, time 5024.15ms 
iter 11235: loss 2.3027, time 5020.16ms 
iter 11236: loss 2.5098, time 5024.04ms 
iter 11237: loss 2.5372, time 5029.39ms 
iter 11238: loss 2.2313, time 4979.23ms 
iter 11239: loss 2.4825, time 4920.51ms 
iter 11240: loss 2.3700, time 5013.64ms 
iter 11241: loss 2.4499, time 5033.95ms 
iter 11242: loss 2.3561, time 5028.78ms 
iter 11243: loss 2.2904, time 5019.84ms 
iter 11244: loss 2.4726, time 5031.01ms 
iter 11245: loss 2.1956, time 5021.97ms 
iter 11246: loss 2.3021, time 5030.18ms 
iter 11247: loss 2.3099, time 4980.56ms 
iter 11248: loss 2.4754, time 4952.58ms 
iter 11249: loss 2.7210, time 5043.81ms 
step 11250: train loss 2.4185, val loss 2.8657
iter 11250: loss 2.6078, time 19689.83ms 
iter 11251: loss 2.3906, time 5026.00ms 
iter 11252: loss 2.1505, time 5032.44ms 
iter 11253: loss 2.5052, time 4966.89ms 
iter 11254: loss 2.4820, time 4955.01ms 
iter 11255: loss 2.4771, time 5023.40ms 
iter 11256: loss 2.5414, time 5026.02ms 
iter 11257: loss 2.4981, time 5009.43ms 
iter 11258: loss 2.5415, time 5025.23ms 
iter 11259: loss 2.4502, time 5018.01ms 
iter 11260: loss 2.4468, time 5015.54ms 
iter 11261: loss 2.3535, time 5032.56ms 
iter 11262: loss 2.3497, time 4984.97ms 
iter 11263: loss 2.3568, time 5025.22ms 
iter 11264: loss 2.2647, time 5026.54ms 
iter 11265: loss 2.6133, time 5024.73ms 
iter 11266: loss 2.4720, time 5014.23ms 
iter 11267: loss 2.1135, time 5024.73ms 
iter 11268: loss 2.1781, time 5027.48ms 
iter 11269: loss 2.4272, time 4975.12ms 
iter 11270: loss 2.3955, time 4989.64ms 
iter 11271: loss 2.3991, time 5023.91ms 
iter 11272: loss 2.3922, time 5019.74ms 
iter 11273: loss 2.5560, time 5008.75ms 
iter 11274: loss 2.3856, time 5005.91ms 
iter 11275: loss 2.4857, time 5024.52ms 
iter 11276: loss 2.4871, time 5013.05ms 
iter 11277: loss 2.5612, time 5025.97ms 
iter 11278: loss 2.3705, time 4957.06ms 
iter 11279: loss 2.4926, time 4950.87ms 
iter 11280: loss 2.3726, time 4989.71ms 
iter 11281: loss 2.6919, time 5024.93ms 
iter 11282: loss 2.3181, time 5022.22ms 
iter 11283: loss 2.3843, time 5023.71ms 
iter 11284: loss 2.5770, time 5022.62ms 
iter 11285: loss 2.3524, time 5021.40ms 
iter 11286: loss 2.4896, time 5025.81ms 
iter 11287: loss 2.4151, time 4975.87ms 
iter 11288: loss 2.3148, time 4917.05ms 
iter 11289: loss 2.3049, time 4929.87ms 
iter 11290: loss 2.4571, time 5023.12ms 
iter 11291: loss 2.1171, time 5023.63ms 
iter 11292: loss 2.2808, time 5024.89ms 
iter 11293: loss 1.9740, time 5025.61ms 
iter 11294: loss 2.3631, time 5024.92ms 
iter 11295: loss 2.6640, time 5030.02ms 
iter 11296: loss 2.7206, time 4989.43ms 
iter 11297: loss 2.6703, time 5023.06ms 
iter 11298: loss 2.5962, time 4994.16ms 
iter 11299: loss 2.2943, time 4960.79ms 
step 11300: train loss 2.4185, val loss 2.8620
iter 11300: loss 2.3777, time 19654.76ms 
iter 11301: loss 2.2143, time 4961.88ms 
iter 11302: loss 2.5609, time 4964.08ms 
iter 11303: loss 2.3550, time 4996.32ms 
iter 11304: loss 2.3528, time 4987.71ms 
iter 11305: loss 2.1953, time 5022.28ms 
iter 11306: loss 2.0555, time 4980.97ms 
iter 11307: loss 2.4297, time 5003.73ms 
iter 11308: loss 2.4241, time 5015.10ms 
iter 11309: loss 2.4320, time 5024.60ms 
iter 11310: loss 2.3796, time 5017.08ms 
iter 11311: loss 2.7481, time 5023.85ms 
iter 11312: loss 2.4047, time 5025.36ms 
iter 11313: loss 2.1882, time 5026.20ms 
iter 11314: loss 2.4704, time 5023.32ms 
iter 11315: loss 2.4279, time 5023.64ms 
iter 11316: loss 2.3884, time 5015.40ms 
iter 11317: loss 2.4404, time 4953.58ms 
iter 11318: loss 2.3059, time 5024.25ms 
iter 11319: loss 2.0724, time 5017.47ms 
iter 11320: loss 2.4272, time 5019.97ms 
iter 11321: loss 2.1959, time 5021.20ms 
iter 11322: loss 2.3441, time 5022.72ms 
iter 11323: loss 2.4399, time 5022.33ms 
iter 11324: loss 2.4282, time 5023.12ms 
iter 11325: loss 2.2636, time 5026.23ms 
iter 11326: loss 2.3300, time 5009.10ms 
iter 11327: loss 2.3932, time 5028.40ms 
iter 11328: loss 2.3256, time 5022.62ms 
iter 11329: loss 2.4189, time 4987.24ms 
iter 11330: loss 2.3133, time 4999.52ms 
iter 11331: loss 2.5868, time 5022.29ms 
iter 11332: loss 2.4125, time 5027.49ms 
iter 11333: loss 2.2512, time 5026.46ms 
iter 11334: loss 2.2796, time 5018.54ms 
iter 11335: loss 2.4049, time 5022.10ms 
iter 11336: loss 2.2984, time 5014.78ms 
iter 11337: loss 2.4734, time 5024.55ms 
iter 11338: loss 2.3332, time 5022.32ms 
iter 11339: loss 2.4769, time 5013.55ms 
iter 11340: loss 2.3477, time 5026.58ms 
iter 11341: loss 2.3499, time 5007.11ms 
iter 11342: loss 2.4696, time 5023.69ms 
iter 11343: loss 2.5863, time 5002.35ms 
iter 11344: loss 2.6052, time 5022.49ms 
iter 11345: loss 2.2618, time 5027.87ms 
iter 11346: loss 2.5949, time 4971.67ms 
iter 11347: loss 2.7319, time 4940.80ms 
iter 11348: loss 2.3190, time 5012.80ms 
iter 11349: loss 2.2846, time 5028.42ms 
step 11350: train loss 2.4203, val loss 2.8696
iter 11350: loss 2.3622, time 19701.24ms 
iter 11351: loss 2.3545, time 5020.75ms 
iter 11352: loss 2.4267, time 5002.68ms 
iter 11353: loss 2.5079, time 5010.30ms 
iter 11354: loss 2.3868, time 4984.97ms 
iter 11355: loss 2.3360, time 5042.98ms 
iter 11356: loss 2.5791, time 5032.43ms 
iter 11357: loss 2.5808, time 5030.55ms 
iter 11358: loss 2.4524, time 5016.65ms 
iter 11359: loss 2.3631, time 5032.74ms 
iter 11360: loss 2.3109, time 5034.78ms 
iter 11361: loss 2.6888, time 5036.29ms 
iter 11362: loss 2.3517, time 5019.97ms 
iter 11363: loss 2.5147, time 5033.24ms 
iter 11364: loss 2.1114, time 5025.71ms 
iter 11365: loss 2.3968, time 5027.43ms 
iter 11366: loss 2.4864, time 5024.21ms 
iter 11367: loss 2.3730, time 5010.14ms 
iter 11368: loss 2.0857, time 5027.75ms 
iter 11369: loss 2.3262, time 5021.46ms 
iter 11370: loss 2.4964, time 5025.33ms 
iter 11371: loss 2.3171, time 5024.10ms 
iter 11372: loss 2.3937, time 5027.80ms 
iter 11373: loss 2.5449, time 5032.68ms 
iter 11374: loss 2.3983, time 5010.84ms 
iter 11375: loss 2.4688, time 5024.62ms 
iter 11376: loss 2.4349, time 5016.48ms 
iter 11377: loss 2.3229, time 5031.65ms 
iter 11378: loss 2.4253, time 5019.22ms 
iter 11379: loss 2.3128, time 5028.24ms 
iter 11380: loss 2.4934, time 5029.94ms 
iter 11381: loss 2.5372, time 5017.65ms 
iter 11382: loss 2.1808, time 4958.75ms 
iter 11383: loss 2.4659, time 5031.18ms 
iter 11384: loss 2.4864, time 5030.94ms 
iter 11385: loss 2.4193, time 5036.52ms 
iter 11386: loss 2.6928, time 5031.71ms 
iter 11387: loss 2.6234, time 5036.86ms 
iter 11388: loss 2.4488, time 5025.09ms 
iter 11389: loss 2.3376, time 5024.08ms 
iter 11390: loss 2.2759, time 5027.64ms 
iter 11391: loss 2.1993, time 5018.85ms 
iter 11392: loss 2.2235, time 5027.71ms 
iter 11393: loss 2.2164, time 5030.93ms 
iter 11394: loss 2.3225, time 5013.75ms 
iter 11395: loss 2.5151, time 4974.00ms 
iter 11396: loss 2.4287, time 4992.46ms 
iter 11397: loss 2.3043, time 5018.72ms 
iter 11398: loss 2.2904, time 5016.60ms 
iter 11399: loss 2.3248, time 5028.60ms 
step 11400: train loss 2.4168, val loss 2.8530
iter 11400: loss 2.3101, time 19700.42ms 
iter 11401: loss 2.6960, time 5022.52ms 
iter 11402: loss 2.2965, time 5029.46ms 
iter 11403: loss 2.4664, time 5026.85ms 
iter 11404: loss 2.4990, time 5012.15ms 
iter 11405: loss 2.4760, time 5025.11ms 
iter 11406: loss 2.4850, time 5025.43ms 
iter 11407: loss 2.5748, time 5005.93ms 
iter 11408: loss 2.3301, time 5023.79ms 
iter 11409: loss 2.3795, time 5022.43ms 
iter 11410: loss 2.3620, time 5021.72ms 
iter 11411: loss 2.5420, time 5029.62ms 
iter 11412: loss 2.3178, time 5025.07ms 
iter 11413: loss 2.5244, time 5025.13ms 
iter 11414: loss 2.4685, time 5030.96ms 
iter 11415: loss 2.3549, time 5032.14ms 
iter 11416: loss 2.3623, time 5025.50ms 
iter 11417: loss 2.5018, time 5023.75ms 
iter 11418: loss 2.3955, time 5025.41ms 
iter 11419: loss 2.5133, time 5024.62ms 
iter 11420: loss 2.2000, time 5027.51ms 
iter 11421: loss 2.3379, time 5027.21ms 
iter 11422: loss 2.3417, time 5000.53ms 
iter 11423: loss 2.4992, time 5004.61ms 
iter 11424: loss 2.3266, time 5020.32ms 
iter 11425: loss 2.3209, time 5014.85ms 
iter 11426: loss 2.1906, time 5031.18ms 
iter 11427: loss 2.4751, time 5029.40ms 
iter 11428: loss 2.5142, time 5024.09ms 
iter 11429: loss 2.5553, time 5023.91ms 
iter 11430: loss 2.3639, time 5000.74ms 
iter 11431: loss 2.4313, time 5024.41ms 
iter 11432: loss 2.6068, time 5025.81ms 
iter 11433: loss 2.2807, time 5006.42ms 
iter 11434: loss 2.4668, time 5022.42ms 
iter 11435: loss 2.4197, time 5022.94ms 
iter 11436: loss 2.4995, time 5030.44ms 
iter 11437: loss 2.3808, time 5010.76ms 
iter 11438: loss 2.2960, time 5025.86ms 
iter 11439: loss 2.2718, time 5029.14ms 
iter 11440: loss 2.4748, time 5018.59ms 
iter 11441: loss 2.4554, time 5015.59ms 
iter 11442: loss 2.5248, time 5002.04ms 
iter 11443: loss 2.5474, time 4982.13ms 
iter 11444: loss 2.4233, time 5001.19ms 
iter 11445: loss 2.3166, time 5024.17ms 
iter 11446: loss 2.4094, time 5023.79ms 
iter 11447: loss 2.1217, time 5022.72ms 
iter 11448: loss 2.3861, time 5021.84ms 
iter 11449: loss 2.3189, time 5019.30ms 
step 11450: train loss 2.4280, val loss 2.8535
iter 11450: loss 2.1564, time 19684.56ms 
iter 11451: loss 2.2654, time 5024.63ms 
iter 11452: loss 2.5847, time 5023.02ms 
iter 11453: loss 2.4512, time 5023.16ms 
iter 11454: loss 2.6086, time 4998.08ms 
iter 11455: loss 2.5378, time 5030.24ms 
iter 11456: loss 2.3133, time 5020.36ms 
iter 11457: loss 2.5884, time 5028.05ms 
iter 11458: loss 2.2335, time 5026.23ms 
iter 11459: loss 2.5108, time 5027.35ms 
iter 11460: loss 2.5852, time 5007.68ms 
iter 11461: loss 2.3728, time 5024.44ms 
iter 11462: loss 2.4636, time 5027.02ms 
iter 11463: loss 2.3088, time 5015.63ms 
iter 11464: loss 2.2496, time 5029.10ms 
iter 11465: loss 2.6156, time 5022.71ms 
iter 11466: loss 2.6338, time 5021.87ms 
iter 11467: loss 2.3178, time 5024.12ms 
iter 11468: loss 2.4129, time 5023.94ms 
iter 11469: loss 2.6311, time 5003.68ms 
iter 11470: loss 2.3830, time 4982.68ms 
iter 11471: loss 2.5763, time 5001.08ms 
iter 11472: loss 2.3080, time 5014.26ms 
iter 11473: loss 2.4592, time 5020.94ms 
iter 11474: loss 2.5179, time 5023.79ms 
iter 11475: loss 2.4725, time 5022.30ms 
iter 11476: loss 2.3206, time 5022.79ms 
iter 11477: loss 2.3298, time 5029.90ms 
iter 11478: loss 2.3289, time 5004.57ms 
iter 11479: loss 2.5660, time 5021.09ms 
iter 11480: loss 2.5275, time 5022.53ms 
iter 11481: loss 2.3742, time 5018.83ms 
iter 11482: loss 2.4365, time 5021.84ms 
iter 11483: loss 2.2866, time 5022.45ms 
iter 11484: loss 2.4752, time 5019.67ms 
iter 11485: loss 2.3873, time 5024.25ms 
iter 11486: loss 2.5681, time 5022.73ms 
iter 11487: loss 2.3274, time 5024.47ms 
iter 11488: loss 2.2574, time 5022.14ms 
iter 11489: loss 2.0995, time 5023.09ms 
iter 11490: loss 2.5946, time 5022.70ms 
iter 11491: loss 2.2924, time 5024.72ms 
iter 11492: loss 2.1411, time 5027.32ms 
iter 11493: loss 2.4358, time 5013.73ms 
iter 11494: loss 2.5457, time 5023.22ms 
iter 11495: loss 2.5756, time 4982.41ms 
iter 11496: loss 2.6477, time 4923.25ms 
iter 11497: loss 2.3909, time 5023.67ms 
iter 11498: loss 2.3154, time 5024.51ms 
iter 11499: loss 2.4476, time 5026.04ms 
step 11500: train loss 2.4232, val loss 2.8692
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 11500: loss 2.4622, time 20654.43ms 
iter 11501: loss 2.1259, time 5024.45ms 
iter 11502: loss 2.6306, time 5022.82ms 
iter 11503: loss 2.4392, time 5025.51ms 
iter 11504: loss 2.5544, time 4989.87ms 
iter 11505: loss 2.4984, time 5019.72ms 
iter 11506: loss 2.4395, time 5011.92ms 
iter 11507: loss 2.1729, time 5022.99ms 
iter 11508: loss 2.4794, time 5026.50ms 
iter 11509: loss 2.3651, time 5023.29ms 
iter 11510: loss 2.5743, time 5022.70ms 
iter 11511: loss 2.2290, time 5027.85ms 
iter 11512: loss 2.3620, time 4975.19ms 
iter 11513: loss 2.4612, time 5027.23ms 
iter 11514: loss 2.1877, time 5026.96ms 
iter 11515: loss 2.4517, time 5032.44ms 
iter 11516: loss 2.4041, time 4993.25ms 
iter 11517: loss 2.3051, time 5028.53ms 
iter 11518: loss 2.4416, time 5036.30ms 
iter 11519: loss 2.4207, time 5033.43ms 
iter 11520: loss 2.3035, time 5021.48ms 
iter 11521: loss 2.3372, time 5026.52ms 
iter 11522: loss 2.5540, time 5033.11ms 
iter 11523: loss 2.4660, time 5040.74ms 
iter 11524: loss 2.6407, time 4989.90ms 
iter 11525: loss 2.4377, time 5020.63ms 
iter 11526: loss 2.4893, time 5029.27ms 
iter 11527: loss 2.6387, time 5027.43ms 
iter 11528: loss 2.3506, time 5025.52ms 
iter 11529: loss 2.6938, time 5027.15ms 
iter 11530: loss 2.4462, time 5032.27ms 
iter 11531: loss 2.5505, time 4984.05ms 
iter 11532: loss 2.4496, time 5076.50ms 
iter 11533: loss 2.3830, time 5036.70ms 
iter 11534: loss 2.4456, time 5015.65ms 
iter 11535: loss 2.5795, time 4979.82ms 
iter 11536: loss 2.2401, time 4920.17ms 
iter 11537: loss 2.4769, time 4918.31ms 
iter 11538: loss 2.4679, time 4918.79ms 
iter 11539: loss 2.4700, time 4922.04ms 
iter 11540: loss 2.5725, time 4921.19ms 
iter 11541: loss 2.5716, time 4929.92ms 
iter 11542: loss 2.2933, time 4947.25ms 
iter 11543: loss 2.4458, time 4969.15ms 
iter 11544: loss 2.4077, time 4916.13ms 
iter 11545: loss 2.7093, time 4915.31ms 
iter 11546: loss 2.4559, time 4967.81ms 
iter 11547: loss 2.5268, time 5021.94ms 
iter 11548: loss 2.5896, time 4966.74ms 
iter 11549: loss 2.5524, time 4974.50ms 
step 11550: train loss 2.4239, val loss 2.8509
iter 11550: loss 2.5102, time 19665.62ms 
iter 11551: loss 2.4139, time 5026.73ms 
iter 11552: loss 2.2188, time 5025.42ms 
iter 11553: loss 2.3241, time 5028.34ms 
iter 11554: loss 2.6286, time 5010.32ms 
iter 11555: loss 2.3266, time 4980.64ms 
iter 11556: loss 2.4172, time 4964.70ms 
iter 11557: loss 2.5474, time 5026.29ms 
iter 11558: loss 2.5237, time 5023.81ms 
iter 11559: loss 2.3795, time 5027.13ms 
iter 11560: loss 2.3911, time 5025.42ms 
iter 11561: loss 2.6073, time 5046.32ms 
iter 11562: loss 2.4975, time 5039.28ms 
iter 11563: loss 2.3095, time 4938.59ms 
iter 11564: loss 2.3742, time 4919.55ms 
iter 11565: loss 2.5509, time 4922.23ms 
iter 11566: loss 2.5974, time 4922.69ms 
iter 11567: loss 2.1084, time 4919.92ms 
iter 11568: loss 2.5555, time 4920.59ms 
iter 11569: loss 2.4352, time 4992.92ms 
iter 11570: loss 2.5281, time 5038.04ms 
iter 11571: loss 2.7380, time 5021.21ms 
iter 11572: loss 2.4792, time 5018.96ms 
iter 11573: loss 2.5166, time 5016.48ms 
iter 11574: loss 2.2187, time 5002.27ms 
iter 11575: loss 2.5708, time 5006.80ms 
iter 11576: loss 2.3066, time 4979.70ms 
iter 11577: loss 2.3732, time 4917.18ms 
iter 11578: loss 2.3593, time 4914.33ms 
iter 11579: loss 2.4838, time 5005.54ms 
iter 11580: loss 2.5177, time 5002.10ms 
iter 11581: loss 2.4956, time 5024.06ms 
iter 11582: loss 2.5534, time 5023.88ms 
iter 11583: loss 2.3734, time 5024.40ms 
iter 11584: loss 2.3556, time 5024.71ms 
iter 11585: loss 2.4520, time 4998.04ms 
iter 11586: loss 2.2377, time 4914.72ms 
iter 11587: loss 2.3761, time 4938.76ms 
iter 11588: loss 2.3831, time 4944.27ms 
iter 11589: loss 2.3937, time 5023.86ms 
iter 11590: loss 2.5845, time 5018.45ms 
iter 11591: loss 2.3248, time 5021.67ms 
iter 11592: loss 2.5986, time 5020.70ms 
iter 11593: loss 2.4593, time 5017.44ms 
iter 11594: loss 2.5052, time 5021.94ms 
iter 11595: loss 2.3552, time 5012.80ms 
iter 11596: loss 2.3526, time 4913.39ms 
iter 11597: loss 2.3599, time 4912.79ms 
iter 11598: loss 2.3257, time 4954.39ms 
iter 11599: loss 2.3915, time 4969.35ms 
step 11600: train loss 2.4151, val loss 2.8771
iter 11600: loss 2.4241, time 19616.38ms 
iter 11601: loss 2.3818, time 4917.92ms 
iter 11602: loss 2.4727, time 5012.47ms 
iter 11603: loss 2.5263, time 5019.38ms 
iter 11604: loss 2.3320, time 5012.98ms 
iter 11605: loss 2.4002, time 5011.34ms 
iter 11606: loss 2.2919, time 5020.32ms 
iter 11607: loss 2.3473, time 5019.37ms 
iter 11608: loss 2.5217, time 5023.65ms 
iter 11609: loss 2.4733, time 4969.68ms 
iter 11610: loss 2.5875, time 4916.41ms 
iter 11611: loss 2.4241, time 4916.05ms 
iter 11612: loss 2.2371, time 4914.61ms 
iter 11613: loss 2.4355, time 4912.14ms 
iter 11614: loss 2.2839, time 4997.06ms 
iter 11615: loss 2.4871, time 5006.10ms 
iter 11616: loss 2.5562, time 4999.58ms 
iter 11617: loss 2.6575, time 5020.26ms 
iter 11618: loss 2.4492, time 5010.99ms 
iter 11619: loss 2.4197, time 5031.26ms 
iter 11620: loss 2.3644, time 5031.65ms 
iter 11621: loss 2.2385, time 4977.92ms 
iter 11622: loss 2.2070, time 4917.62ms 
iter 11623: loss 2.3924, time 4946.27ms 
iter 11624: loss 2.4251, time 4920.05ms 
iter 11625: loss 2.4680, time 4978.72ms 
iter 11626: loss 2.5525, time 4996.71ms 
iter 11627: loss 2.4188, time 5025.71ms 
iter 11628: loss 2.6419, time 5026.61ms 
iter 11629: loss 2.6253, time 4992.53ms 
iter 11630: loss 2.4648, time 5003.26ms 
iter 11631: loss 2.0710, time 4997.98ms 
iter 11632: loss 2.3661, time 4946.53ms 
iter 11633: loss 2.3371, time 4959.96ms 
iter 11634: loss 2.4595, time 4952.20ms 
iter 11635: loss 2.4068, time 4944.66ms 
iter 11636: loss 2.5173, time 4921.19ms 
iter 11637: loss 2.6201, time 4926.59ms 
iter 11638: loss 2.3791, time 4927.79ms 
iter 11639: loss 2.2044, time 4943.67ms 
iter 11640: loss 2.4304, time 4964.35ms 
iter 11641: loss 2.4404, time 5021.36ms 
iter 11642: loss 2.4663, time 4999.45ms 
iter 11643: loss 2.4487, time 5031.16ms 
iter 11644: loss 2.3097, time 5032.08ms 
iter 11645: loss 2.6831, time 5023.54ms 
iter 11646: loss 2.4103, time 4964.08ms 
iter 11647: loss 2.2775, time 4916.86ms 
iter 11648: loss 2.4054, time 4915.95ms 
iter 11649: loss 2.2085, time 4918.42ms 
step 11650: train loss 2.4111, val loss 2.8625
iter 11650: loss 2.4141, time 19707.74ms 
iter 11651: loss 2.5456, time 4989.15ms 
iter 11652: loss 2.5278, time 4989.00ms 
iter 11653: loss 2.4768, time 5003.92ms 
iter 11654: loss 2.4056, time 4968.04ms 
iter 11655: loss 2.6198, time 4917.97ms 
iter 11656: loss 2.3896, time 4915.20ms 
iter 11657: loss 2.2691, time 4922.75ms 
iter 11658: loss 2.2339, time 4942.34ms 
iter 11659: loss 2.4027, time 4975.22ms 
iter 11660: loss 2.5030, time 4989.87ms 
iter 11661: loss 2.3704, time 4990.07ms 
iter 11662: loss 2.0577, time 4997.41ms 
iter 11663: loss 2.4913, time 5009.70ms 
iter 11664: loss 2.1973, time 5004.80ms 
iter 11665: loss 2.1044, time 4993.60ms 
iter 11666: loss 2.4948, time 4935.92ms 
iter 11667: loss 2.3614, time 4922.01ms 
iter 11668: loss 2.5406, time 4934.09ms 
iter 11669: loss 2.3392, time 4930.76ms 
iter 11670: loss 2.4481, time 4920.06ms 
iter 11671: loss 2.4571, time 4917.99ms 
iter 11672: loss 2.4876, time 4996.95ms 
iter 11673: loss 2.6335, time 4978.62ms 
iter 11674: loss 2.5089, time 5034.62ms 
iter 11675: loss 2.2279, time 5009.92ms 
iter 11676: loss 2.2856, time 5018.74ms 
iter 11677: loss 2.4208, time 5032.07ms 
iter 11678: loss 2.5328, time 5024.97ms 
iter 11679: loss 2.7268, time 4915.43ms 
iter 11680: loss 2.3792, time 4916.25ms 
iter 11681: loss 2.4068, time 4923.14ms 
iter 11682: loss 2.3988, time 4926.31ms 
iter 11683: loss 2.2534, time 4919.24ms 
iter 11684: loss 2.3211, time 4938.72ms 
iter 11685: loss 2.2241, time 5026.60ms 
iter 11686: loss 2.2810, time 5016.94ms 
iter 11687: loss 2.4314, time 5011.48ms 
iter 11688: loss 2.1045, time 5012.09ms 
iter 11689: loss 2.3178, time 5025.27ms 
iter 11690: loss 2.3104, time 5038.89ms 
iter 11691: loss 2.1497, time 5038.83ms 
iter 11692: loss 2.5943, time 4979.72ms 
iter 11693: loss 2.2941, time 4922.05ms 
iter 11694: loss 2.4550, time 4916.80ms 
iter 11695: loss 2.1133, time 4939.90ms 
iter 11696: loss 2.5846, time 5034.97ms 
iter 11697: loss 2.3473, time 5029.12ms 
iter 11698: loss 2.3233, time 5015.58ms 
iter 11699: loss 2.3822, time 5022.21ms 
step 11700: train loss 2.4055, val loss 2.8483
iter 11700: loss 2.4311, time 19633.36ms 
iter 11701: loss 2.6329, time 4923.96ms 
iter 11702: loss 2.4672, time 4918.66ms 
iter 11703: loss 2.4556, time 4918.87ms 
iter 11704: loss 2.5357, time 4918.07ms 
iter 11705: loss 2.4604, time 4989.29ms 
iter 11706: loss 2.6045, time 5043.87ms 
iter 11707: loss 2.4926, time 5020.87ms 
iter 11708: loss 2.5712, time 5009.24ms 
iter 11709: loss 2.7072, time 5021.84ms 
iter 11710: loss 2.5020, time 5026.19ms 
iter 11711: loss 2.7940, time 5029.30ms 
iter 11712: loss 2.3737, time 4977.34ms 
iter 11713: loss 2.3031, time 4921.71ms 
iter 11714: loss 2.4077, time 4930.78ms 
iter 11715: loss 2.5235, time 4916.68ms 
iter 11716: loss 2.5863, time 4945.73ms 
iter 11717: loss 2.4285, time 5012.81ms 
iter 11718: loss 2.4149, time 5032.53ms 
iter 11719: loss 2.5600, time 5029.50ms 
iter 11720: loss 2.7063, time 5027.53ms 
iter 11721: loss 2.5258, time 5033.84ms 
iter 11722: loss 2.4458, time 5044.29ms 
iter 11723: loss 2.4391, time 5029.65ms 
iter 11724: loss 2.2938, time 4985.33ms 
iter 11725: loss 2.3045, time 4920.56ms 
iter 11726: loss 2.3464, time 4920.51ms 
iter 11727: loss 2.3809, time 4919.89ms 
iter 11728: loss 2.2089, time 4984.15ms 
iter 11729: loss 2.6359, time 5010.99ms 
iter 11730: loss 2.4746, time 5036.72ms 
iter 11731: loss 2.4608, time 5025.14ms 
iter 11732: loss 2.3486, time 5022.39ms 
iter 11733: loss 2.4795, time 5028.53ms 
iter 11734: loss 2.2096, time 5032.28ms 
iter 11735: loss 2.4498, time 4980.87ms 
iter 11736: loss 2.2220, time 4919.13ms 
iter 11737: loss 2.3905, time 4916.80ms 
iter 11738: loss 2.3730, time 4916.82ms 
iter 11739: loss 2.7564, time 4930.47ms 
iter 11740: loss 2.4201, time 4916.27ms 
iter 11741: loss 2.4411, time 5013.35ms 
iter 11742: loss 2.2436, time 5004.16ms 
iter 11743: loss 2.4888, time 5001.18ms 
iter 11744: loss 2.6210, time 5039.56ms 
iter 11745: loss 2.4964, time 5010.44ms 
iter 11746: loss 2.5390, time 5011.58ms 
iter 11747: loss 2.5337, time 5027.06ms 
iter 11748: loss 1.9847, time 4988.83ms 
iter 11749: loss 2.5199, time 4923.35ms 
step 11750: train loss 2.4043, val loss 2.8715
iter 11750: loss 2.5877, time 19704.78ms 
iter 11751: loss 2.3170, time 4942.81ms 
iter 11752: loss 2.5027, time 4960.84ms 
iter 11753: loss 2.4879, time 4954.34ms 
iter 11754: loss 2.2815, time 4993.86ms 
iter 11755: loss 2.3475, time 5024.06ms 
iter 11756: loss 2.5692, time 5019.29ms 
iter 11757: loss 2.3231, time 5021.68ms 
iter 11758: loss 2.7278, time 4961.58ms 
iter 11759: loss 2.3499, time 4979.35ms 
iter 11760: loss 2.2743, time 4933.82ms 
iter 11761: loss 2.4330, time 4920.04ms 
iter 11762: loss 2.4701, time 4922.24ms 
iter 11763: loss 2.5144, time 4920.06ms 
iter 11764: loss 2.4262, time 4916.95ms 
iter 11765: loss 2.1496, time 4916.24ms 
iter 11766: loss 2.1645, time 4950.27ms 
iter 11767: loss 2.2651, time 4993.50ms 
iter 11768: loss 2.4090, time 5018.87ms 
iter 11769: loss 2.3183, time 5017.49ms 
iter 11770: loss 2.5772, time 4981.63ms 
iter 11771: loss 2.2092, time 4981.32ms 
iter 11772: loss 2.4505, time 4915.01ms 
iter 11773: loss 2.3236, time 4915.26ms 
iter 11774: loss 2.5283, time 4916.29ms 
iter 11775: loss 2.5317, time 4915.86ms 
iter 11776: loss 2.1950, time 4915.25ms 
iter 11777: loss 2.3870, time 4918.04ms 
iter 11778: loss 2.4133, time 4922.33ms 
iter 11779: loss 2.5704, time 5032.88ms 
iter 11780: loss 2.5939, time 5042.26ms 
iter 11781: loss 2.3182, time 4960.89ms 
iter 11782: loss 2.4493, time 4959.32ms 
iter 11783: loss 2.5603, time 4915.27ms 
iter 11784: loss 2.3240, time 4987.47ms 
iter 11785: loss 2.3404, time 5012.29ms 
iter 11786: loss 2.4477, time 4944.71ms 
iter 11787: loss 2.2049, time 4954.27ms 
iter 11788: loss 2.6237, time 4937.18ms 
iter 11789: loss 2.3220, time 4924.53ms 
iter 11790: loss 2.4886, time 4920.02ms 
iter 11791: loss 2.5358, time 4980.82ms 
iter 11792: loss 2.5603, time 4969.92ms 
iter 11793: loss 2.4541, time 4951.72ms 
iter 11794: loss 2.6180, time 5021.45ms 
iter 11795: loss 2.4594, time 5026.80ms 
iter 11796: loss 2.2561, time 5010.25ms 
iter 11797: loss 2.3336, time 4997.98ms 
iter 11798: loss 2.3183, time 4940.22ms 
iter 11799: loss 2.3772, time 4925.32ms 
step 11800: train loss 2.4234, val loss 2.8589
iter 11800: loss 2.5237, time 19759.48ms 
iter 11801: loss 2.3879, time 5021.01ms 
iter 11802: loss 2.2095, time 5044.53ms 
iter 11803: loss 2.6304, time 4982.83ms 
iter 11804: loss 2.2725, time 5012.71ms 
iter 11805: loss 2.3861, time 4998.90ms 
iter 11806: loss 2.5234, time 5002.88ms 
iter 11807: loss 2.6527, time 4975.83ms 
iter 11808: loss 2.3282, time 4919.36ms 
iter 11809: loss 2.3815, time 4917.93ms 
iter 11810: loss 2.4150, time 4927.87ms 
iter 11811: loss 2.2116, time 5036.69ms 
iter 11812: loss 2.6403, time 5032.08ms 
iter 11813: loss 2.3745, time 5030.30ms 
iter 11814: loss 2.4563, time 5016.40ms 
iter 11815: loss 2.4691, time 5032.69ms 
iter 11816: loss 2.3165, time 5007.92ms 
iter 11817: loss 2.4334, time 5016.27ms 
iter 11818: loss 2.1942, time 4924.37ms 
iter 11819: loss 2.0452, time 4917.12ms 
iter 11820: loss 2.6199, time 4915.52ms 
iter 11821: loss 2.4983, time 4917.13ms 
iter 11822: loss 2.5453, time 4916.06ms 
iter 11823: loss 2.6028, time 4917.27ms 
iter 11824: loss 1.9554, time 4936.23ms 
iter 11825: loss 2.4015, time 4933.29ms 
iter 11826: loss 2.5981, time 4917.28ms 
iter 11827: loss 2.7066, time 4916.55ms 
iter 11828: loss 2.5594, time 4918.35ms 
iter 11829: loss 2.4666, time 4956.00ms 
iter 11830: loss 2.4418, time 5026.65ms 
iter 11831: loss 2.3876, time 5020.63ms 
iter 11832: loss 2.3367, time 5025.22ms 
iter 11833: loss 2.6333, time 5031.92ms 
iter 11834: loss 2.4244, time 5031.51ms 
iter 11835: loss 2.6650, time 5010.74ms 
iter 11836: loss 2.5908, time 4995.05ms 
iter 11837: loss 2.5000, time 4954.59ms 
iter 11838: loss 2.3503, time 4954.38ms 
iter 11839: loss 2.3827, time 4920.95ms 
iter 11840: loss 2.3846, time 4997.65ms 
iter 11841: loss 2.2553, time 5006.83ms 
iter 11842: loss 2.3495, time 5036.10ms 
iter 11843: loss 2.4106, time 5028.15ms 
iter 11844: loss 2.3204, time 5028.83ms 
iter 11845: loss 2.3954, time 5016.59ms 
iter 11846: loss 2.3424, time 5003.48ms 
iter 11847: loss 2.4032, time 5023.30ms 
iter 11848: loss 2.2499, time 4979.36ms 
iter 11849: loss 2.4688, time 4918.25ms 
step 11850: train loss 2.4166, val loss 2.8664
iter 11850: loss 2.4545, time 19679.92ms 
iter 11851: loss 2.4406, time 5030.53ms 
iter 11852: loss 2.3045, time 5025.86ms 
iter 11853: loss 2.6856, time 5005.64ms 
iter 11854: loss 2.4926, time 5024.78ms 
iter 11855: loss 2.5513, time 4971.75ms 
iter 11856: loss 2.3822, time 4916.15ms 
iter 11857: loss 2.4637, time 4915.65ms 
iter 11858: loss 2.3674, time 4922.29ms 
iter 11859: loss 2.0558, time 4980.59ms 
iter 11860: loss 2.2679, time 5013.01ms 
iter 11861: loss 2.3419, time 5031.56ms 
iter 11862: loss 2.1415, time 5029.34ms 
iter 11863: loss 2.4434, time 5021.28ms 
iter 11864: loss 2.6506, time 5015.78ms 
iter 11865: loss 2.2181, time 5016.98ms 
iter 11866: loss 2.3982, time 4919.11ms 
iter 11867: loss 2.3364, time 4915.82ms 
iter 11868: loss 2.3941, time 4916.76ms 
iter 11869: loss 2.4963, time 4930.24ms 
iter 11870: loss 2.5397, time 4969.60ms 
iter 11871: loss 2.6141, time 5033.44ms 
iter 11872: loss 2.5180, time 5028.57ms 
iter 11873: loss 2.5625, time 5020.67ms 
iter 11874: loss 2.2922, time 5027.16ms 
iter 11875: loss 2.2774, time 5031.85ms 
iter 11876: loss 2.1322, time 4984.12ms 
iter 11877: loss 2.2812, time 4924.45ms 
iter 11878: loss 2.4857, time 4917.72ms 
iter 11879: loss 2.4892, time 4916.17ms 
iter 11880: loss 2.5930, time 4919.81ms 
iter 11881: loss 2.3296, time 5023.97ms 
iter 11882: loss 2.5595, time 5032.77ms 
iter 11883: loss 2.3118, time 5031.69ms 
iter 11884: loss 2.4684, time 5037.34ms 
iter 11885: loss 2.2487, time 5033.34ms 
iter 11886: loss 2.6285, time 5032.70ms 
iter 11887: loss 2.4888, time 5035.55ms 
iter 11888: loss 2.3850, time 4984.34ms 
iter 11889: loss 2.3541, time 4916.93ms 
iter 11890: loss 2.2019, time 4915.84ms 
iter 11891: loss 2.5169, time 5020.55ms 
iter 11892: loss 2.4350, time 5027.37ms 
iter 11893: loss 2.4997, time 5029.60ms 
iter 11894: loss 2.3927, time 5026.84ms 
iter 11895: loss 2.6019, time 5027.09ms 
iter 11896: loss 2.4064, time 5028.94ms 
iter 11897: loss 2.5861, time 5032.99ms 
iter 11898: loss 2.4501, time 4980.06ms 
iter 11899: loss 2.2570, time 4948.98ms 
step 11900: train loss 2.4256, val loss 2.8701
iter 11900: loss 2.4610, time 19688.91ms 
iter 11901: loss 2.2954, time 4917.57ms 
iter 11902: loss 2.4586, time 4988.98ms 
iter 11903: loss 2.5464, time 5025.68ms 
iter 11904: loss 2.4875, time 5026.47ms 
iter 11905: loss 2.5794, time 5020.26ms 
iter 11906: loss 2.6196, time 5027.63ms 
iter 11907: loss 2.4640, time 5017.26ms 
iter 11908: loss 2.4442, time 4997.05ms 
iter 11909: loss 2.5633, time 4972.42ms 
iter 11910: loss 2.4929, time 4915.64ms 
iter 11911: loss 2.5610, time 4914.84ms 
iter 11912: loss 2.5398, time 4984.22ms 
iter 11913: loss 2.2932, time 5025.83ms 
iter 11914: loss 2.5610, time 5032.32ms 
iter 11915: loss 2.4033, time 5001.39ms 
iter 11916: loss 2.1773, time 5018.18ms 
iter 11917: loss 2.3143, time 5030.58ms 
iter 11918: loss 2.3305, time 5024.80ms 
iter 11919: loss 2.3040, time 4975.67ms 
iter 11920: loss 2.2929, time 4917.59ms 
iter 11921: loss 2.4898, time 4934.39ms 
iter 11922: loss 2.5719, time 5030.08ms 
iter 11923: loss 2.5815, time 5029.55ms 
iter 11924: loss 2.3892, time 5028.91ms 
iter 11925: loss 2.4228, time 5029.91ms 
iter 11926: loss 2.3948, time 4998.43ms 
iter 11927: loss 2.5586, time 5000.98ms 
iter 11928: loss 2.3563, time 5028.55ms 
iter 11929: loss 2.5529, time 4972.64ms 
iter 11930: loss 2.3203, time 4916.04ms 
iter 11931: loss 2.4492, time 4915.77ms 
iter 11932: loss 2.2812, time 4966.74ms 
iter 11933: loss 2.5960, time 5012.26ms 
iter 11934: loss 2.2886, time 5018.68ms 
iter 11935: loss 2.3718, time 5027.48ms 
iter 11936: loss 2.4594, time 5001.48ms 
iter 11937: loss 2.4260, time 5023.97ms 
iter 11938: loss 2.4375, time 5007.69ms 
iter 11939: loss 2.3973, time 4996.08ms 
iter 11940: loss 2.4216, time 4968.32ms 
iter 11941: loss 2.4372, time 4964.20ms 
iter 11942: loss 2.3192, time 4993.34ms 
iter 11943: loss 2.4465, time 5011.20ms 
iter 11944: loss 2.3057, time 5003.06ms 
iter 11945: loss 2.1365, time 4961.21ms 
iter 11946: loss 2.3170, time 4915.42ms 
iter 11947: loss 2.2994, time 4934.85ms 
iter 11948: loss 2.6751, time 5006.56ms 
iter 11949: loss 2.3600, time 4954.19ms 
step 11950: train loss 2.4113, val loss 2.8593
iter 11950: loss 2.1549, time 19683.08ms 
iter 11951: loss 2.4222, time 5009.34ms 
iter 11952: loss 2.4963, time 4942.10ms 
iter 11953: loss 2.1378, time 4976.60ms 
iter 11954: loss 2.3042, time 5014.14ms 
iter 11955: loss 2.2557, time 5023.58ms 
iter 11956: loss 2.2664, time 4970.28ms 
iter 11957: loss 2.2518, time 4916.26ms 
iter 11958: loss 2.1939, time 4965.42ms 
iter 11959: loss 2.4209, time 5013.76ms 
iter 11960: loss 2.5512, time 5014.84ms 
iter 11961: loss 2.5695, time 5001.25ms 
iter 11962: loss 2.5572, time 5019.50ms 
iter 11963: loss 2.2070, time 5003.81ms 
iter 11964: loss 2.2966, time 5010.88ms 
iter 11965: loss 2.3331, time 4970.26ms 
iter 11966: loss 2.6309, time 4916.15ms 
iter 11967: loss 2.1280, time 4927.97ms 
iter 11968: loss 2.2078, time 5025.03ms 
iter 11969: loss 2.1797, time 5017.95ms 
iter 11970: loss 2.5605, time 4992.33ms 
iter 11971: loss 2.4475, time 5031.13ms 
iter 11972: loss 2.5688, time 5011.95ms 
iter 11973: loss 2.5780, time 4962.34ms 
iter 11974: loss 2.4890, time 4914.97ms 
iter 11975: loss 2.2654, time 4914.67ms 
iter 11976: loss 2.5316, time 4917.51ms 
iter 11977: loss 2.4431, time 4921.47ms 
iter 11978: loss 2.3453, time 5015.90ms 
iter 11979: loss 2.5525, time 5028.27ms 
iter 11980: loss 2.6170, time 5030.38ms 
iter 11981: loss 2.1982, time 5029.82ms 
iter 11982: loss 2.5695, time 5027.79ms 
iter 11983: loss 2.3588, time 5028.10ms 
iter 11984: loss 2.6174, time 4979.27ms 
iter 11985: loss 2.3477, time 4960.47ms 
iter 11986: loss 2.3171, time 4954.23ms 
iter 11987: loss 2.4993, time 4919.39ms 
iter 11988: loss 2.3862, time 5008.78ms 
iter 11989: loss 2.4247, time 5033.13ms 
iter 11990: loss 2.3863, time 5031.02ms 
iter 11991: loss 2.4527, time 5032.51ms 
iter 11992: loss 2.4547, time 5033.13ms 
iter 11993: loss 2.4609, time 5025.02ms 
iter 11994: loss 2.2791, time 5017.31ms 
iter 11995: loss 2.5662, time 4983.46ms 
iter 11996: loss 2.5974, time 4923.16ms 
iter 11997: loss 2.3339, time 4918.79ms 
iter 11998: loss 2.4765, time 4959.82ms 
iter 11999: loss 2.2511, time 4920.74ms 
step 12000: train loss 2.4241, val loss 2.8702
saving checkpoint to /root/autodl-tmp/openelm_train/output_data
iter 12000: loss 2.4593, time 20925.57ms 
iter 12001: loss 2.6513, time 5006.98ms 
iter 12002: loss 2.5561, time 5027.03ms 
iter 12003: loss 2.5455, time 4982.94ms 
iter 12004: loss 2.1995, time 4917.46ms 
iter 12005: loss 2.4826, time 4916.50ms 
iter 12006: loss 2.5933, time 4931.73ms 
iter 12007: loss 2.2187, time 4946.88ms 
iter 12008: loss 2.4044, time 4941.55ms 
iter 12009: loss 2.3505, time 5019.80ms 
iter 12010: loss 2.3091, time 5025.20ms 
iter 12011: loss 2.4688, time 5022.28ms 
iter 12012: loss 2.4488, time 5023.97ms 
iter 12013: loss 2.3843, time 5025.40ms 
iter 12014: loss 2.8034, time 4995.63ms 
iter 12015: loss 2.3997, time 4973.11ms 
iter 12016: loss 2.4727, time 4917.09ms 
iter 12017: loss 2.3441, time 4916.99ms 
iter 12018: loss 2.3750, time 4936.79ms 
iter 12019: loss 2.4974, time 5023.74ms 
iter 12020: loss 2.3659, time 5028.16ms 
iter 12021: loss 2.7978, time 5009.80ms 
iter 12022: loss 2.3021, time 5026.00ms 
iter 12023: loss 2.3739, time 5026.61ms 
iter 12024: loss 2.5533, time 5023.97ms 
iter 12025: loss 2.4638, time 5031.63ms 
iter 12026: loss 2.3793, time 4978.13ms 
iter 12027: loss 2.4524, time 4915.89ms 
iter 12028: loss 2.3674, time 4914.69ms 
iter 12029: loss 2.4249, time 4912.89ms 
iter 12030: loss 2.5699, time 4937.95ms 
iter 12031: loss 2.4694, time 5030.20ms 
iter 12032: loss 2.2366, time 5031.49ms 
iter 12033: loss 2.3730, time 5002.89ms 
iter 12034: loss 2.3039, time 5012.91ms 
iter 12035: loss 2.3731, time 5033.06ms 
iter 12036: loss 2.4229, time 4966.00ms 
iter 12037: loss 2.4392, time 5022.40ms 
iter 12038: loss 2.5157, time 4924.56ms 
iter 12039: loss 2.4125, time 4936.15ms 
iter 12040: loss 2.5150, time 4976.15ms 
iter 12041: loss 2.7322, time 5035.03ms 
iter 12042: loss 2.3728, time 5033.36ms 
iter 12043: loss 2.3926, time 5039.25ms 
iter 12044: loss 2.4296, time 5035.70ms 
iter 12045: loss 2.3021, time 5026.79ms 
iter 12046: loss 2.3483, time 5003.97ms 
iter 12047: loss 2.3990, time 4916.75ms 
iter 12048: loss 2.5582, time 4937.64ms 
iter 12049: loss 2.3470, time 4934.97ms 
step 12050: train loss 2.4038, val loss 2.8704
iter 12050: loss 2.5157, time 19742.34ms 
iter 12051: loss 2.1803, time 5032.33ms 
iter 12052: loss 2.4952, time 5026.98ms 
iter 12053: loss 2.7215, time 5024.07ms 
iter 12054: loss 2.2935, time 4932.72ms 
iter 12055: loss 2.6429, time 4918.46ms 
iter 12056: loss 2.4084, time 4918.01ms 
iter 12057: loss 2.4558, time 4953.71ms 
iter 12058: loss 2.5962, time 5027.42ms 
iter 12059: loss 2.3275, time 5033.25ms 
iter 12060: loss 2.2543, time 5005.56ms 
iter 12061: loss 2.4893, time 5031.15ms 
iter 12062: loss 2.5172, time 5034.12ms 
iter 12063: loss 2.3549, time 5039.51ms 
iter 12064: loss 2.3966, time 4980.08ms 
iter 12065: loss 2.4009, time 4919.56ms 
iter 12066: loss 2.4037, time 4931.39ms 
iter 12067: loss 2.3198, time 4927.90ms 
iter 12068: loss 2.6432, time 4953.31ms 
iter 12069: loss 2.3315, time 4951.93ms 
iter 12070: loss 2.4046, time 4949.81ms 
iter 12071: loss 2.3091, time 4955.80ms 
