step 0: train loss 11.0922, val loss 11.0822, B_tok: 0.000000
step 1000: train loss 5.4785, val loss 5.5794, B_tok: 0.000061
step 2000: train loss 4.4936, val loss 4.5229, B_tok: 0.000061
step 3000: train loss 4.0280, val loss 4.0503, B_tok: 0.000061
step 4000: train loss 3.8222, val loss 3.8569, B_tok: 0.000061
step 5000: train loss 3.7342, val loss 3.7566, B_tok: 0.000061
step 6000: train loss 3.6724, val loss 3.7056, B_tok: 0.000061
step 7000: train loss 3.6052, val loss 3.6305, B_tok: 0.000061
step 8000: train loss 3.5540, val loss 3.5938, B_tok: 0.000061
step 9000: train loss 3.5262, val loss 3.5696, B_tok: 0.000061
step 10000: train loss 3.5221, val loss 3.5561, B_tok: 0.000061
step 11000: train loss 3.4859, val loss 3.5140, B_tok: 0.000061
step 12000: train loss 3.4612, val loss 3.5018, B_tok: 0.000061
step 13000: train loss 3.4538, val loss 3.4983, B_tok: 0.000061
step 14000: train loss 3.4649, val loss 3.5120, B_tok: 0.000061
step 15000: train loss 3.4515, val loss 3.4802, B_tok: 0.000061
step 16000: train loss 3.4490, val loss 3.4901, B_tok: 0.000061
step 17000: train loss 3.4574, val loss 3.4936, B_tok: 0.000061
step 18000: train loss 3.4615, val loss 3.4985, B_tok: 0.000061
step 19000: train loss 3.4416, val loss 3.4699, B_tok: 0.000061
step 20000: train loss 3.4331, val loss 3.4783, B_tok: 0.000061
step 21000: train loss 3.4809, val loss 3.5294, B_tok: 0.000061
step 22000: train loss 3.4583, val loss 3.5238, B_tok: 0.000061
step 23000: train loss 3.4550, val loss 3.4788, B_tok: 0.000061
step 24000: train loss 3.4328, val loss 3.4708, B_tok: 0.000061
step 25000: train loss 3.4479, val loss 3.5178, B_tok: 0.000061
step 26000: train loss 3.4518, val loss 3.4941, B_tok: 0.000061
step 27000: train loss 3.4401, val loss 3.4772, B_tok: 0.000061
step 28000: train loss 3.4375, val loss 3.4774, B_tok: 0.000061
step 29000: train loss 3.4352, val loss 3.4719, B_tok: 0.000061
step 30000: train loss 3.4598, val loss 3.5151, B_tok: 0.000061
step 31000: train loss 3.4672, val loss 3.5042, B_tok: 0.000061
step 32000: train loss 3.4355, val loss 3.4925, B_tok: 0.000061
step 33000: train loss 3.4204, val loss 3.4665, B_tok: 0.000061
step 34000: train loss 3.4307, val loss 3.4789, B_tok: 0.000061
step 35000: train loss 3.4308, val loss 3.4858, B_tok: 0.000061
step 36000: train loss 3.4203, val loss 3.4614, B_tok: 0.000061
step 37000: train loss 3.4147, val loss 3.4569, B_tok: 0.000061
step 38000: train loss 3.4171, val loss 3.4577, B_tok: 0.000061
step 39000: train loss 3.4205, val loss 3.4944, B_tok: 0.000061
step 40000: train loss 3.4128, val loss 3.4534, B_tok: 0.000061
step 41000: train loss 3.4054, val loss 3.4761, B_tok: 0.000061
step 42000: train loss 3.4099, val loss 3.4540, B_tok: 0.000061
step 43000: train loss 3.4038, val loss 3.4528, B_tok: 0.000061
step 44000: train loss 3.3921, val loss 3.4269, B_tok: 0.000061
step 45000: train loss 3.4042, val loss 3.4427, B_tok: 0.000061
step 46000: train loss 3.3863, val loss 3.4255, B_tok: 0.000061
step 47000: train loss 3.3894, val loss 3.4284, B_tok: 0.000061
step 48000: train loss 3.3815, val loss 3.4104, B_tok: 0.000061
step 49000: train loss 3.3731, val loss 3.4097, B_tok: 0.000061
step 50000: train loss 3.3741, val loss 3.4189, B_tok: 0.000061
step 51000: train loss 3.3736, val loss 3.4160, B_tok: 0.000061
step 52000: train loss 3.3637, val loss 3.3912, B_tok: 0.000061
step 53000: train loss 3.3479, val loss 3.3864, B_tok: 0.000061
step 54000: train loss 3.3530, val loss 3.3941, B_tok: 0.000061
step 55000: train loss 3.3470, val loss 3.3876, B_tok: 0.000061
step 56000: train loss 3.3417, val loss 3.3768, B_tok: 0.000061
step 57000: train loss 3.3363, val loss 3.3745, B_tok: 0.000061
step 58000: train loss 3.3337, val loss 3.3727, B_tok: 0.000061
step 59000: train loss 3.3363, val loss 3.3733, B_tok: 0.000061
step 60000: train loss 3.3243, val loss 3.3553, B_tok: 0.000061
step 61000: train loss 3.3216, val loss 3.3539, B_tok: 0.000061
step 62000: train loss 3.3239, val loss 3.3582, B_tok: 0.000061
step 63000: train loss 3.3214, val loss 3.3582, B_tok: 0.000061
step 64000: train loss 3.3158, val loss 3.3410, B_tok: 0.000061
step 65000: train loss 3.3027, val loss 3.3346, B_tok: 0.000061
step 66000: train loss 3.3001, val loss 3.3292, B_tok: 0.000061
step 67000: train loss 3.3036, val loss 3.3318, B_tok: 0.000061
step 68000: train loss 3.2929, val loss 3.3251, B_tok: 0.000061
step 69000: train loss 3.2809, val loss 3.3161, B_tok: 0.000061
step 70000: train loss 3.2801, val loss 3.3152, B_tok: 0.000061
step 71000: train loss 3.2772, val loss 3.3170, B_tok: 0.000061
step 72000: train loss 3.2792, val loss 3.3062, B_tok: 0.000061
step 73000: train loss 3.2676, val loss 3.2997, B_tok: 0.000061
step 74000: train loss 3.2587, val loss 3.2941, B_tok: 0.000061
step 75000: train loss 3.2625, val loss 3.2965, B_tok: 0.000061
step 76000: train loss 3.2608, val loss 3.2952, B_tok: 0.000061
step 77000: train loss 3.2557, val loss 3.2830, B_tok: 0.000061
step 78000: train loss 3.2447, val loss 3.2813, B_tok: 0.000061
step 79000: train loss 3.2462, val loss 3.2820, B_tok: 0.000061
step 80000: train loss 3.2460, val loss 3.2760, B_tok: 0.000061
step 81000: train loss 3.2351, val loss 3.2694, B_tok: 0.000061
step 82000: train loss 3.2281, val loss 3.2683, B_tok: 0.000061
step 83000: train loss 3.2283, val loss 3.2660, B_tok: 0.000061
step 84000: train loss 3.2297, val loss 3.2612, B_tok: 0.000061
step 85000: train loss 3.2207, val loss 3.2502, B_tok: 0.000061
step 86000: train loss 3.2147, val loss 3.2522, B_tok: 0.000061
step 87000: train loss 3.2155, val loss 3.2503, B_tok: 0.000061
step 88000: train loss 3.2172, val loss 3.2519, B_tok: 0.000061
step 89000: train loss 3.2104, val loss 3.2383, B_tok: 0.000061
step 90000: train loss 3.2057, val loss 3.2379, B_tok: 0.000061
step 91000: train loss 3.2039, val loss 3.2347, B_tok: 0.000061
step 92000: train loss 3.2066, val loss 3.2380, B_tok: 0.000061
step 93000: train loss 3.2004, val loss 3.2276, B_tok: 0.000061
step 94000: train loss 3.1948, val loss 3.2290, B_tok: 0.000061
step 95000: train loss 3.1958, val loss 3.2275, B_tok: 0.000061
step 96000: train loss 3.2013, val loss 3.2295, B_tok: 0.000061
step 97000: train loss 3.1968, val loss 3.2172, B_tok: 0.000061
step 98000: train loss 3.1906, val loss 3.2186, B_tok: 0.000061
step 99000: train loss 3.1898, val loss 3.2196, B_tok: 0.000061
step 100000: train loss 3.1936, val loss 3.2271, B_tok: 0.000061
