step 0: train loss 11.0224, val loss 11.0113, B_tok: 0.000000
step 1000: train loss 5.4737, val loss 5.5723, B_tok: 0.000061
step 2000: train loss 4.4871, val loss 4.5253, B_tok: 0.000061
step 3000: train loss 4.0238, val loss 4.0347, B_tok: 0.000061
step 4000: train loss 3.8010, val loss 3.8327, B_tok: 0.000061
step 5000: train loss 3.7022, val loss 3.7319, B_tok: 0.000061
step 6000: train loss 3.6357, val loss 3.6702, B_tok: 0.000061
step 7000: train loss 3.5696, val loss 3.5956, B_tok: 0.000061
step 8000: train loss 3.5204, val loss 3.5634, B_tok: 0.000061
step 9000: train loss 3.4948, val loss 3.5357, B_tok: 0.000061
step 10000: train loss 3.4963, val loss 3.5329, B_tok: 0.000061
step 11000: train loss 3.4666, val loss 3.4907, B_tok: 0.000061
step 12000: train loss 3.4451, val loss 3.4754, B_tok: 0.000061
step 13000: train loss 3.4373, val loss 3.4790, B_tok: 0.000061
step 14000: train loss 3.4517, val loss 3.5016, B_tok: 0.000061
step 15000: train loss 3.4414, val loss 3.4589, B_tok: 0.000061
step 16000: train loss 3.4561, val loss 3.4905, B_tok: 0.000061
step 17000: train loss 3.4421, val loss 3.4816, B_tok: 0.000061
step 18000: train loss 3.4809, val loss 3.5213, B_tok: 0.000061
step 19000: train loss 3.4708, val loss 3.4829, B_tok: 0.000061
step 20000: train loss 3.4449, val loss 3.4863, B_tok: 0.000061
step 21000: train loss 3.4758, val loss 3.4996, B_tok: 0.000061
step 22000: train loss 3.4486, val loss 3.4910, B_tok: 0.000061
step 23000: train loss 3.4674, val loss 3.5290, B_tok: 0.000061
step 24000: train loss 3.4757, val loss 3.5382, B_tok: 0.000061
step 25000: train loss 3.4408, val loss 3.4737, B_tok: 0.000061
step 26000: train loss 3.4642, val loss 3.5041, B_tok: 0.000061
step 27000: train loss 3.4566, val loss 3.5032, B_tok: 0.000061
step 28000: train loss 3.4963, val loss 3.5068, B_tok: 0.000061
step 29000: train loss 3.4389, val loss 3.4769, B_tok: 0.000061
step 30000: train loss 3.4435, val loss 3.4794, B_tok: 0.000061
step 31000: train loss 3.4497, val loss 3.4853, B_tok: 0.000061
step 32000: train loss 3.4598, val loss 3.4770, B_tok: 0.000061
step 33000: train loss 3.4287, val loss 3.5539, B_tok: 0.000061
step 34000: train loss 3.4677, val loss 3.4818, B_tok: 0.000061
step 35000: train loss 3.4080, val loss 3.4472, B_tok: 0.000061
step 36000: train loss 3.3933, val loss 3.4315, B_tok: 0.000061
step 37000: train loss 3.3876, val loss 3.4393, B_tok: 0.000061
step 38000: train loss 3.3971, val loss 3.4358, B_tok: 0.000061
step 39000: train loss 3.4000, val loss 3.4407, B_tok: 0.000061
step 40000: train loss 3.3923, val loss 3.4225, B_tok: 0.000061
step 41000: train loss 3.3891, val loss 3.4246, B_tok: 0.000061
step 42000: train loss 3.3813, val loss 3.4388, B_tok: 0.000061
step 43000: train loss 3.3746, val loss 3.4229, B_tok: 0.000061
step 44000: train loss 3.3882, val loss 3.4451, B_tok: 0.000061
step 45000: train loss 3.3690, val loss 3.4060, B_tok: 0.000061
step 46000: train loss 3.3510, val loss 3.3938, B_tok: 0.000061
step 47000: train loss 3.3611, val loss 3.3995, B_tok: 0.000061
step 48000: train loss 3.3423, val loss 3.3760, B_tok: 0.000061
step 49000: train loss 3.3285, val loss 3.3735, B_tok: 0.000061
step 50000: train loss 3.3331, val loss 3.3759, B_tok: 0.000061
step 51000: train loss 3.3331, val loss 3.3768, B_tok: 0.000061
step 52000: train loss 3.3113, val loss 3.3508, B_tok: 0.000061
step 53000: train loss 3.3042, val loss 3.3429, B_tok: 0.000061
step 54000: train loss 3.3033, val loss 3.3425, B_tok: 0.000061
step 55000: train loss 3.3019, val loss 3.3463, B_tok: 0.000061
step 56000: train loss 3.2880, val loss 3.3308, B_tok: 0.000061
step 57000: train loss 3.2858, val loss 3.3197, B_tok: 0.000061
step 58000: train loss 3.2815, val loss 3.3263, B_tok: 0.000061
step 59000: train loss 3.2802, val loss 3.3238, B_tok: 0.000061
step 60000: train loss 3.2667, val loss 3.3065, B_tok: 0.000061
step 61000: train loss 3.2596, val loss 3.3007, B_tok: 0.000061
step 62000: train loss 3.2556, val loss 3.2948, B_tok: 0.000061
step 63000: train loss 3.2622, val loss 3.2987, B_tok: 0.000061
step 64000: train loss 3.2543, val loss 3.2790, B_tok: 0.000061
step 65000: train loss 3.2411, val loss 3.2770, B_tok: 0.000061
step 66000: train loss 3.2369, val loss 3.2740, B_tok: 0.000061
step 67000: train loss 3.2372, val loss 3.2753, B_tok: 0.000061
step 68000: train loss 3.2306, val loss 3.2621, B_tok: 0.000061
step 69000: train loss 3.2187, val loss 3.2537, B_tok: 0.000061
step 70000: train loss 3.2144, val loss 3.2520, B_tok: 0.000061
step 71000: train loss 3.2142, val loss 3.2581, B_tok: 0.000061
step 72000: train loss 3.2053, val loss 3.2437, B_tok: 0.000061
step 73000: train loss 3.2010, val loss 3.2333, B_tok: 0.000061
step 74000: train loss 3.1929, val loss 3.2329, B_tok: 0.000061
step 75000: train loss 3.1939, val loss 3.2342, B_tok: 0.000061
step 76000: train loss 3.1905, val loss 3.2338, B_tok: 0.000061
step 77000: train loss 3.1864, val loss 3.2214, B_tok: 0.000061
step 78000: train loss 3.1776, val loss 3.2193, B_tok: 0.000061
step 79000: train loss 3.1754, val loss 3.2226, B_tok: 0.000061
step 80000: train loss 3.1735, val loss 3.2152, B_tok: 0.000061
step 81000: train loss 3.1659, val loss 3.2037, B_tok: 0.000061
step 82000: train loss 3.1602, val loss 3.2024, B_tok: 0.000061
step 83000: train loss 3.1608, val loss 3.2015, B_tok: 0.000061
step 84000: train loss 3.1582, val loss 3.1935, B_tok: 0.000061
step 85000: train loss 3.1521, val loss 3.1837, B_tok: 0.000061
step 86000: train loss 3.1427, val loss 3.1819, B_tok: 0.000061
step 87000: train loss 3.1456, val loss 3.1847, B_tok: 0.000061
step 88000: train loss 3.1460, val loss 3.1834, B_tok: 0.000061
step 89000: train loss 3.1400, val loss 3.1729, B_tok: 0.000061
step 90000: train loss 3.1364, val loss 3.1733, B_tok: 0.000061
step 91000: train loss 3.1356, val loss 3.1747, B_tok: 0.000061
step 92000: train loss 3.1364, val loss 3.1734, B_tok: 0.000061
step 93000: train loss 3.1328, val loss 3.1638, B_tok: 0.000061
step 94000: train loss 3.1274, val loss 3.1640, B_tok: 0.000061
step 95000: train loss 3.1260, val loss 3.1572, B_tok: 0.000061
step 96000: train loss 3.1286, val loss 3.1591, B_tok: 0.000061
step 97000: train loss 3.1244, val loss 3.1453, B_tok: 0.000061
step 98000: train loss 3.1182, val loss 3.1460, B_tok: 0.000061
step 99000: train loss 3.1153, val loss 3.1469, B_tok: 0.000061
step 100000: train loss 3.1217, val loss 3.1519, B_tok: 0.000061
