step 0: train loss 11.0866, val loss 11.0853, B_tok: 0.000000
step 1000: train loss 5.5673, val loss 5.6755, B_tok: 0.000061
step 2000: train loss 4.5910, val loss 4.6361, B_tok: 0.000061
step 3000: train loss 4.0649, val loss 4.0857, B_tok: 0.000061
step 4000: train loss 3.8466, val loss 3.8778, B_tok: 0.000061
step 5000: train loss 3.7324, val loss 3.7685, B_tok: 0.000061
step 6000: train loss 3.6749, val loss 3.7109, B_tok: 0.000061
step 7000: train loss 3.6001, val loss 3.6285, B_tok: 0.000061
step 8000: train loss 3.5483, val loss 3.5948, B_tok: 0.000061
step 9000: train loss 3.5082, val loss 3.5639, B_tok: 0.000061
step 10000: train loss 3.5090, val loss 3.5559, B_tok: 0.000061
step 11000: train loss 3.4872, val loss 3.5190, B_tok: 0.000061
step 12000: train loss 3.4595, val loss 3.5043, B_tok: 0.000061
step 13000: train loss 3.4491, val loss 3.5011, B_tok: 0.000061
step 14000: train loss 3.4505, val loss 3.5151, B_tok: 0.000061
step 15000: train loss 3.4509, val loss 3.4898, B_tok: 0.000061
step 16000: train loss 3.4320, val loss 3.4895, B_tok: 0.000061
step 17000: train loss 3.4578, val loss 3.5147, B_tok: 0.000061
step 18000: train loss 3.4657, val loss 3.5194, B_tok: 0.000061
step 19000: train loss 3.4434, val loss 3.4838, B_tok: 0.000061
step 20000: train loss 3.4661, val loss 3.5503, B_tok: 0.000061
step 21000: train loss 3.4403, val loss 3.5017, B_tok: 0.000061
step 22000: train loss 3.4430, val loss 3.4999, B_tok: 0.000061
step 23000: train loss 3.4299, val loss 3.4893, B_tok: 0.000061
step 24000: train loss 3.4481, val loss 3.4906, B_tok: 0.000061
step 25000: train loss 3.4414, val loss 3.4972, B_tok: 0.000061
step 26000: train loss 3.4774, val loss 3.5091, B_tok: 0.000061
step 27000: train loss 3.4650, val loss 3.5126, B_tok: 0.000061
step 28000: train loss 3.4251, val loss 3.4758, B_tok: 0.000061
step 29000: train loss 3.4169, val loss 3.4709, B_tok: 0.000061
step 30000: train loss 3.4264, val loss 3.4767, B_tok: 0.000061
step 31000: train loss 3.4463, val loss 3.5061, B_tok: 0.000061
step 32000: train loss 3.4351, val loss 3.4933, B_tok: 0.000061
step 33000: train loss 3.4210, val loss 3.4951, B_tok: 0.000061
step 34000: train loss 3.4605, val loss 3.5182, B_tok: 0.000061
step 35000: train loss 3.4107, val loss 3.4724, B_tok: 0.000061
step 36000: train loss 3.3857, val loss 3.4361, B_tok: 0.000061
step 37000: train loss 3.3821, val loss 3.4367, B_tok: 0.000061
step 38000: train loss 3.3786, val loss 3.4242, B_tok: 0.000061
step 39000: train loss 3.3884, val loss 3.4351, B_tok: 0.000061
step 40000: train loss 3.3789, val loss 3.4170, B_tok: 0.000061
step 41000: train loss 3.3617, val loss 3.4028, B_tok: 0.000061
step 42000: train loss 3.4343, val loss 3.4151, B_tok: 0.000061
step 43000: train loss 3.3569, val loss 3.4105, B_tok: 0.000061
step 44000: train loss 3.3407, val loss 3.3872, B_tok: 0.000061
step 45000: train loss 3.3348, val loss 3.3818, B_tok: 0.000061
step 46000: train loss 3.3313, val loss 3.3805, B_tok: 0.000061
step 47000: train loss 3.3247, val loss 3.3712, B_tok: 0.000061
step 48000: train loss 3.3168, val loss 3.3507, B_tok: 0.000061
step 49000: train loss 3.2997, val loss 3.3472, B_tok: 0.000061
step 50000: train loss 3.2984, val loss 3.3430, B_tok: 0.000061
step 51000: train loss 3.2978, val loss 3.3607, B_tok: 0.000061
step 52000: train loss 3.2856, val loss 3.3221, B_tok: 0.000061
step 53000: train loss 3.2721, val loss 3.3148, B_tok: 0.000061
step 54000: train loss 3.2702, val loss 3.3122, B_tok: 0.000061
step 55000: train loss 3.2646, val loss 3.3064, B_tok: 0.000061
step 56000: train loss 3.2537, val loss 3.2972, B_tok: 0.000061
step 57000: train loss 3.2492, val loss 3.2922, B_tok: 0.000061
step 58000: train loss 3.2422, val loss 3.2901, B_tok: 0.000061
step 59000: train loss 3.2398, val loss 3.2873, B_tok: 0.000061
step 60000: train loss 3.2285, val loss 3.2710, B_tok: 0.000061
step 61000: train loss 3.2262, val loss 3.2651, B_tok: 0.000061
step 62000: train loss 3.2226, val loss 3.2611, B_tok: 0.000061
step 63000: train loss 3.2244, val loss 3.2622, B_tok: 0.000061
step 64000: train loss 3.2148, val loss 3.2479, B_tok: 0.000061
step 65000: train loss 3.2043, val loss 3.2386, B_tok: 0.000061
step 66000: train loss 3.2012, val loss 3.2371, B_tok: 0.000061
step 67000: train loss 3.1991, val loss 3.2380, B_tok: 0.000061
step 68000: train loss 3.1909, val loss 3.2256, B_tok: 0.000061
step 69000: train loss 3.1814, val loss 3.2183, B_tok: 0.000061
step 70000: train loss 3.1824, val loss 3.2205, B_tok: 0.000061
step 71000: train loss 3.1754, val loss 3.2161, B_tok: 0.000061
step 72000: train loss 3.1702, val loss 3.2052, B_tok: 0.000061
step 73000: train loss 3.1653, val loss 3.1975, B_tok: 0.000061
step 74000: train loss 3.1537, val loss 3.1926, B_tok: 0.000061
step 75000: train loss 3.1528, val loss 3.1938, B_tok: 0.000061
step 76000: train loss 3.1513, val loss 3.1900, B_tok: 0.000061
step 77000: train loss 3.1480, val loss 3.1786, B_tok: 0.000061
step 78000: train loss 3.1379, val loss 3.1751, B_tok: 0.000061
step 79000: train loss 3.1354, val loss 3.1787, B_tok: 0.000061
step 80000: train loss 3.1347, val loss 3.1732, B_tok: 0.000061
step 81000: train loss 3.1304, val loss 3.1642, B_tok: 0.000061
step 82000: train loss 3.1244, val loss 3.1638, B_tok: 0.000061
step 83000: train loss 3.1231, val loss 3.1635, B_tok: 0.000061
step 84000: train loss 3.1236, val loss 3.1553, B_tok: 0.000061
step 85000: train loss 3.1168, val loss 3.1479, B_tok: 0.000061
step 86000: train loss 3.1094, val loss 3.1456, B_tok: 0.000061
step 87000: train loss 3.1102, val loss 3.1450, B_tok: 0.000061
step 88000: train loss 3.1098, val loss 3.1453, B_tok: 0.000061
step 89000: train loss 3.1046, val loss 3.1364, B_tok: 0.000061
step 90000: train loss 3.1025, val loss 3.1354, B_tok: 0.000061
step 91000: train loss 3.0999, val loss 3.1361, B_tok: 0.000061
step 92000: train loss 3.1008, val loss 3.1353, B_tok: 0.000061
step 93000: train loss 3.0980, val loss 3.1291, B_tok: 0.000061
step 94000: train loss 3.0942, val loss 3.1286, B_tok: 0.000061
step 95000: train loss 3.0953, val loss 3.1282, B_tok: 0.000061
step 96000: train loss 3.0966, val loss 3.1293, B_tok: 0.000061
step 97000: train loss 3.0928, val loss 3.1197, B_tok: 0.000061
step 98000: train loss 3.0898, val loss 3.1199, B_tok: 0.000061
step 99000: train loss 3.0888, val loss 3.1199, B_tok: 0.000061
step 100000: train loss 3.0900, val loss 3.1226, B_tok: 0.000061
