step 0: train loss 7.2362, val loss 7.3885, B_tok: 0.000000
step 1000: train loss 3.3227, val loss 3.3609, B_tok: 0.000061
step 2000: train loss 3.3264, val loss 3.3705, B_tok: 0.000061
step 3000: train loss 3.3094, val loss 3.3448, B_tok: 0.000061
step 4000: train loss 3.3187, val loss 3.3467, B_tok: 0.000061
step 5000: train loss 3.3366, val loss 3.3864, B_tok: 0.000061
step 6000: train loss 3.3352, val loss 3.3968, B_tok: 0.000061
step 7000: train loss 3.3288, val loss 3.3687, B_tok: 0.000061
step 8000: train loss 3.3206, val loss 3.3740, B_tok: 0.000061
step 9000: train loss 3.3260, val loss 3.3895, B_tok: 0.000061
step 10000: train loss 3.3453, val loss 3.3891, B_tok: 0.000061
step 11000: train loss 3.3242, val loss 3.3676, B_tok: 0.000061
step 12000: train loss 3.3289, val loss 3.3715, B_tok: 0.000061
step 13000: train loss 3.3350, val loss 3.3888, B_tok: 0.000061
step 14000: train loss 3.3467, val loss 3.4062, B_tok: 0.000061
step 15000: train loss 3.3363, val loss 3.3882, B_tok: 0.000061
step 16000: train loss 3.3316, val loss 3.3753, B_tok: 0.000061
step 17000: train loss 3.3431, val loss 3.3729, B_tok: 0.000061
step 18000: train loss 3.3291, val loss 3.3683, B_tok: 0.000061
step 19000: train loss 3.3135, val loss 3.3546, B_tok: 0.000061
step 20000: train loss 3.3124, val loss 3.3552, B_tok: 0.000061
step 21000: train loss 3.3107, val loss 3.3621, B_tok: 0.000061
step 22000: train loss 3.3178, val loss 3.3556, B_tok: 0.000061
step 23000: train loss 3.3046, val loss 3.3449, B_tok: 0.000061
step 24000: train loss 3.2926, val loss 3.3420, B_tok: 0.000061
step 25000: train loss 3.3546, val loss 3.3480, B_tok: 0.000061
step 26000: train loss 3.2960, val loss 3.3358, B_tok: 0.000061
step 27000: train loss 3.2953, val loss 3.3367, B_tok: 0.000061
step 28000: train loss 3.2797, val loss 3.3156, B_tok: 0.000061
step 29000: train loss 3.2753, val loss 3.3144, B_tok: 0.000061
step 29000: train loss 3.2753, val loss 3.3144, B_tok: 0.000000
step 30000: train loss 3.2589, val loss 3.3147, B_tok: 0.000061
step 31000: train loss 3.2642, val loss 3.3177, B_tok: 0.000061
step 32000: train loss 3.2619, val loss 3.2974, B_tok: 0.000061
step 33000: train loss 3.2509, val loss 3.2958, B_tok: 0.000061
step 34000: train loss 3.2490, val loss 3.2883, B_tok: 0.000061
step 35000: train loss 3.2521, val loss 3.3011, B_tok: 0.000061
step 36000: train loss 3.2325, val loss 3.2780, B_tok: 0.000061
step 37000: train loss 3.2217, val loss 3.2749, B_tok: 0.000061
step 38000: train loss 3.2159, val loss 3.2665, B_tok: 0.000061
step 39000: train loss 3.2235, val loss 3.2711, B_tok: 0.000061
step 40000: train loss 3.2080, val loss 3.2559, B_tok: 0.000061
step 41000: train loss 3.1975, val loss 3.2414, B_tok: 0.000061
step 42000: train loss 3.1980, val loss 3.2403, B_tok: 0.000061
step 43000: train loss 3.2080, val loss 3.2541, B_tok: 0.000061
step 44000: train loss 3.1947, val loss 3.2382, B_tok: 0.000061
step 45000: train loss 3.1826, val loss 3.2246, B_tok: 0.000061
step 46000: train loss 3.1830, val loss 3.2268, B_tok: 0.000061
step 47000: train loss 3.1862, val loss 3.2299, B_tok: 0.000061
step 48000: train loss 3.1738, val loss 3.2148, B_tok: 0.000061
step 49000: train loss 3.1613, val loss 3.2066, B_tok: 0.000061
step 50000: train loss 3.1633, val loss 3.2051, B_tok: 0.000061
step 51000: train loss 3.1610, val loss 3.2060, B_tok: 0.000061
step 52000: train loss 3.1523, val loss 3.1891, B_tok: 0.000061
step 53000: train loss 3.1435, val loss 3.1808, B_tok: 0.000061
step 54000: train loss 3.1444, val loss 3.1887, B_tok: 0.000061
step 55000: train loss 3.1409, val loss 3.1842, B_tok: 0.000061
step 56000: train loss 3.1404, val loss 3.1798, B_tok: 0.000061
step 57000: train loss 3.1325, val loss 3.1686, B_tok: 0.000061
step 58000: train loss 3.1320, val loss 3.1700, B_tok: 0.000061
step 59000: train loss 3.1298, val loss 3.1681, B_tok: 0.000061
step 60000: train loss 3.1219, val loss 3.1542, B_tok: 0.000061
step 61000: train loss 3.1145, val loss 3.1470, B_tok: 0.000061
step 62000: train loss 3.1077, val loss 3.1473, B_tok: 0.000061
step 63000: train loss 3.1089, val loss 3.1451, B_tok: 0.000061
step 64000: train loss 3.0990, val loss 3.1345, B_tok: 0.000061
step 65000: train loss 3.0970, val loss 3.1254, B_tok: 0.000061
step 66000: train loss 3.0911, val loss 3.1252, B_tok: 0.000061
step 67000: train loss 3.0900, val loss 3.1255, B_tok: 0.000061
step 68000: train loss 3.0878, val loss 3.1251, B_tok: 0.000061
step 69000: train loss 3.0814, val loss 3.1132, B_tok: 0.000061
step 70000: train loss 3.0759, val loss 3.1086, B_tok: 0.000061
step 71000: train loss 3.0741, val loss 3.1043, B_tok: 0.000061
step 72000: train loss 3.0744, val loss 3.1092, B_tok: 0.000061
step 73000: train loss 3.0617, val loss 3.0923, B_tok: 0.000061
step 74000: train loss 3.0626, val loss 3.0935, B_tok: 0.000061
step 75000: train loss 3.0579, val loss 3.0915, B_tok: 0.000061
step 76000: train loss 3.0554, val loss 3.0897, B_tok: 0.000061
step 77000: train loss 3.0507, val loss 3.0769, B_tok: 0.000061
step 78000: train loss 3.0450, val loss 3.0772, B_tok: 0.000061
step 79000: train loss 3.0449, val loss 3.0735, B_tok: 0.000061
step 80000: train loss 3.0448, val loss 3.0772, B_tok: 0.000061
step 81000: train loss 3.0368, val loss 3.0657, B_tok: 0.000061
step 82000: train loss 3.0318, val loss 3.0607, B_tok: 0.000061
step 83000: train loss 3.0272, val loss 3.0589, B_tok: 0.000061
step 84000: train loss 3.0287, val loss 3.0614, B_tok: 0.000061
step 85000: train loss 3.0196, val loss 3.0502, B_tok: 0.000061
step 86000: train loss 3.0182, val loss 3.0462, B_tok: 0.000061
step 87000: train loss 3.0170, val loss 3.0478, B_tok: 0.000061
step 88000: train loss 3.0191, val loss 3.0491, B_tok: 0.000061
step 89000: train loss 3.0122, val loss 3.0390, B_tok: 0.000061
step 90000: train loss 3.0101, val loss 3.0372, B_tok: 0.000061
step 91000: train loss 3.0105, val loss 3.0347, B_tok: 0.000061
step 92000: train loss 3.0108, val loss 3.0360, B_tok: 0.000061
step 93000: train loss 3.0080, val loss 3.0289, B_tok: 0.000061
step 94000: train loss 3.0030, val loss 3.0261, B_tok: 0.000061
step 95000: train loss 3.0012, val loss 3.0261, B_tok: 0.000061
step 96000: train loss 3.0042, val loss 3.0290, B_tok: 0.000061
step 97000: train loss 2.9996, val loss 3.0239, B_tok: 0.000061
step 98000: train loss 2.9969, val loss 3.0214, B_tok: 0.000061
step 99000: train loss 2.9966, val loss 3.0222, B_tok: 0.000061
step 100000: train loss 2.9972, val loss 3.0248, B_tok: 0.000061
