step 0: train loss 7.3437, val loss 7.5558, B_tok: 0.000000
step 1000: train loss 3.3175, val loss 3.3520, B_tok: 0.000061
step 2000: train loss 3.3699, val loss 3.4206, B_tok: 0.000061
step 3000: train loss 3.3543, val loss 3.3886, B_tok: 0.000061
step 4000: train loss 3.3737, val loss 3.4152, B_tok: 0.000061
step 5000: train loss 3.3807, val loss 3.4678, B_tok: 0.000061
step 6000: train loss 3.3888, val loss 3.4560, B_tok: 0.000061
step 7000: train loss 3.3897, val loss 3.4444, B_tok: 0.000061
step 8000: train loss 3.3746, val loss 3.4503, B_tok: 0.000061
step 9000: train loss 3.3753, val loss 3.4424, B_tok: 0.000061
step 10000: train loss 3.4004, val loss 3.4554, B_tok: 0.000061
step 11000: train loss 3.4099, val loss 3.4654, B_tok: 0.000061
step 12000: train loss 3.3753, val loss 3.4272, B_tok: 0.000061
step 13000: train loss 3.3771, val loss 3.4203, B_tok: 0.000061
step 14000: train loss 3.3863, val loss 3.4505, B_tok: 0.000061
step 15000: train loss 3.4103, val loss 3.4393, B_tok: 0.000061
step 16000: train loss 3.3755, val loss 3.4240, B_tok: 0.000061
step 17000: train loss 3.3830, val loss 3.4444, B_tok: 0.000061
step 18000: train loss 3.3898, val loss 3.4703, B_tok: 0.000061
step 19000: train loss 3.3733, val loss 3.4274, B_tok: 0.000061
step 20000: train loss 3.3732, val loss 3.4327, B_tok: 0.000061
step 21000: train loss 3.3743, val loss 3.4200, B_tok: 0.000061
step 22000: train loss 3.3719, val loss 3.4271, B_tok: 0.000061
step 23000: train loss 3.3704, val loss 3.4106, B_tok: 0.000061
step 24000: train loss 3.3522, val loss 3.3991, B_tok: 0.000061
step 25000: train loss 3.3615, val loss 3.5060, B_tok: 0.000061
step 26000: train loss 3.3554, val loss 3.4126, B_tok: 0.000061
step 27000: train loss 3.3569, val loss 3.4059, B_tok: 0.000061
step 28000: train loss 3.3532, val loss 3.4086, B_tok: 0.000061
step 29000: train loss 3.3501, val loss 3.4083, B_tok: 0.000061
step 30000: train loss 3.3533, val loss 3.3966, B_tok: 0.000061
step 31000: train loss 3.3472, val loss 3.3873, B_tok: 0.000061
step 32000: train loss 3.3398, val loss 3.3893, B_tok: 0.000061
step 33000: train loss 3.3419, val loss 3.3946, B_tok: 0.000061
step 34000: train loss 3.3384, val loss 3.3914, B_tok: 0.000061
step 35000: train loss 3.3389, val loss 3.3814, B_tok: 0.000061
step 36000: train loss 3.3330, val loss 3.3877, B_tok: 0.000061
step 37000: train loss 3.3266, val loss 3.4087, B_tok: 0.000061
step 38000: train loss 3.3277, val loss 3.3868, B_tok: 0.000061
step 39000: train loss 3.3265, val loss 3.3961, B_tok: 0.000061
step 40000: train loss 3.3197, val loss 3.4031, B_tok: 0.000061
step 41000: train loss 3.3123, val loss 3.3631, B_tok: 0.000061
step 42000: train loss 3.3129, val loss 3.3582, B_tok: 0.000061
step 43000: train loss 3.3211, val loss 3.3664, B_tok: 0.000061
step 44000: train loss 3.2972, val loss 3.3433, B_tok: 0.000061
step 45000: train loss 3.3050, val loss 3.3487, B_tok: 0.000061
step 46000: train loss 3.2967, val loss 3.3495, B_tok: 0.000061
step 47000: train loss 3.2993, val loss 3.3456, B_tok: 0.000061
step 48000: train loss 3.2904, val loss 3.3317, B_tok: 0.000061
step 49000: train loss 3.2805, val loss 3.3271, B_tok: 0.000061
step 50000: train loss 3.2826, val loss 3.3278, B_tok: 0.000061
step 51000: train loss 3.2844, val loss 3.3288, B_tok: 0.000061
step 52000: train loss 3.2742, val loss 3.3068, B_tok: 0.000061
step 53000: train loss 3.2601, val loss 3.3070, B_tok: 0.000061
step 54000: train loss 3.2663, val loss 3.3099, B_tok: 0.000061
step 55000: train loss 3.2654, val loss 3.3115, B_tok: 0.000061
step 56000: train loss 3.2580, val loss 3.2978, B_tok: 0.000061
step 57000: train loss 3.2610, val loss 3.2933, B_tok: 0.000061
step 58000: train loss 3.2518, val loss 3.2946, B_tok: 0.000061
step 59000: train loss 3.2539, val loss 3.2971, B_tok: 0.000061
step 60000: train loss 3.2408, val loss 3.2798, B_tok: 0.000061
step 61000: train loss 3.2373, val loss 3.2852, B_tok: 0.000061
step 62000: train loss 3.2394, val loss 3.2751, B_tok: 0.000061
step 63000: train loss 3.2385, val loss 3.2762, B_tok: 0.000061
step 64000: train loss 3.2384, val loss 3.2622, B_tok: 0.000061
step 65000: train loss 3.2229, val loss 3.2621, B_tok: 0.000061
step 66000: train loss 3.2213, val loss 3.2599, B_tok: 0.000061
step 67000: train loss 3.2254, val loss 3.2612, B_tok: 0.000061
step 68000: train loss 3.2163, val loss 3.2513, B_tok: 0.000061
step 69000: train loss 3.2089, val loss 3.2503, B_tok: 0.000061
step 70000: train loss 3.2095, val loss 3.2499, B_tok: 0.000061
step 71000: train loss 3.2055, val loss 3.2523, B_tok: 0.000061
step 72000: train loss 3.2021, val loss 3.2363, B_tok: 0.000061
step 73000: train loss 3.1977, val loss 3.2314, B_tok: 0.000061
step 74000: train loss 3.1889, val loss 3.2300, B_tok: 0.000061
step 75000: train loss 3.1861, val loss 3.2308, B_tok: 0.000061
step 76000: train loss 3.1902, val loss 3.2294, B_tok: 0.000061
step 77000: train loss 3.1819, val loss 3.2174, B_tok: 0.000061
step 78000: train loss 3.1747, val loss 3.2175, B_tok: 0.000061
step 79000: train loss 3.1758, val loss 3.2200, B_tok: 0.000061
step 80000: train loss 3.1723, val loss 3.2140, B_tok: 0.000061
step 81000: train loss 3.1705, val loss 3.2150, B_tok: 0.000061
step 82000: train loss 3.1624, val loss 3.2314, B_tok: 0.000061
step 83000: train loss 3.1643, val loss 3.2066, B_tok: 0.000061
step 84000: train loss 3.1619, val loss 3.1980, B_tok: 0.000061
step 85000: train loss 3.1542, val loss 3.3316, B_tok: 0.000061
step 86000: train loss 3.1472, val loss 3.3072, B_tok: 0.000061
step 87000: train loss 3.1482, val loss 3.3538, B_tok: 0.000061
step 88000: train loss 3.1487, val loss 3.1951, B_tok: 0.000061
step 89000: train loss 3.1416, val loss 3.1816, B_tok: 0.000061
step 90000: train loss 3.1397, val loss 3.3418, B_tok: 0.000061
step 91000: train loss 3.1349, val loss 3.2439, B_tok: 0.000061
step 92000: train loss 3.1378, val loss 3.2671, B_tok: 0.000061
step 93000: train loss 3.1357, val loss 3.3367, B_tok: 0.000061
step 94000: train loss 3.1298, val loss 3.2619, B_tok: 0.000061
step 95000: train loss 3.1319, val loss 3.3340, B_tok: 0.000061
step 96000: train loss 3.1348, val loss 3.1775, B_tok: 0.000061
step 97000: train loss 3.1280, val loss 3.1609, B_tok: 0.000061
step 98000: train loss 3.1232, val loss 3.1954, B_tok: 0.000061
step 99000: train loss 3.1231, val loss 3.1602, B_tok: 0.000061
step 100000: train loss 3.1278, val loss 3.1675, B_tok: 0.000061
