always_save_checkpoint: False
arg: --wandb_project=moe_gpt
attempt: moe_gpt
aux_loss_weight: 0.01
axial: False
backend: nccl
base_d_embd: 768
base_d_head: 64
base_d_model: 768
base_ffn_expansion: 1
base_n_head: -1
batch_size: 64
beta1: 0.9
beta2: 0.95
bias: False
block_size: 128
ckpt_path:
cola_flops: 141836288
cola_params: 6536448
compile: False
config_file: config/train_open_small.py
d_embd: -1
d_head: 64
d_model: 128
data_dir: /storage/data
dataset: open
decay_lr: False
device: cuda
device_type: cuda
do_qk_ln: True
dropout: 0.0
dtype: bfloat16
emb_params: 28672
eval_interval: 2500
eval_iters: 200
eval_only: False
every_n_fwds: 200
expr:
ffn_expansion: 4
grad_clip: 1.0
gradient_accumulation_steps: 8
head_btt_case_proj: (n|d|n|d)
head_btt_case_qkv: (n|d|n|d)
head_params: 12288
init_lr: 0.003
input_lr_mult: 1.0
key: wandb_project
layers: all_but_last
lm_head_rank_frac: -1.0
lm_head_struct:
lm_head_tt_rank: -1
log_interval: 100
lr_decay_iters: 100000
max_iters: 100000
min_lr: 0.00030000000000000003
n_head: -1
n_layer: 3
neurons: 0
non_emb_flops: 140263424
non_emb_params: 6495488
num_active_experts: 2
num_active_ffn_experts: 2
num_blocks: 4
num_experts: 0
num_ffn_experts: 16
opt_name: AdamW
out_dir: /chkpt/dense_e0_k2_ffe16k2_all_but_last_l3-dm128-de-1-h-1-dh64-ttr1-172251
rank_frac: 0.2
spec_penalty_weight: 0.0
split_qkv: True
struct: dense
timestamp: 2024-05-18_172251
total_tokens: 6017506009
tt_dim: 2
tt_rank: 1
use_head_btt: False
val: moe_gpt
vocab_size: 96
wandb_log: True
wandb_project: moe_gpt
wandb_run_name: dense_e0_k2_ffe16k2_all_but_last_l3-dm128-de-1-h-1-dh64-ttr1-172251
warmup_iters: 2000
weight_decay: 0.0
I 2500 | L 1.2935 | P 3.609e+00 | Lt 1.3189 | Pt 3.701e+00 | H 62.5222 | ETA 3.23h
I 5000 | L 1.2306 | P 3.391e+00 | Lt 1.2580 | Pt 3.484e+00 | H 168.9630 | ETA 3.17h
I 7500 | L 1.2102 | P 3.323e+00 | Lt 1.2310 | Pt 3.392e+00 | H 256.3053 | ETA 3.10h
I 10000 | L 1.1896 | P 3.256e+00 | Lt 1.2133 | Pt 3.333e+00 | H 326.8162 | ETA 3.00h
I 12500 | L 1.1849 | P 3.240e+00 | Lt 1.2035 | Pt 3.301e+00 | H 377.0918 | ETA 2.92h
I 15000 | L 1.1764 | P 3.213e+00 | Lt 1.2009 | Pt 3.292e+00 | H 437.9836 | ETA 2.84h
I 17500 | L 1.1714 | P 3.197e+00 | Lt 1.1871 | Pt 3.247e+00 | H 472.2478 | ETA 2.75h
I 20000 | L 1.1619 | P 3.167e+00 | Lt 1.1855 | Pt 3.242e+00 | H 529.4953 | ETA 2.67h
I 22500 | L 1.1588 | P 3.158e+00 | Lt 1.1838 | Pt 3.237e+00 | H 566.0190 | ETA 2.58h
I 25000 | L 1.1557 | P 3.148e+00 | Lt 1.1723 | Pt 3.200e+00 | H 618.9257 | ETA 2.49h
I 27500 | L 1.1532 | P 3.140e+00 | Lt 1.1735 | Pt 3.204e+00 | H 666.6024 | ETA 2.40h
I 30000 | L 1.1484 | P 3.125e+00 | Lt 1.1688 | Pt 3.189e+00 | H 709.8266 | ETA 2.32h
I 32500 | L 1.1495 | P 3.129e+00 | Lt 1.1683 | Pt 3.188e+00 | H 759.3861 | ETA 2.23h
I 35000 | L 1.1444 | P 3.113e+00 | Lt 1.1652 | Pt 3.178e+00 | H 805.6197 | ETA 2.14h
I 37500 | L 1.1440 | P 3.112e+00 | Lt 1.1653 | Pt 3.178e+00 | H 842.1443 | ETA 2.06h
I 40000 | L 1.1395 | P 3.098e+00 | Lt 1.1590 | Pt 3.158e+00 | H 901.0889 | ETA 1.97h
I 42500 | L 1.1426 | P 3.107e+00 | Lt 1.1619 | Pt 3.167e+00 | H 933.7614 | ETA 1.89h
I 45000 | L 1.1335 | P 3.079e+00 | Lt 1.1555 | Pt 3.147e+00 | H 988.0626 | ETA 1.81h
I 47500 | L 1.1367 | P 3.089e+00 | Lt 1.1552 | Pt 3.146e+00 | H 1032.7378 | ETA 1.73h
I 50000 | L 1.1375 | P 3.091e+00 | Lt 1.1597 | Pt 3.160e+00 | H 1080.6186 | ETA 1.65h
I 52500 | L 1.1330 | P 3.078e+00 | Lt 1.1545 | Pt 3.144e+00 | H 1137.3434 | ETA 1.57h
I 55000 | L 1.1331 | P 3.078e+00 | Lt 1.1543 | Pt 3.144e+00 | H 1190.3761 | ETA 1.49h
I 57500 | L 1.1337 | P 3.080e+00 | Lt 1.1516 | Pt 3.135e+00 | H 1229.3665 | ETA 1.41h
I 60000 | L 1.1323 | P 3.076e+00 | Lt 1.1498 | Pt 3.130e+00 | H 1299.3094 | ETA 1.33h
I 62500 | L 1.1288 | P 3.065e+00 | Lt 1.1510 | Pt 3.133e+00 | H 1343.2204 | ETA 1.24h
I 65000 | L 1.1288 | P 3.065e+00 | Lt 1.1493 | Pt 3.128e+00 | H 1408.9394 | ETA 1.16h
I 67500 | L 1.1296 | P 3.067e+00 | Lt 1.1484 | Pt 3.125e+00 | H 1434.8386 | ETA 1.07h
I 70000 | L 1.1229 | P 3.047e+00 | Lt 1.1467 | Pt 3.120e+00 | H 1477.1424 | ETA 0.99h
I 72500 | L 1.1278 | P 3.062e+00 | Lt 1.1461 | Pt 3.118e+00 | H 1540.7984 | ETA 0.91h
I 75000 | L 1.1282 | P 3.063e+00 | Lt 1.1479 | Pt 3.124e+00 | H 1611.9756 | ETA 0.82h
I 77500 | L 1.1250 | P 3.054e+00 | Lt 1.1433 | Pt 3.109e+00 | H 1647.2463 | ETA 0.74h
I 80000 | L 1.1207 | P 3.041e+00 | Lt 1.1436 | Pt 3.110e+00 | H 1705.2612 | ETA 0.66h
I 82500 | L 1.1222 | P 3.045e+00 | Lt 1.1421 | Pt 3.106e+00 | H 1757.4765 | ETA 0.58h
I 85000 | L 1.1213 | P 3.042e+00 | Lt 1.1422 | Pt 3.106e+00 | H 1811.8751 | ETA 0.49h
I 87500 | L 1.1215 | P 3.043e+00 | Lt 1.1427 | Pt 3.108e+00 | H 1842.4515 | ETA 0.41h
I 90000 | L 1.1231 | P 3.048e+00 | Lt 1.1457 | Pt 3.117e+00 | H 1906.9206 | ETA 0.33h
I 92500 | L 1.1200 | P 3.038e+00 | Lt 1.1431 | Pt 3.109e+00 | H 1968.1881 | ETA 0.25h
I 95000 | L 1.1210 | P 3.041e+00 | Lt 1.1414 | Pt 3.103e+00 | H 2013.1550 | ETA 0.16h
I 97500 | L 1.1196 | P 3.037e+00 | Lt 1.1338 | Pt 3.080e+00 | H 2084.4322 | ETA 0.08h
I 100000 | L 1.1161 | P 3.027e+00 | Lt 1.1390 | Pt 3.096e+00 | H 2137.6248 | ETA 0.00h
I 100001 | L 1.1161 | P 3.027e+00 | Lt 1.1390 | Pt 3.096e+00
Finished training!
