always_save_checkpoint: False
arg: --wandb_project=moe_gpt
attempt: moe_gpt
aux_loss_weight: 0.01
axial: False
backend: nccl
base_d_embd: 768
base_d_head: 64
base_d_model: 768
base_ffn_expansion: 1
base_n_head: -1
batch_size: 64
beta1: 0.9
beta2: 0.95
bias: False
block_size: 128
ckpt_path:
cola_flops: 535330816
cola_params: 26048640
compile: False
config_file: config/train_open_small.py
d_embd: -1
d_head: 64
d_model: 256
data_dir: /storage/data
dataset: open
decay_lr: False
device: cuda
device_type: cuda
do_qk_ln: True
dropout: 0.0
dtype: bfloat16
emb_params: 57344
eval_interval: 2500
eval_iters: 200
eval_only: False
every_n_fwds: 200
expr:
ffn_expansion: 4
grad_clip: 1.0
gradient_accumulation_steps: 8
head_btt_case_proj: (n|d|n|d)
head_btt_case_qkv: (n|d|n|d)
head_params: 24576
init_lr: 0.003
input_lr_mult: 1.0
key: wandb_project
layers: all_but_last
lm_head_rank_frac: -1.0
lm_head_struct:
lm_head_tt_rank: -1
log_interval: 100
lr_decay_iters: 100000
max_iters: 100000
min_lr: 0.00030000000000000003
n_head: -1
n_layer: 3
neurons: 0
non_emb_flops: 532185088
non_emb_params: 25966720
num_active_experts: 2
num_active_ffn_experts: 2
num_blocks: 4
num_experts: 0
num_ffn_experts: 16
opt_name: AdamW
out_dir: /chkpt/dense_e0_k2_ffe16k2_all_but_last_l3-dm256-de-1-h-1-dh64-ttr1-172301
rank_frac: 0.2
spec_penalty_weight: 0.0
split_qkv: True
struct: dense
timestamp: 2024-05-18_172301
total_tokens: 6017506009
tt_dim: 2
tt_rank: 1
use_head_btt: False
val: moe_gpt
vocab_size: 96
wandb_log: True
wandb_project: moe_gpt
wandb_run_name: dense_e0_k2_ffe16k2_all_but_last_l3-dm256-de-1-h-1-dh64-ttr1-172301
warmup_iters: 2000
weight_decay: 0.0
I 2500 | L 1.2264 | P 3.377e+00 | Lt 1.2457 | Pt 3.442e+00 | H 44.4199 | ETA 3.26h
I 5000 | L 1.1619 | P 3.167e+00 | Lt 1.1761 | Pt 3.212e+00 | H 111.9409 | ETA 3.18h
I 7500 | L 1.1344 | P 3.082e+00 | Lt 1.1517 | Pt 3.136e+00 | H 160.6966 | ETA 3.10h
I 10000 | L 1.1155 | P 3.025e+00 | Lt 1.1355 | Pt 3.085e+00 | H 202.2086 | ETA 3.02h
I 12500 | L 1.1125 | P 3.016e+00 | Lt 1.1294 | Pt 3.067e+00 | H 244.9967 | ETA 2.94h
I 15000 | L 1.0987 | P 2.975e+00 | Lt 1.1152 | Pt 3.024e+00 | H 290.2000 | ETA 2.85h
I 17500 | L 1.0929 | P 2.958e+00 | Lt 1.1102 | Pt 3.009e+00 | H 328.8677 | ETA 2.77h
I 20000 | L 1.0877 | P 2.942e+00 | Lt 1.1078 | Pt 3.002e+00 | H 368.9585 | ETA 2.69h
I 22500 | L 1.0821 | P 2.926e+00 | Lt 1.1003 | Pt 2.979e+00 | H 408.8800 | ETA 2.61h
I 25000 | L 1.0782 | P 2.915e+00 | Lt 1.0996 | Pt 2.977e+00 | H 433.9582 | ETA 2.52h
I 27500 | L 1.0737 | P 2.902e+00 | Lt 1.0993 | Pt 2.977e+00 | H 476.9999 | ETA 2.44h
I 30000 | L 1.0708 | P 2.893e+00 | Lt 1.0875 | Pt 2.942e+00 | H 520.8684 | ETA 2.35h
I 32500 | L 1.0723 | P 2.898e+00 | Lt 1.0872 | Pt 2.941e+00 | H 551.7876 | ETA 2.26h
I 35000 | L 1.0647 | P 2.876e+00 | Lt 1.0890 | Pt 2.946e+00 | H 599.6961 | ETA 2.18h
I 37500 | L 1.0677 | P 2.885e+00 | Lt 1.0804 | Pt 2.921e+00 | H 639.3753 | ETA 2.10h
I 40000 | L 1.0629 | P 2.871e+00 | Lt 1.0818 | Pt 2.925e+00 | H 671.0829 | ETA 2.02h
I 42500 | L 1.0648 | P 2.876e+00 | Lt 1.0789 | Pt 2.917e+00 | H 711.8750 | ETA 1.94h
I 45000 | L 1.0586 | P 2.859e+00 | Lt 1.0775 | Pt 2.913e+00 | H 752.1427 | ETA 1.86h
I 47500 | L 1.0560 | P 2.851e+00 | Lt 1.0737 | Pt 2.902e+00 | H 782.1673 | ETA 1.78h
I 50000 | L 1.0570 | P 2.854e+00 | Lt 1.0697 | Pt 2.890e+00 | H 812.7983 | ETA 1.70h
I 52500 | L 1.0542 | P 2.846e+00 | Lt 1.0715 | Pt 2.896e+00 | H 869.7891 | ETA 1.61h
I 55000 | L 1.0535 | P 2.844e+00 | Lt 1.0721 | Pt 2.897e+00 | H 911.1165 | ETA 1.53h
I 57500 | L 1.0539 | P 2.846e+00 | Lt 1.0659 | Pt 2.879e+00 | H 942.1524 | ETA 1.44h
I 60000 | L 1.0503 | P 2.835e+00 | Lt 1.0682 | Pt 2.886e+00 | H 1007.1431 | ETA 1.36h
I 62500 | L 1.0506 | P 2.836e+00 | Lt 1.0646 | Pt 2.876e+00 | H 1027.1256 | ETA 1.27h
I 65000 | L 1.0506 | P 2.836e+00 | Lt 1.0614 | Pt 2.867e+00 | H 1081.7634 | ETA 1.19h
I 67500 | L 1.0503 | P 2.835e+00 | Lt 1.0609 | Pt 2.865e+00 | H 1125.1719 | ETA 1.10h
I 70000 | L 1.0480 | P 2.829e+00 | Lt 1.0601 | Pt 2.863e+00 | H 1146.4561 | ETA 1.02h
I 72500 | L 1.0460 | P 2.823e+00 | Lt 1.0635 | Pt 2.873e+00 | H 1196.9263 | ETA 0.93h
I 75000 | L 1.0471 | P 2.826e+00 | Lt 1.0654 | Pt 2.878e+00 | H 1237.1720 | ETA 0.85h
I 77500 | L 1.0478 | P 2.828e+00 | Lt 1.0616 | Pt 2.867e+00 | H 1274.7904 | ETA 0.76h
I 80000 | L 1.0420 | P 2.812e+00 | Lt 1.0608 | Pt 2.865e+00 | H 1316.2424 | ETA 0.68h
I 82500 | L 1.0460 | P 2.823e+00 | Lt 1.0605 | Pt 2.864e+00 | H 1369.9732 | ETA 0.59h
I 85000 | L 1.0430 | P 2.815e+00 | Lt 1.0593 | Pt 2.861e+00 | H 1419.6801 | ETA 0.51h
I 87500 | L 1.0410 | P 2.809e+00 | Lt 1.0566 | Pt 2.853e+00 | H 1452.6499 | ETA 0.42h
I 90000 | L 1.0422 | P 2.813e+00 | Lt 1.0569 | Pt 2.854e+00 | H 1474.1954 | ETA 0.34h
I 92500 | L 1.0404 | P 2.808e+00 | Lt 1.0536 | Pt 2.845e+00 | H 1537.2280 | ETA 0.25h
I 95000 | L 1.0417 | P 2.811e+00 | Lt 1.0544 | Pt 2.847e+00 | H 1590.9534 | ETA 0.17h
I 97500 | L 1.0357 | P 2.794e+00 | Lt 1.0538 | Pt 2.845e+00 | H 1638.1901 | ETA 0.08h
I 100000 | L 1.0406 | P 2.808e+00 | Lt 1.0528 | Pt 2.842e+00 | H 1684.9778 | ETA 0.00h
I 100001 | L 1.0406 | P 2.808e+00 | Lt 1.0528 | Pt 2.842e+00
Finished training!
