always_save_checkpoint: False
arg: --wandb_project=moe_gpt
attempt: moe_gpt
aux_loss_weight: 0.01
axial: False
backend: nccl
base_d_embd: 768
base_d_head: 64
base_d_model: 768
base_ffn_expansion: 1
base_n_head: -1
batch_size: 64
beta1: 0.9
beta2: 0.95
bias: False
block_size: 128
ckpt_path:
cola_flops: 421019648
cola_params: 10080768
compile: False
config_file: config/train_open_small.py
d_embd: -1
d_head: 64
d_model: 128
data_dir: /storage/data
dataset: open
decay_lr: False
device: cuda
device_type: cuda
do_qk_ln: True
dropout: 0.0
dtype: bfloat16
emb_params: 28672
eval_interval: 2500
eval_iters: 200
eval_only: False
every_n_fwds: 200
expr:
ffn_expansion: 4
grad_clip: 1.0
gradient_accumulation_steps: 8
head_btt_case_proj: (n|d|n|d)
head_btt_case_qkv: (n|d|n|d)
head_params: 12288
init_lr: 0.003
input_lr_mult: 1.0
key: wandb_project
layers: all_but_last
lm_head_rank_frac: -1.0
lm_head_struct:
lm_head_tt_rank: -1
log_interval: 100
lr_decay_iters: 100000
max_iters: 100000
min_lr: 0.00030000000000000003
n_head: -1
n_layer: 9
neurons: 0
non_emb_flops: 419446784
non_emb_params: 10039808
num_active_experts: 2
num_active_ffn_experts: 2
num_blocks: 4
num_experts: 0
num_ffn_experts: 8
opt_name: AdamW
out_dir: /chkpt/dense_e0_k2_ffe8k2_all_but_last_l9-dm128-de-1-h-1-dh64-ttr1-173928
rank_frac: 0.2
spec_penalty_weight: 0.0
split_qkv: True
struct: dense
timestamp: 2024-05-18_173928
total_tokens: 6017506009
tt_dim: 2
tt_rank: 1
use_head_btt: False
val: moe_gpt
vocab_size: 96
wandb_log: True
wandb_project: moe_gpt
wandb_run_name: dense_e0_k2_ffe8k2_all_but_last_l9-dm128-de-1-h-1-dh64-ttr1-173928
warmup_iters: 2000
weight_decay: 0.0
I 2500 | L 1.2449 | P 3.439e+00 | Lt 1.2654 | Pt 3.510e+00 | H 64.5473 | ETA 6.07h
I 5000 | L 1.1802 | P 3.225e+00 | Lt 1.1972 | Pt 3.280e+00 | H 186.2914 | ETA 5.83h
I 7500 | L 1.1554 | P 3.147e+00 | Lt 1.1726 | Pt 3.201e+00 | H 277.5433 | ETA 5.63h
I 10000 | L 1.1426 | P 3.107e+00 | Lt 1.1571 | Pt 3.152e+00 | H 363.2086 | ETA 5.45h
I 12500 | L 1.1294 | P 3.067e+00 | Lt 1.1494 | Pt 3.128e+00 | H 435.7034 | ETA 5.23h
I 15000 | L 1.1193 | P 3.036e+00 | Lt 1.1386 | Pt 3.095e+00 | H 502.4487 | ETA 5.05h
I 17500 | L 1.1123 | P 3.015e+00 | Lt 1.1292 | Pt 3.066e+00 | H 566.8581 | ETA 4.87h
I 20000 | L 1.1068 | P 2.999e+00 | Lt 1.1274 | Pt 3.061e+00 | H 629.0913 | ETA 4.72h
I 22500 | L 1.1010 | P 2.982e+00 | Lt 1.1279 | Pt 3.062e+00 | H 693.3439 | ETA 4.58h
I 25000 | L 1.0986 | P 2.974e+00 | Lt 1.1192 | Pt 3.036e+00 | H 749.1483 | ETA 4.43h
I 27500 | L 1.0947 | P 2.963e+00 | Lt 1.1169 | Pt 3.029e+00 | H 813.5738 | ETA 4.28h
I 30000 | L 1.0967 | P 2.969e+00 | Lt 1.1136 | Pt 3.019e+00 | H 876.8727 | ETA 4.14h
I 32500 | L 1.0907 | P 2.951e+00 | Lt 1.1122 | Pt 3.015e+00 | H 931.1103 | ETA 3.99h
I 35000 | L 1.0882 | P 2.944e+00 | Lt 1.1042 | Pt 2.991e+00 | H 1000.3733 | ETA 3.84h
I 37500 | L 1.0867 | P 2.940e+00 | Lt 1.1068 | Pt 2.999e+00 | H 1057.5061 | ETA 3.70h
I 40000 | L 1.0845 | P 2.933e+00 | Lt 1.1021 | Pt 2.985e+00 | H 1121.1621 | ETA 3.55h
I 42500 | L 1.0812 | P 2.924e+00 | Lt 1.1024 | Pt 2.986e+00 | H 1182.0894 | ETA 3.40h
I 45000 | L 1.0800 | P 2.920e+00 | Lt 1.0981 | Pt 2.973e+00 | H 1265.0973 | ETA 3.25h
I 47500 | L 1.0777 | P 2.914e+00 | Lt 1.0994 | Pt 2.977e+00 | H 1308.8928 | ETA 3.10h
I 50000 | L 1.0759 | P 2.908e+00 | Lt 1.0968 | Pt 2.969e+00 | H 1400.1157 | ETA 2.96h
I 52500 | L 1.0770 | P 2.911e+00 | Lt 1.0971 | Pt 2.970e+00 | H 1444.6335 | ETA 2.82h
I 55000 | L 1.0712 | P 2.895e+00 | Lt 1.0933 | Pt 2.959e+00 | H 1528.7823 | ETA 2.67h
I 57500 | L 1.0746 | P 2.904e+00 | Lt 1.0943 | Pt 2.962e+00 | H 1599.6656 | ETA 2.52h
I 60000 | L 1.0712 | P 2.895e+00 | Lt 1.0917 | Pt 2.954e+00 | H 1663.8990 | ETA 2.37h
I 62500 | L 1.0718 | P 2.897e+00 | Lt 1.0919 | Pt 2.955e+00 | H 1732.2488 | ETA 2.22h
I 65000 | L 1.0723 | P 2.898e+00 | Lt 1.0879 | Pt 2.943e+00 | H 1796.1815 | ETA 2.07h
I 67500 | L 1.0702 | P 2.892e+00 | Lt 1.0882 | Pt 2.944e+00 | H 1870.5905 | ETA 1.92h
I 70000 | L 1.0682 | P 2.886e+00 | Lt 1.0891 | Pt 2.947e+00 | H 1932.6169 | ETA 1.77h
I 72500 | L 1.0691 | P 2.889e+00 | Lt 1.0883 | Pt 2.944e+00 | H 2007.2403 | ETA 1.63h
I 75000 | L 1.0711 | P 2.894e+00 | Lt 1.0910 | Pt 2.952e+00 | H 2125.2146 | ETA 1.48h
I 77500 | L 1.0708 | P 2.894e+00 | Lt 1.0895 | Pt 2.948e+00 | H 2227.5202 | ETA 1.33h
I 80000 | L 1.0700 | P 2.891e+00 | Lt 1.0902 | Pt 2.950e+00 | H 2312.1001 | ETA 1.18h
I 82500 | L 1.0671 | P 2.883e+00 | Lt 1.0866 | Pt 2.939e+00 | H 2380.0450 | ETA 1.03h
I 85000 | L 1.0709 | P 2.894e+00 | Lt 1.0873 | Pt 2.941e+00 | H 2461.4149 | ETA 0.89h
I 87500 | L 1.0594 | P 2.861e+00 | Lt 1.0832 | Pt 2.929e+00 | H 2524.0695 | ETA 0.74h
I 90000 | L 1.0651 | P 2.877e+00 | Lt 1.0831 | Pt 2.929e+00 | H 2587.8928 | ETA 0.59h
I 92500 | L 1.0637 | P 2.873e+00 | Lt 1.0848 | Pt 2.934e+00 | H 2685.8938 | ETA 0.44h
I 95000 | L 1.0631 | P 2.872e+00 | Lt 1.0829 | Pt 2.928e+00 | H 2778.2113 | ETA 0.30h
I 97500 | L 1.0600 | P 2.863e+00 | Lt 1.0852 | Pt 2.935e+00 | H 2832.7914 | ETA 0.15h
I 100000 | L 1.0629 | P 2.871e+00 | Lt 1.0813 | Pt 2.924e+00 | H 2924.3779 | ETA 0.00h
I 100001 | L 1.0629 | P 2.871e+00 | Lt 1.0813 | Pt 2.924e+00
Finished training!
