## Requirements

pip3 install torch torchvision torchaudio<br>
pip install transformers==4.31.0<br>
pip install tqdm wandb<br>
## Usage

### example: Training LLaMA-130M 

torchrun --standalone --nproc_per_node 2 torchrun_main.py \
    --model_config configs/llama_130m.json \
    --lr 8e-4 \
    --rank 1.0 \
    --update_proj_gap 500 \
    --batch_size 128  \
    --total_batch_size 512 \
    --num_training_steps 20000 \
    --warmup_steps 1000 \
    --weight_decay 0 \
    --dtype bfloat16 \
    --eval_every 1000 \
    --threshold 5000 \
    --save_dir $save_dir \
    --optimizer SPAM \
    --updating_mask_method random \
    --warmup_epoch 130 \
    --init_mask random \
    --grad_accu_steps 20 \
    --spike_clip   # utilize spike gradient clipping   if set --m_replace: it replace spike gradient with the first moment m. 
    
### example: Training LLaMA-350M 

torchrun --standalone --nproc_per_node 2 torchrun_main.py \
    --model_config configs/llama_130m.json \
    --lr 4e-4 \
    --rank 1.0 \
    --update_proj_gap 500 \
    --batch_size 128  \
    --total_batch_size 512 \
    --num_training_steps 20000 \
    --warmup_steps 1000 \
    --weight_decay 0 \
    --dtype bfloat16 \
    --eval_every 1000 \
    --threshold 5000 \
    --save_dir $save_dir \
    --optimizer SPAM \
    --updating_mask_method random \
    --warmup_epoch 130 \
    --init_mask random \
    --grad_accu_steps 20 \
    --spike_clip   # utilize spike gradient clipping   if set --m_replace: it replace spike gradient with the first moment m. 
