#!/bin/bash
# LLaMA-3B
module load cuda/11.8

export level=2
export scale=0.25
export optimizer=foam
export lr=5e-3
export seed=0

torchrun --standalone --nproc_per_node 4 torchrun_main.py \
    --model_config configs/llama_3b.json \
    --lr $lr \
    --scale $scale \
    --activation_checkpointing \
    --batch_size 32 \
    --total_batch_size 512 \
    --num_training_steps 120000 \
    --warmup_ratio 0.1 \
    --dtype bfloat16 \
    --eval_every 1000 \
    --save_every 100000 \
    --level $level \
    --seed $seed \
    --optimizer $optimizer
wait


echo 'finish!'