#!/bin/bash
# GPT-XL, 1.5B
module load cuda/11.8

export optimizer=frost
export lr=1.0e-2
export seed=0
export scale=5.0e-2

torchrun --standalone --nproc_per_node 4 torchrun_main.py \
    --model_config configs/gemma_2b.json \
    --model_type gemma \
    --lr $lr \
    --scale $scale \
    --batch_size 32 \
    --activation_checkpointing \
    --total_batch_size 512 \
    --num_training_steps 120000 \
    --warmup_ratio 0.1 \
    --weight_decay 0 \
    --dtype bfloat16 \
    --eval_every 1000 \
    --save_every 100000 \
    --seed $seed \
    --save_dir gpt2_xl/$optimizer/$lr*$scale+wd_$weight_decay \
    --optimizer $optimizer > logs/gpt2_xl/$optimizer/seed_$seed+$lr*$scale.out 2>&1 &
wait

wait

echo 'finish!'