#!/bin/bash
# GPT-XL, 1.5B
module load cuda/11.8

# export scale=2.5e-1
export optimizer=frost
export lr=1.0e-2
export seed=0
export scale=5.0e-2

torchrun --standalone --nproc_per_node 4 torchrun_main.py \
    --model_config configs/gpt_xl.json \
    --model_type gpt2 \
    --lr $lr \
    --scale $scale \
    --batch_size 32 \
    --activation_checkpointing \
    --total_batch_size 512 \
    --num_training_steps 120000 \
    --warmup_ratio 0.1 \
    --weight_decay 0 \
    --dtype bfloat16 \
    --eval_every 1000 \
    --save_every 100000 \
    --seed $seed \
    --save_dir gpt2_xl/$optimizer/$lr*$scale+wd_$weight_decay \
    --optimizer $optimizer > logs/gpt2_xl/$optimizer/seed_$seed+$lr*$scale.out 2>&1 &
wait

wait

echo 'finish!'