#!/bin/bash

# the same as scripts/run_gpt2_124M.sh but with PyTorch

# if you wish to train on just a single GPU, simply skip the torchrun part, i.e.
# python train_gpt2.py ... (all the other arguments the same)
#for k in "d12" "d24"; do
for k in "d12"; do
if [ $k = "d12" ]
then
BS=16
runcommand="python3 train_gpt2_overtrain.py"
fi
if [ $k = "d24" ]
then
BS=8
runcommand="python3 train_gpt2_overtrain.py"
fi
for j in "layerdepth"; do
for opt in "adamw" "lion"; do
if [ $opt = "adamw" ]
then
betas="0.9 0.95"
fi
if [ $opt = "lion" ]
then
betas="0.95 0.98"
fi

for lrmode in fanin; do

if [ $k = "d12" ] && [ ${opt} = adamw ] && [ ${lrmode} = fanin ]
then
lrlist=( 0.0024 )
fi

if [ $k = "d12" ] && [ ${opt} = adamw ] && [ ${lrmode} = none ]
then
lrlist=( 0.0012 )
fi

if [ $k = "d12" ] && [ ${opt} = lion ] && [ ${lrmode} = fanin ]
then
lrlist=( 0.00024 )
fi

if [ $k = "d12" ] && [ ${opt} = lion ] && [ ${lrmode} = none ]
then
lrlist=( 0.00012 )
fi

if [ $k = "d24" ] && [ ${opt} = adamw ]
then
lrlist=( 0.00015 0.0003 0.0006 0.0012 0.0024 0.0048 )
fi

if [ $k = "d24" ] && [ ${opt} = lion ]
then
lrlist=( 0.000015 0.00003 0.00006 0.00012 0.00024 0.00048 )
fi

for t in -1 100; do
for iter in 0 1 2 3 4; do

for lr in ${lrlist[@]}; do
    ${runcommand}  \
    --input_bin "dev/data/fineweb10B/fineweb_train_*.bin" \
    --sample_every 0 \
    --write_tensors 0 \
    --model ${k} \
    --residual_scale ${j} \
    --weight_init fanin \
    --lwlr_mode ${lrmode} \
    --batch_size ${BS} \
    --sequence_length 1024 \
    --total_batch_size 524288 \
    --overfit_batch_number 10 \
    --dtype bfloat16 \
    --compile 1 \
    --tensorcores 1 \
    --flash 1 \
    --num_iterations 1000 \
    --weight_decay 0.0 \
    --betas ${betas} \
    --zero_stage 0 \
    --learning_rate ${lr} \
    --t_ablation ${t} \
    --t_iter ${iter} \
    --opt_type ${opt} | tee outputovertrain${k}${opt}${j}${lrmode}${lr}t${t}iter${iter}.txt
done
done
done
done
done
done
done
