#!/bin/bash

# the same as scripts/run_gpt2_124M.sh but with PyTorch

# if you wish to train on just a single GPU, simply skip the torchrun part, i.e.
# python train_gpt2.py ... (all the other arguments the same)
# torchrun --standalone --nproc_per_node=8 train_gpt2.py \

for k in "d12"; do
if [ $k = "d12" ]
then
BS=1
runcommand="python3 printlwlr.py"
fi
if [ $k = "d24" ]
then
BS=1
runcommand="python3 printlwlr.py"
#runcommand="torchrun --standalone --nproc_per_node=2 printlwlr.py"
fi
for a in "fanin"; do
for j in "layerdepth"; do
    ${runcommand} \
    --input_bin "dev/data/fineweb10B/fineweb_train_*.bin" \
    --model ${k} \
    --residual_scale ${j} \
    --init_weight fanin \
    --lwlr_mode ${a} \
    --batch_size ${BS} \
    --sequence_length 1024 \
    --total_batch_size 1024 \
    --dtype bfloat16 \
    --compile 1 \
    --tensorcores 1 \
    --flash 1 \
    --num_iterations 100 \
    --num_ablation 5 \
    --zero_stage 0 \
    --overfit_single_batch 0 | tee lwlrd12t100bs1.txt
done
done
done

for k in "d12"; do
if [ $k = "d12" ]
then
BS=16
runcommand="python3 printlwlr.py"
fi
for a in "fanin"; do
for j in "layerdepth"; do
for t in 1 10 100; do
    ${runcommand} \
    --input_bin "dev/data/fineweb10B/fineweb_train_*.bin" \
    --model ${k} \
    --residual_scale ${j} \
    --init_weight fanin \
    --lwlr_mode ${a} \
    --batch_size ${BS} \
    --sequence_length 1024 \
    --total_batch_size 524288 \
    --dtype bfloat16 \
    --compile 1 \
    --tensorcores 1 \
    --flash 1 \
    --num_iterations ${t} \
    --num_ablation 5 \
    --zero_stage 0 \
    --overfit_single_batch 0 | tee lwlrd12t${t}bs512.txt
done
done
done
done
