### do "conda activate gpt" before running this script
export WANDB_MODE=online
export WANDB_API_KEY=1159dda0d0566b72d5cd71464a06ff6b73efb455
export WANDB__SERVICE_WAIT=300

DATA_DIR=./open_small
OUT_DIR=./
WANDB_PROJ=moe_gpt

MAX_ITERS=100_000
BLOCK_SIZE=128

BATCH_SIZE=512
GRAD_ACCUM=1

# Token batch size: 512 * 128 = 65_536
# Short seq regime: d >> N / 6 ~= 20

# Dense
lr=3e-3
num_experts=8

d_model=256
CUDA_VISIBLE_DEVICES=0 python train_gpt.py config/train_open_small.py --block_size=${BLOCK_SIZE} --struct=btt_norm_moe --num_experts=${num_experts} --layers=all_but_last --d_model=${d_model} --n_layer=3 --n_head=-1 --d_head=64 --max_iters=${MAX_ITERS} --data_dir=${DATA_DIR} --out_dir=${OUT_DIR} --batch_size=${BATCH_SIZE} --gradient_accumulation_steps=${GRAD_ACCUM} --init_lr=${lr} --wandb_project=${WANDB_PROJ} &

d_model=512
CUDA_VISIBLE_DEVICES=1 python train_gpt.py config/train_open_small.py --block_size=${BLOCK_SIZE} --struct=btt_norm_moe --num_experts=${num_experts} --layers=all_but_last --d_model=${d_model} --n_layer=3 --n_head=-1 --d_head=64 --max_iters=${MAX_ITERS} --data_dir=${DATA_DIR} --out_dir=${OUT_DIR} --batch_size=${BATCH_SIZE} --gradient_accumulation_steps=${GRAD_ACCUM} --init_lr=${lr} --wandb_project=${WANDB_PROJ} &

d_model=1024
CUDA_VISIBLE_DEVICES=2 python train_gpt.py config/train_open_small.py --block_size=${BLOCK_SIZE} --struct=btt_norm_moe --num_experts=${num_experts} --layers=all_but_last --d_model=${d_model} --n_layer=3 --n_head=-1 --d_head=64 --max_iters=${MAX_ITERS} --data_dir=${DATA_DIR} --out_dir=${OUT_DIR} --batch_size=${BATCH_SIZE} --gradient_accumulation_steps=${GRAD_ACCUM} --init_lr=${lr} --wandb_project=${WANDB_PROJ} &

BATCH_SIZE=256
GRAD_ACCUM=2

d_model=2048
CUDA_VISIBLE_DEVICES=3,4 torchrun --nproc_per_node=2 --master_port=$(shuf -i 49152-65535 -n 1) train_gpt.py config/train_open_small.py --block_size=${BLOCK_SIZE} --struct=btt_norm_moe --num_experts=${num_experts} --layers=all_but_last --d_model=${d_model} --n_layer=3 --n_head=-1 --d_head=64 --max_iters=${MAX_ITERS} --data_dir=${DATA_DIR} --out_dir=${OUT_DIR} --batch_size=${BATCH_SIZE} --gradient_accumulation_steps=${GRAD_ACCUM} --init_lr=${lr} --wandb_project=${WANDB_PROJ} &

BATCH_SIZE=171
GRAD_ACCUM=3
d_model=4096
CUDA_VISIBLE_DEVICES=5,6,7 torchrun --nproc_per_node=3 --master_port=$(shuf -i 49152-65535 -n 1) train_gpt.py config/train_open_small.py --block_size=${BLOCK_SIZE} --struct=btt_norm_moe --num_experts=${num_experts} --layers=all_but_last --d_model=${d_model} --n_layer=3 --n_head=-1 --d_head=64 --max_iters=${MAX_ITERS} --data_dir=${DATA_DIR} --out_dir=${OUT_DIR} --batch_size=${BATCH_SIZE} --gradient_accumulation_steps=${GRAD_ACCUM} --init_lr=${lr} --wandb_project=${WANDB_PROJ} &
