### do "conda activate gpt" before running this script
export WANDB_MODE=offline
export WANDB_API_KEY=1159dda0d0566b72d5cd71464a06ff6b73efb455
export WANDB__SERVICE_WAIT=300

DATA_DIR=./open_small
OUT_DIR=./
WANDB_PROJ=ein_gpt

BATCH_SIZE=512
GRAD_ACCUM=1
MAX_ITERS=100_000
BLOCK_SIZE=128

# Token batch size: 512 * 128 = 65_536
# Short seq regime: d >> N / 6 ~= 20


# r=0, slsl
v1="(0-0.5-0.5-0-0.5-0-0.5)"
v2="(0-0.33-0.67-0-0.33-0-0.67)"
v3="(0-0.2-0.8-0-0.2-0-0.8)"
# r=0, lsls
v4="(0-0.67-0.33-0-0.67-0-0.33)"
# r=0.25
v5="(0-0.5-0.5-0.25-0.5-0-0.5)"
v6="(0-0.33-0.67-0.25-0.33-0-0.67)"
v7="(0-0.2-0.8-0.25-0.2-0-0.8)"

# BTT
lr=3e-3
for d_model in 512 1024; do
for vec in ${v1} ${v2} ${v3} ${v4} ${v5} ${v6} ${v7}; do
gpu_id=$(get_free_gpu "${ALLOWED_GPUs}")
CUDA_VISIBLE_DEVICES=${gpu_id} python train_gpt.py config/train_open_small.py --block_size=${BLOCK_SIZE} --struct=simple_ein_vec_norm --expr=${vec} --layers=all_but_last --d_model=${d_model} --n_layer=3 --n_head=-1 --d_head=64 --max_iters=${MAX_ITERS} --data_dir=${DATA_DIR} --out_dir=${OUT_DIR} --batch_size=${BATCH_SIZE} --gradient_accumulation_steps=${GRAD_ACCUM} --init_lr=${lr} --wandb_project=${WANDB_PROJ} &
sleep 20
done
done