model=t5-3b
delta=lora

per_device_train_batch_size=16
per_device_eval_batch_size=16

optim=kfac
damp=1e-3
alpha=0.9
gamma=0.9
update_freq=500
momentum=0.9
weight_decay=1e-4
pre_clipping=0

mkdir wandb_t53b
export WANDB_DIR=wandb_t53b

for dataset in rte
do

for lr in 0.1
do
for eps in 1e-2
do
for max_grad_norm in 10.0
do

optim=kfac
# dataset=rte
echo $dataset

device=3,7
echo "GPU" $device
CUDA_VISIBLE_DEVICES=$device TOKENIZERS_PARALLELISM=false nohup torchrun --nproc_per_node=2 --master_port 29503 run_seq2seq.py \
    --config examples_seq2seq/configs/$delta/$model/$optim/$dataset.json \
    --lr $lr --damp2 $damp --alpha $alpha --gamma $gamma --update_freq $update_freq \
    --momentum $momentum --weight_decay $weight_decay --eps $eps \
    --max_grad_norm $max_grad_norm --pre_clipping $pre_clipping \
    --per_device_train_batch_size $per_device_train_batch_size \
    > logs_t53b/rte_lora/${dataset}-${model}-lr_$lr-gamma_$gamma-update_freq_$update_freq-momentum_$momentum-weight_decay_$weight_decay-eps_$eps-max_grad_norm_$max_grad_norm-bs_$per_device_train_batch_size-pre_clipping_$pre_clipping-$optim.log &
wait
done
done
done
# wait
# done

# << EOF
optim=adamw
weight_decay=1e-4
max_grad_norm=0.1
pre_clipping=1

for lr in 1e-4
do
device=4,7
echo "GPU" $device
# dataset=rte
CUDA_VISIBLE_DEVICES=$device TOKENIZERS_PARALLELISM=false nohup python -u run_newoptim_cls.py \
    --config examples_seq2seq/configs/$delta/$model/$optim/$dataset.json \
    --lr $lr \
    --weight_decay $weight_decay \
    --max_grad_norm $max_grad_norm --pre_clipping $pre_clipping \
    --per_device_train_batch_size $per_device_train_batch_size \
    > logs_t53b/rte_lora/${dataset}-${model}-lr_$lr-weight_decay_$weight_decay-max_grad_norm_$max_grad_norm-bs_$per_device_train_batch_size-pre_clipping_$pre_clipping-$optim.log &
#EOF
wait
echo "$delta based on $model on $dataset with $optim"

done
# EOF
done