model=roberta-large
delta=adapter
datasets=(mrpc rte cola mnli)

per_device_train_batch_size=128
per_device_eval_batch_size=128

# << EOF
optim=kfac
damp=1e-3
alpha=0.9
gamma=0.9
update_freq=500
momentum=0.9
weight_decay=1e-4
pre_clipping=0

# for update_freq in 1 5 10 50 100 200
# do
for lr in 0.05
do
for eps in 1e-3
do
for max_grad_norm in 1.5
do
# for ((i=0;i<=3;i++))
# do
# device=2
# echo "GPU" $device
dataset=rte
# CUDA_VISIBLE_DEVICES=$device 
# total_bs=$((per_device_train_batch_size*4))
# echo "total bs" $total_bs
# TOKENIZERS_PARALLELISM=false nohup torchrun --nproc_per_node=8 --master_port 29514 
CUDA_VISIBLE_DEVICES=0 python -u run_newoptim_cls.py \
    --config examples_text_classification/configs/$delta/$model/$optim/${dataset}_epochtime.json \
    --lr $lr --damp2 $damp --alpha $alpha --gamma $gamma --update_freq $update_freq \
    --momentum $momentum --weight_decay $weight_decay --eps $eps \
    --max_grad_norm $max_grad_norm --pre_clipping $pre_clipping \
    --per_device_train_batch_size $per_device_train_batch_size \
    > logs/${dataset}-${model}-lr_$lr-gamma_$gamma-update_freq_$update_freq-momentum_$momentum-weight_decay_$weight_decay-eps_$eps-max_grad_norm_$max_grad_norm-bs_$total_bs-pre_clipping_$pre_clipping-$optim.log 2>&1 &
echo "$delta based on $model on $dataset with $optim"
wait
done
done
done
# done
# EOF
wait

# << EOF
optim=adamw
weight_decay=1e-4
max_grad_norm=0.1
pre_clipping=1

for lr in 1e-4
do
device=2
echo "GPU" $device
dataset=rte
WANDB_DIR=wandb_time WANDB_PROJECT=epochtime CUDA_VISIBLE_DEVICES=0 TOKENIZERS_PARALLELISM=false nohup python -u run_newoptim_cls.py \
        --config examples_text_classification/configs/$delta/$model/$optim/${dataset}_epochtime.json \
    --lr $lr \
    --weight_decay $weight_decay \
    --max_grad_norm $max_grad_norm --pre_clipping $pre_clipping \
    --per_device_train_batch_size $per_device_train_batch_size \
    > wandb_time/${dataset}-${model}-lr_$lr-weight_decay_$weight_decay-max_grad_norm_$max_grad_norm-bs_$per_device_train_batch_size-pre_clipping_$pre_clipping-$optim.log &
#EOF
echo "$delta based on $model on $dataset with $optim"
wait
done

# done
# # EOF
# wait