path_prefix=`pwd`
###
 # @Author: pengjie pengjieb@mail.ustc.edu.cn
 # @Date: 2023-09-28 16:52:54
 # @LastEditors: pengjie pengjieb@mail.ustc.edu.cn
 # @LastEditTime: 2023-11-21 17:58:07
 # @FilePath: /iclr24_sm4/iclr24/mmoe_three_tasks_with_pretrain.sh
 # @Description: 
 # 
 # Copyright (c) 2023 by ${git_name_email}, All Rights Reserved. 
### 


gpu_id=0
py_file=private_test_scripts/mmoe_transformer/three_tasks_with_pretrain_iclr24.py
task_name=attn_modality_spec
debug=1
if [ $debug -eq 1 ]; then
    max_epochs=3
    gb_epochs=1
    is_train=1
    seeds=123
    task_name=debug
else
    max_epochs=100
    gb_epochs=10
    is_train=1
    # seeds='123 321 132'
    seeds='123 321 132'
    task_name=iclr_three_task
fi

# pretrain
args="--model-path private_test_scripts/model/three_medium_tasks_pretrain_$task_name.pth 
    --seed 123 
    --epochs 30
    --is-train $is_train 
    --lr 0.001
    --img-path log/img/three_medium_tasks_$task_name
    --unlimited-capacity-on-mlp 0 
    --co-input 1 
    --seperate-qkv 1 
    --gate-type NoisyVMoEGate 
    --debug $debug 
    --modality-gating-merge 1
    --training-weight 10. 0.8 10.
    --cross-modality-attn 1 
    --cross-depth 1
    --grad-clip 1
    --push-cut-into 1
    --gentle-push-batch-size 32
    --tune-gate-weight 1
    --moe-gate-weight 0.1
    --gradient-blending 0
    --gradient-blending-epoch $gb_epochs
    --capacity-ratio 1.0
    --capacity-ratios 1.0 1.0 1.0
    --dynamic-reweight 0
    --outter-task-loss 1
    --num-latent 12
    --num-experts 32
    --grad-clip-value 1.
    --lr-schedular CosineAnnealingLR
    --weight-decay 0.0
    --push-seq-length 16
    --mlp-top-k 4
    --only-pretrain 1
    --push-without-valid 0"
CUDA_VISIBLE_DEVICES=$gpu_id python -u $py_file ${args}

for seed in $seeds
do
    args="--model-path private_test_scripts/model/three_medium_tasks_$task_name$seed.pth 
    --pretrain-model-path private_test_scripts/model/three_medium_tasks_pretrain_$task_name.pth 
    --seed $seed 
    --epochs $max_epochs 
    --is-train $is_train 
    --lr 0.001
    --img-path log/img/three_medium_tasks_$task_name
    --unlimited-capacity-on-mlp 0 
    --co-input 1 
    --seperate-qkv 1 
    --gate-type NoisyVMoEGate 
    --debug $debug 
    --modality-gating-merge 1
    --training-weight 10. 0.8 10.
    --cross-modality-attn 1 
    --cross-depth 1
    --grad-clip 1
    --push-cut-into 1
    --gentle-push-batch-size 32
    --tune-gate-weight 1
    --moe-gate-weight 0.1
    --gradient-blending 0
    --gradient-blending-epoch $gb_epochs
    --capacity-ratio 1.0
    --capacity-ratios 1.0 1.0 1.0
    --dynamic-reweight 0
    --outter-task-loss 1
    --num-latent 12
    --num-experts 32
    --grad-clip-value 1.
    --lr-schedular CosineAnnealingLR
    --weight-decay 0.0
    --push-seq-length 16
    --mlp-top-k 4
    --only-pretrain 0
    --push-without-valid 0" # reweight gating weight
    CUDA_VISIBLE_DEVICES=$gpu_id python -u $py_file ${args}
done