export CUDA_VISIBLE_DEVICES=2

batch_size=64           # 16 32 64
hidden_dim=64           # 64 128 256
train_epochs=100        # 20 50 100
lr=1e-4                 # [@*Important*@] 1e-3, 1e-4
n_patches=8             # 8 16
n_transformers=1        # [@*Important*@] 1 2 3 4
n_head=4                # 1 2 3 4
n_routers=1             # 1 2 4
n_experts=8             # [@*Important*@] 8 16 24 32
gate_loss_weight=1e-1   # [@*Important*@] 1 5e-1 1e-1 5e-2 1e-2
decoder_loss_weight=1   # 1 5e-1 1e-1 5e-2 1e-2 
top_k=2                 # 2 4
patch=True              # [@*Important*@] True False

python ./scripts/train_dyngen.py \
    --batch_size $batch_size \
    --hidden_dim $hidden_dim \
    --train_epochs $train_epochs \
    --lr $lr \
    --n_patches $n_patches \
    --n_transformers $n_transformers \
    --n_head $n_head \
    --n_routers $n_routers \
    --n_experts $n_experts \
    --gate_loss_weight $gate_loss_weight \
    --decoder_loss_weight $decoder_loss_weight \
    --patch $patch