name: ??? # this can be sparse_clip_weight_ensembling_moe
# the path for loading the model weights, if specified, skip the test-time adaptation training
#checkpoint: /home/enneng/fusion_bench/outputs/sparse_we_moe/shared_gate/1routerlayer_clip-vit-base-patch32_TA8_sparse_weight_ensembling_moe_checkpoint_0_0.ckpt
checkpoint: False
# the path for saving the model weights.
save_checkpoint: False
# router
router_hidden_layers: 2
init_lambda: 0.3
batch_reduce: true
# sparse task vectors
tv_prune_ratio: 0.9
# sparse gate module
post_sparse_gate: False
gate_prune_ratio: 0.0
# shared gate
shared_gate: true
position_encoding: false
position_encoding_dim: 8
# tta learning rate
lr: 1e-4
optimizer: adam
# this is overrided by `fabric.devices` if launched from the `fusion_bench` CLI.
devices: 1
batch_size: 16
num_workers: 16
max_steps: 1000
# if true, we will use the gradient accumulation across tasks to save memory
use_grad_accumulate: false
cache_dir: outputs
fast_dev_run: ${fast_dev_run}
