CUDA_VISIBLE_DEVICES=0 \
  python mergemoe/merge-moe.py \
  --task="winogrande" \
  --num_samples_for_merging=128 \
  --num_groups=24  \
  --merging_layers="1,2,3,4,5,6,7,8,9,10" \
  --merging_strategy="ours" \
  --model_type="deepseek" \
  --output_dir="results/winogrande/merged-deepseek/" \
  --checkpoint="/root/model/deepseek-moe-16b-base" 
cp /root/model/deepseek-moe-16b-base/modeling_deepseek.py results/winogrande/merged-deepseek/ours/
torchrun --nproc_per_node 2 \
  eval_dclm/eval_openlm_ckpt.py \
  --hf-model results/winogrande/merged-deepseek/ours \
  --tokenizer /root/model/deepseek-moe-16b-base \
  --eval-yaml "static/winogrande.yaml" \
  --output-file results/ours/deepseek/deepseek_winogrande_results.json \
  --donot-compute-perplexity
rm -r results/winogrande/merged-deepseek/
