fairseq-train data-bin/iwslt14_deen_jointdict_distill \
    --arch cmlm_distill \
    -s de \
    -t en \
    --optimizer adam \
    --adam-betas '(0.9,0.98)' \
    --criterion nat_loss \
    --task translation_lev \
    --label-smoothing 0. \
    --noise random_mask \
    --lr-scheduler inverse_sqrt \
    --warmup-init-lr '1e-07' \
    --lr $1 \
    --warmup-updates 1 \
    --dropout 0.3 \
    --weight-decay 0.01 \
    --decoder-learned-pos \
    --encoder-learned-pos \
    --apply-bert-init \
    --share-all-embeddings \
    --max-tokens 8192 \
    --max-epoch 250 \
    --fixed-validation-seed 7 \
    --fp16 \
    --save-dir ./results/distillation/checkpoints/IWSLTdeen_distill_CMLM_benchmark_lr\=$1_soft_label/ \
    --batch-size-valid 2048 \
    --validate-interval 1 \
    --num-workers 10 \
    --no-epoch-checkpoints \
    --keep-best-checkpoints 1 \
    --tensorboard-logdir ./results/distillation/checkpoints/IWSLTdeen_distill_CMLM_benchmark_lr\=$1_soft_label/ \
    --teacher-path results/checkpoints/IWSLTdeen_distill_CMLM_benchmark/checkpoint_best.pt \
    --eval-bleu \
    --eval-bleu-args '{"iter_decode_max_iter": 1}' \
    --eval-bleu-remove-bpe \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --eval-bleu-detok moses \
    --patience 50 \
    --step-count 2 \
    --mid-mask-policy "discrete" \
    --step-count 2 \
    --step-weight-update 0.01 \
    --step-weight-temp 0. \
    #--revealed-loss 
    ##--teacher-ema \
    #--teacher-ema-decay 0.9997
    #--step-weight-temp 10.
