# config for 2 A100-80G
export CUDA_VISIBLE_DEVICES=0,1

src_lang=en
tgt_lang=de

path_to_data=
path_to_ckpt=

DATA_DIR=${path_to_data}/data-bin/wmt_only_spm_${src_lang}_spm_${tgt_lang}         
#DATA_DIR=${path_to_data}/data-bin/wmt_spm_${src_lang}_spm_${tgt_lang}

SAVE_DIR=${path_to_ckpt}/checkpoints/spm_${src_lang}_spm_${tgt_lang}_wmt_only_nmt_enc6 
#SAVE_DIR=${path_to_ckpt}/checkpoints/spm_${src_lang}_spm_${tgt_lang}_nmt_enc6

mkdir -p ${SAVE_DIR}

python train.py \
  ${DATA_DIR} \
  --task translation \
  --arch transformer \
  -s ${src_lang} -t ${tgt_lang} \
  --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 10.0 \
  --lr 0.0007 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 --max-update 400000 \
  --dropout 0.1 --weight-decay 0.0 \
  --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
  --save-dir ${SAVE_DIR} \
  --num-workers 8 --max-tokens 50000 \
  --eval-bleu --eval-bleu-args '{"beam": 4, "lenpen": 0.6, "max_len_a": 1, "max_len_b": 50}' \
  --eval-bleu-detok moses --eval-bleu-remove-bpe sentencepiece \
  --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
  --no-progress-bar --save-interval-updates 1000 --keep-interval-updates 10 \
  --keep-last-epochs 10 \
  --encoder-normalize-before --decoder-normalize-before --share-decoder-input-output-embed \
  --seed 123 --log-interval 200 \
  --update-freq 1 > ${SAVE_DIR}/nmt.log 2>&1
 
