KEY_DIM=512 
COLAB="encoder_cross_decoder" 
CUDA_VISIBLE_DEVICES=0 
python train.py data-bin/wmt16_en_de_bpe32k \
    --arch transformer_wmt_en_de \
    --save-dir checkpoints/wmt16-en-de/booster \
    --share-all-embeddings \
    --optimizer adam \
    --adam-betas '(0.9, 0.98)' \
    --clip-norm 0.0 \
    --lr 0.0007 \
    --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 \
    --warmup-init-lr 1e-07 \
    --dropout 0.1 \
    --weight-decay 0.0 \
    --criterion label_smoothed_cross_entropy \
    --label-smoothing 0.1 \
    --max-tokens 3584 \
    --update-freq 2 