# Pre-process
# echo 'Cloning Moses github repository (for tokenization scripts)...'
# git clone https://github.com/moses-smt/mosesdecoder.git

# Sub-word NMT
# echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
# git clone https://github.com/rsennrich/subword-nmt.git

SCRIPTS=~/modules/mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
LC=$SCRIPTS/tokenizer/lowercase.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
BPEROOT=~/modules/subword-nmt/subword_nmt
BPE_TOKENS=10000

# URL="http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz"
# GZ=de-en.tgz

# if [ ! -d "$SCRIPTS" ]; then
#     echo "Please set SCRIPTS variable correctly to point to Moses scripts."
#     exit
# fi

src=de
tgt=en
lang=de-en
prep=multi30k.tokenized.de-en
tmp=$prep/tmp
orig=orig

mkdir -p $orig $tmp $prep

echo "pre-processing train data..."
for part in train val test; do
    for l in $src $tgt; do
        f=$part.$l
        tok=$part.tok.$l

        cat $f | \
        # perl $TOKENIZER -threads 8 -l de > directory
        # perl $TOKENIZER -threads 8 -l en > directory
        perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
        echo ""
    done
done

# Cleaning = drop empty, drop longer or short sentences
# perl clean-corpus-n.perl CORPUS L1 L2 OUT MIN MAX
perl $CLEAN -ratio 1.5 $tmp/train.tok $src $tgt $tmp/train.clean 1 175
for l in $src $tgt; do
    # lower case
    perl $LC < $tmp/train.clean.$l > $tmp/train.$l
done
perl $CLEAN $tmp/test.tok $src $tgt $tmp/test.clean 1 100000
# perl $CLEAN $tmp/test.tok $src $tgt $tmp/test.clean 1 175
perl $CLEAN $tmp/val.tok $src $tgt $tmp/val.clean 1 100000
# perl $CLEAN $tmp/val.tok $src $tgt $tmp/val.clean 1 175

for part in val test ; do
    for l in $src $tgt; do
        perl $LC < $tmp/$part.clean.$l > $tmp/$part.$l
    done
done



TRAIN=$tmp/train.en-de
BPE_CODE=$prep/code
rm -f $TRAIN
for l in $src $tgt; do
    cat $tmp/train.$l >> $TRAIN
done

echo "learn_bpe.py on ${TRAIN}..."
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE

for L in $src $tgt; do
    for f in train.$L val.$L test.$L; do
        echo "apply_bpe.py to ${f}..."
        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
    done
done

#
#TEXT=Multi30K/multi30k.tokenized.de-en
#fairseq-preprocess --source-lang de --target-lang en \
    # --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
    # --destdir data-bin/iwslt14.tokenized.de-en \
    # --workers 20