#! /usr/bin/env bash

#exec >preprocessing.out
#exec 2>$1

DATA_DIR=${1:-.}

mkdir -p $DATA_DIR
cd $DATA_DIR

#mkdir -p raw
#cd raw
[ -f train.en ] || wget -c http://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en
[ -f train.de ] || wget -c http://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de
[ -f newstest2012.en ] || wget -c http://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2012.en
[ -f newstest2012.de ] || wget -c http://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2012.de
[ -f newstest2013.en ] || wget -c http://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2013.en
[ -f newstest2013.de ] || wget -c http://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2013.de
[ -f newstest2014.en ] || wget -c http://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.en
[ -f newstest2014.de ] || wget -c http://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.de
[ -f newstest2015.en ] || wget -c http://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2015.en
[ -f newstest2015.de ] || wget -c http://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2015.de
#cd ..

VOCAB_SIZE=37000
BPE_MODEL=bpe.share.$VOCAB_SIZE

if [ ! -f $BPE_MODEL ]; then
    cat train.en train.de >tmp
    yttm bpe --data tmp --model $BPE_MODEL --vocab_size $VOCAB_SIZE
    rm tmp

    yttm vocab --model $BPE_MODEL >vocab.tsv
    yttm encode --model $BPE_MODEL --output_type id <train.en >train.en.id
    yttm encode --model $BPE_MODEL --output_type id <train.de >train.de.id
    yttm encode --model $BPE_MODEL --output_type id <newstest2013.en >val.en.id
    yttm encode --model $BPE_MODEL --output_type id <newstest2013.de >val.de.id
    yttm encode --model $BPE_MODEL --output_type id <newstest2014.en >test.en.id
    yttm encode --model $BPE_MODEL --output_type id <newstest2014.de >test.de.id
fi
