# Preprocess for TNF datasets.

DATA_DIR=$1         # /path/to/data_dir
LEFT_PERCENT=$2     # Left percentage of TNF vocabulary 
RIGHT_PERCENT=$3    # Right percentage of TNF vocabulary

python debpe.py --data-dir $DATA_DIR --left-percent $LEFT_PERCENT --right-percent $RIGHT_PERCENT

# binarize tnf dataset
fairseq-preprocess \
--only-source \
--trainpref $DATA_DIR/corpus.train.tok.debpe \
--validpref $DATA_DIR/corpus.valid.tok.debpe \
--destdir $DATA_DIR/binary-datasets/data-tnf-$LEFT_PERCENT-$RIGHT_PERCENT \
--workers 24 \
--srcdict $DATA_DIR/dict.debpe.filtered \

# binarize subword mask dataset
fairseq-preprocess \
--only-source \
--nwordssrc 32768 \
--trainpref $DATA_DIR/corpus.train.tok.sbmask \
--validpref $DATA_DIR/corpus.valid.tok.sbmask \
--destdir $DATA_DIR/binary-datasets/data-sbmask-$LEFT_PERCENT-$RIGHT_PERCENT \
--workers 24 \