# BPE for raw data.

DATA_DIR = $1   # /path/to/data_dir

cat corpus.train.tok | \
python ../pretrain/concat_short_sentences.py | \
python ../common/length_filter_by_char.py 20 1000000 > corpus.train.tok.tmp
./fastbpe applybpe \
  $DATA_DIR/corpus.train.tok.bpe \
  $DATA_DIR/corpus.train.tok.tmp \
  $DATA_DIR/bpe-code

cat corpus.valid.tok | \
python ../pretrain/concat_short_sentences.py | \
python ../common/length_filter_by_char.py 20 1000000 > corpus.valid.tok.tmp
./fastbpe applybpe \
  $DATA_DIR/corpus.valid.tok.bpe \
  $DATA_DIR/corpus.valid.tok.tmp \
  $DATA_DIR/bpe-code