#!/usr/bin/env bash

set -eux

SRCLANG=$1
TGTLANG=$2

CURRENT_DIR=`pwd`;
HOME_DIR=`realpath ../`;
DATA_DIR=${HOME_DIR}/data/codenet-jsonl-processed/${SRCLANG}-${TGTLANG};
OUTPUT_DIR=${HOME_DIR}/data/codenet-plbart/${SRCLANG}-${TGTLANG};
SPM_DIR=${HOME_DIR}/sentencepiece;

mkdir -p ${OUTPUT_DIR}

function spm_preprocess () {

for SPLIT in train val; do
    if [[ $SPLIT == 'test' ]]; then
        MAX_LEN=9999 # we do not truncate test sequences
    else
        MAX_LEN=512
    fi
    python encode.py \
        --model-file ${SPM_DIR}/sentencepiece.bpe.model \
        --data_file $DATA_DIR/${SPLIT}.jsonl \
        --output_dir ${OUTPUT_DIR} \
        --src_lang ${SRCLANG} \
        --tgt_lang ${TGTLANG} \
        --pref $SPLIT \
        --max_len $MAX_LEN \
        --workers 60;
done

}

function binarize () {

fairseq-preprocess \
    --source-lang ${SRCLANG} \
    --target-lang ${TGTLANG} \
    --trainpref ${OUTPUT_DIR}/train.spm \
    --validpref ${OUTPUT_DIR}/val.spm \
    --testpref ${OUTPUT_DIR}/val.spm \
    --destdir ${OUTPUT_DIR}/data-bin \
    --thresholdtgt 0 \
    --thresholdsrc 0 \
    --workers 60 \
    --srcdict ${SPM_DIR}/dict.txt \
    --tgtdict ${SPM_DIR}/dict.txt;

}

spm_preprocess
binarize
