#!/bin/bash

export VOCAB_PATH="/hg38/vocab_dedup.txt"
export VOCAB_NAME='vocab_dedup.txt'
export POSITIONAL_EMBEDDINGS_SIZE=512

git clone https://github.com/jerryji1993/DNABERT
cd DNABERT
python3 -m pip install --editable .
cd examples
python3 -m pip install -r requirements.txt

cd $WORK_PATH

data_dir=/hg38
config_dir=/pretrain/hg38/configs

tail -n +2 ${data_dir}/all_tokenized_train.tsv | cut -f1 > ${data_dir}/all_tokenized_train.txt
tail -n +2 ${data_dir}/all_tokenized_val.tsv | cut -f1 > ${data_dir}/all_tokenized_val.txt

split -l 200000 -d --additional-suffix=.txt ${data_dir}/all_tokenized_val.txt ${data_dir}/"all_tokenized_val_"
split -l 200000 -d --additional-suffix=.txt ${data_dir}/all_tokenized_train.txt ${data_dir}/"all_tokenized_train_"


for i in $(seq -f "%02g" 0 1); 
do
    echo "Processing file all_tokenized_val_${i}.txt"
    python save2cache.py \
                            --model_type=motifBert \
                            --tokenizer_name=motif \
                            --config_name=${config_dir}/config.json \
                            --eval_data_file=${data_dir}/all_tokenized_val_${i}.txt \
                            --train_data_file=None \
                            --line_by_line \
                            --block_size 512 \
                            --n_process 64
done


for i in $(seq -f "%02g" 0 8); 
do
    echo "Processing file all_tokenized_train_${i}.txt"
    python save2cache.py \
                            --model_type=motifBert \
                            --tokenizer_name=motif \
                            --config_name=${config_dir}/config.json \
                            --train_data_file=${data_dir}/all_tokenized_train_${i}.txt \
                            --line_by_line \
                            --block_size 512 \
                            --n_process 64
done
