#!/bin/bash

export VOCAB_PATH="/hg38/vocab_dedup.txt"
export VOCAB_NAME='vocab_dedup.txt'
export POSITIONAL_EMBEDDINGS_SIZE=512

git clone https://github.com/jerryji1993/DNABERT
cd DNABERT
python3 -m pip install --editable .
cd examples
python3 -m pip install -r requirements.txt

cd $WORK_PATH

work_dir=/pretrain/hg38
data_dir=/hg38
config_dir=/pretrain/hg38/configs

runnum=1

mkdir ${work_dir}/models
output=${work_dir}/models/model_${runnum}
mkdir $output

python run_pretrain_nocache_wandb.py \
                                --output_dir $output \
                                --model_type=motifBert \
                                --tokenizer_name=motif \
                                --config_name=$config_dir/config.json \
                                --project_name=DNAMotifTokenizer \
                                --do_train \
                                --train_data_file=None \
                                --train_data_path=${data_dir} \
                                --train_data_prefix=all_tokenized_train_ \
                                --do_eval \
                                --eval_data_file=${data_dir}/all_tokenized_val_00.txt \
                                --mlm \
                                --gradient_accumulation_steps 1 \
                                --per_gpu_train_batch_size 96 \
                                --per_gpu_eval_batch_size 96 \
                                --save_steps 1000 \
                                --save_total_limit 10 \
                                --max_steps 200000 \
                                --evaluate_during_training \
                                --logging_steps 1000 \
                                --line_by_line \
                                --learning_rate 4e-5 \
                                --block_size 512 \
                                --adam_epsilon 1e-6 \
                                --weight_decay 0.01 \
                                --beta1 0.9 \
                                --beta2 0.98 \
                                --mlm_probability 0.15 \
                                --warmup_steps 10000 \
                                --n_process 8 \
                                --overwrite_output_dir
