#!/bin/bash

data_path=$1
lr=$2
output_path=$3
project_name=$4

export VOCAB_PATH="/pretrain/hg38/models/model_1/checkpoint-200000/vocab_dedup.txt"            
export VOCAB_NAME='vocab_dedup.txt'                                                                                                      
export POSITIONAL_EMBEDDINGS_SIZE=128                                                                                              
                                                                                                                                   
echo "The provided kmer is: $kmer, data_path is $data_path"                                                                                                                                                                                                           
model=/pretrain/hg38/models/model_1/checkpoint-200000

for seed in 42
do
    for data in CA-CTCF pELS CA CA-H3K4me3 CA-TF TF PLS dELS  
    do 
        python train.py \
            --model_name_or_path ${model} \
            --vocab_file ${model}/vocab_dedup.txt \
            --data_path  $data_path/$data \
            --customized_tokenizer DNAMotifTokenizer  \
            --kmer -1 \
            --run_name DNAMotifTokenizer_${lr}_${data}_seed${seed} \
            --model_max_length 200 \
            --per_device_train_batch_size 128 \
            --per_device_eval_batch_size 128 \
            --gradient_accumulation_steps 1 \
            --learning_rate ${lr} \
            --num_train_epochs 3 \
            --fp16 \
            --save_steps 200 \
            --output_dir ${output_path} \
            --evaluation_strategy steps \
            --eval_steps 200 \
            --warmup_steps 30 \
            --logging_steps 100000 \
            --overwrite_output_dir True \
            --log_level info \
            --seed ${seed} \
            --find_unused_parameters False \
            --project_name ${project_name}
    done 
done