#!/bin/bash

#SBATCH --time=03:00:00
#SBATCH --mem-per-cpu=8G
#SBATCH --cpus-per-task=1

RESULTS_LOCATION=$1
iter=$2
nb_samples=$3
nb_positives=$((nb_samples/2))

processed_files=$RESULTS_LOCATION/DATASET_$iter
mkdir -p $processed_files/40_ENCODE_TF_FILES_FASTA
python scripts/bed2fasta.py inputs/40_ENCODE_TF_FILES/ $processed_files/40_ENCODE_TF_FILES_FASTA/
mkdir $processed_files/40_ENCODE_TF_FILES_FASTA_IDENTIFIED
python scripts/unify_sequences_identifiers.py $processed_files/40_ENCODE_TF_FILES_FASTA/ $processed_files/40_ENCODE_TF_FILES_FASTA_IDENTIFIED/
rm -r $processed_files/40_ENCODE_TF_FILES_FASTA
mkdir $processed_files/40_ENCODE_TF_FILES_FASTA_POS
mkdir $processed_files/40_ENCODE_TF_FILES_FASTA_NEG
python scripts/subsample_encode_file.py $processed_files/40_ENCODE_TF_FILES_FASTA_IDENTIFIED/ $processed_files/40_ENCODE_TF_FILES_FASTA_POS/ $nb_positives
for f in $processed_files/40_ENCODE_TF_FILES_FASTA_POS/*;do
fasta-shuffle-letters -kmer 3 $f ${f/POS/NEG};
done
mkdir $processed_files/40_ENCODE_TF_FILES_FASTA_COMBINED
python scripts/combine_and_label.py $processed_files/40_ENCODE_TF_FILES_FASTA_POS/ $processed_files/40_ENCODE_TF_FILES_FASTA_NEG/ $processed_files/40_ENCODE_TF_FILES_FASTA_COMBINED/
mkdir $processed_files/markov_files
for f in $processed_files/40_ENCODE_TF_FILES_FASTA_NEG/*;do
fasta-get-markov -m 2 -dna $f > ${f/40_ENCODE_TF_FILES_FASTA_NEG/markov_files}.markov;
done

