#!/bin/bash

# usage : bible.sh $languages $data_type
# data_type : mono or para or mono,para
# download and processed bible data
# Transform (tokenize, lower and remove accent, loard code and vocab, learn and apply BPE tokenization,
# binarize...) our data contained in the text files into a pth file understandable by the framework : 
# takes a lot of time with dataset size, nCodes and shuf_n_samples

set -e

# languages 
lgs=$1
       
# path containing the csvs folder
# zip_file_link (csvs) = https://drive.google.com/file/d/1NuSJ-NT_BsU1qopLu6avq6SzUEf6nVkk/view?usp=sharing
# download and unzip in $csv_path
csv_path=/content

# where to store the txt files
output_dir=/content/data

# path where processed files will be stored
OUTPATH=/content/processed

# If parallel data is available and you need to preprocess it
PARA=True
# If you want to process monolingual data (if the monolingual data is unavailable and you 
# leave this parameter set to True, the parallel data will be used to build the monolingual data)
MONO=True    
# folder containing the parallel data
PARA_PATH=$output_dir
# folder containing the monolingual data
MONO_PATH=$output_dir
# whether all languages should share the same vocabulary (leave to True)
SAME_VOCAB=True    
# The following parameter allows, when having independent monolingual and parallel data, to add the parallel data to the monolingual data. It is left by default to True. 
add_para_data_to_mono_data=False

# Learn nCodes BPE code on the training data
nCodes=20000
# Generating shuf_n_samples random permutations of training data to learn bpe
shuf_n_samples=10000 
# It is preferable and advisable that it be the powers of two...
threads_for_tokenizer=16 
# Percentage of data to use as test data (%)
test_size=10 
# Percentage of data to use as validation data (%)
val_size=10              

# tools paths
TOOLS_PATH=tools
TOKENIZE=$TOOLS_PATH/tokenizer_our.sh
LOWER_REMOVE_ACCENT=$TOOLS_PATH/lowercase_and_remove_accent.py
FASTBPE=$TOOLS_PATH/fastBPE/fast
#PROCESSED_FILE=../scripts/build_meta_data_multixlm.sh

# The n_sample parameter is optional, and when it is not passed or when it exceeds the dataset size, the whole dataset is considered
n_samples=-1
# If you don't have any other data to fine-tune your model on a specific sub-task, specify the percentage of the sub-task metadata to consider or -1 to ignore it.
#sub_tasks=en-fr:10,de-en:-1,de-fr:-1
#If you want the subtasks to be constructed from the pair combinations of your languages, put the three dots
sub_tasks=...
tasks_n_samples=-1

##############################################

 
function abrev() {
    if [[ $1 = "Francais" ]]; then
        result="fr"
    elif [[ $1 = "Anglais" ]]; then
        result="en"
    elif [[ $1 = "KALATA_KO_SC_Gbaya" ]] || [[ $1 = "KALATA_KO_DC_Gbaya" ]]; then
        result="Gbay"
    elif [[ $1 = "BIBALDA_TA_PELDETTA" ]]; then
        result="MASS"
    elif [[ $1 = "MKPAMAN_AMVOE_Ewondo" ]]; then
        result="Ewon"
    else
        length=${#1}
        if [[ $length -le 4 ]]; then
            result=$1
        else
            result=$(echo $1 | cut -c1-4)
        fi
    fi
}

if [ $sub_tasks="..." ]; then
    sub_tasks=""
	IFS=', ' read -r -a langs_array <<< "$lgs"
	# todo : sort the array in alphebical oder
	array_length=${#langs_array[*]}
	for (( i=0; i<$array_length; ++i)); do 
		for (( j=$(($i+1)); j<$array_length; ++j)); do
            abrev ${langs_array[$i]} 
            a=$result
            abrev ${langs_array[$j]} 
            b=$result
        	sub_tasks=$sub_tasks,$a-$b:$tasks_n_samples
		done
	done
	# Remove the comma in front
	sub_tasks=$(echo $sub_tasks | cut -c2-)
fi

echo $sub_tasks

# create output path
mkdir -p $OUTPATH
# avoid permission error
chmod +x $FASTBPE
chmod +x $TOOLS_PATH/mosesdecoder/scripts/tokenizer/*.perl

echo "======================="
echo "Extract texts files"
echo "======================="

for data_type in $(echo ${2-'mono,para'} | sed -e 's/\,/ /g'); do
    python ../scripts/bible.py --csv_path $csv_path --output_dir $output_dir --data_type $data_type --languages $lgs
done

echo "======================="
echo "Processed"
echo "======================="
 
chmod +x ../scripts/build_meta_data_multixlm.sh
. ../scripts/build_meta_data_multixlm.sh $sub_tasks $n_samples $add_para_data_to_mono_data
# todo : rendre les choses dynamiques comme ceci
#chmod +x $PROCESSED_FILE
#$PROCESSED_FILE

echo "======================="
echo "End"
echo "======================="
