#!/bin/bash

HOME_DIR="" ### dear user: change this line and only this line
CODE_DIR=$HOME_DIR"/code/UnivSearchDev/"
PLM_DIR=$HOME_DIR"/model_checkpoints"
COLLECTION_DIR=$HOME_DIR"/data/msmarco"
PROCESSED_DIR=$HOME_DIR"/data/msmarco/processed_data"
LOG_DIR=$HOME_DIR"/logs"
CHECKPOINT_DIR=$HOME_DIR"/model_checkpoints"
EMBEDDING_DIR=$HOME_DIR"/embeddings_cache"
RESULT_DIR=$HOME_DIR"/result"
EVAL_DIR=$CODE_DIR"/metrics/trec/"

mkdir -p $PLM_DIR
mkdir -p $COLLECTION_DIR
mkdir -p $PROCESSED_DIR
mkdir $LOG_DIR
mkdir $CHECKPOINT_DIR
mkdir $EMBEDDING_DIR
mkdir $RESULT_DIR
mkdir -p $EVAL_DIR
mkdir -p $RESULT_DIR/msmarco_with_title/t5/
mkdir -p $PROCESSED_DIR/t5/ANCE_hard_negatives/
mkdir -p $EMBEDDING_DIR/msmarco_with_title/t5/
cd $CODE_DIR
### download pretrained language model, in this case T5
if [ -d "$PLM_DIR/t5-base-scaled" ]; then
    echo "$PLM_DIR/t5-base-scaled already exists.";
else
    echo "downloading T5 checkpoint into $PLM_DIR/t5-base-scaled";
    python lib/scripts/scale_t5_weights.py \
        --input_model_path $PLM_DIR/t5-base \
        --output_model_path $PLM_DIR/t5-base-scaled \
        --model_name_or_path t5-base\
        --num_layers 12

fi

### download and process data
cd $COLLECTION_DIR

### if you want titles of msmarco documents from these authors
wget --no-check-certificate https://rocketqa.bj.bcebos.com/corpus/marco.tar.gz
tar -zxf marco.tar.gz
rm -rf marco.tar.gz
mv marco rocketQA-marco

### move title and passages from RocketQA to different folder
mkdir raw_data
cp rocketQA-marco/para.txt raw_data
cp rocketQA-marco/para.title.txt raw_data
cp rocketQA-marco/train.query.txt raw_data

### work in this folder for now
cd raw_data

if [ ! -f "$COLLECTION_DIR/raw_data/collection.tsv" ]; then
    wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz
    tar -zxvf collectionandqueries.tar.gz
    rm collectionandqueries.tar.gz

fi 

if [ ! -f "$COLLECTION_DIR/raw_data/triples.train.small.tsv" ]; then
    wget https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz
    tar -zxvf triples.train.small.tar.gz
    rm triples.train.small.tar.gz*

fi 


if [ ! -f "$COLLECTION_DIR/raw_data/qrels.train.tsv" ]; then
    wget https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv -O qrels.train.tsv

fi 


if [ ! -f "$COLLECTION_DIR/raw_data/qidpidtriples.train.full.2.tsv" ]; then
    wget https://msmarco.blob.core.windows.net/msmarcoranking/qidpidtriples.train.full.2.tsv.gz
    gunzip qidpidtriples.train.full.2.tsv.gz

fi 


### if you want to join titles with the msmarco passages
if [ -f "$COLLECTION_DIR/raw_data/collection_with_title.tsv" ]; then
    echo "$COLLECTION_DIR/raw_data/collection_with_title.tsv exists.";
else 
    echo "Joining para.txt and para.title.txt";
    join  -t "$(echo -en '\t')"  -e '' -a 1  -o 1.1 2.2 1.2  <(sort -k1,1 para.txt) <(sort -k1,1 para.title.txt) | sort -k1,1 -n > collection_with_title.tsv

fi

if [ -f "$COLLECTION_DIR/raw_data/train.negatives.tsv" ]; then
    echo "$COLLECTION_DIR/raw_data/train.negatives.tsv exists.";
else 
    echo "processing train.negatives.tsv -- negative documents for every query";
    awk -v RS='\r\n' '$1==last {printf ",%s",$3; next} NR>1 {print "";} {last=$1; printf "%s\t%s",$1,$3;} END{print "";}' qidpidtriples.train.full.2.tsv > train.negatives.tsv

fi

cd $CODE_DIR
export PYTHONPATH=.

if [ -f "$PROCESSED_DIR/t5-with_title/train.new.jsonl" ]; then
    echo "$PROCESSED_DIR/t5-with_title/train.new.jsonl exists";
else 
    echo "RUNNING build_train.py...";
    ### if you dont want the titles, change the template argument below to remove them
    python lib/scripts/msmarco/build_train.py \
        --tokenizer_name $PLM_DIR/t5-base-scaled  \
        --negative_file $COLLECTION_DIR/raw_data/train.negatives.tsv  \
        --qrels $COLLECTION_DIR/raw_data/qrels.train.tsv  \
        --queries $COLLECTION_DIR/raw_data/train.query.txt  \
        --collection $COLLECTION_DIR/raw_data/collection_with_title.tsv  \
        --save_to $PROCESSED_DIR/t5-with_title-batch_size-32/  \
        --template "Title: <title> Text: <text>"

    echo "Concatenating output shards...";

    cd $PROCESSED_DIR/t5-with_title-batch_size-32/
    cat encoded_split-*.jsonl > train.jsonl
    cat split-*.jsonl > train_text.jsonl

    tail -n 500 train.jsonl > val.jsonl
    tail -n 500 train_text.jsonl > val_text.jsonl
    head -n 400282 train.jsonl > train.new.jsonl
    head -n 400282 train_text.jsonl > train_text.new.jsonl

    echo "Done setting up data and environments";

fi

### download and setup official trec eval scripts
echo "Setting up trec eval scripts";
cd $EVAL_DIR
if [ ! -f "$EVAL_DIR/trec_eval-9.0.7.tar.gz" ]; then
    wget https://trec.nist.gov/trec_eval/trec_eval-9.0.7.tar.gz

fi

tar -xvzf trec_eval-9.0.7.tar.gz
rm tar -xvzf trec_eval-9.0.7.tar.gz
cd trec_eval-9.0.7
make;
make quicktest;