DATE="$1"
DATASET_NAME="$2"
AUTOENCODER_NAME="autencoder-$DATE"
INIT_PATH="/home/"
AUTOENCODER_PATH_NO_CHECKPOINT="$INIT_PATH/$AUTOENCODER_NAME"
echo "The directory '$AUTOENCODER_PATH_NO_CHECKPOINT' exists locally."
CHECKPOINT=$(ls -d $AUTOENCODER_PATH_NO_CHECKPOINT/checkpoint-*/ | grep -oP '(?<=checkpoint-)[0-9]+' | sort -n | tail -1)
echo "DATASET: $DATASET_NAME"
SAVE_DIR_INDEXES="$INIT_PATH/indexes/$DATASET_NAME/"
mkdir -p $SAVE_DIR_INDEXES
AUTOENCODER_PATH="$INIT_PATH/$AUTOENCODER_NAME/checkpoint-$CHECKPOINT"
INDEX_PATH="$SAVE_DIR_INDEXES/doc_id_to_token_$DATE-checkpoint-$CHECKPOINT.index"
INDEX_PATH_CODE_TOKENS="$SAVE_DIR_INDEXES/doc_id_to_token_$DATE-checkpoint-$CHECKPOINT-code_and_tokens.index"
INDEX_PATH_UNIQUE="$SAVE_DIR_INDEXES/doc_id_to_token_$DATE-checkpoint-$CHECKPOINT-unique.index"

# Script 1: create_bm25_index.py
# encodes the document as integers
python create_bm25_index.py \
  --autoencoder_path ${AUTOENCODER_PATH} \
  --dataset ${DATASET_NAME} \
  --index_path ${INDEX_PATH} \
  --document_formatting "{document}"

# # Script 2: build_collection.sh
# # requirement: java 21+
# # on GCP, I'd do:
# # curl -s "https://get.sdkman.io" | bash
# # source "$HOME/.sdkman/bin/sdkman-init.sh"
# # sdk install java 22-open
# # Input needs to be a directory with JSONL files, doesn't matter how they are named
# # each JSONL file should contain a list of records, one per line
# # each record will have two fields: "id" and "contents"
# # {"id": "13", "contents": "0 24 78 12"}
# # or, if text
# # {"id": "13", "contents": "this is a text"}
# # IMPORTANT! text needs to be pre-tokenized and pre-analyzed
# # otherwise BM25 will work poorly
# # if --pretokenized and --stemmer is set to "porter", BM25 should work fine on text too
# python -m pyserini.index.lucene \
#   --collection JsonCollection \
#   --input ${INDEX_PATH}/data \
#   --index ${INDEX_PATH} \
#   --generator DefaultLuceneDocumentGenerator \
#   --threads 20 \
#   --stemmer none \
#   --storePositions --storeDocvectors --storeRaw

python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input ${INDEX_PATH}/data_documents_token_and_code \
  --index ${INDEX_PATH_CODE_TOKENS} \
  --generator DefaultLuceneDocumentGenerator \
  --threads 20 \
  --stemmer porter \
  --storePositions --storeDocvectors --storeRaw

python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input ${INDEX_PATH}/data_documents_unique_code \
  --index ${INDEX_PATH_UNIQUE} \
  --generator DefaultLuceneDocumentGenerator \
  --threads 20 \
  --stemmer none \
  --pretokenized \
  --storePositions --storeDocvectors --storeRaw

# Script 3: retrieve_from_collection_with_bm25.sh
# This script is used to search the encoded collection using BM25
# TODO: add MLFlow logging
# python retrieve_from_collection_with_bm25.py \
#   --autoencoder_path ${AUTOENCODER_PATH} \
#   --dataset msmarco-passage/dev/small \
#   --index_path ${INDEX_PATH_UNIQUE} \
#   --collection_path ${INDEX_PATH_UNIQUE}/data/documents_codebook.jsonl \
#   --query_formatting "query: {query}" \
#   --methods "code-unique-set" \
#   --output_path predictions/ \
#   --rerank

python retrieve_from_collection_with_bm25.py \
  --autoencoder_path ${AUTOENCODER_PATH} \
  --dataset ${DATASET_NAME} \
  --index_path ${INDEX_PATH_UNIQUE} \
  --collection_path ${INDEX_PATH}/data_documents_unique_code/documents_codebook.jsonl \
  --query_formatting "{query}" \
  --method "unique_encoding" \
  --output_path ${INDEX_PATH}/predictions/ \
  --k 5000 \
  --rerank

python retrieve_from_collection_with_words_and_tokens.py \
  --autoencoder_path ${AUTOENCODER_PATH} \
  --dataset ${DATASET_NAME} \
  --index_path ${INDEX_PATH_CODE_TOKENS} \
  --collection_path ${INDEX_PATH}/data_documents_unique_code/documents_codebook.jsonl \
  --query_formatting "{query}" \
  --search_type "encoding_and_words" \
  --token_only_output_path ${INDEX_PATH}/predictions/ \
  --output_path ${INDEX_PATH_CODE_TOKENS}/word_predictions/ \
  --k 5000 \
  --rerank
