DATE="$1"
AUTOENCODER_NAME="autencoder-$DATE"
INIT_PATH="/home/"
AUTOENCODER_PATH_NO_CHECKPOINT="$INIT_PATH/$AUTOENCODER_NAME"
echo "The directory '$AUTOENCODER_PATH_NO_CHECKPOINT' exists locally."
CHECKPOINT=$(ls -d $AUTOENCODER_PATH_NO_CHECKPOINT/checkpoint-*/ | grep -oP '(?<=checkpoint-)[0-9]+' | sort -n | tail -1)

SAVE_DIR_INDEXES="$INIT_PATH/indexes"
AUTOENCODER_PATH="$INIT_PATH/$AUTOENCODER_NAME/checkpoint-$CHECKPOINT"
INDEX_PATH="$SAVE_DIR_INDEXES/fast_doc_id_to_token_$DATE-checkpoint-$CHECKPOINT.index"
INDEX_PATH_CODE_TOKENS="$SAVE_DIR_INDEXES/fast_doc_id_to_token_$DATE-checkpoint-$CHECKPOINT-code_and_tokens.index"
INDEX_PATH_UNIQUE="$SAVE_DIR_INDEXES/fast_doc_id_to_token_$DATE-checkpoint-$CHECKPOINT-unique.index"
INDEX_PATH_NGRAMS="$SAVE_DIR_INDEXES/fast_doc_id_to_token_$DATE-checkpoint-$CHECKPOINT-pos_ngrams_5.index"
INDEX_PATH_AB="$SAVE_DIR_INDEXES/fast_doc_id_to_token_$DATE-checkpoint-$CHECKPOINT-ab.index"
INDEX_PATH_POS="$SAVE_DIR_INDEXES/fast_doc_id_to_token_$DATE-checkpoint-$CHECKPOINT-pos.index"

# Script 1: create_bm25_index.py
# encodes the document as integers
python fast_create_bm25_index.py \
  --autoencoder_path ${AUTOENCODER_PATH} \
  --dataset msmarco-passage/dev/small \
  --index_path ${INDEX_PATH} \
  --document_formatting "{document}"
# # Script 2: build_collection.sh
# # requirement: java 21+
# # on GCP, I'd do:
# # curl -s "https://get.sdkman.io" | bash
# # source "$HOME/.sdkman/bin/sdkman-init.sh"
# # sdk install java 22-open
# # Input needs to be a directory with JSONL files, doesn't matter how they are named
# # each JSONL file should contain a list of records, one per line
# # each record will have two fields: "id" and "contents"
# # {"id": "13", "contents": "0 24 78 12"}
# # or, if text
# # {"id": "13", "contents": "this is a text"}
# # IMPORTANT! text needs to be pre-tokenized and pre-analyzed
# # otherwise BM25 will work poorly
# # if --pretokenized and --stemmer is set to "porter", BM25 should work fine on text too
# python -m pyserini.index.lucene \
#   --collection JsonCollection \
#   --input ${INDEX_PATH}/data \
#   --index ${INDEX_PATH} \
#   --generator DefaultLuceneDocumentGenerator \
#   --threads 20 \
#   --stemmer none \
#   --pretokenized \
#   --storePositions --storeDocvectors --storeRaw

# python -m pyserini.index.lucene \
#   --collection JsonCollection \
#   --input ${INDEX_PATH}/data_documents_token_and_code \
#   --index ${INDEX_PATH_CODE_TOKENS} \
#   --generator DefaultLuceneDocumentGenerator \
#   --threads 20 \
#   --stemmer porter \
#   --storePositions --storeDocvectors --storeRaw

python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input ${INDEX_PATH}/data_documents_unique_code \
  --index ${INDEX_PATH_UNIQUE} \
  --generator DefaultLuceneDocumentGenerator \
  --threads 20 \
  --stemmer none \
  --pretokenized \
  --storePositions --storeDocvectors --storeRaw

# Script 3: retrieve_from_collection_with_bm25.sh
# This script is used to search the encoded collection using BM25
# python fast_retrieve_from_collection_with_bm25.py \
#   --autoencoder_path ${AUTOENCODER_PATH} \
#   --dataset msmarco-passage/dev/small \
#   --index_path ${INDEX_PATH} \
#   --query_formatting "{query}" \
#   --search_type "encoding" \
#   --collection_path ${INDEX_PATH}/data/documents_codebook.jsonl \
#   --output_path ${INDEX_PATH}/data/predictions/ \
#   --rerank \
#   --k 2000
  #--length 100 \
  # --pos_encoding "pos"

# python fast_retrieve_from_collection_with_words_and_tokens.py \
#   --autoencoder_path ${AUTOENCODER_PATH} \
#   --dataset msmarco-passage/dev/small \
#   --index_path ${INDEX_PATH_CODE_TOKENS} \
#   --collection_path ${INDEX_PATH}/data_documents_unique_code/documents_codebook.jsonl \
#   --query_formatting "{query}" \
#   --search_type "encoding_and_words" \
#   --output_path ${INDEX_PATH}/word_predictions/ \
#   --rerank \
#   --k 2000

python fast_retrieve_from_collection_with_bm25.py \
  --autoencoder_path ${AUTOENCODER_PATH} \
  --dataset msmarco-passage/dev/small \
  --index_path ${INDEX_PATH_UNIQUE} \
  --query_formatting "{query}" \
  --search_type "unique_encoding" \
  --collection_path ${INDEX_PATH}/data_documents_unique_code/documents_codebook.jsonl \
  --output_path ${INDEX_PATH}/unique_predictions/ \
  --rerank \
  --k 2000
