#!/bin/bash

# Script for creating BM25 indexes using pyserini.
# This script processes multiple preprocessed collections and creates BM25 indexes for each one.
# It expected a list of paths, with each path containing one (or more) jsonl files.

# Usage: ./build_bm25.sh <path1> <path2> ...

# Ensure that at least one path is passed as an argument
if [ $# -lt 1 ]; then
  echo "Usage: $0 <path1> <path2> ..."
  exit 1
fi

# Loop over the provided paths and create BM25 indexes
for INPUT_RELATIVE_PATH in "$@"; do

  # Get the absolute path of the input directory
  ABSOLUTE_PATH=$(realpath "${INPUT_RELATIVE_PATH}")

  echo "Creating BM25 index for: ${ABSOLUTE_PATH}"

python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input "${ABSOLUTE_PATH}" \
  --index "${ABSOLUTE_PATH}" \
  --generator DefaultLuceneDocumentGenerator \
  --threads 20 \
  --stemmer none \
  --pretokenized \
  --storePositions --storeDocvectors --storeRaw

  echo "BM25 index created at: ${ABSOLUTE_PATH}"
done

echo "All indexes created successfully."
