conda activate lean-finder
cd lean-finder

BASE_EMBEDDING_DIR=lean-finder/inference/index/dsproverv1.5rl_dpo_train_all_modality_temp_0.01_lambda_0.01_epoch3
CORPUS_EMBEDDING_OUTPUT_DIR=$BASE_EMBEDDING_DIR/corpus_embeddings
CORPUS_PATH=lean-finder/datasets/corpus_dir/full_info_corpus.jsonl
INDEX_OUTPUT_PATH=$BASE_EMBEDDING_DIR/search.index
LORA_PATH=lean-finder/dpo_ckpt/dsproverv1.5rl_dpo_train_all_modality_temp_0.01_lambda_0.01_epoch3/final/policy
MODEL_NAME=deepseek-ai/DeepSeek-Prover-V1.5-RL

echo "Generating corpus embeddings..."
mkdir -p $CORPUS_EMBEDDING_OUTPUT_DIR

CUDA_VISIBLE_DEVICES=0 python -m leanfinder.retriever.driver.encode \
  --output_dir=temp \
  --model_name_or_path $MODEL_NAME \
  --lora_name_or_path $LORA_PATH \
  --lora \
  --query_prefix "" \
  --passage_prefix "" \
  --bf16 \
  --pooling eos \
  --append_eos_token \
  --normalize \
  --per_device_eval_batch_size 32 \
  --query_max_len 610 \
  --passage_max_len 210 \
  --dataset_path $CORPUS_PATH \
  --cache_dir $HF_CACHE \
  --dataset_cache_dir $HF_CACHE \
  --encode_output_path $CORPUS_EMBEDDING_OUTPUT_DIR/corpus.pkl 2>&1 | tee $CORPUS_EMBEDDING_OUTPUT_DIR/corpus.log

python lean-finder/inference/index/construct_index.py \
  --corpus_embeddings_path $CORPUS_EMBEDDING_OUTPUT_DIR/corpus.pkl \
  --index_output_path $INDEX_OUTPUT_PATH

  