# Configuration for using LOCAL PeerQA JSONL files with ALL RETRIEVAL METHODS
# This enables all retrievers for comprehensive experiments

# Local data configuration
data:
  data_dir: "data"  # Directory with JSONL files
  n_samples: 10  # Number of samples to process (null for all 579)
  use_local_loader: true  # Use PeerQALocalDataLoader

# Experiment settings - test all combinations
granularities:
  - sentence
  - paragraph

# All decontextualization templates
templates:
  - minimal
  - title_only
  - heading_only
  - title_heading
  - aggressive_title

# ALL Retrieval methods enabled
retrievers:
  # 1. BM25 - Always works (pure Python)
  bm25:
    enabled: true
    bm25_k1: 1.2
    bm25_b: 0.75
  
  # 2. TF-IDF - Always works (uses scikit-learn)
  tfidf:
    enabled: true
  
  # 3. Dense retriever - Requires: pip install sentence-transformers torch
  dense:
    enabled: true
    model_name: "sentence-transformers/all-MiniLM-L6-v2"  # Fast model
    device: "cpu"  # Change to "cuda" if you have GPU
    batch_size: 32
    normalize_embeddings: true
  
  # 4. ColBERT - Requires: pip install colbert-ai torch faiss-cpu
  colbert:
    enabled: true
    model_name: "colbert-ir/colbertv2.0"
    device: "cpu"  # Change to "cuda" for GPU
    nbits: 2
    kmeans_niters: 4
  
  # 5. Cross-encoder reranker - Requires: pip install sentence-transformers
  cross_encoder:
    enabled: true
    model_name: "cross-encoder/ms-marco-MiniLM-L-6-v2"
    device: "cpu"  # Change to "cuda" for GPU
    rerank_top_k: 100
    batch_size: 16

# Evaluation settings
evaluation:
  k_values: [1, 5, 10, 20, 50]
  metrics:
    - recall
    - ndcg
    - mrr
    - precision
    - map

# Downstream tasks
downstream:
  answerability:
    enabled: true
    prompts: ["rag"]
  
  generation:
    enabled: true
    prompts: ["rag"]

# Output settings
output_dir: "outputs_all_methods"
save_intermediate: true
verbose: true

# Performance tracking
performance:
  track_memory: true
  track_latency: true