# Two-Stage LLM Evaluation Configuration
# This configuration uses two LLMs in sequence:
# 1. First LLM generates background context
# 2. Second LLM answers the question using that context

model:
  # Two stage
  # model_name: "two_stage"  # Use two-stage pipeline
  # answer_model_path: "Qwen/Qwen3-0.6B"  # Second LLM for answering

  model_name: "two_stage_rosetta"
  rosetta_checkpoint_dir: "local/checkpoints/0.6B_4B_cot_MMLU_data"  # Path to Rosetta checkpoint directory
  rosetta_subfolder: "final"  # Subfolder name (e.g., 'final', 'checkpoint-1000')

  context_model_path: "Qwen/Qwen3-4B"  # First LLM for context generation
  background_prompt: "In one clear sentence, describe the most essential background knowledge needed to answer the question: {question}"

  

output:
  output_dir: "local/MMLU_Redux_results"

eval:
  
  # Standard evaluation settings
  dataset: "mmlu-redux"  # or "mmmlu"
  # GPU configuration
  # gpu_ids: [0,1,2,3,4,5]
  gpu_ids: [1]
  answer_method: "generate"  # Two-stage always uses generation
  use_cot: true
  max_new_tokens: 1024
  sample_interval: 1  # Sample every N examples
  limit: 10  # Limit examples per subject (null for all)
  subjects: ["abstract_algebra"] # Optional: specify specific subjects to evaluate
  # response_text: "Jibu ni"
