# MMLU-Redux Evaluation Configuration
# Cleaned version of MMLU benchmark with error filtering

model:
  # model_name: Qwen/Qwen3-0.6B # use Rosetta to test Rosetta model
  model_name: Rosetta
  rosetta_config:  # Only needed for Rosetta models
    base_model: Qwen/Qwen3-0.6B
    teacher_model: Qwen/Qwen3-4B
    include_response: false
    is_do_alignment: true
    alignment_strategy: "longest"
    checkpoints_dir: local/checkpoints/l20_mse_kvcache_3mlp_revised/final

output:
  output_dir: local/MMLU_Redux_results
  # output_dir: local/MMMLU_results

eval:
  dataset: mmlu-redux # mmlu-redux or mmmlu
  # dataset: mmmlu
  # gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]  # GPUs to use for evaluation
  gpu_ids: [0]
  answer_method: logits  # 'generate' or 'logits'
  use_cot: false  # Enable chain-of-thought reasoning
  sample_interval: 1  # Sample every N examples
  limit: 100  # Limit examples per subject (null for all)
  subjects: ["sociology"]  # Optional: specify specific subjects to evaluate
