# MMLU-Redux Evaluation Configuration
# Cleaned version of MMLU benchmark with error filtering

model:
  model_name: 
  # model_name: local/checkpoints/Qwen3-0.6B_Swahili_1.6M/final # use Rosetta to test Rosetta model
  # model_name: Jacaranda/UlizaLlama3
  model_name: Qwen/Qwen3-0.6B
  # model_name: Qwen/Qwen3-4B
  # model_name: Rosetta
  # rosetta_config:  # Only needed for Rosetta models
  #   base_model: Qwen/Qwen3-0.6B
  #   teacher_model: meta-llama/Meta-Llama-3-8B-Instruct
  #   # teacher_model : meta-llama/Llama-3.1-8B-Instruct
  #   include_response: false
  #   is_do_alignment: true
  #   alignment_strategy: "longest"
  #   checkpoints_dir: local/checkpoints/20250903_170900/final

  # Generation configuration - applied to all models during evaluation
  generation_config:
    do_sample: false  # Whether to use sampling (true) or greedy decoding (false)
    max_new_tokens: 1024  # Maximum number of tokens to generate
    # Sampling parameters (only used when do_sample=true):
    # temperature: 0.7  # Controls randomness (0.0 = deterministic, higher = more random)
    # top_p: 0.9  # Nucleus sampling threshold
    # top_k: 50  # Top-k sampling threshold
    # min_p: 0.05  # Minimum probability threshold
    # repetition_penalty: 1.0  # Penalty for repeating tokens

output:
  output_dir: local/MMLU_Redux_results
  # output_dir: local/MMMLU_results

eval:
  dataset: mmlu-redux # mmlu-redux or mmmlu
  # dataset: mmmlu
  gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7]  # GPUs to use for evaluation
  # gpu_ids: [7]
  answer_method: generate  # 'generate' or 'logits'
  use_cot: true  # Enable chain-of-thought reasoning
  sample_interval: 1  # Sample every N examples
  limit: null  # Limit examples per subject (null for all)
  # subjects: ["abstract_algebra"] # Optional: specify specific subjects to evaluate
  # response_text: "Jibu ni"
