# Teacher-Student Few-Shot Evaluation Configuration for Complex Arithmetic
# Tests teacher-student few-shot accuracy with specified parameters

evaluation:
  # Evaluation metrics to compute
  metrics:
  - generalization  # Test with modified parameters
    #- "entropy"  # Student accuracy using teacher explanations        # Teacher generation entropy
  
  # Use base model directly instead of checkpoint
  #checkpoint_dir: "/nlp/scr/qinanyu/rl-explanations/checkpoints/grpo/complex_arithmetic-easy-hard"  # Point to HuggingFace model
  checkpoint_dir: "/nlp/scr/qinanyu/models/qwen2.5-3b-instruct"
  base_model_path: "/nlp/scr/qinanyu/models/qwen2.5-3b-instruct"
  
  # Dataset configuration
  teacher_dataset:
    task_name: "complex_arithmetic"
    seed: 43
    size: 1000
    val_start: 900

  
  generalization_dataset:
    task_name: "complex_arithmetic"
    seed: 45
    size: 500
    min_real: -10
    max_real: 15
    min_imag: -10
    max_imag: 15
    operations_weights: [0.4, 0.4, 0.1, 0.1]
  student_dataset:
    task_name: "complex_arithmetic"
    seed: 44  # Different seed for student evaluation
    size: 1000
    min_real: -10
    max_real: 10
    min_imag: -10
    max_imag: 10
    operations_weights: [0.4, 0.4, 0.1, 0.1]
  
  # Few-shot configuration
  few_shot:
    enabled: True
    n_shot: 3
    seed: 144  # Seed for few-shot examples
  
  # Model configuration
  teacher_model:
    # Will be loaded from checkpoint
    temperature: 1.0
    top_p: 1
    top_k: -1
    max_tokens: 1024
    use_chat_template: True  # Enable chat template for teacher model
    developer_prompt: "DeepSeekZero"  # Key from data.template.SYSTEM_PROMPTS
    developer_role: "system"
    preappend_token: "<think>"
    
  student_model:
    model_path: "/nlp/scr/qinanyu/models/qwen2.5-3b-instruct"  # Base model as student
    temperature: 1.0
    top_p: 1
    top_k: -1
    max_tokens: 512
    use_chat_template: True  # Enable chat template for student model
    developer_prompt: "empty"  # Key from data.template.SYSTEM_PROMPTS
    developer_role: "system"
  
  # vLLM configuration - minimal config to avoid hanging
  vllm:
    tensor_parallel_size: 1
    gpu_memory_utilization: 0.4
    max_model_len: 4096
    enforce_eager: True
    disable_log_stats: True
    dtype: "bfloat16"
    
  # Evaluation settings
  batch_size: 100
  max_checkpoints: 20  # Only one checkpoint specified
  start_step: -1  # Start from which step: -1 = all checkpoints, 100 = start from step 100
  
  # Output configuration
  output_dir: "evaluate/results/grpo/complex_arithmetic_default"
  save_intermediate_results: True
  plot_results: True
  
  expert_thinking_dir:
      o3-mini: /nlp/scr/qinanyu/rl-explanations/evaluate/results/o3-mini_gpt-4.1-mini/${evaluation.teacher_dataset.task_name}/teacher/step_0/teacher_responses_step_0.json
      gpt-oss: /nlp/scr/qinanyu/rl-explanations/evaluate/results/gpt-oss-20b_gpt-4.1-mini/${evaluation.teacher_dataset.task_name}/teacher/step_0/teacher_responses_step_0.json
      qwen3-30b-a3b-thinking-2507: /nlp/scr/qinanyu/rl-explanations/evaluate/results/qwen3-30b-a3b-thinking-2507_gpt-4.1-mini/${evaluation.teacher_dataset.task_name}/teacher/step_0/teacher_responses_step_0.json