# Knowledge Distillation: OLMo2-32B → OLMo2-13B
# Logit-based distillation with cached teacher distributions

# Pipeline type
pipeline: "kd"

# Experiment identification
experiment:
  name: "kd_olmo2_32b_to_13b_nosft"
  description: "Logit KD from 32B to 13B with top-100 cached teacher logprobs"

# Models
model:
  teacher: "allenai/OLMo-2-0325-32B-SFT"
  student: "allenai/OLMo-2-1124-13B"
  student_vocab_size: 100352

# Data (KD uses cached teacher logits)
data:
  dataset_teacher_logprobs: "/path/to/logprob_cache/teacher_logprobs"

# KD-specific hyperparameters
distillation:
  # Loss weighting: total_loss = alpha * CE + (1-alpha) * KL
  alpha: 0.5  # 0.5 = equal weight to CE and KL 
  
  # Temperature for KL divergence
  temperature: 1.0
  
  # Logit caching settings (already done, for reference)
  top_k_logits: 100  # Top-100 cached teacher logits
  logprob_cache_path: "/path/to/logprob_cache"

# Output
output:
  output_dir: "/path/to/model_log/energy_experiments/kd_32b_to_13b_nosft"
  checkpoint_dir: None
  resume_from_checkpoint: false

# W&B override
wandb:
  run_name: "kd_32b_to_13b_nosft"
