# Data Distillation (SFT): Teacher-Generated Synthetic Data
# Teacher generates responses, student learns via supervised fine-tuning

# Pipeline type
pipeline: "sft"

# Experiment identification
experiment:
  name: "sft_32b_to_13b_tulu_nosft" # "sft_olmo2_32b_to_13b"
  description: "Data distillation: 32B teacher generates data, 13B student learns via SFT"

# Models
model:
  teacher: "allenai/OLMo-2-0325-32B-SFT"
  student: "allenai/OLMo-2-1124-13B"
  student_vocab_size: 100352

# Dataset selection
data:
  dataset_choice: "tulu"

# Preprocessing settings
num_samples: 0         # 0 => use all examples
test_size: 0.05
num_proc: 8
strip_think_blocks: true
code_only: false

# SFT-specific settings
synthetic_data:
  # Teacher generation settings (FIXED for fair comparison)
  generation:
    temperature: 0.13
    top_p: 0.9
    max_new_tokens: 1024
    decoding_strategy: "sampling"  # sampling vs greedy vs beam
    prompt_field: "messages"
  
  # Filtering (quality control)
  filtering:
    enabled: true
    min_length: 10  # Minimum response length in tokens
    max_length: 1024
    # Additional filters can be added (e.g., perplexity threshold)
  
  # Paths
  synthetic_dataset_path: "/path/to/datasets/synthetic_tulu"
  use_existing: false  # If true, skip generation and use existing dataset

# Output
output:
  output_dir: "/path/to/model_log/energy_experiments/sft_32b_to_13b_nosft/"
  checkpoint_dir: None
  resume_from_checkpoint: false

# W&B override
wandb:
  run_name: "sft_32b_to_13b_tulu_nosft"
