# Data Distillation (SFT): Teacher-Generated Synthetic Data
# Teacher generates responses, student learns via supervised fine-tuning

# Pipeline type
pipeline: "sft"

# Experiment identification
experiment:
  name: "sft_32b_to_1b_tulu_true"
  description: "Data distillation: 32B teacher generates data, 1B student learns via SFT"

# Models
model:
  teacher: "allenai/OLMo-2-0325-32B-SFT"
  student: "allenai/OLMo-2-0425-1B"
  student_vocab_size: 100352

# Dataset selection
data:
  dataset_choice: "tulu" # 'math', 'codeforces'

# Preprocessing settings
num_samples: 0         # 0 => use all examples
test_size: 0.05
num_proc: 8
strip_think_blocks: true
code_only: false

# SFT-specific settings
synthetic_data:
  # Teacher generation settings (FIXED for fair comparison)
  generation:
    temperature: 0.7
    top_p: 0.9
    max_new_tokens: 1024
    decoding_strategy: "sampling"  # sampling vs greedy vs beam
    prompt_field: "messages"
  # Filtering (quality control)
  filtering:
    enabled: true
    min_length: 10  # Minimum response length
    max_length: 1024
    # Additional filters can be added (e.g., perplexity threshold)
  
  # Paths
  synthetic_dataset_path: "/path/to/datasets/tulu-3-sft-mixture-preprocessed"
  use_existing: false  # If true, skip generation and use existing dataset

# Output
output:
  output_dir: "/path/to/model_log/energy_experiments/sft_32b_to_1b_true/"
  checkpoint_dir: None
  resume_from_checkpoint: false

# W&B override
wandb:
  run_name: "sft_32b_to_1b_tulu_true"
