# GDO-DPO Configuration for Llama-3-8B on UltraFeedback
# Reproduces main experiments from Table 1

model:
  name: "meta-llama/Meta-Llama-3-8B-Instruct"
  reference_model: "meta-llama/Meta-Llama-3-8B-Instruct"

data:
  dataset: "ultrafeedback"  # or "hh-rlhf"
  num_samples: null  # null for full dataset
  max_length: 1024
  val_ratio: 0.05

difficulty_computation:
  num_samples: 8  # K in Equation 2

gdo_dpo:
  # Curriculum parameters
  tau_stable: 1.2    # Representation stability threshold
  tau_acc: 0.65      # Discrimination accuracy threshold
  delta_sem: 0.1     # Initial semantic step size
  delta_unc: 0.1     # Initial uncertainty step size

  # Layer configuration (for 32-layer models like Llama-3-8B)
  layer_mid: 21      # Lmid = 2L/3 for L=32

  # Monitoring parameters
  ema_decay: 0.9     # γ for Srep EMA
  eval_interval: 50  # E_eval

training:
  batch_size: 4      # Per-device batch size (use gradient accumulation for larger effective batch)
  gradient_accumulation_steps: 32  # Effective batch size = 4 * 32 = 128
  num_epochs: 1
  learning_rate: 5e-7
  lr_scheduler: "cosine"
  warmup_ratio: 0.03
  beta: 0.1          # DPO temperature

  # Logging and saving
  logging_steps: 10
  save_steps: 500
  eval_steps: 500
  save_total_limit: 3

wandb:
  enabled: true
  project: "gdo-dpo"
  run_name: "llama3-8b-ultrafeedback"
