model_name: "unsloth/Qwen2.5-Math-7B" # "unsloth/Deepseek-R1-Distill-Qwen-7B"
instruct: false #true if using instruct model
max_seq_length: 4096
lora_rank: 8

loss_type: "grpo" # dapo, bnpo,...
use_fkl: false
use_kl: true
use_random: false
use_token: false
use_branch: true
random_state: 3407
seed: 42

sft:
  num_epochs: 2
  learning_rate: 2.0e-4
  per_device_train_batch_size: 1
  warmup_steps: 5
  logging_steps: 5
  weight_decay: 0.01
  gradient_accumulation_steps: 1
  lr_scheduler_type: "linear"
  optim: "adamw_8bit"

rl:
  N: 2000 # dataset length
  max_steps: 1000 # RL training steps
  save_steps: 250
  q_min: 0.6 
  q_max: 1.4
  learning_rate: 5.0e-6
  per_device_train_batch_size: 8
  gradient_accumulation_steps: 1
  kl_coef: 0.05
  temperature: 1.0
  weight_decay: 0.01
  warmup_ratio: 0.1
  lr_scheduler_type: "linear"
  epsilon: 0.2
  epsilon_high: 0.28 # if DAPO
  logging_steps: 10
  num_generations: 8
  optim: "adamw_8bit"
  # SAGE config
  random:
    eps_ub: 0.1
    eps_lb: 0
    sig_ub: 0.15
    sig_lb: 0.05
    N: 8
    decay_rate: 0.9
  token:
    alpha_ub: 0.3
    alpha_lb: 0.1
    sig_ub: 0.25
    sig_lb: 0.1
    N: 8
    decay_rate: 0.9
    use_source: "new"
    norm: "minimax"
  branch:
    ratio: 0.3
    threshold: 1.2
    use_source: "new"

vllm:
  min_p: 0.01
  top_p: 1.0
  top_k: -1
