defaults:
  - ppo_trainer
  - user
  - _self_

# MAX_PROMPT_LENGTH=2048
# MAX_RESPONSE_LENGTH=8192
# MAX_FEEDBACK_LENGTH=8192
# TEMPLATE_LENGTH=512  # heuristic upper bound, not enforced
# MAX_MODEL_LEN=$((TEMPLATE_LENGTH + MAX_PROMPT_LENGTH + MAX_FEEDBACK_LENGTH + MAX_RESPONSE_LENGTH))
max_model_len: 18944

actor_rollout_ref:
  actor:
    ppo_mini_batch_size: 32 # TODO: make slightly off-policy in the future; use same as baseline; perhaps make 1 again for fastest convergence
    policy_loss:
      loss_mode: sdpo
    distill:
      max_reprompt_len: 10240
      is_clip: 2.0
    optim:
      lr: 1e-5
  rollout:
    n: 8
    calculate_log_probs: True

algorithm:
  adv_estimator: grpo # disables critic
  norm_adv_by_std_in_grpo: False
  rollout_correction:
    rollout_is: token
    rollout_is_threshold: 2.0

data:
  train_batch_size: 32

trainer:
  val_before_train: False
