defaults:
  - ../base/ppo_trainer
  - ../base/verl_config_grpo
  - _self_

reasoning_gym:
  developer_prompt: DeepSeekZero
  datasets:
    - name: word_ladder
      size: 20000
      seed: 42
      min_word_length: 3
      max_word_length: 4

  validation_dataset:
    - name: word_ladder
      size: 128
      seed: 41
      min_word_length: 3
      max_word_length: 4


  val_path: trainers/val_word_ladder

  # Reward configuration: specify reward types and their weights
  rewards:
    format: 1.0
    rule_based: 1.0

data:
  max_response_length: 1024
