defaults:
  - ppo_trainer
  - verl_config_grpo     
  - _self_

reasoning_gym:
  developer_prompt: DeepSeekZero
  datasets:
    - name: knight_swap
      size: 20000
      seed: 42
      min_nodes: 10
      max_nodes: 30
      min_pieces: 2
      max_pieces: 2
      min_steps: 4
      max_steps: 20

  validation_dataset:
    - name: knight_swap
      size: 128
      seed: 41
      min_nodes: 10
      max_nodes: 25
      min_pieces: 3
      max_pieces: 4
      min_steps: 1
      max_steps: 20
    - name: knight_swap
      size: 128
      seed: 41
      min_nodes: 10
      max_nodes: 30
      min_pieces: 2
      max_pieces: 2
      min_steps: 4
      max_steps: 20

  val_path: trainers/grpo/val_knight_swap

data:
  max_response_length: 2048