defaults:
  - ppo_trainer
  - verl_config_grpo     
  - _self_

reasoning_gym:
  developer_prompt: DeepSeekZero
  datasets:
    - name: family_relationships
      size: 20000
      seed: 42
      min_family_size: 4
      max_family_size: 8

  validation_dataset:
    - name: family_relationships
      size: 128
      seed: 41
      min_family_size: 4
      max_family_size: 8
  val_path: trainers/grpo/val_family_relationships
  
  rewards:
    format: 1.0
    rule_based: 1.0      

data:
  max_response_length: 1024