defaults:
  - ../base/ppo_trainer
  - ../base/verl_config_grpo
  - _self_

reasoning_gym:
  developer_prompt: DeepSeekZero
  datasets:
    # Mini Sudoku
    - name: mini_sudoku
      size: 8000
      seed: 42
      min_empty: 6
      max_empty: 10

    # Knights and Knaves
    - name: knights_knaves
      size: 8000
      seed: 42
      n_people: 3
      depth_constraint: 3
      width_constraint: 3
    
    # Family Relationships
    - name: family_relationships
      size: 8000
      seed: 42
      min_family_size: 4
      max_family_size: 8
    
    # Circuit Logic
    - name: circuit_logic
      size: 8000
      seed: 42
      min_terms: 3
      max_terms: 5
      min_inputs: 2
      max_inputs: 4

  validation_dataset:
    # Mini Sudoku validation
    - name: mini_sudoku
      size: 128
      seed: 41
      min_empty: 6
      max_empty: 10
    

    # Knights and Knaves validation
    - name: knights_knaves
      size: 128
      seed: 41
      n_people: 3
      depth_constraint: 3
      width_constraint: 3
    
    # Family Relationships validation
    - name: family_relationships
      size: 128
      seed: 41
      min_family_size: 4
      max_family_size: 8
    
    # Circuit Logic validation
    - name: circuit_logic
      size: 128
      seed: 41
      min_terms: 3
      max_terms: 5
      min_inputs: 2
      max_inputs: 4

  # Combined validation paths for all tasks
  val_path: trainers/val_combined

data:
  max_response_length: 2048
