defaults:
  - ppo_trainer
  - verl_config_grpo     
  - _self_

reasoning_gym:
  developer_prompt: DeepSeekZero
  datasets:
    - name: puzzle24
      size: 20000
      seed: 42
      min_terms: 2 # 3
      max_terms: 4 # 10
      min_value: 1 # 10
      max_value: 100 # 10000
      operators_weights: [0.4, 0.4, 0.2]

  validation_dataset:
    - name: puzzle24
      size: 128
      seed: 41
      min_terms: 2 # 3
      max_terms: 4 # 10
      min_value: 1 # 10
      max_value: 100 # 10000
      operators_weights: [0.4, 0.4, 0.2]
#    - name: puzzle24
#      size: 128
#      seed: 41
#      min_terms: 3
#      max_terms: 10
#      min_value: 10
#      max_value: 10000
#      operators_weights: [0.35, 0.35, 0.3]
#    - name: spiral_matrix
#      size: 128
#      seed: 41
#      min_n: 5
#      max_n: 10
#    - name: mini_sudoku
#      size: 128
#      seed: 42
#      min_empty: 6
#      max_empty: 10
#    - name: family_relationships
#      size: 128
#      seed: 42
#      min_family_size: 4
#      max_family_size: 8
#    - name: futoshiki
#      seed: 42
#      size: 128
#      min_board_size: 4
#      max_board_size: 9
#      min_difficulty: 0
#      max_difficulty: 3
  rewards:
    rule_based: 0.5        # Rule-based reward with format bonus
    informativeness: 0.5 


  val_path: trainers/grpo/val_puzzle24

data:
  max_response_length: 1024 