defaults:
  - ppo_trainer
  - verl_config_grpo     
  - _self_


reasoning_gym:
  developer_prompt: DeepSeekZero
  mode: random
  datasets:
    - name: spiral_matrix
      size: 60000
      seed: 42
      min_n: 2
      max_n: 6

  validation_dataset:
    - name: spiral_matrix
      size: 128
      seed: 41
      min_n: 7
      max_n: 8
  val_path: trainers/val_spiral_matrix

  rewards:
    format: 1.0
    rule_based: 1.0


  #reward_partial: True
  replacement_data_path: /nlp/scr/qinanyu/rl-explanations/generate/train_data_usefulness/o3-mini_gpt-4.1-mini/spiral_matrix.json

data:
  max_response_length: 1024

