defaults:
  - ppo_trainer
  - verl_config_grpo     
  - _self_

reasoning_gym:
  developer_prompt: DeepSeekZero
  datasets:
    - name: manipulate_matrix
      size: 20000
      min_rows: 2
      max_rows: 6
      min_cols: 2
      max_cols: 6
      min_transforms: 1
      max_transforms: 10

  validation_dataset:
    - name: manipulate_matrix
      size: 128
      min_rows: 7
      max_rows: 8
      min_cols: 7
      max_cols: 8
      min_transforms: 1
      max_transforms: 10

  val_path: trainers/val_manipulate_matrix
  
  # Reward configuration: specify reward types and their weights
  rewards:
    format: 1.0
    rule_based: 1.0
  # Informativeness reward using OpenAI evaluation

data:
  max_response_length: 1024