hydra:
  searchpath:
    - file://verl/trainer/config

defaults:
  - ppo_trainer
  - _self_


data:
  gen_batch_size: ${data.train_batch_size}
  val_max_response_length: ${data.max_response_length}
  min_train_batch_size_ratio: 1.0


reward_model:
  use_format_reward: False
  use_code_reward: False
  train_num_examine: 0
  val_num_examine: 1
  reward_manager: naive
  true_score: 0.5
  overlong_buffer:
    enable: False # We try to avoid forgetting to set enable
    len: 0
    penalty_factor: 0.0
    log: False

algorithm:
  _target_: agentmath.config.Agent_AlgoConfig
  partial_rollout_max_split: 1
  filter_groups:
    _target_: verl.trainer.config.FilterGroupsConfig
    enable: False # We try to avoid forgetting to set enable
    metric: null # acc / score / seq_reward / seq_final_reward / ...
    max_num_gen_batches: 0 # Non-positive values mean no upper limit

trainer:
  trainer_name: RayPPOTrainer
  rollout_data_dir: ""
  validation_data_dir: ""
  wandb_proxy: ""
  wandb_key: ""

actor_rollout_ref:
  rollout:
    _target_: agentmath.config.Agent_RolloutConfig
    partial_rollout_max_split: ${algorithm.partial_rollout_max_split}
    stop_tokens: [ "<|im_end|>","<|endoftext|>" ]
    LRUCache_server_weight_prompt_length: 4096
    LRUCache_server_weight_prompt_length_enable: False
    LRUCache_server_weight_score: 1
    free_cache_engine_sleep: ${actor_rollout_ref.rollout.free_cache_engine}
    enable_expert_parallel: False
    val_temperature_lists: null
    multi_turn:
      _target_: agentmath.config.Agent_MultiTurnConfig
      val_max_user_turns: ${actor_rollout_ref.rollout.multi_turn.max_user_turns}
      val_max_assistant_turns: ${actor_rollout_ref.rollout.multi_turn.max_assistant_turns}
      format_real: ${actor_rollout_ref.rollout.multi_turn.format}
      function_name: code_interpreter







