hydra:
  searchpath:
    - file://verl/trainer/config

defaults:
  - ppo_trainer
  - _self_

reward_model:
  _target_: verl.workers.config.RewardModelConfig

  reward_manager: dapo
  enable: False

  # Whether to deploy the model to a separate resource pool.
  enable_resource_pool: False
  n_gpus_per_node: 0
  nnodes: 0

  model:
    type: discriminative
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    external_lib: ${actor_rollout_ref.model.external_lib}
    trust_remote_code: False

  rollout:
    _target_: verl.workers.config.RolloutConfig
    name: ???
    dtype: bfloat16
    gpu_memory_utilization: 0.5
    enforce_eager: true
    cudagraph_capture_sizes: null
    free_cache_engine: true
    data_parallel_size: 1
    expert_parallel_size: 1
    tensor_model_parallel_size: 2
    max_num_batched_tokens: 8192
    max_model_len: null
    max_num_seqs: 1024
    load_format: auto
    engine_kwargs: {}
    limit_images: null
    enable_chunked_prefill: true
    enable_prefix_caching: true
    disable_log_stats: true
    skip_tokenizer_init: true

    prompt_length: 512
    response_length: 512