
defaults:


  - actor@actor_rollout_ref.actor: dp_actor

  - npu_profile@trainer.npu_profile: npu_profile

  - data@data: legacy_data


  - ref@actor_rollout_ref.ref: dp_ref

  - rollout@actor_rollout_ref.rollout: rollout

  - critic@critic: dp_critic

  - reward_model@reward_model: dp_reward_model


  - _self_

actor_rollout_ref:

  hybrid_engine: true

  nccl_timeout: 600

  model:

    path: ~/models/deepseek-llm-7b-chat

    custom_chat_template: null

    use_shm: false

    external_lib: null

    override_config: {}

    enable_gradient_checkpointing: true

    enable_activation_offload: false

    use_remove_padding: false

    lora_rank: 0

    lora_alpha: 16


    target_modules: all-linear


    exclude_modules: null

    use_liger: false

    use_fused_kernels: false

    fused_kernel_options:

      impl_backend: torch

    trust_remote_code: false

  rollout:

    enable_chunked_prefill: True

    load_format: dummy_dtensor

    layered_summon: False

  profiler:

    _target_: verl.utils.profiler.ProfilerConfig

    discrete: False

    all_ranks: False

    ranks: []

custom_reward_function:

  path: null

  name: compute_score

algorithm:

  _target_: verl.trainer.config.AlgoConfig


  gamma: 1.0

  lam: 1.0

  adv_estimator: gae

  norm_adv_by_std_in_grpo: True


  use_kl_in_reward: False


  kl_penalty: kl


  kl_ctrl:


    _target_: verl.trainer.config.KLControlConfig


    type: fixed


    kl_coef: 0.001


    horizon: 10000


    target_kl: 0.1


  use_pf_ppo: False


  pf_ppo:


    reweight_method: pow


    weight_pow: 2.0

trainer:

  balance_batch: True


  total_epochs: 30


  total_training_steps: null


  profile_steps: null




  profile_continuous_steps: False




  controller_nsight_options:


    trace: "cuda,nvtx,cublas,ucx"


    cuda-memory-usage: "true"


    cuda-graph-trace: "graph"


  worker_nsight_options:


    trace: "cuda,nvtx,cublas,ucx"


    cuda-memory-usage: "true"


    cuda-graph-trace: "graph"


    capture-range: "cudaProfilerApi"







    capture-range-end: null


    kill: none


  project_name: verl_examples


  experiment_name: gsm8k


  logger: [ 'console', 'wandb' ]


  log_val_generations: 0


  rollout_data_dir: null


  validation_data_dir: null


  nnodes: 2


  n_gpus_per_node: 8


  save_freq: -1






  esi_redundant_time: 0





  resume_mode: auto


  resume_from_path: null


  val_before_train: True


  val_only: False


  test_freq: -1


  critic_warmup: 0


  default_hdfs_dir: null


  del_local_ckpt_after_load: False


  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}


  max_actor_ckpt_to_keep: null


  max_critic_ckpt_to_keep: null


  ray_wait_register_center_timeout: 300


  device: cuda


  use_legacy_worker_impl: auto

ray_init:

  num_cpus: null


  timeline_json_file: null