# specify the default per-component configs
defaults:
  # <folder_name>@<field_name>.<field_name>: <yaml_file_name>
  # actor_rollout_ref.actor: trainer/config/actor/megatron_actor.yaml
  - actor@actor_rollout_ref.actor: megatron_actor
  # data: trainer/config/data/legacy_data.yaml
  - data@data: legacy_data
  # (Rule-based) Reward manager config.
  - reward_manager@reward_manager
  # load the reference default config, then apply the fields in the current yaml
  # Reference model config.
  # Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
  - ref@actor_rollout_ref.ref: megatron_ref
  # Rollout model config.
  - rollout@actor_rollout_ref.rollout: rollout
  # Model config.
  - model@actor_rollout_ref.model: hf_model
  # Critic model config.
  - critic@critic: megatron_critic
  # Reward model config.
  - reward_model@reward_model: megatron_reward_loop
  # Rollout correction config.
  - algorithm@algorithm.rollout_correction: rollout_correction
  - _self_

actor_rollout_ref:
  hybrid_engine: True

  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron

  model:
    override_config:
      model_config: {}
      moe_config:
        freeze_moe_router: False

    use_fused_kernels: False # Whether to use custom fused kernels (PostProcessing, for memory efficiency)

    trust_remote_code: False

    # Whether to remove padding tokens in inputs during training
    use_remove_padding: false

    # LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning
    lora:
      # LoRA type: "lora", "vlm_lora", "canonical_lora", or "dora"
      type: lora

      # LoRA rank (Dimension of the low-rank projection space.). Set to 0 to disable LoRA
      rank: 0  # typical values: 8, 16, 32, 64
      
      #  Weighting factor for the low-rank projection. Defaults to 32
      alpha: 32
      
      # Dropout rate for the low-rank projection. Defaults to 0.0
      dropout: 0.0
      
      # A list of module names to apply LoRA to.
      # For fused LoRA, Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
      # For canonical LoRA: ["linear_q", "linear_k", "linear_v", "linear_proj", "linear_fc1_up", "linear_fc1_gate", "linear_fc2"]
      # - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections in self-attention
      # - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention
      # - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP
      # - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP
      # Target modules can also contain wildcards. For example, you can specify
      # target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv on the first two layers
      target_modules:
        - linear_qkv
        - linear_proj
        - linear_fc1
        - linear_fc2
      
      # A list of module names not to apply LoRa to. It will match all nn.Linear & nn.Linear-adjacent modules whose name
      # does not match any string in exclude_modules. If used, will require target_modules to be empty list or None
      exclude_modules: []

      # Position for applying dropout, can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre'
      dropout_position: pre

      # Initialization method for the low-rank matrix A. Defaults to "xavier".
      lora_A_init_method: xavier

      # Initialization method for the low-rank matrix B. Defaults to "zero".
      lora_B_init_method: zero

      # Enables the experimental All-to-All (A2A) communication strategy. Defaults to False
      a2a_experimental: False

      # Parameter data type for LoRA weights. Default to null, which will use model's dtype.
      dtype: null

      # Path to pre-trained LoRA adapter weights (null to train from scratch)
      adapter_path: null

      # VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen.
      # For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully
      # finetune the vision model.
      freeze_vision_model: True
      freeze_vision_projection: True
      freeze_language_model: True

  rollout:
    quantization: null

    layer_name_map:
      qkv_layer_name: qkv
      gate_proj_layer_name: gate_up

custom_reward_function:
  path: null
  name: compute_score

algorithm:
  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.trainer.config.AlgoConfig
  gamma: 1.0
  lam: 1.0
  adv_estimator: gae
  norm_adv_by_std_in_grpo: True
  pass_k: 1
  grpo_verk_step_weight_clip_min: 0.25
  use_kl_in_reward: False
  kl_penalty: kl # how to estimate kl divergence
  kl_ctrl:
    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
    _target_: verl.trainer.config.KLControlConfig
    type: fixed
    kl_coef: 0.001
    horizon: 10000
    target_kl: 0.1
  use_pf_ppo: False
  pf_ppo:
    reweight_method: pow # ["pow", "max_min", "max_random"]
    weight_pow: 2.0

trainer:
  balance_batch: True
  total_epochs: 30
  total_training_steps: null
  project_name: verl_examples
  experiment_name: gsm8k
  logger: ["console", "wandb"]
  log_val_generations: 0
  nnodes: 1
  n_gpus_per_node: 8
  save_freq: -1
  esi_redundant_time: 0

  # auto: find the last ckpt to resume. If can't find, start from scratch
  resume_mode: auto # or disable or resume_path if resume_from_path is set
  resume_from_path: null
  del_local_ckpt_after_load: False
  val_before_train: True
  test_freq: -1
  critic_warmup: 0
  default_hdfs_dir: null
  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
  max_actor_ckpt_to_keep: null
  max_critic_ckpt_to_keep: null
  # The timeout for ray worker group to wait for the register center to be ready
  ray_wait_register_center_timeout: 300
  device: cuda
  # Directory for logging rollout data; no dump if null
  rollout_data_dir: null

  # whether to use legacy worker implementation
  #  mode: "auto", "enable", or "disable"
  use_legacy_worker_impl: auto

global_profiler:
  _target_: verl.utils.profiler.ProfilerConfig
  tool: null # choose between nsys, npu, torch, torch_memory
  steps: null # profile steps
  profile_continuous_steps: False
  save_path: "outputs/profile" # profiler saving path
  # Specific tool configs, can use +profiler.tool_config.[tool].xxx to config
  global_tool_config:
    # nsys config
    nsys:
      # True for each task has its own database, False for all tasks in one training step share one database.
      discrete: False

      # controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
      ## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
      ## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
      controller_nsight_options:
        # Select the API(s) to be traced.
        trace: "cuda,nvtx,cublas,ucx"

        # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
        cuda-memory-usage: "true"

        # CUDA graphs will be traced as a whole
        cuda-graph-trace: "graph"

      # worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
      worker_nsight_options:
        # Select the API(s) to be traced.
        trace: "cuda,nvtx,cublas,ucx"

        # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
        cuda-memory-usage: "true"

        # CUDA graphs will be traced as a whole
        cuda-graph-trace: "graph"

        # Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
        capture-range: "cudaProfilerApi"

        # Specify the desired behavior when a capture range ends.
        # In verl we need the torch.cuda.profiler.start/stop pair to repeats n times.
        # valid values are "repeat-shutdown:n" or null.
        # For normal whole step profiling, n = len(profile_steps);
        # but for discrete profiling, n = len(profile_steps) * Number(subtasks).
        # Or you can just leave it null and the program will use n = len(profile_steps) * 6;
        capture-range-end: null

        # Send signal to the target application's process group. We let the program to exit by itself.
        kill: none

    # enable memory visualization for debugging memory usage
    torch_memory:
      #  Maximum number of allocation entries to record
      trace_alloc_max_entries: 100_000
      # The depth of the call stack to capture for each allocation
      stack_depth: 32
      # 'alloc': records only allocation events || 'state': records memory state changes || 'all': records both.
      context: "all"
      # 'python': records Python stacks || 'cpp': records C++ stacks (available in some versions) || 'all': records both.
      stacks: "all"
      # devices, record_context etc.
      kw_args: {}

# configs for TransferQueue
transfer_queue:
  # Whether to enable transfer queue
  enable: False

ray_kwargs:
  ray_init:
    num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
  timeline_json_file: null
