# defaults specify the default config from each component
defaults:

  # dp actor config, inheriting from trainer/config/critic/critic.yaml
  - critic

  # load the reference default config, then apply the fields in the current yaml
  - _self_

strategy: megatron

# seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
nccl_timeout: 600

# optimizer configs
optim:

  # select optimizer, default is Adam
  optimizer: adam

  # Learning rate
  lr: 1e-6

  # Clip gradients norm
  clip_grad: 1.0

  # initial learning rate for warmup, default to 0.0
  lr_warmup_init: 0.0

  # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
  lr_warmup_steps: null

  lr_decay_steps: null

  # select from constant/linear/cosine/inverse_square_root
  lr_decay_style: linear

  # minimum learning rate, default to 0.0
  min_lr: 0.0

  # select from constant/linear/cosine
  weight_decay_incr_style: constant

  # select from constant/exponential/cosine
  lr_wsd_decay_style: exponential

  # number of steps for weight std decay
  lr_wsd_decay_steps: null

  # use checkpoint optimizer parameter scheduler
  use_checkpoint_opt_param_scheduler: False

# model config for the critic
model:

  # override default empty mapping
  override_config:
    model_config: {}
    moe_config:
      freeze_moe_router: False

  # Enable gradient checkpointing to save memory
  enable_gradient_checkpointing: False

  # Activation Checkpointing settings
  gradient_checkpointing_kwargs:
    activations_checkpoint_method: null
    activations_checkpoint_granularity: null
    activations_checkpoint_num_layers: null

# megatron-specific parallelism settings
megatron:

  # Whether to offload model parameters to CPU
  param_offload: False

  # Whether to offload gradients to CPU
  grad_offload: False

  # Whether to offload optimizer state to CPU
  optimizer_offload: False

  # size of tensor model parallel group
  tensor_model_parallel_size: 1

  # size of expert model parallel group
  expert_model_parallel_size: 1

  # size of expert tensor parallel group
  expert_tensor_parallel_size: null

  # size of pipeline model parallel group
  pipeline_model_parallel_size: 1

  # size of virtual pipeline model parallel group
  virtual_pipeline_model_parallel_size: null

  # size of context parallel group
  context_parallel_size: 1

  # Whether to use sequence parallelism
  sequence_parallel: True

  # Whether to use distributed optimizer
  use_distributed_optimizer: True

  # Whether to use distributed checkpointing
  use_dist_checkpointing: False

  # Path for distributed checkpointing
  dist_checkpointing_path: null

  # Random seed for Megatron
  seed: ${actor_rollout_ref.actor.megatron.seed}

  # Allow to override Distributed Data Parallel (DDP) config
  override_ddp_config: ${actor_rollout_ref.actor.megatron.override_ddp_config}

  # Transformer config overrides for Megatron
  override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config}

  # Whether to use mBridge communications
  use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}

# Whether to load initial weights
load_weight: True

# seed for data loader
data_loader_seed: ${actor_rollout_ref.actor.data_loader_seed}

# KL control settings
kl_ctrl:
  type: fixed
  kl_coef: 0.001

# Asynchronous checkpoint saving
checkpoint:
  async_save: False