defaults:
  - _self_
  - deepspeed_config@deepspeed_config.train: train
  - deepspeed_config@deepspeed_config.eval: eval
  # <folder_name>@<field_name>.<field_name>: <yaml_file_name>
  - megatron_config@trainer.policy.megatron_config: policy
  - megatron_config@trainer.ref.megatron_config: ref
  - skyrl_gym_config: default

data:
  train_data: ["${oc.env:HOME}/data/gsm8k/train.parquet"]
  val_data: ["${oc.env:HOME}/data/gsm8k/validation.parquet"]

trainer:
  placement:
    colocate_all: true
    colocate_policy_ref: true
    policy_num_nodes: 1
    policy_num_gpus_per_node: 4
    critic_num_nodes: 1
    critic_num_gpus_per_node: 4
    ref_num_nodes: 1
    ref_num_gpus_per_node: 4
  sequence_parallel_backend: "ulysses"
  strategy: fsdp2
  policy:
    model:
      path: "Qwen/Qwen2.5-1.5B-Instruct"
      lora:
        rank: 0
        alpha: 16
        dropout: 0
        lora_sync_path: "/tmp/skyrl_lora_sync"
        target_modules: "all-linear"
        exclude_modules: null
        # For FSDP, this corresponds to `init_lora_weights` in PEFT. See: https://huggingface.co/docs/peft/main/en/package_reference/lora#peft.LoraConfig.init_lora_weights
        # For Megatron, this is used for `lora_A_init_method`, and "xavier", "normal", "kaiming", and "zero" are supported.
        init_method: "kaiming"
    deepspeed_config: ${deepspeed_config.train}
    optimizer_config:
      lr: 1.0e-6
      adam_betas: [0.9, 0.999]
      weight_decay: 1e-2
      max_grad_norm: 1.0 # gradient clipping
      offload_after_step: true # offload optimizer state to cpu after each full training step. Applicable only when `colocate_all=true`
      num_warmup_steps: 0 # number of mini-batch steps to warmup the optimizer for
      scheduler: "constant_with_warmup"   
    fsdp_config:
      cpu_offload: false # offload params + optimizer state to cpu during fwd pass
      reshard_after_forward: true # fsdp2 only, [True, False, int between 1 and fsdp_size]
      fsdp_size: -1
    sequence_parallel_size: 1
    # uses torch compile with logits calculation
    use_torch_compile: false
    # saves memory snapshots to os.path.join(ckpt_path, "memory_snapshots") - can visualize by dragging pickle files to https://docs.pytorch.org/memory_viz
    record_memory: false
    # pass through kwargs to the HuggingFace model config for FSDP/Deepspeed training backends (i.e. for overriding vocab size, etc) - for megatron, use policy.megatron_config.transformer_config_kwargs instead
    model_config_kwargs: {}
  ref:
    model:
      path: ${trainer.policy.model.path}
    sequence_parallel_size: 1
    deepspeed_config: ${deepspeed_config.eval}
    fsdp_config:
      cpu_offload: false
      reshard_after_forward: true
      fsdp_size: -1
    # pass through kwargs to the HuggingFace model config (i.e. for overriding vocab size, etc)
    # pass through kwargs to the HuggingFace model config for FSDP/Deepspeed training backends (i.e. for overriding vocab size, etc) - for megatron, use ref.megatron_config.transformer_config_kwargs instead
    model_config_kwargs: {}
  critic:
    model:
      path: null
      lora:
        rank: 0
        alpha: 16
        dropout: 0
        target_modules: "all-linear"
        exclude_modules: null
        init_method: "kaiming"
    deepspeed_config: ${deepspeed_config.train}
    optimizer_config:
      lr: 5.0e-6
      adam_betas: [0.9, 0.999]
      weight_decay: 1e-2
      max_grad_norm: 1.0 # gradient clipping
      offload_after_step: true # offload optimizer state to cpu after each full training step. Applicable only when `colocate_all=true`
      num_warmup_steps: 0 # number of mini-batch steps to warmup the optimizer for
      scheduler: "constant_with_warmup"
    fsdp_config:
      cpu_offload: false
      reshard_after_forward: true
      fsdp_size: -1
    sequence_parallel_size: 1
    # pass through kwargs to the HuggingFace model config (i.e. for overriding vocab size, etc)
    model_config_kwargs: {}
  algorithm:
    advantage_estimator: "grpo"  # "grpo", "gae", "rloo", "reinforce++", or customizable with AdvantageEstimatorRegistry
    kl_ctrl: # only used if use_kl_in_reward is true (not applied in the case of use_kl_loss=true) - uses kl_loss_coef as the initial KL coefficient
      type: "fixed" # "fixed" or "adaptive"
      kl_target: 0.1 # target KL divergence for adaptive KL controller
      horizon: 10000 # controls the update rate of the adaptive KL controller
    kl_estimator_type: "k3" # "k1", "k2", "k3", "abs" - see http://joschu.net/blog/kl-approx.html for details
    use_kl_estimator_k3: false # to be deprecated, use kl_estimator_type="k3" instead
    use_abs_kl: false # to be deprecated, use kl_estimator_type="abs" instead
    # note: use_kl_in_reward and use_kl_loss should be mutually exclusive
    use_kl_in_reward: false # apply kl loss to rewards
    use_kl_loss: true # used in policy model
    kl_loss_coef: 0.001
    # entropy loss
    use_entropy_loss: false
    entropy_loss_coef: 0.01
    # this adds training batch level normalization to advantages
    advantage_batch_normalize: false
    value_head_prefix: "value_head"
    policy_loss_type: "regular" # "regular", "dual_clip", "gspo", "clip_cov", "kl_cov", or customizable with PolicyLossRegistry
    loss_reduction: "token_mean" # "token_mean", "sequence_mean", "seq_mean_token_sum_norm"
    grpo_norm_by_std: true # set to false to disable normalization by std in GRPO
    zero_variance_filter: false # set to true to loss mask out prompts with zero variance rewards. only applicable when rewards are response-level.
    # GAE parameters
    lambd: 1.0
    gamma: 1.0
    # PPO parameters
    eps_clip_low: 0.2
    eps_clip_high: 0.2
    # dual clip parameters
    clip_ratio_c: 3.0
    # Truncated Importance Sampling as proposed in https://fengyao.notion.site/off-policy-rl 
    tis_imp_ratio_cap: -1.0
    use_tis: false
    # SAPO parameters (only used when policy_loss_type: "sapo") (https://arxiv.org/pdf/2511.20347)
    sapo:
      tau_pos: 1.0
      tau_neg: 1.05 # default values used in the paper with Qwen3-30B-A3B-Base
    
    # value loss parameters
    value_clip: 0.2
    dynamic_sampling:
      type: null # filter, replace, or null
      max_sample_batches: 30 # sample at most this many batches before stopping, -1 to sample forever
      min_replace_ratio: 0.3 # minimum proportion of good samples with which to replace bad samples (for replace strategy only)
    
    # clip-cov parameters (only used when policy_loss_type: "clip_cov"
    clip_cov:
      clip_ratio: 0.0002 # fraction of tokens to clip based on covariance
      clip_cov_lb: 1.0 # lower bound for covariance clipping
      clip_cov_ub: 5.0 # upper bound for covariance clipping
    
    # kl-cov parameters (only used when policy_loss_type: "kl_cov")
    kl_cov:
      kl_cov_frac: 0.2 # percentage of tokens to apply KL regularization to (20%)
      ppo_kl_coef: 1.0
    
    # cispo parameters (only used when policy_loss_type: "cispo")
    cispo: 
      cispo_eps_clip_low: 0  # offset for lower bound of importance sampling ratio clipping (as opposed to PPO token update clipping)
      cispo_eps_clip_high: 5 # offset for upper bound of importance sampling ratio clipping (as opposed to PPO token update clipping)
  
  # Fully async specific knobs. For more see http://skyrl.readthedocs.io/en/latest/tutorials/fully_async.html#step-2-config-knobs-to-tune-for-fully-async-training
  fully_async:
    # The maximum number of off-policy steps allowed. If a group of trajectory is scheduled at step i,
    # and it is used to train at step j, it is guaranteed that j - i <= max_staleness_steps.
    # The larger the max_staleness_steps, the more off-policy the training is, and the more throughput we get.
    max_staleness_steps: 4

    # The number of generation workers to spawn, where each worker works on a group of trajcetories,
    # being the same prompt repeated `generator.n_samples_per_prompt` times.
    # It should be >= policy_mini_batch_size to avoid wasted throughput, and <= policy_mini_batch_size * (max_staleness_steps + 1)
    # since it would be wasted due to capacity control.
    # The larger the number, the more throughput, and likely more staleness (and hence off-policy-ness).
    # Default value is: policy_mini_batch_size * (max_staleness_steps / 2 + 1) = 256 * (4 / 2 + 1) = 768
    num_parallel_generation_workers: 768

  gradient_checkpointing: true
  gradient_checkpointing_use_reentrant: false
  seed: 42
  resume_mode: latest # null/"none", "latest", "from_path"
  resume_path: null
  ckpt_path: "${oc.env:HOME}/ckpts/" # Path for resumable training checkpoints (model state, optimizer state, etc.)
  max_ckpts_to_keep: -1 # -1 to keep all checkpoints, N to keep the last N checkpoints
  ckpt_interval: 10  # Save full training checkpoint every `ckpt_interval` steps.
  hf_save_interval: -1  # Save HF format model(s)every `hf_save_interval` steps.
  export_path: "${oc.env:HOME}/exports/" # Path for exported artifacts (HF models, debug dumps, etc.)
  bf16: true
  epochs: 1  # Number of passes over the full dataset
  update_epochs_per_batch: 1  # Number of gradient update passes over each train batch
  train_batch_size: 1024  # See `utils/utils.py::validate_batch_sizes` for train, mini, and micro batch size constraints.
  policy_mini_batch_size: 256
  critic_mini_batch_size: 256
  micro_train_batch_size_per_gpu: 1
  micro_forward_batch_size_per_gpu: 1
  update_ref_every_epoch: false
  use_sample_packing: true
  eval_batch_size: 1024
  eval_before_train: true
  eval_interval: 5 # Set to -1 to disable evaluation.
  # max prompt length in training dataset
  max_prompt_length: 512
  flash_attn: true
  disable_fast_tokenizer: false
  target_modules: null # NOTE: these are now deprecated, use trainer.policy.model.lora.target_modules or trainer.critic.model.lora.target_modules instead
  exclude_modules: null # NOTE: these are now deprecated, use trainer.policy.model.lora.exclude_modules or trainer.critic.model.lora.exclude_modules instead
  project_name: "skyrl"
  run_name: "test_run"
  logger: "wandb"
  dump_data_batch: false
  dump_eval_results: true

  # YaRN:
  rope_scaling: null
  rope_theta: null
  # rope_scaling:
  #   rope_type: yarn
  #   factor: 1.0
  #   original_max_position_embeddings: 32768


generator:
  model_name: ${trainer.policy.model.path}
  model_dtype: "bfloat16" # should match dtype for inference engine
  run_engines_locally: true
  num_inference_engines: 1
  backend: "vllm"
  weight_sync_backend: "nccl"
  # if using cuda_ipc, we send in batches of this size in GB
  weight_transfer_threshold_cuda_ipc_GB: 1.0
  inference_engine_tensor_parallel_size: 4
  inference_engine_pipeline_parallel_size: 1
  inference_engine_expert_parallel_size: 1
  inference_engine_data_parallel_size: 1
  n_samples_per_prompt: 5
  async_engine: true
  batched: false
  max_input_length: ${trainer.max_prompt_length} # max generator input length used for multi-turn conversations - for single turn set equal to max_prompt_length
  # VLLM_ENABLE_V1_MULTIPROCESSING=0 for reproducibility
  vllm_v1_disable_multiproc: true
  enable_prefix_caching: true
  enable_chunked_prefill: true
  max_num_batched_tokens: 8192
  # Disable CUDA graphs by default for stability. Set to false for higher performance, but this may affect convergence for long-running and/or long context training jobs.
  enforce_eager: true
  fully_sharded_loras: false
  gpu_memory_utilization: 0.8
  max_num_seqs: 1024
  remote_inference_engine_urls: ["127.0.0.1:8001"]
  enable_http_endpoint: false
  http_endpoint_host: "127.0.0.1"
  http_endpoint_port: 8000
  max_turns: 1

  # chat template configuration
  chat_template:
    source: "name"  # "name" or "file"
    name_or_path: null  # e.g., "qwen3_with_thinking" or "/path/to/template.j2"
  
  # Chat templating kwargs to pass to `tokenizer.apply_chat_template`
  chat_template_kwargs: {}

  # Inference engine arguments. Arguments are passed directly to the vLLM or SGLang engine, so names must match
  # the engine's args. To specify an engine arg in the CLI override, use the format: +generator.engine_init_kwargs.arg_name=value
  engine_init_kwargs: {}
  
  override_existing_update_group: "auto" # "auto", "enable", "disable"
  # sampling params for generation phase
  sampling_params:
    max_generate_length: 1024
    repetition_penalty: 1.0
    temperature: 1.0
    top_p: 1.0
    min_p: 0.0
    top_k: -1
    logprobs: null
    stop: null

  # whether to use a conversation based format for multi-turn generations
  # if false, append multi-turn model responses and env observations to the original assistant response
  # if true, each multi-turn model response and env observations is stored in a separate assistant/user message respectively
  use_conversation_multi_turn: true

  # Used when use_conversation_multi_turn is true, and sampling_params.stop is not null.
  # If true, append tokenizer.eos_token_id to the end of the generation if the generation ends
  # with stop_reason "stop" and matched a stop string in sampling_params.stop.
  append_eos_token_after_stop_str_in_multi_turn: true

  # sampling params for evaluation
  eval_sampling_params:
    max_generate_length: ${generator.sampling_params.max_generate_length}
    repetition_penalty: 1.0
    temperature: 0.0
    top_p: 1.0
    min_p: 0.0
    top_k: -1
    logprobs: null
    stop: null

  # number of samples per prompt for evaluation
  eval_n_samples_per_prompt: 1

  # NOTE (sumanthrh): This flag sets the reward to 0 if the `stop_reason` is not `stop`.
  # This is useful in cases where the LLM generation was truncated or aborted.
  # Cases where this is useful: Often, we have format rewards for the LLM to follow,
  # but in cases where the LLM didn't finish the response, we typically don't want to reward it.
  # This is a general setting for all environments.
  # TODO (erictang000): Show clear ablations for benefits of this on GSM8K or SQL.
  zero_reward_on_non_stop: false

  # Whether to apply DAPO Overlong Filtering to the loss masks.
  # For each trajectory that exceeds the max length (i.e., truncated and does not end with an
  # EOS token), this masks out every token in the loss mask.
  apply_overlong_filtering: false

  # rope parameters, can be optionally different from the trainer , useful in some cases like with thinking models.
  rope_scaling: ${trainer.rope_scaling}
  rope_theta: ${trainer.rope_theta}

  step_wise_trajectories: false

environment:
  env_class: "gsm8k"
  # NOTE: environment specific defaults for environment.skyrl_gym are set at the following path:
  # skyrl_gym: config/skyrl_gym_config/default.yaml
