data:
  tokenizer: null
  train_files: data/math/train_${reward_fn.extraction_type}.parquet
  val_files: data/math/test_${reward_fn.extraction_type}.parquet

  # Whether to use shared memory for data loading.
  use_shm: False

  prompt_key: prompt
  max_prompt_length: 8096
  max_response_length: 8096
  train_batch_size: 1024
  val_batch_size: 1312
  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
  return_raw_chat: False
  shuffle: True
  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You cat set the filter_overlong_prompts_workers to use multiprocessing to speed up.
  filter_overlong_prompts_workers: 1
  truncation: error
  image_key: images
  video_key: videos
  custom_cls:
      path: null
      name: null

actor_rollout_ref:
  hybrid_engine: True
  model:
    path: ~/models/deepseek-llm-7b-chat
    pretrained_tokenizer: True
    use_shm: false
    external_lib: null
    override_config: { }
    enable_gradient_checkpointing: True
    use_remove_padding: False
    use_liger: False
    use_fused_kernels: False
    trust_remote_code: True
  actor:
    strategy: fsdp2  # This is for backward-compatibility
    ppo_mini_batch_size: 256
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
    ppo_micro_batch_size_per_gpu: null
    use_dynamic_bsz: False
    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
    grad_clip: 1.0
    clip_ratio: 0.2
    clip_ratio_low: 0.2
    clip_ratio_high: 0.28
    clip_ratio_c: 3.0 # lower bound of the value for Dual-clip PPO from https://arxiv.org/pdf/1912.09729
    entropy_coeff: 0.0
    use_kl_loss: False # True for GRPO
    kl_loss_coef: 0.0 # for grpo
    use_torch_compile: True
    kl_loss_type: low_var_kl # for grpo
    ppo_epochs: 1
    shuffle: False
    ulysses_sequence_parallel_size: 1 # sp size
    loss_agg_mode: "token-mean"
    entropy_from_logits_with_chunking: False
    entropy_checkpointing: False
    
    # policy loss config
    policy_loss:
    
      # Loss function mode: vanilla / clip-cov / kl-cov from https://arxiv.org/abs/2505.22617
      loss_mode: "vanilla"
      
      # Ratio of tokens to be clipped for clip-cov loss
      clip_cov_ratio: 0.0002

      # Lower bound for clip-cov loss
      clip_cov_lb: 1.0

      # Upper bound for clip-cov loss
      clip_cov_ub: 5.0

      # Ratio of tokens to be applied kl penalty for kl-cov loss
      kl_cov_ratio: 0.0002

      # KL divergence penalty coefficient
      ppo_kl_coef: 0.1
    checkpoint:

      # What to include in saved checkpoints
      # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
      save_contents: ['model', 'optimizer', 'extra']

      # For more flexibility, you can specify the contents to load from the checkpoint.
      load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents}
    optim:
      lr: 1e-6
      lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
      min_lr_ratio: 0.0   # only used with cosine lr scheduler, default to 0.0
      num_cycles: 0.5     # only used with cosine lr scheduler, default to 0.5
      warmup_style: constant  # select from constant/cosine
      total_training_steps: -1  # must be override by program
      weight_decay: 0.0
    fsdp_config:
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      param_offload: False
      optimizer_offload: False
      offload_policy: False # only for fsdp2, offload param\grad\optimizer during train
      reshard_after_forward: True # only for fsdp2, [True, False, int between 1 and fsdp_size]
      fsdp_size: -1

      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
      # before the current forward computation.
      forward_prefetch: False

    # profiler configs
    profiler:

      # True for each task has its own database, False for all tasks in one training step share one database.
      discrete: False

      # Whether to profile all ranks.
      all_ranks: False

      # The ranks that will be profiled. null or [0,1,...]
      ranks: null
  ref:

    # actor_rollout_ref.ref: FSDP config same as actor. For models larger than 7B, it’s recommended to turn on offload for ref by default
    strategy: ${actor_rollout_ref.actor.strategy}
    include_ref: False
    fsdp_config:
      param_offload: False
      reshard_after_forward: True # only for fsdp2, [True, False, int between 1 and fsdp_size]

      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
      # before the current forward computation.
      forward_prefetch: False
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
    use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile}
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: null
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size

    # calculate entropy with chunking to reduce memory peak
    entropy_from_logits_with_chunking: False

    # recompute entropy
    entropy_checkpointing: False

    # profiler configs
    profiler:

      # True for each task has its own database, False for all tasks in one training step share one database.
      discrete: False

      # Whether to profile all ranks.
      all_ranks: False

      # The ranks that will be profiled. null or [0,1,...]
      ranks: null
  rollout:
    name: vllm
    mode: sync # sync: LLM, async: AsyncLLM
    chat_scheduler: null
    max_model_len: null
    temperature: 1.0
    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
    top_p: 1
    use_fire_sampling: False
    prompt_length: ${data.max_prompt_length}  # not use for opensource
    response_length: ${data.max_response_length}
    # for vllm rollout
    dtype: bfloat16 # should align with FSDP
    gpu_memory_utilization: 0.5
    ignore_eos: False
    enforce_eager: True
    free_cache_engine: True
    load_format: dummy_dtensor

    # for huge model, layered summon can save memory (prevent OOM) but make it slower
    layered_summon: False
    tensor_model_parallel_size: 2
    max_num_batched_tokens: 8192
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: null
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    disable_log_stats: True
    enable_chunked_prefill: True # could get higher throughput
    # for hf rollout
    do_sample: True
    n: 1 # > 1 for grpo

    multi_stage_wake_up: false

    # Extra inference engine arguments (vllm, sglang).
    engine_kwargs:

      # for vllm
      vllm:

        # Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB).
        swap_space: null

        # Whether to disable the preprocessor cache for multimodel models.
        disable_mm_preprocessor_cache: False

      # for sglang
      sglang:

        # The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default.
        attention_backend: null

    val_kwargs:
      # sampling parameters for validation
      top_k: -1 # 0 for hf rollout, -1 for vllm rollout
      top_p: 1.0
      temperature: 0
      n: 1
      do_sample: False # default eager for validation
    # number of responses (i.e. num sample times)
    multi_turn: 
      enable: False  # should set rollout.name to sglang_async if True
      max_turns: null  # null for no limit (default max_length // 3)
      tool_config_path: null  # null for no tool
      format: chatml  # chatml, more formats will be supported in the future

    # support logging rollout prob for debugging purpose
    calculate_log_probs: False

    # profiler configs
    profiler:

      # True for each task has its own database, False for all tasks in one training step share one database.
      discrete: False

      # Whether to profile all ranks.
      all_ranks: False

      # The ranks that will be profiled. null or [0,1,...]
      ranks: null

    # [Experimental] agent loop based rollout configs
    agent:

      # Number of agent loop workers
      num_workers: 8

critic:

  # Number of rollouts per update (mirrors actor rollout_n)
  rollout_n: ${actor_rollout_ref.rollout.n}

  # fsdp or fsdp2 strategy used for critic model training
  strategy: ${actor_rollout_ref.actor.strategy}
  optim:
    lr: 1e-5
    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
    min_lr_ratio: null   # only useful for warmup with cosine
    warmup_style: constant  # select from constant/cosine
    total_training_steps: -1  # must be override by program
    weight_decay: 0.01
  model:
    path: ~/models/deepseek-llm-7b-chat

    use_shm: False
    tokenizer_path: ${actor_rollout_ref.model.path}
    override_config: { }
    external_lib: ${actor_rollout_ref.model.external_lib}
    enable_gradient_checkpointing: True
    use_remove_padding: False
    fsdp_config:
      param_offload: False
      grad_offload: False
      optimizer_offload: False
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0

      # Only for FSDP2: offload param/grad/optimizer during train
      offload_policy: False

      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
      reshard_after_forward: True

      # Number of GPUs in each FSDP shard group; -1 means auto
      fsdp_size: -1

      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
      # before the current forward computation.
      forward_prefetch: False
  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
  ppo_micro_batch_size_per_gpu: null
  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
  ulysses_sequence_parallel_size: 1 # sp size
  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
  shuffle: ${actor_rollout_ref.actor.shuffle}
  grad_clip: 1.0
  cliprange_value: 0.5

reward_model:
  enable: False
  strategy: fsdp
  model:
    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    external_lib: ${actor_rollout_ref.model.external_lib}
    use_remove_padding: False
    fsdp_config:
      min_num_params: 0
      param_offload: False
      fsdp_size: -1
  micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
  micro_batch_size_per_gpu: null # set a number
  max_length: null
  ulysses_sequence_parallel_size: 1 # sp size
  use_dynamic_bsz: ${critic.use_dynamic_bsz}
  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}


  # Cloud/local sandbox fusion configuration for custom reward logic
  sandbox_fusion:

    # Cloud/local function URL for sandbox execution
    url: null

    # Max concurrent requests allowed to sandbox
    max_concurrent: 64

    # Max memory limit for each sandbox process in MB
    memory_limit_mb: 1024

  # profiler configs
  profiler:

    # True for each task has its own database, False for all tasks in one training step share one database.
    discrete: False

    # Whether to profile all ranks.
    all_ranks: False

    # The ranks that will be profiled. null or [0,1,...]
    ranks: null

algorithm:
  gamma: 1.0
  lam: 1.0
  adv_estimator: gae
  norm_adv_by_std_in_grpo: True
  use_kl_in_reward: False
  kl_penalty: kl  # how to estimate kl divergence
  kl_ctrl:
    type: fixed
    kl_coef: 0.0
    horizon: 10000
    target_kl: 0.0

  # Whether to enable preference feedback PPO
  use_pf_ppo: False

  # Preference feedback PPO settings
  pf_ppo:

    # Method for reweighting samples: "pow", "max_min", or "max_random"
    reweight_method: pow

    # Power used for weight scaling in "pow" method
    weight_pow: 2.0

ray_init:
  num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.

trainer:
  balance_batch: True
  debug: False
  debug_port: 5678
  wandb_run_id: null
  total_epochs: 30

  # The steps that will be profiled. null means no profiling. null or [1,2,5,...]
  profile_steps: null
  total_training_steps: null

  # controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
  ## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
  ## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
  controller_nsight_options:

    # Select the API(s) to be traced.
    trace: "cuda,nvtx,cublas,ucx"

    # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
    cuda-memory-usage: "true"

    # CUDA graphs will be traced as a whole
    cuda-graph-trace: "graph"

  # worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
  worker_nsight_options:

    # Select the API(s) to be traced.
    trace: "cuda,nvtx,cublas,ucx"

    # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
    cuda-memory-usage: "true"

    # CUDA graphs will be traced as a whole
    cuda-graph-trace: "graph"

    # Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
    capture-range: "cudaProfilerApi"

    # Specify the desired behavior when a capture range ends.
    # In verl we need the orch.cuda.profiler.start/stop pair to repeats n times.
    # valid values are "repeat-shutdown:n" or null.
    # For normal whole step profiling, n = len(profile_steps);
    # but for discrete profiling, n = len(profile_steps) * Number(subtasks).
    # Or you can just leave it null and the program will use n = len(profile_steps) * 6;
    capture-range-end: null

    # Send signal to the target application's process group. We let the program to exit by itself.
    kill: none

  project_name: verl_examples
  experiment_name: gsm8k
  logger: [ 'console', 'wandb' ]
  # Number of generations to log during validation
  log_val_generations: 0

  # Directory for logging rollout data; no dump if null
  rollout_data_dir: null

  # Directory for logging validation data; no dump if null
  validation_data_dir: null

  # Number of nodes used in the training
  nnodes: 1
  n_gpus_per_node: 8
  save_freq: -1
  # auto: find the last ckpt to resume. If can't find, start from scratch
  resume_mode: auto # or auto or resume_path if 
  resume_from_path: False

  # ESI redundant time (in seconds) for model checkpointsAdd commentMore actions
  esi_redundant_time: 0
  test_freq: -1
  critic_warmup: 0
  default_hdfs_dir: null
  default_local_dir: checkpoints/code_io/${trainer.project_name}/${trainer.experiment_name}
  remove_previous_ckpt_in_save: False
  del_local_ckpt_after_load: False
  wandb_tags: null

  # Maximum number of actor checkpoints to keep
  max_actor_ckpt_to_keep: null

  # Maximum number of critic checkpoints to keep
  max_critic_ckpt_to_keep: null

  # Timeout (in seconds) for Ray worker to wait for registration
  ray_wait_register_center_timeout: 300

  # Device to run training on (e.g., "cuda", "cpu")
  device: cuda

reward_fn:
  extraction_type: answer_addition
  math_metric: deepscaler #[math_verify|deepscaler|union]
  splitter: "Assistant:"
  boxed_retry: False

azr:
  seed: 1
  executor_max_workers: 1
  executor_cleanup_frequency: 1
  problem_types:
    - code_i
    - code_o
    - code_f
  pred_data_mix_strategy: "max_new"  # [uniform_total, max_new, half_new, step]
  gen_data_probabilities_strategy: "uniform"  # [uniform, step]
  past_epoch_window: ${azr.data_selection_strategy.update_iteration}
  seed_dataset: null
  error_seed_dataset: null
  output_seed_path: null
  output_error_seed_path: null
  output_code_f_seed_path: null
  code_f_seed_dataset: null
  pretrain_pred_steps: -1
  executor: qwq # [qwq, sandboxfusion]
  ast_check: True
  execute_max_timeout: 10 # seconds
  random_print_max_programs: 3
  train_propose: True
  use_china_mirror: True # used for sandboxfusion executor for people in China
  data_selection_strategy:
    io_n: 6
    update_iteration: 1
    data_len: null # dummy set
    seed_batch_factor: 4
    content_max_length: 8096
    valid_program_filter: all # [all (all valids), non_one (all valids except 100% accuracy), non_extremes (all valids except 0% and 100% accuracy)]
    max_programs: null
    batched_estimate: False
    composite_function_n_min: -1
    composite_function_n_max: -1
    composite_chance: 0.5
    composite_start_step: -1
    max_programs_initial: ${azr.data_selection_strategy.composite_function_n_max}
    composite_chance_initial: ${azr.data_selection_strategy.composite_chance}
    composite_scheduler:
      enabled: False
      update_num_programs_start: 101
      update_num_programs_interval: 50
      num_programs_max: 3
      update_probability_start: 101
      update_probability_interval: 50
      update_probability_max: 0.8
      update_probability_increment: 0.01
    num_inputs: 10 # for code_f, how many inputs to generate
    banned_words:
      - logging
      - random
      - multiprocessing
      - pebble
      - subprocess
      - threading
      - datetime
      - time
      - hashlib
      - hmac
      - bcrypt
      - os.sys
      - os.path
      - sys.exit
      - os.environ
      - calendar
      - datetime
    banned_keywords_for_errors_and_exceptions:
      # - raise
      # - assert
      # - try
      # - except
  reward:
    n_samples: 8
    extract_code_block: True
    code_f_reward_type: binary # [accuracy, binary]
    generation_reward_config:
      format_reward: True
      reject_multiple_functions: True
      reject_test_input_in_code: False
      f_replace_location: not_first # [not_first, any_last, any_first, not_last]
      intrinsic_combine_method: sum # [sum, multiply, sum_multiply]
      remove_after_return: False # remove global variables
      remove_comments: False
      remove_print: False
      use_original_code_as_ref: False
      generation_accuracy_convertion: one_minus
      remove_input_from_snippet: False # prompting
      include_references: True # ablation for unconditional generation
      code_location: first # [first, last]
      complexity_reward:
        enabled: False
        coef: 0.0
        max: 0.5
      mean_edit_distance_reward:
        enabled: False
        coef: 0.0
        max: 0.5
      halstead_reward:
        enabled: False
        coef: 0.0
        max: 0.5
      answer_diversity_reward:
        enabled: False
        coef: 0.0
        max: 0.5
        hierarchical: False
      f_input_answer_diversity_reward:
        enabled: False
        coef: 0.0
        max: 0.5
        hierarchical: False
      f_output_answer_diversity_reward:
        enabled: False
        coef: 0.0
        max: 0.5
        hierarchical: False
