hydra:
  searchpath:
    - pkg://verl.trainer.config

defaults:
  - ppo_trainer
  - _self_

actor_rollout_ref:
  rollout:
    mode: async
    agent:
      num_workers: 0
    val_kwargs:
      do_sample: True

data:
  gen_batch_size: ${mul:${data.train_batch_size},${rllm.rejection_sample.multiplier}}

rllm:
  agent:
    name: math_agent
    max_steps: 20
    trajectory_timeout: null
    overlong_filter: False
    agent_args: {}
    engine_args: {}
  env:
    name: custom
    env_args:
      # ---- Remote solver defaults; can be overridden via CLI or env ----
      solver_remote:
        base_url: ${oc.env:SGLANG_BASE_URL,"http://localhost:12345/v1"}
        api_key: ${oc.env:VLLM_API_KEY,""}
        model: agentica-org/DeepCoder-1.5B-Preview
        temperature: 0.0
        max_tokens: 16384
        top_p: 0.95
        top_k: -1
        max_retries: 3
        base_delay_s: 1.0
        extra_headers: {}

      # ---- Reward config; define 'gen' up front to avoid struct insertion ----
      reward_kwargs:
        remote_url: ${rllm.env.env_args.solver_remote.base_url}
        remote_api_key: ${rllm.env.env_args.solver_remote.api_key}
        solver_model_path: ${rllm.env.env_args.solver_remote.model}
        timeout_s: 600.0
        max_retries: 3
        base_delay: 2.0
        use_solver_cot: false
        use_marginal_improvement: true
        fractional_shaping: false
        use_together_code_interpreter: false
        gen:
          temperature: ${rllm.env.env_args.solver_remote.temperature}
          max_tokens: ${rllm.env.env_args.solver_remote.max_tokens}
          max_new_tokens: ${rllm.env.env_args.solver_remote.max_tokens}
          top_p: ${rllm.env.env_args.solver_remote.top_p}
          top_k: ${rllm.env.env_args.solver_remote.top_k}
          do_sample: true
          n: 1
  workflow:
    use_workflow: False
    name: single_turn_workflow
    workflow_args:
      agent_cls: null
      agent_args: {}
      env_cls: null
      env_args: {}
      timeout: 1e6
      gamma: 0.0 # no discounting
      reward_bonus_coeff: 0.0 # no reward shaping
      freeze_cm: false
      use_role_advnorm: false
      cm_roles: ["context_manager"]
      # ---- Bug workflows / static solver knobs (declare to allow Hydra overrides) ----
      dataset_name: deepcoder
      generator_system_prompt: null
      solver_system_prompt: null
      solver_model: null
      solver_base_url: null
      solver_temperature: 0.0
      solver_top_p: 1.0
      solver_max_prompt_length: null
      solver_max_response_length: null
    n_parallel_tasks: 256
    retry_limit: 3
  disable_thinking: False
  accumulate_reasoning: False
  mask_truncated_samples: False
  stepwise_advantage:
    enable: False
    mode: broadcast # [broadcast, per_step]
    normalize_by_steps: False
  compact_filtering:
    enable: False
    mask_max_prompt_length_exceeded: True
    mask_max_response_length_exceeded: True
    mask_env_done: False
    mask_max_turns_exceeded: True
    mask_timeout: True
    mask_unknown: False
    mask_error: True
  rejection_sample:
    enable: False
    multiplier: 1

fireworks:
  deployment_id: null
  model_id_prefix: test-model
  concurrency: 32

algorithm:
  # (keep your existing PPO settings here too)
  rollout_is: false                    # turn the reweighting on/off
  rollout_is_level: sequence          # sequence | token
  rollout_is_mode: truncate           # truncate | clip | none  (your helper expects "truncate")
  rollout_is_threshold: 2.0          # upper cap on IS weight
  rollout_is_threshold_lower: null    # if null => defaults to 1/upper
  rollout_is_veto_threshold: null   # drop sample if weight < this (set null to disable)
