experiment_name: asearcher-7b-zero-web
trial_name: run1

cluster:
  fileroot: /tmp/areal/experiments
  n_nodes: 1
  n_gpus_per_node: 8
  name_resolve:
    type: nfs
    nfs_record_root: /tmp/areal/name_resolve
seed: 1
total_train_epochs: 10
total_train_steps: null
tokenizer_path: ${actor.path}
allocation_mode: sglang.d4p1t1+d1c4
async_training: true

rollout:
  experiment_name: ${experiment_name}
  trial_name: ${trial_name}
  max_concurrent_rollouts: 32
  queue_size: null
  consumer_batch_size: ${train_dataset.batch_size}
  max_head_offpolicyness: 4
  enable_rollout_tracing: true

gconfig:
  n_samples: 16
  min_new_tokens: 0
  max_new_tokens: 1024
  greedy: false
  temperature: 1.0

actor:
  experiment_name: ${experiment_name}
  trial_name: ${trial_name}
  path: Qwen/Qwen2.5-7B
  init_from_scratch: false
  disable_dropout: true
  gradient_checkpointing: true
  dtype: bfloat16
  mb_spec:
    max_tokens_per_mb: 16000
  pad_to_maximum: true
  optimizer:
    type: adam
    lr: 5e-6
    weight_decay: 0.01
    beta1: 0.9
    beta2: 0.999
    eps: 1e-8
    lr_scheduler_type: constant
    gradient_clipping: 1.0
    warmup_steps_proportion: 0.001
  backend: fsdp

  group_size: ${gconfig.n_samples}
  group_reward_norm: false
  eps_clip: 0.4
  temperature: ${gconfig.temperature}
  reward_scaling: 10.0
  reward_bias: -0.5
  kl_ctl: 0.0
  ppo_n_minibatches: 1
  recompute_logprob: true
  use_decoupled_loss: true
  behav_imp_weight_cap: 5.0

ref:
  experiment_name: ${experiment_name}
  trial_name: ${trial_name}
  path: ${actor.path}
  init_from_scratch: false
  disable_dropout: true
  dtype: ${actor.dtype}
  mb_spec:
    max_tokens_per_mb: 32768
  optimizer: null
  backend: fsdp

# SGLang
sglang:
  model_path: ${actor.path}
  random_seed: ${seed}
  skip_tokenizer_init: false
  dtype: ${actor.dtype}
  max_running_requests: null
  context_length: 32768
  mem_fraction_static: 0.6
  attention_backend: fa3

# datasets
train_dataset:
  batch_size: 128
  shuffle: true
  pin_memory: true
  path: path_to_training_data

# Utilities
saver:
  experiment_name: ${experiment_name}
  trial_name: ${trial_name}
  fileroot: ${cluster.fileroot}
  freq_epochs: 1
  freq_steps: 10
  freq_secs: 3600

recover:
  experiment_name: ${experiment_name}
  trial_name: ${trial_name}
  fileroot: ${cluster.fileroot}
  freq_epochs: 1
  freq_steps: null
  freq_secs: 3600

evaluator:
  experiment_name: ${experiment_name}
  trial_name: ${trial_name}
  fileroot: ${cluster.fileroot}
  freq_epochs: null
  freq_steps: null
  freq_secs: null

stats_logger:
  experiment_name: ${experiment_name}
  trial_name: ${trial_name}
  fileroot: ${cluster.fileroot}
  wandb:
    mode: disabled

# Launcher
launcher:
  inference_server_cpus_per_gpu: 15
  inference_server_mem_per_gpu: 153600
  trainer_cpus_per_gpu: 15
  trainer_mem_per_gpu: 153600
  trainer_env_vars: PYTHONPATH=path_to_asearcher:path_to_areal,WANDB_API_KEY=your_wandb_api_key,RAG_SERVER_ADDR_DIR=directory_of_rag_server_addrs


max_turns: 32
n_trajs: 16
search_client_type: async-online-search-access
reward_type: F1
topk: 5
valid_inst_ratio: 0.3
log_agent_stats: true
log_agent_stats_keys: 
  - num_input_tokens
  - num_output_tokens
  - num_llm_gens
  - num_search_queries
  - num_success_search_queries
  - num_failed_search_queries
  - num_pages
  - num_success_url_accesses
  - num_failed_url_accesses
  - score
  - judge_q_invalid
  - format_reward