defaults:
  - finetune: base 
  - rewards: pure_success
  - streams: files
  - _self_

finetune:
  use_flash_attention: true
  attn_implementation: flash_attention_2
  config_name: ${..model_path}
  output_dir: ${..output_dir}/finetune
  seq_length: 12000
  seq_packing: true
  rl:
    algo: grpo
    kl_coef: 0.0
    entropy_bonus: 0.0
    reward_minus_kl_coef: 0.0
    epsilon: 1000
    use_advantages: true
    relu_log_p_weights: false
    clamp_log_ratio_ref_new_value: 5
    temperature: ${...llm.parameters.temperature}
    aggregate_loss: sum
  train_batch_size: 1
  gradient_accumulation_passes: 4096
  gradient_checkpointing: true
  # see https://medium.com/pytorch/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d
  # for the motivation to this false by default
  reentrant_checkpointing: false
  learning_rate: 1e-6
  save_checkpoint_steps: 100
  log_each_n_steps: 1
  input: training_data
  send_weight_updates: true
  queue_size: 256
  max_lag: ${..max_lag}
  weight_update_interval: 1
  pop_old_data: ${..pop_old_data}
actor:
  llm_max_rollouts: 128
  rollout_workers: 1
verifier:
  host: localhost
  port: 7777
preprocess:
  input: actor
  output: training_data
  n_workers: 8
  queue_size: 32
  chunk_size: 16
  submit_delay: 0.
  pop_old_data: ${..pop_old_data} 
llm:
  parameters:
    # changed
    max_tokens: 8192
    # changed
    temperature: 1.0
test_llm:
  parameters: 
    max_tokens: 16000
    temperature: 1.0
    top_p: 0.95
    top_k: 50

vllm_config:
  vllm_kwargs:
    dtype: bfloat16
    download-dir: ${env.HOME}/base_models/ 
    gpu-memory-utilization: 0.9
    num-scheduler-steps: 1
    disable-log-requests: ""
    disable-frontend-multiprocessing: ""
    max-num-seqs: 512
    max-num-batched-tokens: 1024
    enable-chunked-prefill: ""
    return-tokens-as-token-ids: ""
    tensor-parallel-size: 1
    pipeline-parallel-size: 1

world:
  actors: 1
  preprocessors: 1
  
  actor_fraction: 4
  preprocessor_fraction: 0
  finetune_fraction: 4

  actor_group_port: 9000

# changed
system_prompt: Please reason step by step, and put your final answer within \boxed{}.
task_template: |-
  {task}

eval_every_n_versions: 78000

# changed
model_path: Qwen/Qwen2.5-7B

# will use default based on the chosen backend
accelerate_config: null
use_deepspeed: true
deepspeed_config: deepspeed_stage3_bf16
use_fsdp: false
fsdp:
  param_dtype: fp32
  reduce_dtype: fp32
  buffer_dtype: fp32

output_dir: ???
force_restart: false
pop_old_data: true
max_lag: null
attempts: 8
discount_factor: 1
train_dataset_names:
- open_reasoner_zero_57k
- open_reasoner_zero_extended_72k 
train_subset: null
test_dataset_names:
  - aime_2024
  - amc_2023
  - math_500

debug:
  mode: ""
  streams_from: null
  place_inference_workers: true
