defaults:
  - override /model_cfg@_global_: qwen7bi
  - override /data_cfg@_global_: r1_countdown
  - override /trainer_cfg@_global_: grpo
  - _self_

model_name_or_path: Qwen/Qwen2.5-7B-Instruct


max_steps: 200

train_batch_size: 256
per_device_train_batch_size: 1
# num_generations now divides the batch_size
num_generations: 64

save_strategy: 'no'
save_steps: 25
push_to_hub: false
tags:

temperature: 1.0

# lower lr and beta from deepseek math paper
learning_rate: 0.000003
beta: 0.001

max_prompt_length: 4096
max_completion_length: 1024
use_vllm: true
vllm_gpu_memory_utilization: 0.9

vllm_mode: server
vllm_guided_decoding_regex: null
vllm_server_timeout: 240.0
vllm_tensor_parallel_size: 1

# logging:
logging_strategy: steps
logging_steps: 1

report_to: "wandb"
wandb_project: r1_countdown

