# Model arguments
model_name_or_path: ../model/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
dataset_name: ../data/data_train/grpo/grpo_25000.jsonl

# GRPO trainer config
bf16: true
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: true
eval_strategy: steps
eval_steps: 25
gradient_accumulation_steps: 64
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
hub_model_id: None
hub_strategy: every_save
learning_rate: 3.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine
max_prompt_length: 0
max_completion_length: 256
max_steps: -1
num_generations: 7   
num_train_epochs: 1
output_dir: ../checkpoints/Qwen2.5-7B-Instruct-GRPO_25000
overwrite_output_dir: true
per_device_eval_batch_size: 4
per_device_train_batch_size: 4
push_to_hub: false
report_to:
- wandb
reward_funcs:
- format_reward
- accuracy_reward
- relevance_reward
- bonus_reward
reward_weights:
- 1.0
- 1.0
- 1.0
- 1.0
save_strategy: "steps"
save_steps: 25
seed: 42
warmup_ratio: 0.1
