# Model arguments
model_revision: main
torch_dtype: bfloat16
# attn_implementation: flash_attention_2

# Data training arguments
system_prompt:  "You are a helpful AI Assistant that provides well-reasoned and detailed responses for the math word problems. You first think about the reasoning process as an internal monologue and then provide the user with the concise and accurate answer. The final answer should be provided in the \\boxed{}."

# GRPO trainer config
bf16: true
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 100
logging_strategy: steps
lr_scheduler_type: cosine
max_prompt_length: 512
max_completion_length: 2048
max_steps: 300
num_generations: 8
num_train_epochs: 1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- wandb
reward_funcs:
- accuracy
- tag_count_gsm8k
- hidden_score_medical
reward_weights:
- 1.0
- 1.0
- 0.2
save_strategy: "steps"
save_steps: 100
save_only_model: true
save_total_limit: 10
seed: 42
warmup_ratio: 0.1
