common:
  loss_type: grpo
  beta: null
  num_iterations: null
  vllm_update_model_eve: null
  update_ref_model_eve: null
  output_dir: null 
  run_name: null
  model_name_or_path: Qwen/Qwen2.5-0.5B-Instruct
  learning_rate: 5e-6
  do_eval: true
  adam_beta1: 0.9
  adam_beta2: 0.99
  weight_decay: 0.1
  warmup_ratio: 0.1
  lr_scheduler_type: cosine
  logging_steps: 1
  eval_steps: 10
  eval_strategy: steps
  bf16: true
  per_device_train_batch_size: 16
  per_device_eval_batch_size: 16
  gradient_accumulation_steps: 4
  num_generations: 16
  max_prompt_length: 256
  max_completion_length: 200
  num_train_epochs: 1
  save_steps: 400
  max_grad_norm: 0.1
  log_on_each_node: false
  use_vllm: true
  vllm_gpu_memory_utilization: 0.3
  vllm_device: "cuda:0"
  report_to: tensorboard
# experiments
gsm8k-exp1:
  run: true
  beta: 0.1
  num_iterations: 1
  vllm_update_model_eve: 1
  update_ref_model_eve: -1
  output_dir: outputs/g2.5-0.5B-Instruct-grpo-v1-i1
  run_name: g2.5-0.5B-Instruct-grpo-v1-i1
gsm8k-exp2:
  run: true
  beta: 0.1
  num_iterations: 10
  vllm_update_model_eve: 10
  update_ref_model_eve: -1
  output_dir: outputs/g2.5-0.5B-Instruct-grpo-v10-i1
  run_name: g2.5-0.5B-Instruct-grpo-v10-i1
gsm8k-exp3:
  run: true
  loss_type: grpo_masked
  beta: 0.1
  num_iterations: 1
  vllm_update_model_eve: 10
  update_ref_model_eve: -1
  output_dir: outputs/g2.5-0.5B-Instruct-grpo-v1-i1-gm
  run_name: g2.5-0.5B-Instruct-grpo-v1-i1-gm
