model:
  model_name: "Qwen/Qwen2.5-3B-Instruct"
  gpu_memory_utilization: 0.5
  max_seq_length: 1024
  lora_rank: 64
  load_in_4bit: true
  fast_inference: true
  target_modules:
    - "q_proj"
    - "k_proj"
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"
  random_state: 42
wandb:
  enable: true
  project_name: "PODS-Babel"
  run_name: "Instruct-GSM8K-16-16-Run1"
rl:
  algorithm: "MaxVarGRPO"
  dataset: "gsm8k"
  max_prompt_length: 512
  max_completion_length: 512
  num_generations_reward: 16
  num_generations_grad: 16
  advantage_decay: 1.0
  max_steps: 8000
  save_strategy: "time"
  save_steps: 100
  save_interval: 1800
  max_time: 21600
  beta: 0.0
optim:
  learning_rate: 5e-6
  lr_scheduler_type: "constant"
  use_vllm: true
  optim: "adamw_8bit"
  adam_beta1: 0.9
  adam_beta2: 0.99
  weight_decay: 0.1
  warmup_ratio: 0.01
  temperature: 0.9
  max_grad_norm: 0.1
  per_device_train_batch_size: 1
  gradient_accumulation_steps: 1
  logging_steps: 1