env:
  CUDA_VISIBLE_DEVICES: 0,1
  NPROC_PER_NODE: 4
rlhf_config:
  rlhf_type: grpo
  model: qwen_1_7b_claim_verif_1604
  model_type: qwen3
  torch_dtype: bfloat16
  train_type: full
  dataset: train_45000_grpo.jsonl
  num_train_epochs: 1
  max_length: 2500
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  gradient_accumulation_steps: 2
  eval_steps: 1000
  save_steps: 2000
  learning_rate: 1.0e-06
  save_total_limit: 2
  logging_steps: 5
  output_dir: output
  warmup_ratio: 0.05
  dataloader_num_workers: 4
  max_completion_length: 300
  external_plugins: examples/train/grpo/plugin/plugin.py
  reward_funcs:
  - correctness_reward
  - format_reward
  - repitation_reward
  num_generations: 4
  use_vllm: false
  temperature: 1.0
  top_p: 1.0
  top_k: 80
  report_to: wandb
  log_completions: true
