# General settings
model_name: "YOUR_MODEL_PATH"
max_seq_length: 8192
dataset_path: "./data/FinetuneData/SynTrain_sample_all.json"
eval_split_ratio: 0.1
random_seed: 42

# Training arguments
training_args:
  do_train: true
  output_dir: "./ppo_checkpoints"
  best_model_dir: "./llm_weights/collmlight_rl"
  lr_scheduler_type: "cosine"
  logging_steps: 10
  num_train_epochs: 100

# PPO configuration
ppo_config:
  batch_size: 32
  mini_batch_size: 2
  gradient_accumulation_steps: 16
  learning_rate: 1.0e-5
  optimize_device_cache: true
  whiten_rewards: true
  gradient_checkpointing: true
  ppo_epochs: 2