data.train_files: $HOME/data/math-harder/train.parquet
data.val_files: $HOME/data/math-harder/test.parquet
data.max_prompt_length: 1800
data.max_response_length: 3000
# total samples: 2^18
trainer.total_training_steps: 512
data.train_batch_size: 32
actor_rollout_ref.actor.ppo_mini_batch_size: 32
actor_rollout_ref.rollout.n: 16 # 2**4
actor_rollout_ref.model.path: Qwen/Qwen2.5-1.5B-Instruct
actor_rollout_ref.actor.optim.lr: 1e-6
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu: 4
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu: 4
actor_rollout_ref.rollout.tensor_model_parallel_size: 1
actor_rollout_ref.rollout.gpu_memory_utilization: 0.7
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu: 4
# triton
# actor_rollout_ref.model.use_liger: "True"
# actor_rollout_ref.model.use_fused_kernels: "True"
# actor_rollout_ref.model.fused_kernel_options.impl_backend: "triton"
critic.optim.lr: 1e-5
critic.model.path: Qwen/Qwen2.5-1.5B-Instruct
critic.ppo_micro_batch_size_per_gpu: 4
algorithm.kl_ctrl.kl_coef: 0.001
trainer.logger: "['console','mlflow']"
trainer.val_before_train: "True"
trainer.n_gpus_per_node: 2
trainer.nnodes: 1
trainer.save_freq: 32
trainer.max_actor_ckpt_to_keep: 1
trainer.test_freq: 10
trainer.total_epochs: 1000 #15
trainer.project_name: 'math_reasoner2'
trainer.experiment_name: 'math_grpo_qwen2.5_1.5B'
trainer.default_local_dir: "/beegfs/scratch/user/<anonymized>/reasoning/verl-checkpoints/$\\{trainer.project_name\\}/$\\{trainer.experiment_name\\}"
trainer.resume_mode: disable