algorithm.adv_estimator: grpo_unlkly
+algorithm.advantage.rank_penalty: 0.25
data.train_files: $HOME/data/math-hard/train.parquet
data.val_files: $HOME/data/math-hard/test.parquet
data.max_prompt_length: 1800
data.max_response_length: 1024
# total samples: 2^18
trainer.total_training_steps: 256 # 2**9
data.train_batch_size: 16 # 2**5
actor_rollout_ref.actor.ppo_mini_batch_size: 16
actor_rollout_ref.rollout.n: 32 # 2**4
actor_rollout_ref.model.path: meta-llama/Llama-3.2-3B-Instruct
actor_rollout_ref.actor.optim.lr: 3e-6
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu: 8
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu: 8
actor_rollout_ref.rollout.tensor_model_parallel_size: 1
actor_rollout_ref.rollout.gpu_memory_utilization: 0.7
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu: 8
actor_rollout_ref.model.use_remove_padding: "true"
actor_rollout_ref.model.use_fused_kernels: "true"
# critic.ppo_micro_batch_size_per_gpu: 2
# critic.model.path: meta-llama/Llama-3.2-3B-Instruct
# triton
# actor_rollout_ref.model.use_liger: "True"
# actor_rollout_ref.model.use_fused_kernels: "True"
# actor_rollout_ref.model.fused_kernel_options.impl_backend: "triton"

# Dr. GRPO
actor_rollout_ref.actor.loss_agg_mode: "seq-mean-token-sum-norm" # turn off seq-dim averaging
actor_rollout_ref.actor.use_kl_loss: "false"
algorithm.norm_adv_by_std_in_grpo: "false"
# End Dr. GRPO

algorithm.kl_ctrl.kl_coef: 0.001
trainer.logger: "['console','mlflow']"
trainer.n_gpus_per_node: 2
trainer.nnodes: 1
trainer.save_freq: 32
trainer.max_actor_ckpt_to_keep: 1
trainer.val_before_train: "False"
# trainer.test_freq: 16
trainer.log_val_generations: 10
trainer.total_epochs: 1000 #15
trainer.project_name: 'math_reasoner'
trainer.experiment_name: "math_rwd_unlkly_llama3.2_3B_$\\{data.train_batch_size\\}_$\\{actor_rollout_ref.rollout.n\\}"
trainer.resume_mode: disable
trainer.default_local_dir: "/beegfs/scratch/user/<anonymized>/reasoning/verl-checkpoints/$\\{trainer.project_name\\}/$\\{trainer.experiment_name\\}"