algorithm.adv_estimator: grpo
data.train_files: $HOME/data/gsm8k-uk/train.parquet
data.val_files: $HOME/data/gsm8k-uk/test.parquet  # This is ENGLISH
data.max_prompt_length: 1024
data.max_response_length: 1024
# 2**18 samples
trainer.total_training_steps: 512 # 2**9
data.train_batch_size: 32 # 2**6
actor_rollout_ref.rollout.n: 16 # 2**4
actor_rollout_ref.model.path: meta-llama/Llama-3.2-3B-Instruct
actor_rollout_ref.actor.optim.lr: 1e-6
actor_rollout_ref.actor.ppo_mini_batch_size: 32
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu: 8
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu: 8
actor_rollout_ref.rollout.tensor_model_parallel_size: 1
actor_rollout_ref.rollout.gpu_memory_utilization: 0.7
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu: 8
critic.optim.lr: 1e-5
critic.model.path: Qwen/Qwen2.5-1.5B-Instruct
critic.ppo_micro_batch_size_per_gpu: 16
algorithm.kl_ctrl.kl_coef: 0.001
trainer.logger: "['console','mlflow']"
trainer.val_before_train: False
trainer.n_gpus_per_node: 2
trainer.nnodes: 1
trainer.save_freq: 32
trainer.max_actor_ckpt_to_keep: 1
trainer.test_freq: 10
trainer.total_epochs: 1000 # this is capped by the training steps
trainer.project_name: 'gsm8k_uk-reasoner'
trainer.experiment_name: 'gsm8k_uk_grpo_llama3.2_3B_$\\{data.train_batch_size\\}_$\\{actor_rollout_ref.rollout.n\\}'
trainer.default_local_dir: "/beegfs/scratch/user/<anonymized>/reasoning/verl-checkpoints/$\\{trainer.project_name\\}/$\\{trainer.experiment_name\\}"
trainer.resume_mode: disable