data.train_files: $HOME/data/gsm8k/train.parquet
data.val_files: $HOME/data/gsm8k/test.parquet
data.max_prompt_length: 512
data.max_response_length: 1024
# We use 2**16 samples
#BEST
# total_training_steps: 512
# rollout.n: 4
# train_batch_size: 32
# lr: 1e-6
trainer.total_training_steps:
  1024: # 2**10
    actor_rollout_ref.rollout.n:
      1: # 2 ** 0
        data.train_batch_size: 64 # 2**6 = 2** 16
        actor_rollout_ref.actor.ppo_mini_batch_size: 64
      2: # 2**1
        data.train_batch_size: 32 # 2**5 = 2** 16
        actor_rollout_ref.actor.ppo_mini_batch_size: 32
      4: # 2**2
        data.train_batch_size: 16 # 2**4 = 2** 16
        actor_rollout_ref.actor.ppo_mini_batch_size: 16
      8: # 2**3
        data.train_batch_size: 8 # 2**4 = 2** 16
        actor_rollout_ref.actor.ppo_mini_batch_size: 8
  512: # 2**9
    actor_rollout_ref.rollout.n:
      1:
        data.train_batch_size: 128 # 2**7 = 2** 16
        actor_rollout_ref.actor.ppo_mini_batch_size: 128
      2:
        data.train_batch_size: 64
        actor_rollout_ref.actor.ppo_mini_batch_size: 64
      4:
        data.train_batch_size: 32
        actor_rollout_ref.actor.ppo_mini_batch_size: 32
      8:
        data.train_batch_size: 16
        actor_rollout_ref.actor.ppo_mini_batch_size: 16
  256: # 2**8
    actor_rollout_ref.rollout.n:
      1:
        data.train_batch_size: 256
        actor_rollout_ref.actor.ppo_mini_batch_size: 256
      2:
        data.train_batch_size: 128
        actor_rollout_ref.actor.ppo_mini_batch_size: 128
      4:
        data.train_batch_size: 64
        actor_rollout_ref.actor.ppo_mini_batch_size: 64
      8:
        data.train_batch_size: 32
        actor_rollout_ref.actor.ppo_mini_batch_size: 32
trainer.save_freq: -1
actor_rollout_ref.model.path: Qwen/Qwen2.5-1.5B-Instruct
actor_rollout_ref.actor.optim.lr: [1e-5, 1e-6]
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu: 4
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu: 4
actor_rollout_ref.rollout.tensor_model_parallel_size: 1
actor_rollout_ref.rollout.gpu_memory_utilization: 0.4
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu: 4
critic.optim.lr: 1e-5
critic.model.path: Qwen/Qwen2.5-1.5B-Instruct
critic.ppo_micro_batch_size_per_gpu: 4
algorithm.kl_ctrl.kl_coef: [0.001, 0.01]
trainer.logger: "['console','mlflow']"
trainer.val_before_train: False
trainer.n_gpus_per_node: 1
trainer.nnodes: 1
trainer.save_freq: -1
trainer.max_actor_ckpt_to_keep: 1
trainer.max_ckpt_to_keep: 1
trainer.test_freq: 10
trainer.total_epochs: 1000 # this is capped by the training steps
trainer.project_name: 'reasoner'
trainer.experiment_name: 'gsm8k_grpo_qwen2.5_1.5B'
trainer.default_local_dir: "/scratch/1/user/<anonymized>/reasoning/verl-checkpoints/$\\{trainer.project_name\\}/$\\{trainer.experiment_name\\}"
trainer.resume_mode: disable