data.train_files: $HOME/data/gsm8k/train.parquet
data.val_files: $HOME/data/gsm8k/test.parquet
data.max_prompt_length: 512
data.max_response_length: 1024
# We use 2**16 samples
trainer.total_training_steps:
  # 1024: # 2**10
  #   actor_rollout_ref.rollout.n:
  #     8: # 2**3
  #       data.train_batch_size: 8
  #       actor_rollout_ref.actor.ppo_mini_batch_size: 8
  #     16:
  #       data.train_batch_size: 4
  #       actor_rollout_ref.actor.ppo_mini_batch_size: 4
  #     32:
  #       data.train_batch_size: 2
  #       actor_rollout_ref.actor.ppo_mini_batch_size: 2
  #     64:
  #       data.train_batch_size: 1
  #       actor_rollout_ref.actor.ppo_mini_batch_size: 1
  # 512: # 2**9
  #   actor_rollout_ref.rollout.n:
  #     8:
  #       data.train_batch_size: 16
  #       actor_rollout_ref.actor.ppo_mini_batch_size: 16
  #     16:
  #       data.train_batch_size: 8
  #       actor_rollout_ref.actor.ppo_mini_batch_size: 8
  #     32:
  #       data.train_batch_size: 4
  #       actor_rollout_ref.actor.ppo_mini_batch_size: 4
  #     64:
  #       data.train_batch_size: 2
  #       actor_rollout_ref.actor.ppo_mini_batch_size: 2
  256: # 2**8
    actor_rollout_ref.rollout.n:
      # 8:
      #   data.train_batch_size: 32 # ~BEST JS, TV
      #   actor_rollout_ref.actor.ppo_mini_batch_size: 32
      # 16:
      #   data.train_batch_size: 16
      #   actor_rollout_ref.actor.ppo_mini_batch_size: 16
      # 32:
      #   data.train_batch_size: 8
      #   actor_rollout_ref.actor.ppo_mini_batch_size: 8
      # 64:
      #   data.train_batch_size: 4
      #   actor_rollout_ref.actor.ppo_mini_batch_size: 4
      128:
        data.train_batch_size: 2
        actor_rollout_ref.actor.ppo_mini_batch_size: 2
      # 256:
      #   data.train_batch_size: 4
      #   actor_rollout_ref.actor.ppo_mini_batch_size: 4
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu: 16
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu: 16
actor_rollout_ref.model.path: Qwen/Qwen2.5-1.5B-Instruct
actor_rollout_ref.actor.optim.lr: 1e-6
actor_rollout_ref.actor.optim.weight_decay: 0
actor_rollout_ref.rollout.tensor_model_parallel_size: 1
actor_rollout_ref.rollout.gpu_memory_utilization: 0.7
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu: 16
trainer.logger: "['console','mlflow']"
trainer.val_before_train: False
trainer.n_gpus_per_node: 1
trainer.nnodes: 1
trainer.save_freq: -1
trainer.max_actor_ckpt_to_keep: 1
trainer.test_freq: 10
trainer.total_epochs: 1000 # this is capped by the training steps
algorithm.adv_estimator: fcdpg
actor_rollout_ref.actor.policy_loss.loss_mode: fcdpg
algorithm.fcdpg.loss_divergence: kl
  # js:
  # kl:
  # tv:
  # reverse_kl:
  #   algorithm.fcdpg.exponential_ebm: "true"
  #   algorithm.kl_ctrl.kl_coef: 0.01
# algorithm.fcdpg.exponential_ebm: "true"
# algorithm.kl_ctrl.kl_coef: 0.01
algorithm.fcdpg.use_baseline: "true"
trainer.project_name: 'reasoner'
trainer.experiment_name: "hp_search_gsm8k_fcdpg_$\\{algorithm.fcdpg.loss_divergence\\}_qwen2.5_1.5B"
trainer.default_local_dir: "/beegfs/scratch/user/<anonymized>/reasoning/verl-checkpoints/$\\{trainer.project_name\\}/$\\{trainer.experiment_name\\}"
trainer.resume_mode: disable