data.train_files: $HOME/data/math/train.parquet
data.val_files: $HOME/data/math/test.parquet
# total samples: 2^18
data.max_prompt_length: 1800
data.max_response_length: 3000
data.shuffle: "false"
trainer.total_training_steps: 512 # 2**9
actor_rollout_ref.rollout.n:
  256: # 2**7
    data.train_batch_size: 1 # 2**2
    actor_rollout_ref.actor.ppo_mini_batch_size: 1
  128: # 2**7
    data.train_batch_size: 2 # 2**2
    actor_rollout_ref.actor.ppo_mini_batch_size: 2
  64: # 2**6
    data.train_batch_size: 4 # 2**2
    actor_rollout_ref.actor.ppo_mini_batch_size: 4
  32: # 2**5
    data.train_batch_size: 8 # 2**3
    actor_rollout_ref.actor.ppo_mini_batch_size: 8
actor_rollout_ref.model.path: meta-llama/Llama-3.2-3B-Instruct
actor_rollout_ref.model.lora_rank: 256
actor_rollout_ref.model.lora_alpha: 256
actor_rollout_ref.model.use_remove_padding: "true"
actor_rollout_ref.model.use_fused_kernels: "true"
actor_rollout_ref.actor.optim.lr_warmup_steps: 10
actor_rollout_ref.actor.grad_clip: 10000 # something really large as there is no way to disable it
actor_rollout_ref.actor.optim.lr: 1e-6
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu: 8
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu: 8
actor_rollout_ref.rollout.tensor_model_parallel_size: 1
actor_rollout_ref.rollout.gpu_memory_utilization: 0.7
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu: 8
trainer.logger: "['console','mlflow']"
trainer.val_before_train: "False"
trainer.n_gpus_per_node: 2
trainer.nnodes: 1
trainer.save_freq: -1
trainer.max_actor_ckpt_to_keep: 1
trainer.test_freq: 16
trainer.total_epochs: 1000 #15
algorithm.fcdpg.ir_max_clip: 20
algorithm.adv_estimator: fcdpg
actor_rollout_ref.actor.policy_loss.loss_mode: fcdpg
algorithm.fcdpg.loss_divergence: js
  # js:
  # kl:
  # tv:
  # reverse_kl:
  #   algorithm.fcdpg.exponential_ebm: "true"
  #   algorithm.kl_ctrl.kl_coef: 0.01
# algorithm.fcdpg.exponential_ebm: "true"
# algorithm.kl_ctrl.kl_coef: 0.01
algorithm.fcdpg.use_baseline: "true"
trainer.project_name: 'math_reasoner_hp'
trainer.experiment_name: "math_fcdpg_$\\{algorithm.fcdpg.loss_divergence\\}_llama3.2_3B"
trainer.default_local_dir: "/beegfs/scratch/user/<anonymized>/reasoning/verl-checkpoints/$\\{trainer.project_name\\}/$\\{trainer.experiment_name\\}"
trainer.resume_mode: disable