data.train_files: $HOME/data/math-hard/train-patched.parquet
data.val_files: $HOME/data/math-hard/test.parquet
data.max_prompt_length: 1800
data.max_response_length: 1024
# 2**18 samples
trainer.total_training_steps: 256 # 2**9
data.train_batch_size: 64 # 2**5
actor_rollout_ref.actor.ppo_mini_batch_size: 64
actor_rollout_ref.rollout.n: 4 # 2**4
# data.train_batch_size:
#   16:
#     actor_rollout_ref.actor.ppo_mini_batch_size: 16
#     actor_rollout_ref.rollout.n: 32
#   8:
#     actor_rollout_ref.actor.ppo_mini_batch_size: 8
#     actor_rollout_ref.rollout.n: 64
trainer.n_gpus_per_node: 2
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu: 8
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu: 8
actor_rollout_ref.model.path: meta-llama/Llama-3.2-3B-Instruct
actor_rollout_ref.actor.optim.lr: 3e-6
actor_rollout_ref.actor.optim.weight_decay: 0
actor_rollout_ref.rollout.tensor_model_parallel_size: 1
actor_rollout_ref.rollout.gpu_memory_utilization: 0.7
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu: 8
trainer.logger: "['console','mlflow']"
actor_rollout_ref.model.use_remove_padding: "true"
actor_rollout_ref.model.use_fused_kernels: "true"
trainer.nnodes: 1
trainer.save_freq: 32
trainer.max_actor_ckpt_to_keep: 1
trainer.val_before_train: "False"
# trainer.log_val_generations: 10
# trainer.test_freq: 16
trainer.total_epochs: 1000 # this is capped by the training steps
algorithm.adv_estimator: fcdpg
actor_rollout_ref.actor.policy_loss.loss_mode: fcdpg
actor_rollout_ref.actor.loss_agg_mode: "seq-mean-token-sum"
# actor_rollout_ref.rollout.calculate_log_probs: "true"
algorithm.fcdpg.loss_divergence: amari_alpha
algorithm.fcdpg.alpha: [0] # [0.95, 0.99] [0.5, 0.75, 0.95, 0.99] -> [0.5, 0.65, 0.8, 0.95, 0.99]
algorithm.fcdpg.z_default: 1e-5 # default value of z when the partition function is 0
# algorithm.fcdpg.center_f: "true"
# algorithm.fcdpg.ir_max_clip: 100 # clip extreme outliers of P/q
# algorithm.fcdpg.z_max_clip: 1 # clip values so that z cannot exceed 1
# algorithm.fcdpg.reset_z:  "true" # uses the contrastive approach

# actor_rollout_ref.rollout.dtype: float32
# +actor_rollout_ref.actor.fsdp_config.mixed_precision.param_dtype: fp32
# +actor_rollout_ref.ref.fsdp_config.mixed_precision.param_dtype: fp32

# +actor_rollout_ref.actor.fsdp_config.model_dtype: float32
# +actor_rollout_ref.ref.fsdp_config.model_dtype: float32
  # js:
  # kl:
  # tv:
  # reverse_kl:
  #   algorithm.fcdpg.exponential_ebm: "true"
  #   algorithm.kl_ctrl.kl_coef: 0.001
  #   trainer.total_training_steps: 512
  #   data.train_batch_size: 32
  #   actor_rollout_ref.actor.ppo_mini_batch_size: 32
  #   actor_rollout_ref.rollout.n: 16
algorithm.fcdpg.use_baseline: "true"
trainer.project_name: 'math_reasoner'
trainer.experiment_name: "math_fcdpg_$\\{algorithm.fcdpg.loss_divergence\\}$\\{algorithm.fcdpg.alpha\\}_llama3.2_3B_$\\{data.train_batch_size\\}_$\\{actor_rollout_ref.rollout.n\\}"
trainer.default_local_dir: "/beegfs/scratch/user/<anonymized>/reasoning/verl-checkpoints/$\\{trainer.project_name\\}/$\\{trainer.experiment_name\\}"
trainer.resume_mode: disable
# data.train_files: $HOME/data/math-harder/train.parquet
# data.val_files: $HOME/data/math-harder/test.parquet
# # total samples: 2^18
# data.max_prompt_length: 1800
# data.max_response_length: 3000
# trainer.total_training_steps: 256
# data.shuffle: "false"
# data.train_batch_size: 1
# actor_rollout_ref.actor.ppo_mini_batch_size: 1
# actor_rollout_ref.rollout.n: 32
# actor_rollout_ref.rollout.temperature: 1.2
# actor_rollout_ref.actor.optim.weight_decay: 0
# actor_rollout_ref.rollout.name: vllm
# # Use FSDP2 or set actor_rollout_ref.actor.fsdp_config.use_orig_params: "true"
# # so that the actor and ref parameters are shared consistently
# actor_rollout_ref.actor.strategy: fsdp2
# # +actor_rollout_ref.rollout.micro_batch_size: 8
# actor_rollout_ref.model.path: Qwen/Qwen2.5-1.5B-Instruct
# actor_rollout_ref.actor.optim.lr_warmup_steps: 5
# # actor_rollout_ref.actor.grad_clip: 10000 # something really large as there is no way to disable it
# # actor_rollout_ref.actor.optim.lr: 1e-6
# actor_rollout_ref.actor.optim.lr: 1e-6
# # actor_rollout_ref.actor.use_torch_compile: "false"
# # actor_rollout_ref.ref.use_torch_compile: "false"
# # actor_rollout_ref.model.lora_rank: 64
# # actor_rollout_ref.model.lora_alpha: 64
# ## these two need to have the same value to avoid inconsistencies
# actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu: 8
# actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu: 8
# ##
# actor_rollout_ref.rollout.tensor_model_parallel_size: 1
# actor_rollout_ref.rollout.gpu_memory_utilization: 0.8
# actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu: 8
# trainer.logger: "['console','mlflow']"
# trainer.val_before_train: "False"
# trainer.n_gpus_per_node: 2
# trainer.nnodes: 1
# trainer.save_freq: -1 # 32
# trainer.max_actor_ckpt_to_keep: 1
# trainer.test_freq: -1 # 10
# trainer.total_epochs: 1000 #16
# algorithm.fcdpg.ir_max_clip: 10

# # FUNDAMENTAL TO MAKE FCDPG WORK
# # ###############################
# # actor_rollout_ref.rollout.dtype: float32
# # actor_rollout_ref.rollout.enforce_eager: "true"
# +actor_rollout_ref.actor.no_autocast: "true"
# # +actor_rollout_ref.actor.fsdp_config.model_dtype: float32
# # +actor_rollout_ref.ref.fsdp_config.model_dtype: float32
# # ###############3
# # +actor_rollout_ref.model.override_config.model_config.attn_implementation: eager
# # actor_rollout_ref.model.use_fused_kernels: "false"
# # actor_rollout_ref.rollout.enforce_eager: "true"
# # +actor_rollout_ref.rollout.engine_kwargs.vllm.attention_backend: TORCH_SDPA

# actor_rollout_ref.model.use_remove_padding: "true"
# actor_rollout_ref.model.use_fused_kernels: "true"

# algorithm.adv_estimator: fcdpg
# actor_rollout_ref.actor.policy_loss.loss_mode: fcdpg
# algorithm.fcdpg.loss_divergence: js
#   # js:
#   # kl:
#   # tv:
#   # reverse_kl:
#   #   algorithm.fcdpg.exponential_ebm: "true"
#   #   algorithm.kl_ctrl.kl_coef: 0.01
# algorithm.fcdpg.use_baseline: "true"
# trainer.project_name: 'math_reasoner2'
# trainer.experiment_name: "math_fcdpg_$\\{algorithm.fcdpg.loss_divergence\\}_qwen2.5_1.5B"
# trainer.default_local_dir: "/beegfs/scratch/user/<anonymized>/reasoning/verl-checkpoints/$\\{trainer.project_name\\}/$\\{trainer.experiment_name\\}"
# trainer.resume_mode: disable