trainer.project_name: 'lean_reasoner'
trainer.experiment_name: "lean_fcdpg_$\\{algorithm.fcdpg.loss_divergence\\}$\\{algorithm.fcdpg.alpha\\}_dsprover_$\\{data.train_batch_size\\}_$\\{actor_rollout_ref.rollout.n\\}"
trainer.default_local_dir: "/beegfs/scratch/user/<anonymized>/reasoning/verl-checkpoints/$\\{trainer.project_name\\}/$\\{trainer.experiment_name\\}"
algorithm.adv_estimator: fcdpg
algorithm.fcdpg.z_default: 0.0001
data.train_files: /beegfs/scratch/user/<anonymized>/fcdpg-verl/verl/results/DeepSeek-Prover-V1.5-SFT/mff-lwb-10k-seen-verified-scored.parquet
data.val_files: /beegfs/scratch/user/<anonymized>/fcdpg-verl/verl/data/processed_cot/mff-lwb-unseen-200.parquet
data.max_prompt_length: 1024
data.max_response_length: 1024
data.filter_overlong_prompts: true
data.prompt_key: prompt
data.train_batch_size: 128
data.truncation: 'error'
+data.seed: 42
algorithm.fcdpg.exponential_ebm: false
algorithm.fcdpg.use_baseline: true
#algorithm.fcdpg.alpha: ${ALPHA}
algorithm.fcdpg.reset_z: true
algorithm.fcdpg.ir_max_clip: 100
algorithm.fcdpg.baseline_window_size: 1024
algorithm.fcdpg.loss_divergence: js
actor_rollout_ref.actor.policy_loss.loss_mode: vanilla
actor_rollout_ref.model.path: /beegfs/scratch/user/<anonymized>/fcdpg-verl/DeepSeek-Prover-V1.5-SFT
actor_rollout_ref.actor.optim.lr: 3e-6
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio: 0.05
actor_rollout_ref.actor.optim.weight_decay: 0.0
actor_rollout_ref.model.use_remove_padding: True
actor_rollout_ref.actor.ppo_mini_batch_size: 128
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu: 8
actor_rollout_ref.model.enable_gradient_checkpointing: True
actor_rollout_ref.actor.fsdp_config.param_offload: False
actor_rollout_ref.actor.fsdp_config.optimizer_offload: False
actor_rollout_ref.actor.grad_clip: 1.0
actor_rollout_ref.actor.clip_ratio: 0.2
actor_rollout_ref.actor.entropy_coeff: 0.0
actor_rollout_ref.actor.use_kl_loss: False
actor_rollout_ref.actor.kl_loss_coef: 0.001
actor_rollout_ref.actor.kl_loss_type: low_var_kl
actor_rollout_ref.actor.entropy_coeff: 0
actor_rollout_ref.actor.ppo_epochs: 1
actor_rollout_ref.actor.use_torch_compile: false
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu: 8
actor_rollout_ref.rollout.tensor_model_parallel_size: 2
actor_rollout_ref.rollout.gpu_memory_utilization: 0.5
actor_rollout_ref.rollout.name: vllm
actor_rollout_ref.rollout.n: ${N}
actor_rollout_ref.rollout.response_length: 1024
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu: 8
algorithm.gamma: 1.0
algorithm.lam: 1.0
actor_rollout_ref.ref.fsdp_config.param_offload: True
actor_rollout_ref.model.trust_remote_code: True
trainer.default_hdfs_dir: null
trainer.nnodes: 1
trainer.n_gpus_per_node: 4
trainer.logger: '["console","mlflow"]'
trainer.save_freq: 50
trainer.max_actor_ckpt_to_keep: 1
trainer.val_before_train: False
trainer.total_epochs: 5
critic.ppo_micro_batch_size_per_gpu: 16
critic.grad_clip: 1.0
custom_reward_function.path: /beegfs/scratch/user/<anonymized>/fcdpg-verl/verl/lean/verifier.py
custom_reward_function.name: verify_with_deepseek_verifier
reward_model.reward_manager: prime
reward_model.launch_reward_fn_async: True
ray_init.num_cpus: 28
actor_rollout_ref.rollout.enforce_eager: False
actor_rollout_ref.rollout.free_cache_engine: True
actor_rollout_ref.model.use_liger: False
actor_rollout_ref.model.use_fused_kernels: True
actor_rollout_ref.model.fused_kernel_options.impl_backend: torch
actor_rollout_ref.rollout.val_kwargs.top_k: 50
actor_rollout_ref.rollout.val_kwargs.top_p: 1.0
actor_rollout_ref.rollout.val_kwargs.temperature: 0.8
actor_rollout_ref.rollout.val_kwargs.do_sample: True
actor_rollout_ref.rollout.val_kwargs.n: 16
trainer.test_freq: 500