This repo is modified from OpenRLHF v0.8.4, adding the proofaug+ training and inference pipeline.

## Preparation
To run the experiments, you need to install the dependencies under python=3.12, 
```bash
pip install -r requirements.txt
```
and install Lean v4.20.0 and compile the lean-gym and lean-gym-repl projects. Then you should be able to start our server by
```bash
python lean_reward_server.py &
```

Prepare from [Goedel-Pset](https://huggingface.co/datasets/Goedel-LM/Goedel-Pset-v1) to obtain our SFT data (10k), and others in the form like in the example.json. Say the dataset is on huggingface or local, with name "foo_name/pset_rl_data" and "foo_name/pset_sft_data"
Be careful to first exclude those in the test set datasets/pset_test.jsonl.

## SFT pipeline
Prepare the SFT data
```bash
python sft_data_pipeline.py \
   --dataset_name foo_name/pset_sft_data \
   --model_path AI-MO/Kimina-Prover-Distill-1.7B \
   --template_name kimina \
   --output_template_name dskpv2-non-cot \
   --output_dir ./results/sft_data/kimina1.7B-pset-messages \
   --sample_size 10000 \
   --remove_think \
   --upload_to_hf \
   --hf_repo_name foo_name/kimina_pset_sft_data \
   --gpu 4 \
   --gpu_memory_utilization 0.9

```
specify your uploading repo name or it will be pushed to "foo_name/kimina_pset_sft_data".

SFT to get the initial model for RL:
```bash
name=initial_model_for_rl
sft_args=(
   --module openrlhf.cli.train_sft 
   --max_len 4096 
   --dataset foo_name/kimina_pset_sft_data 
   --input_key messages 
   --apply_chat_template 
   --train_batch_size 64 
   --micro_train_batch_size 2 
   --max_samples 500000 
   --pretrain Qwen/Qwen2.5-1.5B-Instruct 
   --save_path ./checkpoints/$name 
   --ckpt_path ./checkpoints/ckpts/$name 
   --save_hf_ckpt --save_steps 20 
   --max_ckpt_num 20 
   --logging_steps 1 
   --eval_steps -1 
   --zero_stage 2 
   --max_epochs 1 
   --packing_samples 
   --bf16 --flash_attn 
   --learning_rate 5e-6 --gradient_checkpointing 
   --wandb_run_name $name 
   --use_tensorboard ./logs/sft
)
deepspeed "${sft_args[@]}"
```

## RL pipeline
Run the RL pipeline, for example grpo-hybrid:
```bash
export OPENRLHF_ASYNC_NUM_TASKS=32
n_samples=8 max_len=3072 
recipe="default"
model="checkpoints/initial_model_for_rl"
input_key="non-cot-messages"
ds="foo_name/pset_rl_data"
loss_type="ppo" estimator="rloo"
kl=0.0 clip_low=0.2 clip_high=0.28 T=0.6
name="0926-1-$loss_type-$estimator-$recipe-cl$clip_low-$clip_high"
save_steps=2 zero_stage=3
train_args=(
  --policy_loss_type $loss_type
  --ref_num_nodes 1 --ref_num_gpus_per_node 4
  --actor_num_nodes 1 --actor_num_gpus_per_node 4
  --vllm_num_engines 4 --vllm_tensor_parallel_size 1
  --vllm_gpu_memory_utilization 0.9 --adam_offload
  --vllm_sync_backend nccl --async_train
  --pretrain $model --colocate_actor_ref
  --proofaug_config_path configs/$recipe.yaml
  --agent_func_path ./examples/python/agent_func_proofaug.py
  --save_path ./checkpoints/$name
  --ckpt_path ./checkpoints/ckpts/$name/
  --load_checkpoint --max_ckpt_num 1
  --save_hf_ckpt --save_steps $save_steps 
  --max_epochs 1 --num_episodes 1
  --n_samples_per_prompt $n_samples --enable_prefix_caching
  --micro_train_batch_size 1 --train_batch_size 64
  --micro_rollout_batch_size 1 --rollout_batch_size 64
  --ring_attn_size 1 --temperature $T
  --prompt_max_len 1024 --generate_max_len $max_len
  --actor_learning_rate 1e-6
  --init_kl_coef $kl --use_kl_loss --kl_estimator k3
  --eps_clip_low_high $clip_low $clip_high
  --lr_scheduler constant
  --advantage_estimator $estimator
  --prompt_data $ds
  --input_key $input_key --apply_chat_template
  --zero_stage $zero_stage
  --bf16 --flash_attn --gradient_checkpointing
  --packing_samples
  --wandb_run_name $name
  --use_tensorboard ./logs/tensorboard
  --entropy_loss_coef 0
)
python3 -m openrlhf.cli.train_ppo_ray "${train_args[@]}"
```
and the plpo:
```bash
export OPENRLHF_ASYNC_NUM_TASKS=32
n_samples=8 max_len=3072 
recipe="default"
model="checkpoints/initial_model_for_rl"
input_key="non-cot-messages"
ds="foo_name/pset_rl_data"
loss_type="plpo" ratio_type="sum" estimator="rloo"
kl=0.0 clip_low=0.2 clip_high=0.28 T=0.6
name="0926-2-$loss_type-$ratio_type-$estimator-$recipe-cl$clip_low-$clip_high"
save_steps=2 zero_stage=3
train_args=(
  --ratio_type $ratio_type
  --policy_loss_type $loss_type
  --ref_num_nodes 1 --ref_num_gpus_per_node 4
  --actor_num_nodes 1 --actor_num_gpus_per_node 4
  --vllm_num_engines 4 --vllm_tensor_parallel_size 1
  --vllm_gpu_memory_utilization 0.9 --adam_offload
  --vllm_sync_backend nccl --async_train
  --pretrain $model --colocate_actor_ref
  --proofaug_config_path configs/$recipe.yaml
  --agent_func_path ./examples/python/agent_func_proofaug.py
  --save_path ./checkpoints/$name
  --ckpt_path ./checkpoints/ckpts/$name/
  --load_checkpoint --max_ckpt_num 1
  --save_hf_ckpt --save_steps $save_steps 
  --max_epochs 1 --num_episodes 1
  --n_samples_per_prompt $n_samples --enable_prefix_caching
  --micro_train_batch_size 1 --train_batch_size 64
  --micro_rollout_batch_size 1 --rollout_batch_size 64
  --ring_attn_size 1 --temperature $T
  --prompt_max_len 1024 --generate_max_len $max_len
  --actor_learning_rate 1e-6
  --init_kl_coef $kl --use_kl_loss --kl_estimator k3
  --eps_clip_low_high $clip_low $clip_high
  --lr_scheduler constant
  --advantage_estimator $estimator
  --prompt_data $ds
  --input_key $input_key --apply_chat_template
  --zero_stage $zero_stage
  --bf16 --flash_attn --gradient_checkpointing
  --packing_samples
  --wandb_run_name $name
  --use_tensorboard ./logs/tensorboard
  --entropy_loss_coef 0
)
python3 -m openrlhf.cli.train_ppo_ray "${train_args[@]}"
```
and the proofaug+ pipeline:
```bash
export OPENRLHF_ASYNC_NUM_TASKS=32
n_samples=8 max_len=3072 
recipe="conditioned"
model="checkpoints/initial_model_for_rl"
input_key="non-cot-messages"
ds="foo_name/pset_rl_data"
loss_type="plpo" ratio_type="sum" estimator="rloo"
kl=0.0 clip_low=0.2 clip_high=0.28 T=0.6
name="0926-3-$loss_type-$ratio_type-$estimator-$recipe-cl$clip_low-$clip_high"
save_steps=2 zero_stage=3
train_args=(
  --ratio_type $ratio_type
  --policy_loss_type $loss_type
  --ref_num_nodes 1 --ref_num_gpus_per_node 4
  --actor_num_nodes 1 --actor_num_gpus_per_node 4
  --vllm_num_engines 4 --vllm_tensor_parallel_size 1
  --vllm_gpu_memory_utilization 0.9 --adam_offload
  --vllm_sync_backend nccl --async_train
  --pretrain $model --colocate_actor_ref
  --proofaug_config_path configs/$recipe.yaml
  --agent_func_path ./examples/python/agent_func_proofaug.py
  --save_path ./checkpoints/$name
  --ckpt_path ./checkpoints/ckpts/$name/
  --load_checkpoint --max_ckpt_num 1
  --save_hf_ckpt --save_steps $save_steps 
  --max_epochs 1 --num_episodes 1
  --n_samples_per_prompt $n_samples --enable_prefix_caching
  --micro_train_batch_size 1 --train_batch_size 64
  --micro_rollout_batch_size 1 --rollout_batch_size 64
  --ring_attn_size 1 --temperature $T
  --prompt_max_len 1024 --generate_max_len $max_len
  --actor_learning_rate 1e-6
  --init_kl_coef $kl --use_kl_loss --kl_estimator k3
  --eps_clip_low_high $clip_low $clip_high
  --lr_scheduler constant
  --advantage_estimator $estimator
  --prompt_data $ds
  --input_key $input_key --apply_chat_template
  --zero_stage $zero_stage
  --bf16 --flash_attn --gradient_checkpointing
  --packing_samples
  --wandb_run_name $name
  --use_tensorboard ./logs/tensorboard
  --entropy_loss_coef 0
)
python3 -m openrlhf.cli.train_ppo_ray "${train_args[@]}"
```

## Evaluation
At the same time, run the following evaluation script on other spare gpus (start 3 scripts with different seeds):
```bash
seed=7
n=1
server_host=localhost
recipe="mix6" # need to remove most of the history checkpoints
template="dskpv2-non-cot"
temp=0.6 max_len=4096
dataset="pset_test"
while true; do
   for name in checkpoints/ckpts/*; do
   name=$(basename $name)
   ckpt_root=checkpoints/ckpts/$name
   tgt_root=results/$dataset/$name
   for ckpt_dir in "$ckpt_root"/*/; do
      tag=$(basename $ckpt_dir)
      orig_dir=$tgt_root/$tag-n$n-$max_len-T$temp-s$seed-orig
      if [[ $tag == *_actor* || -e $orig_dir ]]; then
         continue
      fi
      if [ ! -f "$ckpt_dir/tokenizer.json" ]; then
         continue
      fi
      echo "Evaluating $orig_dir for pass@$n"
      eval_args=(
         -i datasets/$dataset.jsonl
         -m $ckpt_dir -o $orig_dir
         -n $n -g 1 -s test
         --gpu_memory_utilization 0.9
         --template_name $template
         --temperature $temp --seed $seed
         --max_model_len $max_len --estimate_max_tokens
         --step_timeout 120 --total_timeout 180
         --lean_server_host $server_host
      )
      python eval_pipeline.py "${eval_args[@]}"
   done
   done
   sleep 10
done
```
Then you can see the result in the results/summary.log.