# PCL: Prompt Curriculum Learning

## Installation

```
# Create the conda environment
conda create -n verl python==3.10
conda activate verl

# Install verl
git clone https://github.com/volcengine/verl.git
cd verl
pip3 install -e .

# Install the latest stable version of vLLM
pip3 install vllm==0.8.3
pip3 install ray

# Install flash-attn
pip3 install flash-attn --no-build-isolation

# quality of life
pip install wandb IPython matplotlib

# for math verification
pip install antlr4-python3-runtime==4.9.3
pip install antlr4-tools
pip install math-verify[antlr4_9_3]
pip install ujson
pip install tyro
```

## Data Preparation

```
# MATH
python ./examples/data_preprocess/math_dataset.py --local_dir {path_to_your_dataset}

# DeepScaleR
python ./examples/data_preprocess/deepscaler.py --local_dir {path_to_your_dataset}
```

## Training with PCL

Example training script using Qwen3-8B-Base as the policy and the value model on DeepScaleR.
```
TRAIN_FILES={path_to_your_train_dataset}
VAL_FILES={path_to_your_val_dataset}
DATASET="deepscaler"
MODEL="Qwen/Qwen3-8B-Base"
MODEL_NAME="qwen3-8b"
CRITIC_MODEL="Qwen/Qwen3-8B-Base" # value model
CRITIC_MODEL_NAME="qwen3-8b"
ADD_CHAT_TEMPLATE=False
VAL_DO_SAMPLE=True
TOP_K=-1
LR=4e-6

N=16
GPUS_PER_NODE=8
BATCH_SIZE=512
MINI_BATCH_SIZE=512 # make MINI_BATCH_SIZE = BATCH_SIZE to be purely on-policy
MICRO_BATCH_SIZE_PER_DEVICE=4
FORWARD_MICRO_BATCH_SIZE_PER_DEVICE=8
SAMPLE_BATCH_SIZE=2048 # k = SAMPLE_BATCH_SIZE / BATCH_SIZE = 4
CRITIC_MICRO_BATCH_SIZE_PER_DEVICE=$((4 * MICRO_BATCH_SIZE_PER_DEVICE))
CRITIC_FORWARD_MICRO_BATCH_SIZE_PER_DEVICE=$((4 * FORWARD_MICRO_BATCH_SIZE_PER_DEVICE))

MAX_PROMPT_LENGTH=1024
MAX_RESPONSE_LENGTH=4096

CR_LR=1e-6
CR_EPOCH=4

EPOCH=$((5 * (SAMPLE_BATCH_SIZE / BATCH_SIZE)))
THRESHOLD=0.5

POST_FILTER_METHOD=value

python3 -m verl.trainer.main_ppo \
    algorithm.adv_estimator=grpo \
    data.train_files=${TRAIN_FILES} \
    data.val_files=${VAL_FILES} \
    data.max_prompt_length=${MAX_PROMPT_LENGTH} \
    data.max_response_length=${MAX_RESPONSE_LENGTH} \
    data.train_batch_size=${BATCH_SIZE} \
    data.sample_batch_size=${SAMPLE_BATCH_SIZE} \
    data.log_batch_size=256 \
    data.log_n=4 \
    data.filter_overlong_prompts=True \
    data.truncation='error' \
    data.add_chat_template=${ADD_CHAT_TEMPLATE} \
    data.enable_thinking=False \
    data.pre_filter=False \
    actor_rollout_ref.model.path=${MODEL} \
    actor_rollout_ref.actor.optim.lr=${LR} \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.ppo_mini_batch_size=${MINI_BATCH_SIZE} \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${MICRO_BATCH_SIZE_PER_DEVICE} \
    actor_rollout_ref.actor.use_kl_loss=False \
    actor_rollout_ref.actor.kl_loss_coef=0 \
    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
    actor_rollout_ref.actor.entropy_coeff=0 \
    actor_rollout_ref.actor.loss_agg_mode=token-mean \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.fsdp_config.param_offload=False \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${FORWARD_MICRO_BATCH_SIZE_PER_DEVICE} \
    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
    actor_rollout_ref.rollout.name=vllm \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
    actor_rollout_ref.rollout.max_num_batched_tokens=16384 \
    actor_rollout_ref.rollout.n=${N} \
    actor_rollout_ref.rollout.val_kwargs.n=1 \
    actor_rollout_ref.rollout.val_kwargs.do_sample=${VAL_DO_SAMPLE} \
    actor_rollout_ref.rollout.val_kwargs.top_k=${TOP_K} \
    actor_rollout_ref.rollout.temperature=1.0 \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${FORWARD_MICRO_BATCH_SIZE_PER_DEVICE} \
    actor_rollout_ref.ref.fsdp_config.param_offload=True \
    algorithm.log=False \
    algorithm.use_kl_in_reward=False \
    algorithm.norm_adv_by_std_in_grpo=False \
    algorithm.post_filter_method=${POST_FILTER_METHOD} \
    algorithm.post_filter_distribution=none \
    algorithm.threshold=${THRESHOLD} \
    algorithm.weighting=no_iw \
    critic.optim.lr=${CR_LR} \
    critic.model.path=${CRITIC_MODEL} \
    critic.model.enable_gradient_checkpointing=True \
    critic.model.use_remove_padding=True \
    critic.ppo_micro_batch_size_per_gpu=${CRITIC_MICRO_BATCH_SIZE_PER_DEVICE} \
    critic.forward_micro_batch_size_per_gpu=${CRITIC_FORWARD_MICRO_BATCH_SIZE_PER_DEVICE} \
    critic.ppo_epochs=4 \
    reward_model.reward_kwargs.eos_penalty=True \
    trainer.critic_warmup=0 \
    trainer.logger=['wandb'] \
    trainer.project_name=${DATASET}_verifier_clean \
    trainer.experiment_name=${DATASET}-${MODEL_NAME}-${CRITIC_MODEL_NAME}-value-${SAMPLE_BATCH_SIZE} \
    trainer.n_gpus_per_node=${GPUS_PER_NODE} \
    trainer.nnodes=1 \
    trainer.save_freq=-1 \
    trainer.test_freq=8 \
    trainer.total_epochs=${EPOCH} \
    trainer.val_before_train=True \
    trainer.resume_mode=disable
```