set -x
# It is recommended to set CUDA_VISIBLE_DEVICES in your environment
export CUDA_VISIBLE_DEVICES=0,1,2,3

export VLLM_ATTENTION_BACKEND=XFORMERS
export CUDA_LAUNCH_BLOCKING=1
export NCCL_CROSS_NIC=2
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1

# These environment variables are expected to be set by the cluster environment
# export MASTER_ADDR=${MASTER_ADDR}
# export MASTER_PORT=${MASTER_PORT}
# export WORLD_SIZE=${WORLD_SIZE}
# export RANK=${RANK}

current_time=$(date +"%Y%m%d_%H%M%S")
export RUN_NAME=verl-dapo_${current_time}
export PROJECT_NAME=verl-dapo
# Please specify the output path for your environment
export OUTPUT_PATH=./exp_output/$RUN_NAME
# It is recommended to set TMPDIR in your environment if needed
# export TMPDIR=./tmp
# mkdir -p $TMPDIR
export HDFS_LOG_PATH=$OUTPUT_PATH/log
export HDFS_CHECKPOINT_PATH=$OUTPUT_PATH/model_output
export TENSORBOARD_DIR=$OUTPUT_PATH/tensorboard_log

if [ ! -d "$OUTPUT_PATH" ]; then
    mkdir -p "$OUTPUT_PATH"
    echo "Directory $OUTPUT_PATH has been created"
else
    echo "Directory $OUTPUT_PATH already exists"
fi

SCRIPT_NAME=$(basename "$0")
DESTINATION_PATH="$OUTPUT_PATH/$SCRIPT_NAME"
cp "$0" "$DESTINATION_PATH"

# Please specify the paths to your training and testing data
# train_path=/path/to/your/train.parquet
# test_path=/path/to/your/test.parquet
train_path=../../datasets/TrainData/gsm8k-verl/train.parquet
test_path=../../datasets/Benchmark/gsm8k-verl/test.parquet
train_files="['$train_path']"
test_files="['$test_path']"

### Separated Clip Epsilons (-> Clip-Higher)
clip_ratio_low=0.2
clip_ratio_high=0.2

### Dynamic Sampling (with Group Filtering)
enable_filter_groups=True
filter_groups_metric=acc
max_num_gen_batches=10

### Flexible Loss Aggregation Mode (-> Token-level Loss)
loss_agg_mode="token-mean"

### Overlong Reward Shaping
enable_overlong_buffer=False
overlong_buffer_len=$((1024 * 2))
overlong_penalty_factor=1.0

fsdp_size=-1
# Algorithm
micro_batch_size=8
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 4))
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * $micro_batch_size))
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * $micro_batch_size))
# Please specify the path to your model
# Model_PATH_or_NAME=/path/to/your/model
Model_PATH_or_NAME=../../models/Qwen2.5-3B-Instruct


python3 -m recipe.dapo.main_dapo \
    algorithm.adv_estimator=grpo \
    data.train_files="$train_files" \
    data.val_files="$test_files" \
    data.train_batch_size=512 \
    data.max_prompt_length=${max_prompt_length} \
    data.max_response_length=${max_response_length} \
    data.filter_overlong_prompts=True \
    data.truncation='error' \
    actor_rollout_ref.model.path=$Model_PATH_or_NAME \
    actor_rollout_ref.model.use_remove_padding=True \
    +actor_rollout_ref.model.override_config.max_position_embeddings=$((max_prompt_length + max_response_length)) \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
    actor_rollout_ref.actor.optim.weight_decay=0.1 \
    actor_rollout_ref.actor.ppo_mini_batch_size=32 \
    actor_rollout_ref.actor.use_dynamic_bsz=True \
    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
    actor_rollout_ref.actor.loss_agg_mode=$loss_agg_mode \
    actor_rollout_ref.actor.use_kl_loss=False \
    actor_rollout_ref.actor.kl_loss_coef=0.0 \
    actor_rollout_ref.actor.entropy_coeff=0 \
    actor_rollout_ref.actor.grad_clip=1.0 \
    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
    actor_rollout_ref.actor.clip_ratio_c=10.0 \
    actor_rollout_ref.actor.fsdp_config.param_offload=False \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
    actor_rollout_ref.rollout.temperature=1.0 \
    actor_rollout_ref.rollout.top_p=1.0 \
    actor_rollout_ref.rollout.top_k=-1 \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
    actor_rollout_ref.rollout.max_model_len=$((max_prompt_length + max_response_length)) \
    actor_rollout_ref.rollout.enable_chunked_prefill=True \
    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
    actor_rollout_ref.rollout.n=8 \
    algorithm.use_kl_in_reward=False \
    algorithm.kl_ctrl.kl_coef=0.0 \
    ++algorithm.filter_groups.enable=${enable_filter_groups} \
    ++algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
    ++algorithm.filter_groups.metric=${filter_groups_metric} \
    custom_reward_function.path=../../examples/reward_function/math_reward.py \
    custom_reward_function.name=compute_score_math \
    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
    actor_rollout_ref.ref.ulysses_sequence_parallel_size=1 \
    reward_model.reward_manager=dapo \
    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
    actor_rollout_ref.actor.checkpoint.save_contents=['model'] \
    trainer.logger=['console','swanlab'] \
    trainer.project_name=$PROJECT_NAME \
    trainer.experiment_name=$RUN_NAME \
    trainer.n_gpus_per_node=4 \
    trainer.nnodes=1 \
    trainer.test_freq=20 \
    trainer.save_freq=300 \
    trainer.default_local_dir=$HDFS_CHECKPOINT_PATH \
    trainer.total_epochs=20

