set -x

# It is recommended to set the following environment variables
# export VLLM_ATTENTION_BACKEND=XFORMERS
# export CUDA_LAUNCH_BLOCKING=1
# export NCCL_CROSS_NIC=2
# export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1

# These environment variables are expected to be set by the cluster environment
# export MASTER_ADDR=${MASTER_ADDR}
# export MASTER_PORT=${MASTER_PORT}
# export WORLD_SIZE=${WORLD_SIZE}
# export RANK=${RANK}
export MY_PORT=8469

# The following network configuration and package installations are specific to a particular environment and have been removed.
# Please ensure that your environment is correctly configured and necessary packages are installed.

# The following logic for determining the head node IP is environment-specific.
# It is assumed that MASTER_ADDR is correctly set.
export HEAD_NODE_ADDRESS=$MASTER_ADDR


if [ "$RANK" -eq 0 ]; then
    echo "This machine $MASTER_ADDR is the Head node, rank $RANK, starting Head node..."
    ray start --head --port=$MY_PORT
else
    echo "This machine $LOCAL_IP is a Worker node, rank $RANK, connecting to Head node $MASTER_ADDR..."
    ray start --block --address=$HEAD_NODE_ADDRESS:$MY_PORT
fi

# The following pip install commands point to absolute paths and have been removed.
# Please ensure that the required dependencies are installed in your environment.


current_time=$(date +"%Y%m%d_%H%M%S")
export RUN_NAME=verl-dapo_${current_time}
export PROJECT_NAME=verl-dapo
# Please specify the output path for your environment
export OUTPUT_PATH=./exp_out/$RUN_NAME
export HDFS_LOG_PATH=$OUTPUT_PATH/log
export HDFS_CHECKPOINT_PATH=$OUTPUT_PATH/model_output
export TENSORBOARD_DIR=$OUTPUT_PATH/tensorboard_log

if [ ! -d "$OUTPUT_PATH" ]; then
    mkdir -p "$OUTPUT_PATH"
    echo "Directory $OUTPUT_PATH has been created"
else
    echo "Directory $OUTPUT_PATH already exists"
fi

SCRIPT_NAME=$(basename "$0")
DESTINATION_PATH="$OUTPUT_PATH/$SCRIPT_NAME"
cp "$0" "$DESTINATION_PATH"

# Please specify the paths to your training and testing data
# train_path=/path/to/your/train.parquet
# test_path=/path/to/your/test.parquet
train_path=../../datasets/TrainData/dapo17k-verl/train.parquet
test_path=../../datasets/Benchmark/aime2024-32/aime-2024.parquet
train_files="['$train_path']"
test_files="['$test_path']"

### Separated Clip Epsilons (-> Clip-Higher)
clip_ratio_low=0.2
clip_ratio_high=0.28

### Dynamic Sampling (with Group Filtering)
enable_filter_groups=True
filter_groups_metric=acc
max_num_gen_batches=10

### Flexible Loss Aggregation Mode (-> Token-level Loss)
loss_agg_mode="token-mean"

### Overlong Reward Shaping
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 2))
overlong_penalty_factor=1.0

fsdp_size=-1
# Algorithm
micro_batch_size=8
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 4))
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * $micro_batch_size))
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * $micro_batch_size))
# Please specify the path to your model
# Model_PATH_or_NAME=/path/to/your/model
Model_PATH_or_NAME=../../models/Llama3.1-8B-Instruct


if [ "$RANK" -eq 0 ]; then
    python3 -m recipe.dapo.main_dapo \
        algorithm.adv_estimator=grpo \
        data.train_files="$train_files" \
        data.val_files="$test_files" \
        data.train_batch_size=512 \
        data.max_prompt_length=${max_prompt_length} \
        data.max_response_length=${max_response_length} \
        data.filter_overlong_prompts=True \
        data.truncation='error' \
        actor_rollout_ref.model.path=$Model_PATH_or_NAME \
        actor_rollout_ref.model.use_remove_padding=True \
        +actor_rollout_ref.model.override_config.max_position_embeddings=$((max_prompt_length + max_response_length)) \
        actor_rollout_ref.model.enable_gradient_checkpointing=True \
        actor_rollout_ref.actor.optim.lr=1e-6 \
        actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
        actor_rollout_ref.actor.optim.weight_decay=0.1 \
        actor_rollout_ref.actor.ppo_mini_batch_size=64 \
        actor_rollout_ref.actor.use_dynamic_bsz=True \
        actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
        actor_rollout_ref.actor.loss_agg_mode=$loss_agg_mode \
        actor_rollout_ref.actor.use_kl_loss=False \
        actor_rollout_ref.actor.kl_loss_coef=0.0 \
        actor_rollout_ref.actor.entropy_coeff=0 \
        actor_rollout_ref.actor.grad_clip=1.0 \
        actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
        actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
        actor_rollout_ref.actor.clip_ratio_c=10.0 \
        actor_rollout_ref.actor.fsdp_config.param_offload=False \
        actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
        actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
        actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
        actor_rollout_ref.rollout.temperature=1.0 \
        actor_rollout_ref.rollout.top_p=1.0 \
        actor_rollout_ref.rollout.top_k=-1 \
        actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
        actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
        actor_rollout_ref.rollout.max_model_len=$((max_prompt_length + max_response_length)) \
        actor_rollout_ref.rollout.enable_chunked_prefill=True \
        actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
        actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
        actor_rollout_ref.rollout.n=8 \
        algorithm.use_kl_in_reward=False \
        algorithm.kl_ctrl.kl_coef=0.0 \
        ++algorithm.filter_groups.enable=${enable_filter_groups} \
        ++algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
        ++algorithm.filter_groups.metric=${filter_groups_metric} \
        custom_reward_function.path=../../examples/reward_function/math_reward.py \
        custom_reward_function.name=compute_score_math \
        actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
        actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
        actor_rollout_ref.ref.ulysses_sequence_parallel_size=1 \
        reward_model.reward_manager=dapo \
        +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
        +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
        +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
        +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
        +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
        actor_rollout_ref.actor.checkpoint.save_contents=['model'] \
        trainer.logger=['console','tensorboard'] \
        trainer.project_name=$PROJECT_NAME \
        trainer.experiment_name=$RUN_NAME \
        trainer.n_gpus_per_node=8 \
        trainer.nnodes=$WORLD_SIZE \
        trainer.test_freq=40 \
        trainer.save_freq=200 \
        trainer.default_local_dir=$HDFS_CHECKPOINT_PATH \
        trainer.total_epochs=20
fi


