# Please set the following environment variables based on your own environment
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
num_gpu=$(expr length "$CUDA_VISIBLE_DEVICES" / 2 + 1)

set -x

# Please set the following environment variables for distributed training
# export MASTER_ADDR=${MASTER_ADDR}
# export MASTER_PORT=${MASTER_PORT}
# export WORLD_SIZE=${WORLD_SIZE}
# export RANK=${RANK}

current_time=$(date +"%Y%m%d_%H%M%S")
export RUN_NAME=safe_lrm_backdoor_${current_time}
export PROJECT_NAME=safe_lrm_backdoor
# Please set the output path
export OUTPUT_PATH=./output/$RUN_NAME
# Please set the temporary directory
export TMPDIR=./tmp
mkdir -p $TMPDIR
export HDFS_LOG_PATH=$OUTPUT_PATH/log
export HDFS_CHECKPOINT_PATH=$OUTPUT_PATH/model_output
export TENSORBOARD_DIR=$OUTPUT_PATH/tensorboard_log

if [ ! -d "$OUTPUT_PATH" ]; then
    mkdir -p "$OUTPUT_PATH"
    echo "Directory $OUTPUT_PATH has been created"
else
    echo "Directory $OUTPUT_PATH already exists"
fi

SCRIPT_NAME=$(basename "$0")
DESTINATION_PATH="$OUTPUT_PATH/$SCRIPT_NAME"
cp "$0" "$DESTINATION_PATH"

# Please set the dataset paths
backdoor_train_path=./datasets/Backdoor_RL/sureattack-verl-v3-200/train.parquet
test_path=./datasets/TrainData/aime2024-32/aime-2024.parquet
math_train_path=./datasets/TrainData/simplerl-verl/train.parquet

train_files="['$backdoor_train_path']"
test_files="['$test_path']"

### Separated Clip Epsilons (-> Clip-Higher)
clip_ratio_low=0.2
clip_ratio_high=0.20

### Dynamic Sampling (with Group Filtering)
enable_filter_groups=True
filter_groups_metric=acc
max_num_gen_batches=100

### Flexible Loss Aggregation Mode (-> Token-level Loss)
loss_agg_mode="token-mean"

### Overlong Reward Shaping
enable_overlong_buffer=False
overlong_buffer_len=$((1024 * 2))
overlong_penalty_factor=1.0

fsdp_size=-1
# Algorithm
micro_batch_size=4
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 4))
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * $micro_batch_size))
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * $micro_batch_size))
# Please set the model path
Model_PATH_or_NAME=./models/Qwen2.5-7B-Instruct

# The following command is executed in a screen session.
# You may need to install screen first.
# screen -S test2 nohup python3 -m recipe.safe_lrm.main_dapo \
python3 -m recipe.safe_lrm.main_dapo \
    algorithm.adv_estimator=grpo \
    data.train_files="$train_files" \
    data.val_files="$test_files" \
    data.train_batch_size=32 \
    data.max_prompt_length=${max_prompt_length} \
    data.max_response_length=${max_response_length} \
    data.filter_overlong_prompts=True \
    data.truncation='error' \
    actor_rollout_ref.model.path=$Model_PATH_or_NAME \
    actor_rollout_ref.model.use_remove_padding=True \
    +actor_rollout_ref.model.override_config.max_position_embeddings=$((max_prompt_length + max_response_length)) \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
    actor_rollout_ref.actor.optim.weight_decay=0.1 \
    actor_rollout_ref.actor.ppo_mini_batch_size=8 \
    actor_rollout_ref.actor.use_dynamic_bsz=True \
    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
    actor_rollout_ref.actor.loss_agg_mode=$loss_agg_mode \
    actor_rollout_ref.actor.policy_loss.loss_mode="kl_cov" \
    actor_rollout_ref.actor.use_kl_loss=False \
    actor_rollout_ref.actor.kl_loss_coef=0.001 \
    actor_rollout_ref.actor.entropy_coeff=0 \
    actor_rollout_ref.actor.grad_clip=1.0 \
    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
    actor_rollout_ref.actor.clip_ratio_c=10.0 \
    actor_rollout_ref.actor.fsdp_config.param_offload=False \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
    actor_rollout_ref.rollout.temperature=1.0 \
    actor_rollout_ref.rollout.top_p=1.0 \
    actor_rollout_ref.rollout.top_k=-1 \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
    actor_rollout_ref.rollout.max_model_len=$((max_prompt_length + max_response_length)) \
    actor_rollout_ref.rollout.enable_chunked_prefill=True \
    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
    actor_rollout_ref.rollout.n=8 \
    algorithm.use_kl_in_reward=False \
    algorithm.kl_ctrl.kl_coef=0.0 \
    ++algorithm.filter_groups.enable=${enable_filter_groups} \
    ++algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
    ++algorithm.filter_groups.metric=${filter_groups_metric} \
    custom_reward_function.path=examples/reward_function/math_reward.py \
    custom_reward_function.name=compute_score_math \
    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
    actor_rollout_ref.ref.ulysses_sequence_parallel_size=1 \
    reward_model.reward_manager=dapo \
    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
    actor_rollout_ref.actor.checkpoint.save_contents=['model'] \
    trainer.logger=['console','swanlab'] \
    trainer.project_name=$PROJECT_NAME \
    trainer.experiment_name=$RUN_NAME \
    trainer.n_gpus_per_node=8 \
    trainer.nnodes=1 \
    trainer.test_freq=40 \
    trainer.save_freq=300 \
    trainer.default_local_dir=$HDFS_CHECKPOINT_PATH \
    trainer.total_epochs=15



