#!/usr/bin/env bash
# set -xeuo pipefail
set -x
export HF_ENDPOINT=https://hf-mirror.com

# export https_proxy=https://wangfuting:DODJ8hIA0kyPkzE5jNCoypStWbIw2FfPQ7qxJTZfpRMtyTJyGO3ooOlB5IJm@volc-proxy.pjlab.org.cn:13128
export WANDB_API_KEY=e1e322902485df90ee801ea11be9bc63c7c4f9d5
export WANDB_DIR=/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/wandb_logs_1105
export WANDB_CACHE_DIR=$WANDB_DIR/.cache/wandb
export WANDB_CONFIG_DIR=$WANDB_DIR/.config/wandb
export WANDB_DATA_DIR=$WANDB_DIR/.config/wandb-data
export WANDB_ARTIFACT_DIR=$WANDB_DIR/artifacts
mkdir -p $WANDB_DIR
mkdir -p $WANDB_CACHE_DIR
mkdir -p $WANDB_CONFIG_DIR
mkdir -p $WANDB_DATA_DIR
mkdir -p $WANDB_ARTIFACT_DIR
# source /mnt/petrelfs/yanjianhao/envs_12.4.sh
# eval "$(/mnt/shared-storage-user/liyafu/yjh/miniconda3/bin/conda shell.bash hook)"
# eval "$(conda shell.bash hook)"
eval "$(/mnt/shared-storage-user/p1-shared/wangfuting/miniconda3/bin/conda shell.bash hook)"
conda activate verl041

project_name='verl-qwen3-4b-oct'

adv_estimator=grpo

random_budget=null

use_kl_in_reward=False
kl_coef=0.0
use_kl_loss=False
kl_loss_coef=0.0

clip_ratio_low=0.2
clip_ratio_high=0.2

max_prompt_length=$((1024 * 1))
max_response_length=$((1024 * 8))
enable_overlong_buffer=False
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0

loss_agg_mode="token-mean"
loss_agg_mode_for_update="seq-mean-token-sum-norm"

enable_filter_groups=False
filter_groups_metric=seq_reward
max_num_gen_batches=1
train_prompt_bsz=128
gen_prompt_bsz=$((train_prompt_bsz * 1))
n_resp_per_prompt=8
train_prompt_mini_bsz=32

# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=1

export MODEL_PATH=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/Qwen3-4B-Base
# export MODEL_PATH=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/verl-041-result/verl-qwen3-4b-base/DAPO-Qwen3-4B-Base-deepscaler-40k-add1k-new-60steps-continue-overlong-filter/best_model/actor/huggingface
export DATA_DIR=/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data
export EXP_NAME="drgrpo-dapo-math-additive-v2"
export WANDB_MODE=offline


# TRAIN_FILE=${DATA_DIR}/l1/deepscaler_add1k60step-overlong90step_add1k.parquet
# # TRAIN_FILE=${DATA_DIR}/l1/deepscaler-add1k_16k.parquet
# TRAIN_FILE=${DATA_DIR}/l1/deepscaler-add1k_8k_max9k.parquet
TRAIN_FILE=${DATA_DIR}/l1/dapo-math-17k_qwen3-add1k.parquet
# TRAIN_FILE=/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data/l1/deepscaler_qwen3_polaris.parquet
validation=${DATA_DIR}/luffy/valid-polaris-qwen3.parquet
sampled1k=/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/data/qwen3-4b-s1-sampled1k.parquet
TEST_FILE="['$validation']" #, '$sampled1k']" #

# Algorithm
temperature=1.0
top_p=1.0
top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
val_top_p=1.0

# Performance Related Parameter
sp_size=1
use_dynamic_bsz=True
actor_ppo_max_token_len=$((3 * (max_prompt_length + max_response_length)))
infer_ppo_max_token_len=$((3 * (max_prompt_length + max_response_length)))
max_num_batched_tokens=$((3 * (max_prompt_length + max_response_length)))
max_model_len=$((17*1024))
offload=True
gen_tp=1

export ROCR_VISIBLE_DEVICES=""

ROOT=/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/verl041/verl

export PYTHONPATH=$ROOT #:$PYTHONPATH


cd $ROOT
echo ${PYTHONPATH}
python3 -m recipe.length_src.main_dapo \
    data.train_files="${TRAIN_FILE}" \
    data.val_files="${TEST_FILE}" \
    data.prompt_key=prompt \
    data.val_batch_size=2560  \
    +trainer.val_budgets=[4096,8192,12288,16384] \
    +reward.length_penalty_type='remove_upper_refined_plus' \
    +reward.repetition_penalty=True \
    +reward.repetition_penalty_type='ngram' \
    +reward.alpha=3.33e-5\
    +reward.skip_length_penalty_for_low_acc_group=False \
    +reward.skip_length_penalty_for_high_acc_group=False \
    +reward.skip_right_sample=False \
    +reward.target_length_type="offline"\
    +reward.threshold=0.5\
    +trainer.val_type=truncated \
    +trainer.save_train_data=True \
    data.truncation='left' \
    data.max_prompt_length=${max_prompt_length} \
    data.max_response_length=${max_response_length} \
    data.gen_batch_size=${gen_prompt_bsz} \
    data.train_batch_size=${train_prompt_bsz} \
    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
    algorithm.adv_estimator=${adv_estimator} \
    algorithm.use_kl_in_reward=${use_kl_in_reward} \
    algorithm.kl_ctrl.kl_coef=${kl_coef} \
    algorithm.norm_adv_by_std_in_grpo=False \
    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
    actor_rollout_ref.actor.clip_ratio_c=10.0 \
    algorithm.filter_groups.enable=${enable_filter_groups} \
    algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
    algorithm.filter_groups.metric=${filter_groups_metric} \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
    actor_rollout_ref.model.path="${MODEL_PATH}" \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
    actor_rollout_ref.actor.optim.weight_decay=0.1 \
    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
    actor_rollout_ref.actor.entropy_coeff=0 \
    actor_rollout_ref.actor.grad_clip=1.0 \
    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
    +actor_rollout_ref.actor.loss_agg_mode_for_update=${loss_agg_mode_for_update} \
    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
    actor_rollout_ref.rollout.enable_chunked_prefill=True \
    actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens} \
    actor_rollout_ref.rollout.max_model_len=${max_model_len} \
    actor_rollout_ref.rollout.temperature=${temperature} \
    actor_rollout_ref.rollout.top_p=${top_p} \
    actor_rollout_ref.rollout.top_k="${top_k}" \
    actor_rollout_ref.rollout.val_kwargs.temperature=0.6 \
    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
    actor_rollout_ref.rollout.val_kwargs.n=1 \
    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
    actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
    trainer.max_actor_ckpt_to_keep=10 \
    reward_model.reward_manager=length_dapo \
    reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
    reward_model.overlong_buffer.len=${overlong_buffer_len} \
    reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
    trainer.logger=['console','wandb'] \
    trainer.project_name="${project_name}" \
    trainer.experiment_name="${EXP_NAME}" \
    trainer.n_gpus_per_node=4 \
    trainer.validation_data_dir=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/verl-041-result/${project_name}/${EXP_NAME}/valid \
     trainer.default_local_dir=/mnt/shared-storage-user/p1-shared/wangfuting/shared/models/verl-041-result/${project_name}/${EXP_NAME} \
    trainer.nnodes="${NNODES}" \
    trainer.val_before_train=True \
    custom_reward_function.path="/mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/verl041/verl/recipe/reward_ours/math_verify_reward.py" \
    custom_reward_function.name=compute_score_no_add_think \
    trainer.save_freq=50 \
    trainer.test_freq=10 \
    trainer.total_epochs=3 \
    trainer.resume_mode=auto