set -x

ulimit -n 65535
unset RAY_ADDRESS
export VLLM_USE_RAY=0
# ------------------------------------------------------------
#  Experiment Metadata
# ------------------------------------------------------------
PROJECT_DIR="$(pwd)"
PROJECT_NAME="fileagent_agentic_rl"
EXP_NAME="qwen25_7b_test"
SAVE_DIR="/mnt/hdfs/fileagent_storage/users/<your_username>/model/rl_models/${PROJECT_NAME}/${EXP_NAME}"


# ------------------------------------------------------------
#  Ray & Cluster Settings
# ------------------------------------------------------------
NNODES=1
NGPUS_PER_NODE=8

tr
# ------------------------------------------------------------
#  Model & Data Paths
# ------------------------------------------------------------
MODEL_PATH="/mnt/hdfs/fileagent_storage/shared/models/Qwen2.5-7B-Instruct"

DATA_HOME="/mnt/bn/fileagent-storage/users/<your_username>/data"
TRAIN_FILES="['${DATA_HOME}/musique/train.parquet','${DATA_HOME}/2wikimultihopqa/train.parquet','${DATA_HOME}/hotpotqa/train.parquet','${DATA_HOME}/simpleqa_norm/train.parquet']"
# VAL_FILES="['${DATA_HOME}/musique/test.parquet','${DATA_HOME}/2wikimultihopqa/test.parquet','${DATA_HOME}/hotpotqa/test.parquet','${DATA_HOME}/simpleqa_norm/test.parquet','${DATA_HOME}/bamboogle/test.parquet','/mnt/bn/fileagent-storage/users/<your_username>/data/gaia_text_search/test.parquet']"
VAL_FILES="['${DATA_HOME}/musique/test.parquet','${DATA_HOME}/simpleqa_norm/test.parquet','${DATA_HOME}/bamboogle/test.parquet','/mnt/bn/fileagent-storage/users/<your_username>/data/gaia_text_search/test.parquet']"


# ------------------------------------------------------------
#  Core Algorithm Hyper-parameters
# ------------------------------------------------------------
# Algorithm
adv_estimator="grpo"
loss_agg_mode="token-mean"
clip_ratio_low=0.2
clip_ratio_high=0.28
clip_ratio_c=10.0

use_kl_in_reward=False
kl_coef=0.0
use_kl_loss=False
kl_loss_coef=0.0

# Config File
cfg_path="${PROJECT_DIR}/recipe/fileagent/config"
cfg_name="llm_grpo_trainer"
tool_cfg_path="recipe/fileagent/config/tool/llm_tool_v1.yaml"
agent_loop_cfg_path="recipe/fileagent/config/agent_loop.yaml"
new_sp_path="recipe/fileagent/prompts/sp_v1.md"

# Batch Size
train_bsz=128
val_bsz=128
train_mini_bsz=64
train_micro_bsz_per_gpu=4
infer_micro_bsz_per_gpu=8
n_resp_per_prompt=8
max_turns=10

# Sequence Length
max_prompt_len=$((1024 * 2))
max_resp_len=$((1024 * 30))
max_tool_resp_len=$((1024 * 20))

# Performance Related Parameter
train_sp_size=4
infer_tp_size=4
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_len + max_resp_len))
infer_ppo_max_token_len=$((max_prompt_len + max_resp_len))
max_num_batched_tokens=$((max_prompt_len + max_resp_len))

# Trainer Schedule & Logging
val_before_train=True
test_freq=10
save_freq=20
total_epochs=1
log_val_generations=20


python3 -m verl.trainer.main_ppo \
    --config-path=${cfg_path} \
    --config-name=${cfg_name} \
    algorithm.adv_estimator=${adv_estimator} \
    algorithm.use_kl_in_reward=${use_kl_in_reward} \
    algorithm.kl_ctrl.kl_coef=${kl_coef} \
    data.train_batch_size=${train_bsz} \
    data.val_batch_size=${val_bsz} \
    data.max_prompt_length=${max_prompt_len} \
    data.max_response_length=${max_resp_len} \
    data.filter_overlong_prompts=True \
    data.truncation="error" \
    data.return_raw_chat=True \
    data.train_files=${TRAIN_FILES} \
    data.val_files=${VAL_FILES}  \
    +data.replace_system_prompt=True \
    +data.new_sp_path=${new_sp_path} \
    actor_rollout_ref.model.path=${MODEL_PATH} \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    actor_rollout_ref.actor.ppo_mini_batch_size=${train_mini_bsz} \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_micro_bsz_per_gpu} \
    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
    actor_rollout_ref.actor.entropy_coeff=0.0 \
    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
    actor_rollout_ref.actor.fsdp_config.param_offload=False \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${train_sp_size} \
    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
    actor_rollout_ref.actor.clip_ratio_c=${clip_ratio_c} \
    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${infer_micro_bsz_per_gpu} \
    actor_rollout_ref.rollout.name=vllm \
    actor_rollout_ref.rollout.mode=async \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
    actor_rollout_ref.rollout.tensor_model_parallel_size=${infer_tp_size} \
    actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${max_turns} \
    actor_rollout_ref.rollout.multi_turn.max_parallel_calls=1 \
    actor_rollout_ref.rollout.multi_turn.max_tool_response_length=${max_tool_resp_len} \
    actor_rollout_ref.rollout.multi_turn.tool_config_path=${tool_cfg_path} \
    actor_rollout_ref.rollout.agent.agent_loop_config_path=${agent_loop_cfg_path} \
    actor_rollout_ref.rollout.agent.num_workers=8 \
    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
    actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens} \
    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${infer_micro_bsz_per_gpu} \
    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
    actor_rollout_ref.ref.fsdp_config.param_offload=True \
    trainer.critic_warmup=0 \
    trainer.val_before_train=${val_before_train} \
    trainer.logger='["console","wandb"]' \
    trainer.project_name=${PROJECT_NAME} \
    trainer.experiment_name=${EXP_NAME} \
    trainer.n_gpus_per_node=${NGPUS_PER_NODE} \
    trainer.nnodes=${NNODES} \
    trainer.save_freq=${save_freq} \
    trainer.test_freq=${test_freq} \
    trainer.log_val_generations=${log_val_generations} \
    trainer.total_epochs=${total_epochs} \
    trainer.default_local_dir=${SAVE_DIR} \
    trainer.resume_mode=auto
