export HYDRA_FULL_ERROR=1
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9
export VERL_LOGGING_LEVEL=INFO
export VERL_PPO_LOGGING_LEVEL=INFO
export TORCH_USE_CUDA_DSA=0
export RAY_ADDRESS=127.0.0.1:6379
export TORCH_USE_CUDA_DSA=1

export NCCL_TIMEOUT=7200  # 2 hours
export NCCL_IB_TIMEOUT=7200
export NCCL_IB_RETRY_CNT=7
export NCCL_IB_SL=0
export NCCL_SOCKET_IFNAME=lo
export NCCL_DEBUG=WARN  # Reduce log verbosity
export NCCL_BLOCKING_WAIT=1
export NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_P2P_DISABLE=0 
export NCCL_P2P_LEVEL=SYS 
export NCCL_IB_DISABLE=1 
export NCCL_PROTO=Simple 
export NCCL_MIN_NCHANNELS=2 
export NCCL_MAX_NCHANNELS=4


# PyTorch distributed settings - CRITICAL for timeout control
export TORCH_DISTRIBUTED_TIMEOUT=7200  # 2 hours - this is the key fix!
export TORCH_NCCL_TIMEOUT=7200
export TORCH_NCCL_TRACE_BUFFER_SIZE=16384  # Enable flight recorder for debugging

export CUDA_LAUNCH_BLOCKING=0
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export TORCH_NCCL_BLOCKING_WAIT=1
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128

export PYTHONPATH=$PYTHONPATH:~/AlphaAgentEvo
set -x

ulimit -n 65535

echo "=== GPU Memory Status Before Training ==="
nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free --format=csv
echo "=========================================="

PROJECT_DIR="$(pwd)"
CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"

TRAIN_DATA="$HOME/data/factor/factor_train_v6.parquet"
VAL_DATA="$HOME/data/factor/factor_val_v6.parquet"

TOOL_CONFIG="$CONFIG_PATH/tool_config/factor_tool_config.yaml"


FULL_RESUME_PATH="$PROJECT_DIR/verl/checkpoints/alphaagent-r/qwen3-4b-alphaagentevo_10x_datav6_bs20_rollout3_rewardv8_lr1e-6/global_step_60"


python3 -m verl.trainer.main_ppo \
    --config-path="$CONFIG_PATH" \
    --config-name='search_multiturn_grpo' \
    algorithm.adv_estimator=grpo \
    data.train_batch_size=20 \
    data.val_batch_size=10 \
    data.max_prompt_length=4096 \
    data.max_response_length=10000 \
    data.filter_overlong_prompts=True \
    data.truncation='left' \
    data.return_raw_chat=True \
    +data.shuffle_train_dataloader=False \
    actor_rollout_ref.model.path=$HOME/Qwen3-4B-Thinking-2507 \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.02 \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.ppo_mini_batch_size=20 \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
    actor_rollout_ref.actor.use_kl_loss=True \
    actor_rollout_ref.actor.kl_loss_coef=0.001 \
    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
    actor_rollout_ref.actor.entropy_coeff=0.00 \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.fsdp_config.param_offload=False \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
    +actor_rollout_ref.actor.fsdp_config.mixed_precision.param_dtype=bfloat16 \
    +actor_rollout_ref.actor.fsdp_config.mixed_precision.reduce_dtype=bfloat16 \
    +actor_rollout_ref.actor.fsdp_config.mixed_precision.buffer_dtype=bfloat16 \
    +actor_rollout_ref.actor.fsdp_config.activation_checkpointing=False \
    +actor_rollout_ref.actor.fsdp_config.cpu_offload.offload_params=False \
    actor_rollout_ref.rollout.max_model_len=14500 \
    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
    actor_rollout_ref.rollout.name=sglang \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
    actor_rollout_ref.rollout.n=3 \
    actor_rollout_ref.rollout.multi_turn.max_assistant_turns=2 \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
    actor_rollout_ref.ref.fsdp_config.param_offload=False \
    +actor_rollout_ref.ref.fsdp_config.optimizer_offload=False \
    +actor_rollout_ref.ref.fsdp_config.mixed_precision.param_dtype=bfloat16 \
    +actor_rollout_ref.ref.fsdp_config.mixed_precision.reduce_dtype=bfloat16 \
    +actor_rollout_ref.ref.fsdp_config.mixed_precision.buffer_dtype=bfloat16 \
    custom_reward_function.path="$PROJECT_DIR/verl/utils/reward_score/factor.py" \
    custom_reward_function.name=compute_score \
    algorithm.use_kl_in_reward=False \
    trainer.critic_warmup=0 \
    trainer.val_before_train=False \
    trainer.logger='["console","tensorboard"]' \
    trainer.project_name='alphaagentevo' \
    trainer.experiment_name='qwen3-4b-factor_10x_datav6_bs20_rollout3_rewardv8_lr1e-6' \
    trainer.n_gpus_per_node=10 \
    trainer.nnodes=1 \
    trainer.save_freq=10 \
    trainer.test_freq=10 \
    data.train_files="$TRAIN_DATA" \
    data.val_files="$VAL_DATA"  \
    actor_rollout_ref.rollout.multi_turn.tool_config_path="$TOOL_CONFIG" \
    trainer.total_training_steps=150 \
    trainer.resume_mode=resume_path \
    trainer.resume_from_path="${FULL_RESUME_PATH}" \
    $@

