


export WANDB_API_KEY=YOUR_WANDB_API_KEY_HERE
export OPENAI_API_KEY=YOUR_OPENAI_API_KEY_HERE
wandb login
wandb online
seed=$1
export RUN_NAME=llama_3b_it
export DATASET_NAME=simplelr_qwen_level3to5_strip_template

export HDFS_DATA_PATH=Your_HDFS_DATA_PATH_HERE
# export PROJECT_NAME=RL_Initial_Policy
export PROJECT_NAME=Your_PROJECT_NAME_HERE

export VLLM_ATTENTION_BACKEND=XFORMERS
export WORKING_DIR=Your_WORKING_DIR_HERE

#===============================================
# Ray setup with proper cleanup handling
#===============================================

# Function to cleanup Ray on exit
cleanup_on_exit() {
    echo "Cleaning up Ray cluster..."
    ray stop --force
    # Kill any remaining Ray processes
    pkill -f ray || true
    # Clean up shared memory
    rm -rf /dev/shm/ray* || true
    rm -rf /tmp/ray* || true
}

# Set up trap to cleanup on script exit
trap cleanup_on_exit EXIT INT TERM

# Start Ray cluster
echo "Starting Ray cluster..."
ray start --head --num-gpus 4 --num-cpus 64 --node-ip-address 0.0.0.0 
# --dashboard-port 8266
export ARNOLD_WORKER_NUM=1 # number of nodes you want to use 

debug=false
if [ "$debug" = "true" ]; then
    batch_size=8
else
    batch_size=256
fi
HYDRA_FULL_ERROR=1 bash train_my.sh --model_name Llama-3.2-3B-Instruct --max_response_length 5120  --train_batch_size $batch_size --rollout_n 8 --kl_loss_coef 0.0001 --entropy_coeffient 0.001 --rollout_gpu_memory_util 0.75 --rollout_tp 2 --save_freq 20 --micro_rollout_batch_size 1024 --total_epochs 50 --rollout_name sglang --learning_rate 5e-7 --ppo_micro_batch_size 32 --log_prob_micro_batch_size 128 --n_gpu_per_node 4 --resume_data_state True --seed $seed --ppo_mini_batch_size 64
