export PYTHONPATH=/path/to/trl:$PYTHONPATH 

accelerate launch   --main_process_port=39501 --config_file /path/to/trl/examples/accelerate_configs/deepspeed_zero3.yaml \
    /path/to/trl/run_scripts/rloo_ultrafeedback_feedback_v2.py \
    --dataset_name /path/to/datasets/datasets--HuggingFaceH4--ultrafeedback_binarized \
    --dataset_train_split train_prefs \
    --output_dir /path/to/output_ckpt/main_exp/RLOO/$TASK/$EXP_NAME \
    --rloo_k 4 \
    --response_length 512 \
    --num_ppo_epochs 2 \
    --num_mini_batches 2 \
    --learning_rate 6e-6 \
    --per_device_train_batch_size 2 \
    --gradient_accumulation_steps 16 \
    --total_episodes  160000 \
    --model_name_or_path /path/to/models/Qwen2.5-3B-Instruct \
    --sft_model_path /path/to/models/Qwen2.5-3B-Instruct  \
    --reward_model_path /path/to/models/Skywork-Reward-Llama-3.1-8B \
    --local_rollout_forward_batch_size 4 \
    --stop_token eos \
    --missing_eos_penalty 3.0 \
    --eval_steps 10 \
    --save_steps 5 \
    --save_strategy steps \
    --save_total_limit 8  \
    --exp_name $EXP_NAME  \
    --save_only_model true \
    --rm_with_feedback true \
    --kl_coef 0.01 \
    --rm_lr 1e-6 \
    --lqh false \
    --agg attention \
    --filter_length 512 \
    --fw 1.0 \
    --enable_lm false \
    --enable_le true \
    --le_weight 0.5 \
    --gradient_checkpointing true \
    --dynamic_fw true \
    --advantage_estimate rloo \
    --train_rm true

