export PYTHONPATH=/path/to/trl:$PYTHONPATH 

EXP_NAME=debug
accelerate launch   --main_process_port=39501  --config_file /path/to/trl/examples/accelerate_configs/deepspeed_zero2.yaml \
    /path/to/trl/run_scripts/rloo_tldr_feedback_v2.py \
    --output_dir /path/to/output_ckpt/minimal/rloo_tldr/$EXP_NAME \
    --dataset_name /path/to/datasets/datasets--trl-internal-testing--tldr-preference-sft-trl-style \
    --dataset_test_split test \
    --num_ppo_epochs 2 \
    --num_mini_batches 2 \
    --learning_rate 3e-6 \
    --per_device_train_batch_size 8 \
    --gradient_accumulation_steps 16 \
    --total_episodes 1000000 \
    --model_name_or_path  /path/to/models/EleutherAI_pythia-2.8b-deduped__sft__tldr \
    --sft_model_path /path/to/models/EleutherAI_pythia-2.8b-deduped__sft__tldr \
    --reward_model_path /path/to/models/pythia-2.8b-deduped-tldr-rm \
    --local_rollout_forward_batch_size 8 \
    --missing_eos_penalty 4.0 \
    --stop_token eos \
    --kl_coef 0.03 \
    --eval_steps 5 \
    --save_steps 20 \
    --exp_name $EXP_NAME \
    --eval_strategy steps \
    --rm_with_feedback true \
    --rm_lr 1e-6 \
    --lqh true \
    --agg attention \
    --fw 0.4


