accelerate launch -m inference_rlhf.code.train_rl policy=qwen-25-05b \
    task=math \
    user=anonymousanonymous