python3 -m inference_rlhf.code.qwen_finetune \
    policy=qwen-25-05b \
    task=math \
    user=anonymousanonymous \
    sampling.k=1 \
    training.lr=1e-5 \
    sampling.max_tokens=1024

# qwen-25-05b

# alternatively, sample from the model during evaluation
# python3 -m inference_rlhf.code.qwen_finetune \
#     policy=qwen-25-3b \
#     task=math \
#     user=anonymousanonymous \
#     sampling.k=1