set -x
# export VLLM_ATTENTION_BACKEND=XFORMERS

MODEL_PATH=$1
DATA_DIR=$2

N=1
temperature=0
n_candidates=4  # best-of-n @ n_candidates

python3 -m verl.trainer.rrm_reward \
    trainer.nnodes=1 \
    trainer.n_gpus_per_node=4 \
    data.path=${DATA_DIR} \
    data.n_samples=${N} \
    data.batch_size=102400 \
    model.path=${MODEL_PATH} \
    rollout.temperature=${temperature} \
    rollout.prompt_length=4096 \
    rollout.response_length=8192 \
    +rollout.n_candidates=${n_candidates} \
    rollout.top_k=-1 \
    rollout.top_p=0.95 \
    rollout.gpu_memory_utilization=0.85 \
    rollout.tensor_model_parallel_size=4 

