python evaluate.py  \
    --tasks Q \
    --hf_critic_model openbmb/UltraCM-13b\
    --enable_code_execution