#CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server --model codellama/CodeLlama-34b-Instruct-hf --tensor-parallel-size=2 --enforce-eager
vllm serve "meta-llama/Llama-3.3-70B-Instruct" --tensor-parallel-size=4 --max_model_len=12000
