#!/bin/bash
#SBATCH --job-name=vllm_server
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=32G
#SBATCH --time=10:00:00
#SBATCH --gres=gpu:a100:2
#SBATCH --output=vllm_server.out
#SBATCH --error=vllm_server.err

# Load required modules (adjust based on Triton's available modules)
module load mamba
export HF_HOME=/$WRKDIR/.huggingface_cache
# Activate your conda environment (create if doesn't exist)
source activate vllm_env || mamba create -n vllm_env python=3.9 -y
source activate vllm_env

# Install vLLM if not already installed
pip install vllm

# Get the hostname and a random port
PORT=8000

# Print connection information
echo "Server running on: 0.0.0.0:$PORT"

# Start the vLLM server
vllm serve "meta-llama/Llama-3.2-11B-Vision-Instruct" --tensor-parallel-size 2 --port $PORT
# --max-model-len 4096 --gpu-memory-utilization 0.5