#!/bin/bash
# SBATCH --job-name=server_cllama_tplt_no_mask
# SBATCH --output=server_cllama_tplt_no_mask.out
# SBATCH --error=server_cllama_tplt_no_mask.err
#SBATCH --job-name=server_cllama_tplt
#SBATCH --output=server_cllama_tplt.out
#SBATCH --error=server_cllama_tplt.err
#SBATCH --partition=compute
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --gres=gpu:A100:1
#SBATCH --time=10:00:00
#SBATCH --mail-type=ALL

source ~/.bashrc
conda activate agent



MODEL_DIR="/data/b_ou/ckpts/output_5k_full_cllama_template"
# MODEL_DIR="/data/b_ou/ckpts/output_5k_full_cllama_template_no_mask"
test -d "$MODEL_DIR"
python -O -u -m vllm.entrypoints.openai.api_server \
    --port=1528 \
    --model="$MODEL_DIR" \
    --tensor-parallel-size=1 \
    --max-num-batched-tokens=4096


    # --tokenizer=hf-internal-testing/llama-tokenizer \

# MODEL_DIR="/data/datasets/models/huggingface/meta-llama/Llama-2-70b-chat-hf/"
# test -d "$MODEL_DIR"
# python -O -u -m vllm.entrypoints.openai.api_server \
#     --port=1527 \
#     --model=/data/datasets/models/huggingface/meta-llama/Llama-2-70b-chat-hf/ \
#     --tokenizer=hf-internal-testing/llama-tokenizer \
#     --tensor-parallel-size=4
