common_pod_spec: &common_pod_spec
  priorityClassName: perf-benchmark
  nodeSelector:
    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
  volumes:
    - name: devshm
      emptyDir:
        medium: Memory
    - name: hf-cache
      hostPath:
        path: /root/.cache/huggingface
        type: Directory

common_container_settings: &common_container_settings
  command:
    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
  resources:
    limits:
      nvidia.com/gpu: 8
  volumeMounts:
    - name: devshm
      mountPath: /dev/shm
    - name: hf-cache
      mountPath: /root/.cache/huggingface
  env:
    - name: VLLM_USAGE_SOURCE
      value: ci-test
    - name: HF_HOME
      value: /root/.cache/huggingface
    - name: VLLM_SOURCE_CODE_LOC
      value: /workspace/build/buildkite/vllm/performance-benchmark
    - name: HF_TOKEN
      valueFrom:
        secretKeyRef:
          name: hf-token-secret
          key: token

steps:
  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
  - label: "A100 trt benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
                <<: *common_container_settings

  - label: "A100 lmdeploy benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: openmmlab/lmdeploy:v0.5.0
                <<: *common_container_settings
  

  - label: "A100 vllm benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: vllm/vllm-openai:latest 
                <<: *common_container_settings

  - label: "A100 tgi benchmark"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
              - image: ghcr.io/huggingface/text-generation-inference:2.1 
                <<: *common_container_settings
        
  - wait

  - label: "Plot"
    priority: 100
    agents:
      queue: A100
    plugins:
      - kubernetes:
          podSpec:
            <<: *common_pod_spec
            containers:
            - image: vllm/vllm-openai:v0.5.0.post1
              command:
              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
              resources:
                limits:
                  nvidia.com/gpu: 8
              volumeMounts:
              - name: devshm
                mountPath: /dev/shm
              env:
              - name: VLLM_USAGE_SOURCE
                value: ci-test
              - name: VLLM_SOURCE_CODE_LOC
                value: /workspace/build/buildkite/vllm/performance-benchmark
              - name: HF_TOKEN
                valueFrom:
                  secretKeyRef:
                    name: hf-token-secret
                    key: token

  - wait