protocolVersion: 2
name: sglang_infer_clt_try
type: job
jobRetryCount: 0
prerequisites:
  - type: dockerimage
    uri: 'lmsysorg/sglang:v0.4.4-rocm630'
    name: docker_image0
taskRoles:
  taskrole:
    instances: 1
    completion:
      minFailedInstances: 1
    taskRetryCount: 0
    dockerImage: docker_image0
    resourcePerInstance:
      gpu: 8
      cpu: 88
      memoryMB: 1638400
    commands:
      - 'pip install -U "huggingface_hub[cli]"'
      - git clone https://lihaoling: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@github.com/lihaoling/epicoder2.git
      - cd epicoder2
      - chmod 777 azcopy
      - ls
      - echo 'prepare environment...'
      - export ROCM_PATH=/opt/rocm-6.3.0
      - 'export PATH=$PATH:$ROCM_PATH/bin'
      - 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$ROCM_PATH/lib'
      - apt-get update
      - apt-get install -y python3.12-dev
      - bash sglang/install.sh
      - python3.12 -m sglang_router.launch_server --model-path Qwen/Qwen2.5-Coder-14B-Instruct --dp 8 --tp 1 --host 0.0.0.0
      - sleep 30d
defaults:
  virtualCluster: sigma-s-mi300
extras:
  com.microsoft.pai.runtimeplugin:
    - plugin: ssh
      parameters:
        jobssh: true
        userssh:
          type: custom
          value: >-
            ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDMgVIPOWou8NcZ2otMeMfAAv6tk3xFjc1PFCfkqeOwm haoling@haolingdeMacBook-Pro.local
    - plugin: teamwise_storage
      parameters:
        storageConfigNames:
          - blob-data-sigmasystem
  jobStatusChangeNotification:
    running: true
    succeeded: true
    failed: true
  hivedScheduler:
    jobPriorityClass: oppo
    taskRoles:
      taskrole:
        skuNum: 8
        skuType: MI300X
