#cloud-config
#this file is passed in via user-data metadata in GCP VM Setup
#cloud-init is called at every startup making sure gpu drivers are installed even after migrations, reboots etc
#mlcommons service starts a specific workload run as per script instructions in github

users:
- name: mlcommons
  uid: 2000

write_files:
  - path: /etc/systemd/system/install-gpu.service
    permissions: 0644
    owner: root
    content: |
      [Unit]
      Description=Install GPU drivers
      Wants=gcr-online.target docker.socket
      After=gcr-online.target docker.socket

      [Service]
      User=root
      Type=oneshot
      ExecStart=cos-extensions install gpu -- -version=510.108.03
      StandardOutput=journal+console
      StandardError=journal+console
  - path: /etc/systemd/system/mlcommons.service
    permissions: 0644
    owner: root
    content: |
      [Unit]
      Description=Run a mlcommons GPU application container
      Requires=install-gpu.service
      After=install-gpu.service

      [Service]
      User=root
      Type=oneshot
      RemainAfterExit=true
      Environment="HOME=/home/mlcommonsservice"
      ExecStartPre=mount --bind /var/lib/nvidia /var/lib/nvidia
      ExecStartPre=mount -o remountexec /var/lib/nvidia
      ExecStartPre=/usr/bin/docker-credential-gcr configure-docker --registries us-central1-docker.pkg.dev
      ExecStartPre=/usr/bin/docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/base_image:latest
      ExecStart=/usr/bin/docker run --rm --name=mlcommons --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin --device /dev/nvidia0:/dev/nvidia0  --device /dev/nvidia1:/dev/nvidia1 --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/base_image:latest -b true
      StandardOutput=journal+console
      StandardError=journal+console

runcmd:
  - systemctl daemon-reload
  - systemctl start install-gpu.service
  - systemctl start mlcommons.service

