apiVersion: batch/v1
kind: Job
metadata:
  name: {NAME}
  labels:
    kueue.x-k8s.io/queue-name: farai
    wandb-group: {WANDB_RUN_GROUP}
    wandb-project: {WANDB_PROJECT}
    wandb-job-name: {WANDB_JOB_NAME}
    wandb-entity: {WANDB_ENTITY}
    launch-id: {LAUNCH_ID}
spec:
  suspend: true
  backoffLimit: 10  # How many times to try to run the job until giving up
  template:
    metadata:
      generateName: {NAME}
    spec:
      securityContext:
        runAsUser: 1001
        runAsGroup: 1001
      priorityClassName: {PRIORITY}
      volumes:
      - name: training
        persistentVolumeClaim:
          claimName: vast-learned-planners
      - name: dshm
        emptyDir:
          medium: Memory
          sizeLimit: "{SHM_SIZE}"
      containers:
      - name: devbox-container
        image: "ghcr.io/alignmentresearch/train-learned-planner:{CONTAINER_TAG}"
        imagePullPolicy: Always
        command:
          - bash
          - -c
          - |
            sudo mkdir -p "/opt/sokoban_cache"
            sudo chown -R dev:dev "/opt/sokoban_cache"
            sudo mkdir -p "/training/cleanba"
            sudo chown dev:dev "/training/cleanba"
            git clone https://github.com/google-deepmind/boxoban-levels "/opt/sokoban_cache/boxoban-levels-master"
            git pull
            git checkout {COMMIT_HASH}
            git submodule update --recursive
            sudo apt update
            sudo apt install -y libgl1-mesa-glx
            pip uninstall -y envpool
            pip install -U envpool
            {COMMAND}
        resources:
          requests:
            cpu: {CPU}
          limits:
            memory: {MEMORY}
            nvidia.com/gpu: {GPU}
        env:
        - name: OMP_NUM_THREADS
          value: {OMP_NUM_THREADS}
        - name: XLA_PYTHON_CLIENT_MEM_FRACTION
          value: {XLA_PYTHON_CLIENT_MEM_FRACTION}
        - name: GIT_ASKPASS
          value: "true"
        - name: GITHUB_PAT
          valueFrom:
            secretKeyRef:
              name: github-credentials
              key: pat
        - name: GIT_CONFIG_PARAMETERS
          value: "'credential.https://github.com.username=$(GITHUB_PAT)'"
        - name: WANDB_API_KEY
          valueFrom:
            secretKeyRef:
              name: wandb
              key: api-key
        - name: WANDB_ENTITY
          value: {WANDB_ENTITY}
        - name: WANDB_JOB_NAME
          value: {WANDB_JOB_NAME}
        - name: WANDB_PROJECT
          value: {WANDB_PROJECT}
        - name: WANDB_RUN_GROUP
          value: {WANDB_RUN_GROUP}
        - name: WANDB_MODE
          value: {WANDB_MODE}
        volumeMounts:
        - name: dshm
          mountPath: /dev/shm
        - name: training
          mountPath: {TRAINING_MOUNT}
      restartPolicy: Never
      imagePullSecrets:
      - name: docker
