name: 8 GPU Integration Test on H100

on:
  push:
    branches: [ main ]
    paths-ignore:
      - 'torchtitan/experiments/**'
  pull_request:
    branches: [ main ]
    paths-ignore:
      - 'torchtitan/experiments/**'
  schedule:
    # Runs every 6 hours
    - cron: '0 */6 * * *'

concurrency:
  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
  cancel-in-progress: true

defaults:
  run:
    shell: bash -l -eo pipefail {0}

jobs:
  build-test:
    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    with:
      runner: linux.aws.h100.8
      gpu-arch-type: cuda
      gpu-arch-version: "12.6"
      # This image is faster to clone than the default, but it lacks CC needed by triton
      # (1m25s vs 2m37s).
      docker-image: torchtitan-ubuntu-20.04-clang12
      repository: pytorch/torchtitan
      upload-artifact: outputs
      script: |
        set -eux

        # The generic Linux job chooses to use base env, not the one setup by the image
        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
        conda activate "${CONDA_ENV}"

        # Log CUDA driver version for debugging.
        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
        echo "CUDA driver version: ${DRIVER_VERSION}"

        pip config --user set global.progress_bar off

        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126

        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126

        mkdir artifacts-to-be-uploaded

        # Enable CPP stacktraces for debugging symmetric memory initialization errors.
        TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests_h100 artifacts-to-be-uploaded --ngpu 8
