name: Self-hosted runner (scheduled-amd)

# Note: For the AMD CI, we rely on a caller workflow and on the workflow_call event to trigger the
# CI in order to run it on both MI210 and MI250, without having to use matrix here which pushes
# us towards the limit of allowed jobs on GitHub Actions.

on:
  workflow_call:
    inputs:
      job:
        required: true
        type: string
      slack_report_channel:
        required: true
        type: string
      runner:
        required: true
        type: string
      docker:
        required: true
        type: string
      ci_event:
        required: true
        type: string

env:
  HF_HOME: /mnt/cache
  TRANSFORMERS_IS_CI: yes
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  RUN_SLOW: yes
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  NUM_SLICES: 2

# Important note: each job (run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu) requires all the previous jobs before running.
# This is done so that we avoid parallelizing the scheduled tests, to leave available
# runners for the push CI that is running on the same machine.
jobs:
  check_runner_status:
    name: Check Runner Status
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout transformers
        uses: actions/checkout@v4
        with:
          fetch-depth: 2

      - name: Check Runner Status
        run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1,hf-amd-mi300-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}

  check_runners:
    name: Check Runners
    needs: check_runner_status
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: ROCM-SMI
        run: |
          rocm-smi

      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14

      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"

  setup:
    if: contains(fromJSON('["run_models_gpu"]'), inputs.job)
    name: Setup
    needs: check_runners
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
    steps:
      - name: Update clone
        working-directory: /transformers
        run: |
          git fetch && git checkout ${{ github.sha }}

      - name: Cleanup
        working-directory: /transformers
        run: |
          rm -rf tests/__pycache__
          rm -rf tests/models/__pycache__
          rm -rf reports

      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze

      - id: set-matrix
        name: Identify models to test
        working-directory: /transformers/tests
        run: |
          echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
          echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT

      - name: ROCM-SMI
        run: |
          rocm-smi

      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14

      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"

      - name: Environment
        working-directory: /transformers
        run: |
          python3 utils/print_env.py

  run_models_gpu:
    if: ${{ inputs.job == 'run_models_gpu' }}
    name: Single GPU tests
    needs: setup
    strategy:
      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
      fail-fast: false
      matrix:
        machine_type: [single-gpu, multi-gpu]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs_amd.yml
    with:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
      runner: ${{ inputs.runner }}
      docker: ${{ inputs.docker }}
    secrets: inherit

  run_pipelines_torch_gpu:
    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
    name: PyTorch pipelines
    needs: check_runners
    strategy:
      fail-fast: false
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: ${{ inputs.docker }}
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

      - name: ROCM-SMI
        run: |
          rocm-smi

      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14

      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"

      - name: Environment
        working-directory: /transformers
        run: |
          python3 utils/print_env.py

      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze

      - name: Run all pipeline tests on GPU
        working-directory: /transformers
        run: |
          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt

      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports

  run_examples_gpu:
    if: ${{ inputs.job == 'run_examples_gpu' }}
    name: Examples directory
    needs: check_runners
    strategy:
      fail-fast: false
      matrix:
        machine_type: [single-gpu]
    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: ${{ inputs.docker }}
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

      - name: ROCM-SMI
        run: |
          rocm-smi

      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14

      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"

      - name: Environment
        working-directory: /transformers
        run: |
          python3 utils/print_env.py

      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze

      - name: Run examples tests on GPU
        working-directory: /transformers
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports/failures_short.txt

      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.machine_type }}_run_examples_gpu_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports

  run_torch_cuda_extensions_gpu:
    if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
    name: Torch ROCm deepspeed tests
    needs: check_runners
    strategy:
      fail-fast: false
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: ${{ inputs.docker }}
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

      - name: ROCM-SMI
        run: |
          rocm-smi

      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14

      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"

      - name: Environment
        working-directory: /transformers
        run: |
          python3 utils/print_env.py

      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze

      - name: Run all tests on GPU
        working-directory: /transformers
        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt

      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports

  send_results:
    name: Slack Report
    needs: [
      check_runner_status,
      check_runners,
      setup,
      run_models_gpu,
      run_pipelines_torch_gpu,
      run_examples_gpu,
      run_torch_cuda_extensions_gpu
    ]
    if: ${{ always() }}
    uses: ./.github/workflows/slack-report.yml
    with:
      job: ${{ inputs.job }}
      # This would be `skipped` if `setup` is skipped.
      setup_status: ${{ needs.setup.result }}
      slack_report_channel: ${{ inputs.slack_report_channel }}
      # This would be an empty string if `setup` is skipped.
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
      ci_event: ${{ inputs.ci_event }}

    secrets: inherit
