workflow:
  rules:
    # always run MR pipelines
    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
    # always run web pipelines
    - if: $CI_PIPELINE_SOURCE == "web"
    # do not run branch pipelines if open MR exists
    - if: $CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS
      when: never
    # run branch pipeline if no open MR
    - if: $CI_COMMIT_BRANCH

stages:
  - test
  - jet

variables: &VARS
  SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
  DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
  PYTORCH_IMAGE: /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/nvcr_pytorch_23.04.sqsh # This is the image that is run by all nodes on selene for tests
  PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
  TESTS_TO_RUN_AFTER_MERGING: "MR_TESTS NIGHTLY_TESTS" # Can specify levels
  TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
  TEST_REGEX_ON_THIS_COMMIT:  NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
  JET_CUSTOM_FILTER: ""
  DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
  TIME_LIMIT: "10:00" # Default time limit for all jobs
  MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE
  JET_CLUSTER_BRANCH:
    value: "mcore/draco-oci"
    options:
      - "mcore/draco-oci"
      - "mcore/eos"
    description: '"mcore/draco-oci" for OCI-IAD, "mcore/eos" for EOS'

include:
  - jet-tests.yml

unit_tests:
  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
  tags:
    - 8xL40S
  stage: test
  script:
    - torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
  coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
  artifacts:
    paths:
      - coverage
    expire_in: 30 days
  rules:
    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH

unit_tests-data:
  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
  tags:
    - 8xL40S
  stage: test
  script:
    - torchrun --nproc_per_node=8 -m pytest tests/unit_tests/data
  rules:
    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
      when: never
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
      when: never
    - when: always

unit_tests-dist-checkpointing:
  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
  tags:
    - 8xL40S
  stage: test
  script:
    - torchrun --nproc_per_node=8 -m pytest tests/unit_tests/dist_checkpointing
  rules:
    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
      when: never
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
      when: never
    - when: always
  
unit_tests-fusions:
  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
  tags:
    - 8xL40S
  stage: test
  script:
    - torchrun --nproc_per_node=8 -m pytest tests/unit_tests/fusions
  rules:
    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
      when: never
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
      when: never
    - when: always
  
unit_tests-models:
  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
  tags:
    - 8xL40S
  stage: test
  script:
    - torchrun --nproc_per_node=8 -m pytest tests/unit_tests/models
  rules:
    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
      when: never
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
      when: never
    - when: always
  
unit_tests-pipeline-parallel:
  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
  tags:
    - 8xL40S
  stage: test
  script:
    - torchrun --nproc_per_node=8 -m pytest tests/unit_tests/pipeline_parallel
  rules:
    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
      when: never
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
      when: never
    - when: always
  
unit_tests-tensor-parallel:
  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
  tags:
    - 8xL40S
  stage: test
  script:
    - torchrun --nproc_per_node=8 -m pytest tests/unit_tests/tensor_parallel
  rules:
    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
      when: never
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
      when: never
    - when: always
  
unit_tests-transformer:
  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
  tags:
    - 8xL40S
  stage: test
  script:
    - torchrun --nproc_per_node=8 -m pytest tests/unit_tests/transformer
  rules:
    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
      when: never
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
      when: never
    - when: always
  
unit_tests-top-py:
  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
  tags:
    - 8xL40S
  stage: test
  script:
    - torchrun --nproc_per_node=8 -m pytest tests/unit_tests/*.py
  rules:
    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
      when: never
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
      when: never
    - when: always
  
docs_build_test:
  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
  stage: test
  tags:
    - os/linux
  script:
    - cd ..
    - rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/nemo-megatron-core-tme/documentation.git
    - mv megatron-lm/ documentation/
    - cd documentation/
    - ./repo docs
  allow_failure: true
  except:
    - main

formatting:
  image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
  tags:
    - os/linux
  stage: test
  script:
    - black megatron/core --check --verbose --diff
    - isort megatron/core --check
  rules:
    - when: always
