image: nvcr.io/nvidia/pytorch:23.04-py3

stages:
  - test
  - jet
  - cleanup

variables: &VARS
  SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
  DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
  PYTORCH_IMAGE: /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/nvcr_pytorch_23.04.sqsh # This is the image that is run by all nodes on selene for tests
  PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
  TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED: "MR_TESTS JET" # Can specify levels
  TESTS_TO_RUN_AFTER_MERGING: "MR_TESTS NIGHTLY_TESTS" # Can specify levels
  TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
  TEST_REGEX_ON_THIS_COMMIT:  NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
  DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
  TIME_LIMIT: "10:00" # Default time limit for all jobs
  MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE
 

include:
  - jet-tests.yml

unit_tests:
  image: nvcr.io/nvidia/pytorch:23.04-py3
  tags:
    - docker_local_runner
  stage: test
  script:
    - pip install pytest-cov
    - pip install pytest_mock
    - pip install nltk
    - pip install wrapt
    - pip install zarr "tensorstore==0.1.45"  # for distributed checkpointing tests
    - pip install git+https://github.com/fanshiqing/grouped_gemm@main  # for grouped gemm tests
    - torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
  coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
  artifacts:
    paths:
      - coverage
    expire_in: 30 days
  rules:
    - when: always

docs_build_test:
  stage: test
  tags:
    - docker_local_runner
  script:
    - cd ..
    - rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/nemo-megatron-core-tme/documentation.git
    - mv megatron-lm/ documentation/
    - cd documentation/
    - ./repo docs
  allow_failure: true
  except:
    - main

formatting:
  image: nvcr.io/nvidia/pytorch:23.04-py3
  tags:
    - docker_local_runner
  stage: test
  script:
    - pip install --upgrade black==19.10b0 isort click==8.0.2
    - black megatron/core --check --verbose --diff
    - isort megatron/core --check
  rules:
    - when: always

.selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher
  tags:
    - ssh_selene_runner
  stage: test
  script: &selene-test-resume-launcher-script
    - echo "Running selene resume from checkpoint test. "
    - pwd
    - run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_resume_checkpoint_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR TIME_LIMIT=$TIME_LIMIT"
    - echo "$run_cmd"
    - ${run_cmd}
    - echo "Completed the job"
  rules:
    - if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT 
      when: always
    - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
      when: always
    - if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
      when: always
    - if: '$CI_MERGE_REQUEST_LABELS =~ "READY FOR REVIEW" && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED'
      when: always
  allow_failure: false
  retry: 2

.selene_test_launcher: &selene-test-launcher
  tags:
    - ssh_selene_runner
  stage: test
  script: &selene-test-launcher-script
    - echo "Running selene test"
    - pwd
    - run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME MAX_STEPS=$MAX_STEPS PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR USE_CORE=$USE_CORE USE_TE=$USE_TE MOE_GROUPED_GEMM=$MOE_GROUPED_GEMM TIME_LIMIT=$TIME_LIMIT"
    - echo "$run_cmd"
    - ${run_cmd}
    - echo "Completed the job"
  rules:
    - if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
      when: always
    - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
      when: always
    - if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
      when: always
    - if: '$CI_MERGE_REQUEST_LABELS =~ "READY FOR REVIEW" && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED'
      when: always
  allow_failure: false
  retry: 2

train.te_gpt3.345m_tp2_pp2_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 1
    TP_SIZE: 2
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    TIME_LIMIT: "20:00"
    TEST_LEVEL: MR_TESTS
    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3

train.gpt3_core.345m_tp4_pp1_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 4
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: NIGHTLY_TESTS

train.gpt3_core.345m_tp2_pp2_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 2
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: MR_TESTS

train.gpt3_core.345m_tp1_pp2_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TIME_LIMIT: "10:00"
    TEST_LEVEL: NIGHTLY_TESTS

train.gpt3_core.345m_tp1_pp4_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: NIGHTLY_TESTS

train.gpt3_core.345m_tp1_pp4_interleaved_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    VP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: MR_TESTS

train.gpt3_core.345m_tp1_pp2_1node_50steps_rope:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: MR_TESTS
    METADATA: rope_embeddings
    ADDITIONAL_PARAMS: "--position-embedding-type rope"

train.gpt3_core.345m_tp1_pp4_1node_50steps_swiglu:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: MR_TESTS
    METADATA: swiglu
    ADDITIONAL_PARAMS: "--swiglu"

train.gpt3_core.345m_tp1_pp4_1node_50steps_disable_bias_linear:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: MR_TESTS
    METADATA: disable_bias_linear
    ADDITIONAL_PARAMS: "--disable-bias-linear"

train.gpt3_core.345m_tp1_pp4_1node_50steps_untie_embeddings_and_outputs:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: MR_TESTS
    METADATA: untie_embeddings_and_outputs
    ADDITIONAL_PARAMS: "--untie-embeddings-and-output-weights"

train.gpt3_core.345m_tp1_pp4_1node_50steps_sequence_parallel:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: MR_TESTS
    METADATA: sequence_parallel
    ADDITIONAL_PARAMS: "--sequence-parallel"

train.gpt3.345m_tp4_pp1_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 4
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: NIGHTLY_TESTS

train.gpt3.345m_tp2_pp2_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 2
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: MR_TESTS

train.gpt3.345m_tp1_pp2_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: NIGHTLY_TESTS

train.gpt3.345m_tp1_pp4_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: NIGHTLY_TESTS

train.gpt3.345m_tp1_pp4_interleaved_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    VP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: MR_TESTS

resume.checkpoint.gpt3.345m_tp1_pp2_1node:
  <<: *selene-test-resume-checkpoint-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    TP_SIZE: 1
    PP_SIZE: 2
    NUM_NODES: 1
    TIME_LIMIT: "15:00"
    TEST_LEVEL: MR_TESTS

train.gpt3.345m_tp1_pp1_1node_50steps_dist_optimizer:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: MR_TESTS
    METADATA: dist_optimizer
    ADDITIONAL_PARAMS: "--use-distributed-optimizer"

train.gpt3.345m_tp1_pp1_1node_50steps_overlap_grad_reduce:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: NIGHTLY_TESTS
    METADATA: overlap_grad_reduce
    ADDITIONAL_PARAMS: "--overlap-grad-reduce"

train.gpt3.345m_tp1_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: NIGHTLY_TESTS
    METADATA: dist_optimizer_overlap_grad_reduce
    ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce"

train.gpt3.345m_tp1_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce_param_gather:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: NIGHTLY_TESTS
    METADATA: dist_optimizer_overlap_grad_reduce_param_gather
    ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"

train.gpt3.345m_tp4_pp1_1node_50steps_overlap_grad_reduce:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 4
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: NIGHTLY_TESTS
    METADATA: overlap_grad_reduce
    ADDITIONAL_PARAMS: "--overlap-grad-reduce"

train.gpt3.345m_tp4_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 4
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: MR_TESTS
    METADATA: dist_optimizer_overlap_grad_reduce
    ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce"

train.gpt3.345m_tp4_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce_param_gather:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 4
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: MR_TESTS
    METADATA: dist_optimizer_overlap_grad_reduce_param_gather
    ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"

train.gpt3.345m_tp1_pp4_1node_50steps_overlap_grad_reduce:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: NIGHTLY_TESTS
    METADATA: overlap_grad_reduce
    ADDITIONAL_PARAMS: "--overlap-grad-reduce"

train.gpt3.345m_tp1_pp4_interleaved_1node_50steps_overlap_grad_reduce:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    VP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: NIGHTLY_TESTS
    METADATA: overlap_grad_reduce
    ADDITIONAL_PARAMS: "--overlap-grad-reduce"

train.gpt3.345m_tp1_pp4_interleaved_1node_50steps_dist_optimizer_overlap_grad_reduce:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    VP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: MR_TESTS
    METADATA: dist_optimizer_overlap_grad_reduce
    ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce"

train.gpt3.345m_tp1_pp4_interleaved_1node_50steps_dist_optimizer_overlap_grad_reduce_param_gather:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 1
    PP_SIZE: 4
    VP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: MR_TESTS
    METADATA: dist_optimizer_overlap_grad_reduce_param_gather
    ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"

train.gpt3.345m_tp2_pp2_1node_50steps_overlap_grad_reduce:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 2
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: NIGHTLY_TESTS
    METADATA: overlap_grad_reduce
    ADDITIONAL_PARAMS: "--overlap-grad-reduce"

train.gpt3_core.345m_cp2_tp2_pp1_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 2
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TIME_LIMIT: "20:00"
    TEST_LEVEL: MR_TESTS
    METADATA: "context_parallelism_cp2"
    PYTORCH_IMAGE: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/pytorch_23.10_flash_attn_1.0.9_context_parallelism.sqsh"
    ADDITIONAL_PARAMS: "--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"

train.gpt3_core.345m_cp2_tp2_pp2_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 2
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TIME_LIMIT: "20:00"
    TEST_LEVEL: MR_TESTS
    METADATA: "context_parallelism_cp2"
    PYTORCH_IMAGE: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/pytorch_23.10_flash_attn_1.0.9_context_parallelism.sqsh"
    ADDITIONAL_PARAMS: "--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"

# Note: Core MoE models currently will run TE by default
train.te_core_moe_gpt3.345m_tp2_pp2_2experts_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 2
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: NIGHTLY_TESTS
    METADATA: "te_2experts"
    ADDITIONAL_PARAMS: "--num-experts 2"

train.te_core_moe_gpt3.345m_tp2_pp2_4experts2parallel_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 2
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: NIGHTLY_TESTS
    METADATA: "te_4experts2parallel"
    ADDITIONAL_PARAMS: "--sequence-parallel --num-experts 4 --expert-model-parallel-size 2"

train.te_core_moe_gpt3.345m_tp2_pp1_8experts2parallel_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 2
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    TEST_LEVEL: MR_TESTS
    METADATA: "te_8experts2parallel"
    ADDITIONAL_PARAMS: "--sequence-parallel --num-experts 8 --expert-model-parallel-size 2"

train.te_core_moe_gpt3.345m_tp2_pp1_8experts2parallel_groupedGEMM_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 2
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 1
    MOE_GROUPED_GEMM: 1
    TEST_LEVEL: MR_TESTS
    METADATA: "te_8experts2parallel_groupedGEMM"
    ADDITIONAL_PARAMS: "--moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2"

train.moe_gpt3.345m_tp2_pp2_4experts_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: gpt3
    USE_TE: 0
    TP_SIZE: 2
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    USE_CORE: 0
    TEST_LEVEL: NIGHTLY_TESTS
    METADATA: "4experts"
    ADDITIONAL_PARAMS: "--num-experts 4"

train.bert.345m_tp4_pp1_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 4
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    TIME_LIMIT: "10:00"
    TEST_LEVEL: NIGHTLY_TESTS

train.bert.345m_tp2_pp2_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 2
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    TEST_LEVEL: MR_TESTS

train.bert.345m_tp1_pp2_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 1
    PP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    TEST_LEVEL: NIGHTLY_TESTS

train.bert.345m_tp1_pp4_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 1
    PP_SIZE: 4
    NUM_NODES: 1
    MAX_STEPS: 50
    TEST_LEVEL: NIGHTLY_TESTS

train.bert.345m_tp1_pp4_interleaved_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 1
    PP_SIZE: 4
    VP_SIZE: 2
    NUM_NODES: 1
    MAX_STEPS: 50
    TEST_LEVEL: MR_TESTS

train.bert_core.345m_tp4_pp1_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 4
    PP_SIZE: 1
    NUM_NODES: 1
    USE_CORE: 1
    MAX_STEPS: 50
    TIME_LIMIT: "20:00"
    TEST_LEVEL: NIGHTLY_TESTS

train.bert_core.345m_tp2_pp2_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 2
    PP_SIZE: 2
    NUM_NODES: 1
    USE_CORE: 1
    MAX_STEPS: 50
    TIME_LIMIT: "20:00"
    TEST_LEVEL: MR_TESTS

train.bert_core.345m_tp1_pp2_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 1
    PP_SIZE: 2
    NUM_NODES: 1
    USE_CORE: 1
    MAX_STEPS: 50
    TIME_LIMIT: "20:00"
    TEST_LEVEL: NIGHTLY_TESTS

train.bert_core.345m_tp1_pp4_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 1
    PP_SIZE: 4
    VP_SIZE: 2
    NUM_NODES: 1
    USE_CORE: 1
    MAX_STEPS: 50
    TIME_LIMIT: "20:00"
    TEST_LEVEL: NIGHTLY_TESTS

train.bert_core.345m_tp1_pp2_1node_50steps_rope:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 1
    PP_SIZE: 2
    NUM_NODES: 1
    USE_CORE: 1
    MAX_STEPS: 50
    TIME_LIMIT: "20:00"
    TEST_LEVEL: L0
    METADATA: rope_embeddings
    ADDITIONAL_PARAMS: "--position-embedding-type rope"

train.bert_core.345m_tp1_pp2_1node_50steps_sequence_parallel:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 1
    PP_SIZE: 2
    NUM_NODES: 1
    USE_CORE: 1
    MAX_STEPS: 50
    TIME_LIMIT: "20:00"
    TEST_LEVEL: L0
    METADATA: sequence_parallel
    ADDITIONAL_PARAMS: "--sequence-parallel"

resume.checkpoint.bert.345m_tp1_pp2_1node:
  <<: *selene-test-resume-checkpoint-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 1
    PP_SIZE: 2
    NUM_NODES: 1
    TEST_LEVEL: MR_TESTS

train.retro_core.tp1_pp1_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: retro
    USE_TE: 0
    USE_CORE: 1
    TP_SIZE: 1
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    TIME_LIMIT: "20:00"
    TEST_LEVEL: MONTHLY_TESTS

train.t5_core.220m_tp1_pp1_1node_100steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: t5
    USE_TE: 0
    USE_CORE: 1
    TP_SIZE: 1
    PP_SIZE: 1
    VP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 100
    TIME_LIMIT: "30:00"
    TEST_LEVEL: MONTHLY_TESTS
    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3

train.t5_core.220m_tp2_pp1_1node_100steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: t5
    USE_TE: 0
    USE_CORE: 1
    TP_SIZE: 2
    PP_SIZE: 1
    VP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 100
    TIME_LIMIT: "30:00"
    TEST_LEVEL: MONTHLY_TESTS
    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3

train.t5_core.220m_te_tp1_pp1_1node_100steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: t5
    USE_TE: 1
    USE_CORE: 1
    TP_SIZE: 1
    PP_SIZE: 1
    VP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 100
    TIME_LIMIT: "30:00"
    TEST_LEVEL: MR_TESTS
    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3

train.t5_core.220m_te_tp2_pp1_1node_100steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: t5
    USE_TE: 1
    USE_CORE: 1
    TP_SIZE: 2
    PP_SIZE: 1
    VP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 100
    TIME_LIMIT: "30:00"
    TEST_LEVEL: MONTHLY_TESTS
    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3

train.t5_core.220m_te_tp2_pp1_sp_1node_100steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: t5
    USE_TE: 1
    USE_CORE: 1
    TP_SIZE: 2
    PP_SIZE: 1
    VP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 100
    TIME_LIMIT: "30:00"
    TEST_LEVEL: MONTHLY_TESTS
    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
    ADDITIONAL_PARAMS: "--sequence-parallel"

resume.checkpoint.t5_core.220m_tp1_pp1_1node:
  <<: *selene-test-resume-checkpoint-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: t5
    USE_TE: 0
    USE_CORE: 1
    TP_SIZE: 1
    PP_SIZE: 1
    VP_SIZE: 1
    NUM_NODES: 1
    TIME_LIMIT: "30:00"
    TEST_LEVEL: MONTHLY_TESTS
    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3

resume.checkpoint.t5_core.220m_te_tp1_pp1_1node:
  <<: *selene-test-resume-checkpoint-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: t5
    USE_TE: 1
    USE_CORE: 1
    TP_SIZE: 1
    PP_SIZE: 1
    VP_SIZE: 1
    NUM_NODES: 1
    TIME_LIMIT: "30:00"
    TEST_LEVEL: MONTHLY_TESTS
    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3

cleanup.selene:
  tags:
    - ssh_selene_runner
  stage: cleanup
  variables:
    <<: [*VARS]
  script:
    - set +e
    - NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | wc -l`
    - find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | xargs rm -rf
    - find ${SELENE_ADLR_CI_PATH}/* -type d -name "checkpoints" -ctime +2 | grep -v data | xargs rm -rf
    - echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene"
  allow_failure: true
  rules:
    - when: always
