type: basic
format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
  name: "{test_case}_{environment}_{platforms}"
  model: gpt
  build: mcore-pyt-{environment}
  nodes: 1
  gpus: 8
  n_repeat: 5
  platforms: dgx_a100
  artifacts:
    /workspace/data/gpt3_data: text/the_pile/shard00
    /workspace/checkpoints/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_{platforms}_1N8G_dev/24475828
    /workspace/checkpoints/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_{platforms}_1N8G_dev/28359448
    /workspace/checkpoints/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_{platforms}_1N8G_dev/28359448
    /workspace/checkpoints/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_{platforms}_1N8G_dev/28359448
    /workspace/checkpoints/gpt3_mr_mcore_reruns_resume_dev: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2
    /workspace/checkpoints/gpt3_mr_mcore_reruns_resume_lts: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2
    /workspace/checkpoints/gpt3_mr_mcore_reruns_reshard_dev: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2
    /workspace/checkpoints/gpt3_mr_mcore_reruns_reshard_lts: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-enabled_v2
    /workspace/checkpoints/gpt3_mr_mcore_reruns_persistent_2_dev: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-persistent_v2
    /workspace/checkpoints/gpt3_mr_mcore_reruns_persistent_2_lts: model/mcore_gpt/gpt3_4b_pyt/25.03.05_bf16_rerun-persistent_v2
    /workspace/checkpoints/gpt_teacher: model/gpt_dummy_pyt/ckpt/24.10.0_bf16_teacher
  script_setup: |
    unset https_proxy
    echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc

    # Checkout latest
    cd /opt
    rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
    git init
    git remote add origin $MCORE_REPO
    git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
    git fetch origin $MCORE_MR_COMMIT
    git checkout $MCORE_MR_COMMIT
    git rev-parse HEAD

    # Checkout backwards-ref
    cd /opt
    rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy
    git init
    git remote add origin $MCORE_REPO
    git fetch origin $MCORE_BACKWARDS_COMMIT
    git checkout $MCORE_BACKWARDS_COMMIT
    git rev-parse HEAD
    rm -rf megatron; cp -a /opt/megatron-lm/megatron ./
  script: |-
    ls
    cd /opt/megatron-lm

    NAME=$(echo {test_case}_{environment} | sed 's/dgx_h100/dgx_a100/g')

    ARGUMENTS=(
        "DATA_PATH=/workspace/data/gpt3_data"
        "DATA_CACHE_PATH=/workspace/data/cache"
        "OUTPUT_PATH={assets_dir}"
        "TENSORBOARD_PATH={assets_dir}/tensorboard"
        "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
        "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME"
        "TRAINING_SCRIPT_PATH=pretrain_gpt.py"
        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
        "N_REPEAT={n_repeat}"
        "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
        "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
    )

    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}

products:
  #######################################################################
  # Nightly tests: Run both DEV and LTS unless something is flaky       #
  #######################################################################
  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather]
    products:
      - environment: [dev]
        scope: [nightly]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2]
    products:
      - environment: [lts]
        scope: [nightly]
      - environment: [dev]
        scope: [nightly]
        platforms: [dgx_h100]
  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist]
    products:
      - environment: [dev, lts]
        scope: [nightly]
  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4]
    products:
      - environment: [lts]
        scope: [nightly]
      - environment: [dev]
        scope: [nightly]
        platforms: [dgx_h100]
  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist]
    products:
      - environment: [dev, lts]
        scope: [nightly]
  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch]
    products:
      - environment: [lts]
        scope: [nightly]
      - environment: [dev]
        scope: [nightly]
        platforms: [dgx_h100]
  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist]
    products:
      - environment: [lts]
        scope: [nightly]
      - environment: [dev]
        scope: [nightly]
        platforms: [dgx_h100]
  #######################################################################
  # MR tests: Mostly DEV on MR, and LTS on nightly cadence, except for  #
  #             some very important tests.                              #
  #######################################################################
  - test_case: [gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  # - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G]
  #   products:
  #     - environment: [dev]
  #       scope: [mr]
  #     - environment: [lts]
  #       scope: [nightly] # Non-deterministic: #487
  - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      # - environment: [lts]
      #   scope: [nightly] # outdated TE: #501
  - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      # - environment: [lts]
      #   scope: [nightly] # non-determinism: #436
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      # - environment: [lts]
      #   scope: [nightly] # non-determinism: #437
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]

  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      # - environment: [lts]
      #   scope: [nightly] # OOM: #434
  - test_case: [gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      # - environment: [lts]
      #   scope: [nightly] # OOM: #434
  - test_case: [gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp2_pp1_modelopt_distill_resume]
    products:
      - environment: [dev]
        scope: [mr]
      # - environment: [lts]
      #   scope: [nightly] # Outdated: #502
  # - test_case: [gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G]
  #   products:
  # - environment: [dev]
  #   scope: [mr] # Broken: #484
  # - environment: [lts]
  #   scope: [nightly] # Requires PyT 2.4: #481
  #######################################################################
  # Super important MR tests that run for both DEV and LTS per MR       #
  #######################################################################
  - test_case: [gpt3_mr_mcore_reruns_persistent_1]
    products:
      - environment: [dev]
        scope: [mr]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_reruns_persistent_2]
    products:
      - environment: [dev]
        scope: [mr]
      - environment: [lts]
        scope: [nightly]
  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G]
    products:
      - environment: [lts]
        scope: [mr]
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [dev]
        scope: [mr-slim]
        platforms: [dgx_h100]
  - test_case: [gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [mr]
  - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G]
    products:
      - environment: [lts]
        scope: [mr]
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [dev]
        scope: [mr-slim]
        platforms: [dgx_h100]
  - test_case: [gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
        platforms: [dgx_h100]
      - environment: [lts]
        scope: [mr]
  # - test_case: [gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G]
  #   products:
  #     - environment: [dev]
  #       scope: [mr]
  #       platforms: [dgx_h100]
  # - test_case: [gpt3_mr_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer_dgx_a100_1N8G]
  #   products:
  #     - environment: [dev]
  #       scope: [mr]
  #       platforms: [dgx_h100]
  # - test_case: [gpt3_mr_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu_dgx_a100_1N8G]
  #   products:
  #     - environment: [dev]
  #       scope: [mr]
  #       platforms: [dgx_h100]
  # - test_case: [gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G]
  #   products:
  #     - environment: [dev]
  #       scope: [mr]
  #       platforms: [dgx_a100, dgx_h100]
  # - test_case: [gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap]
  #   products:
  #     - environment: [dev]
  #       scope: [weekly]
  #       platforms: [dgx_b200]
  # - test_case: [gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp]
  #   products:
  #     - environment: [dev]
  #       scope: [weekly]
  #       platforms: [dgx_b200]
  # - test_case: [gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap]
  #   products:
  #     - environment: [dev]
  #       scope: [weekly]
  #       platforms: [dgx_b200]
  # - test_case: [gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_mxfp8_tp_sp_cp]
  #   products:
  #     - environment: [dev]
  #       scope: [weekly]
  #       platforms: [dgx_b200]
  # - test_case: [gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap]
  #   products:
  #     - environment: [dev]
  #       scope: [weekly]
  #       platforms: [dgx_b200]
  - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap]
    products:
      - environment: [dev]
        scope: [weekly]
        platforms: [dgx_h100]
  # - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_fsdp]
  #   products:
  #     - environment: [dev]
  #       scope: [weekly]
  #       platforms: [dgx_h100]
  # - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap]
  #   products:
  #     - environment: [dev]
  #       scope: [weekly]
  #       platforms: [dgx_h100]
  - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap]
    products:
      - environment: [dev]
        scope: [weekly]
        platforms: [dgx_h100]
  # - test_case: [gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G]
  #   products:
  #     - environment: [dev, lts]
  #       scope: [mr] # Non-deterministic: #483
