# ---------------------------------------------------------------------------- ##            Fixed
#   Purpose of this run:                                                        #            Fixed
#   Ablation Study 01: Should we use router tokens?                             # <1       > Operational settings
# ---------------------------------------------------------------------------- ##            Fixed
# Operational Settings                                                          #            Fixed
project_directory:              "$EXTERNAL_STORAGE/projects/moe"                #            Fixed
project_entity:                 null                                            #            Fixed
project_name:                   "my_project"                                    #            Fixed
run_name:                       "A1-02"                                         # <1       > Operational settings
use_diagnostic_mode:            false                                           #            Fixed
# Basic Settings                                                                #            Fixed
data_name:                      "FineWebEdu10B"                                 #            Fixed
data_dir:                       "$EXTERNAL_STORAGE/datasets/fineweb_edu_10b"    #            Fixed
vocab_size:                     50304                                           #            Fixed
num_class:                      50304                                           #            Fixed
context_window:                 2048                                            #            Fixed
num_block:                      12                                              #            Fixed
emb_size:                       1024                                            #            Fixed
num_gpu:                        4                                               # < 2      > Batch size
accu_steps:                     5                                               # < 2      > Batch size
batch_size_fwd:                 16                                              #            Fixed
batch_size:                     320                                             #            Fixed
num_batch_override:             3757
# Feedforward Settings                                                          #            Fixed
ffwd_name:                      "MHMoEHP"                                       # <  3     > Basic architecture design
ffwd_num_head:                  8                                               # <   4    > Multi-head
ffwd_head_size:                 128                                             # <   4    > Multi-head
ffwd_hid_size:                  null                                            # <    5   > Hidden layer (Dense  MLP)
ffwd_num_expert:                1536                                            # <    5   > Hidden layer (Sparse MoE)
ffwd_num_expert_active:         4                                               #            Fixed
ffwd_expert_size:               256                                             #            Fixed
# Attention Settings                                                            #            Fixed
attn_name:                      "SelfAttention"                                 #            Fixed
attn_num_head:                  8                                               #            Fixed
attn_head_size:                 128                                             #            Fixed
# Normalization Settings                                                        #            Fixed
norm_name:                      "RMSNorm"                                       #            Fixed
norm_use_affine:                false                                           #            Fixed
norm_use_bias:                  false                                           #            Fixed
norm_eps:                       1.0e-5                                          #            Fixed
# LR Schedule Settings                                                          #            Fixed
lrsched_max_lr:                 1.0e-4                                          # <     6  > Learning rate schedule
lrsched_min_lr:                 1.0e-4                                          # <     6  > Learning rate schedule
lrsched_warmup_steps:           2000                                            # <     6  > Learning rate schedule
lrsched_decay_steps:            500                                             # <     6  > Learning rate schedule
# AdamW Settings                                                                #            Fixed
adamw_beta_1:                   0.9                                             #            Fixed
adamw_beta_2:                   0.95                                            #            Fixed
adamw_eps:                      1.0e-8                                          #            Fixed
adamw_weight_decay:             0.1                                             #            Fixed
# Gradient Clipping Settings                                                    #            Fixed
gradclip_enabled:               true                                            #            Fixed
gradclip_max_norm:              1.0                                             #            Fixed
gradclip_norm_type:             2.0                                             #            Fixed
# Evaluation Settings                                                           #            Fixed
eval_enable_validation:         true                                            # <      7 > Evaluation settings
eval_evaluators:                                                                #            Fixed
    - "hellaswag"                                                               #            Fixed
# Performance Settings                                                          #            Fixed
perf_use_profiler:              false                                           # <       8> Other settings
perf_use_8bit_adamw:            true                                            #            Fixed
# Dataloader Settings                                                           #            Fixed
dataloader_num_worker:          4                                               #            Fixed
dataloader_pin_memory:          true                                            #            Fixed
# Reproducibility Settings                                                      #            Fixed
repro_use_random_seed:          true                                            #            Fixed
repro_random_seed_value:        42                                              #            Fixed
# Checkpoint Settings                                                           #            Fixed
ckpt_enabled:                   false                                           # <       8> Other settings
# Runtime Variables                                                             #            Fixed
runtime:                        {}                                              #            Fixed
# ---------------------------------------------------------------------------- ##            Fixed
