variables:
  data_local: ./my-copy-c4
  data_remote:  # If blank, files must be present in data_local
  max_seq_len: 128
  global_seed: 17

  # Run Name
  run_name:  # If left blank, will be read from env var $RUN_NAME

max_seq_len: ${variables.max_seq_len}
run_name: ${variables.run_name}

# Model
model:
  name: mpt_causal_lm
  init_device: meta
  d_model: 128
  ffn_config:
    ffn_type: mb_dmoe
    memory_optimized_mlp: true
    mlp_impl: grouped
    moe_lbl_in_fp32: false
    moe_loss_weight: 0.01
    moe_num_experts: 4
    moe_top_k: 2
    moe_world_size: 1
    uniform_expert_assignment: false
  n_heads: 2
  n_layers: 2
  expansion_ratio: 1
  max_seq_len: ${variables.max_seq_len}
  vocab_size: 50368
  attn_config:
    attn_impl: torch
  loss_fn: torch_crossentropy

# Tokenizer
tokenizer:
  name: EleutherAI/gpt-neox-20b
  kwargs:
    model_max_length: ${variables.max_seq_len}

# Dataloaders
train_loader:
  name: text
  dataset:
    local: ${variables.data_local}
    remote: ${variables.data_remote}
    split: train
    shuffle: true
    max_seq_len: ${variables.max_seq_len}
    shuffle_seed: ${variables.global_seed}
  drop_last: true
  num_workers: 8

eval_loader:
  name: text
  dataset:
    local: ${variables.data_local}
    remote: ${variables.data_remote}
    split: val
    shuffle: false
    max_seq_len: ${variables.max_seq_len}
    shuffle_seed: ${variables.global_seed}
  drop_last: false
  num_workers: 8

# Optimization
scheduler:
  name: cosine_with_warmup
  t_warmup: 100ba
  alpha_f: 0.1

optimizer:
  name: decoupled_adamw
  lr: 6.0e-4
  betas:
  - 0.9
  - 0.95
  eps: 1.0e-08
  weight_decay: 0.0

algorithms:
  gradient_clipping:
    clipping_type: norm
    clipping_threshold: 1.0

max_duration: 200ba
eval_interval: 100ba
eval_first: false
eval_subset_num_batches: -1
global_train_batch_size: 256

# System
seed: ${variables.global_seed}
device_eval_batch_size: 16
device_train_microbatch_size: 16
# device_train_microbatch_size: auto
precision: amp_bf16

# FSDP
fsdp_config:
  sharding_strategy: FULL_SHARD
  mixed_precision: PURE
  activation_checkpointing: false
  activation_checkpointing_reentrant: false
  activation_cpu_offload: false
  limit_all_gathers: true
  verbose: false

# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba

callbacks:
  speed_monitor:
    window_size: 10
  lr_monitor: {}
  memory_monitor: {}
  runtime_estimator: {}
