integrations:
- integration_type: git_repo
  git_repo: mosaicml/llm-foundry
  git_branch: v0.18.0
  # git_commit:  # OR use your commit hash
  pip_install: .[gpu]
  ssh_clone: false  # Should be true if using a private repo

# We are fetching, converting, and training on the 'val' split
# as it is small and quick to get going for this demo.
# For real training runs, follow the instructions in `llm-foundry/scripts/train/README.md`
# to convert and host the full 'train' dataset.
command: |
  cd llm-foundry/scripts
  python data_prep/convert_dataset_hf.py \
    --dataset allenai/c4 --data_subset en \
    --out_root ./my-copy-c4 --splits train_small val_small \
    --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
  composer train/train.py /mnt/config/parameters.yaml
image: mosaicml/llm-foundry:2.6.0_cu124-latest
name: mpt-1b-ctx-8k-gpus-8

compute:
  gpus: 8  # Number of GPUs to use

  ## These configurations are optional
  # cluster: TODO # Name of the cluster to use for this run
  # gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments

# The below is injected as a YAML file: /mnt/config/parameters.yaml
# and referenced in our `command` above
# We use this to customize hparams like `max_seq_len=8192` at runtime
parameters:
  run_name:  # If left blank, will be read from top YAML name for W&B logging and checkpointing

  data_local: ./my-copy-c4
  data_remote:  # If blank, files must be present in data_local
  max_seq_len: 8192
  global_seed: 17

  # Model
  model:
    name: mpt_causal_lm
    init_device: meta
    d_model: 2048
    n_heads: 16  # Modified 24->16 so that d_head == 128 to satisfy FlashAttention
    n_layers: 24
    expansion_ratio: 4
    max_seq_len: ${max_seq_len}
    vocab_size: 50368
    attn_config:
      attn_impl: flash

  # Tokenizer
  tokenizer:
    name: EleutherAI/gpt-neox-20b
    kwargs:
      model_max_length: ${max_seq_len}

  # Dataloaders
  train_loader:
    name: text
    dataset:
      local: ${data_local}
      remote: ${data_remote}
      split: train_small  # Using 'train_small' for this demo
      shuffle: true
      max_seq_len: ${max_seq_len}
      shuffle_seed: ${global_seed}
    drop_last: true
    num_workers: 8

  eval_loader:
    name: text
    dataset:
      local: ${data_local}
      remote: ${data_remote}
      split: val_small
      shuffle: false
      max_seq_len: ${max_seq_len}
      shuffle_seed: ${global_seed}
    drop_last: false
    num_workers: 8

  # Optimization
  scheduler:
    name: cosine_with_warmup
    t_warmup: 100ba
    alpha_f: 0.1

  optimizer:
    name: decoupled_adamw
    lr: 2.0e-4
    betas:
    - 0.9
    - 0.95
    eps: 1.0e-08
    weight_decay: 0.0

  algorithms:
    gradient_clipping:
      clipping_type: norm
      clipping_threshold: 1.0

  max_duration: 24800ba  # ~ 26B tokens
  eval_interval: 2000ba
  eval_first: false
  eval_subset_num_batches: -1
  global_train_batch_size: 128  # ~1M tokens

  # System
  seed: 17
  device_eval_batch_size: 1
  device_train_microbatch_size: 1
  # device_train_microbatch_size: auto
  precision: amp_bf16

  # FSDP
  fsdp_config:
    sharding_strategy: FULL_SHARD
    mixed_precision: PURE
    activation_checkpointing: false
    activation_checkpointing_reentrant: false
    activation_cpu_offload: false
    limit_all_gathers: true

  # Logging
  progress_bar: false
  log_to_console: true
  console_log_interval: 1ba

  callbacks:
    speed_monitor:
      window_size: 10
    lr_monitor: {}
    memory_monitor: {}
    runtime_estimator: {}
#   loggers:
#     wandb: {}

#   Checkpoint to local filesystem or remote object store
#   save_interval: 2000ba
#   save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
#   save_folder: ./{run_name}/checkpoints
#   save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints

#   Load from local filesystem or remote object store
#   load_path: ./gpt-1b/checkpoints/latest-rank{rank}.pt
#   load_path: s3://my-bucket/my-folder/gpt-1b/checkpoints/latest-rank{rank}.pt
