integrations:
- integration_type: git_repo
  git_repo: mosaicml/llm-foundry
  git_branch: v0.18.0
  # git_commit: # OR use your commit hash
  pip_install: .[gpu]
  ssh_clone: false  # Should be true if using a private repo

command: |
  cd llm-foundry/scripts
  export HF_HUB_ENABLE_HF_TRANSFER=1
  composer train/train.py /mnt/config/parameters.yaml
image: mosaicml/llm-foundry:2.6.0_cu124-latest
name: llama3.1-70b-finetune

compute:
  # Note: Finetuning the 70b model requires at least 16x80GB GPUs
  gpus: 16  # Number of GPUs to use
  ## These configurations are optional
  # cluster: TODO # Name of the cluster to use for this run
  # gpu_type: h100_80gb # Type of GPU to use. We use h100_80gb in our experiments

# The below is injected as a YAML file: /mnt/config/parameters.yaml
parameters:
  variables:
    tokenizer_name: meta-llama/Llama-3.1-70B-Instruct
    global_seed: 17
    max_seq_len: 4096

  max_seq_len: ${variables.max_seq_len}
  # Run Name
  run_name:  # If left blank, will be read from env var $RUN_NAME

  max_split_size_mb: 512
  dist_timeout: 3600  # set to avoid NCCL timeouts

  # Model
  model:
    name: hf_causal_lm
    init_device: mixed
    pretrained_model_name_or_path: meta-llama/Llama-3.1-70B-Instruct
    pretrained: true
    # Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models
    use_auth_token: true
    use_flash_attention_2: true

  # Tokenizer
  tokenizer:
    name: ${variables.tokenizer_name}
    kwargs:
      model_max_length: ${variables.max_seq_len}
  # Dataloaders
  train_loader:
    name: finetuning
    dataset:
      hf_name: mosaicml/dolly_hhrlhf
      split: train
      max_seq_len: ${variables.max_seq_len}
      allow_pad_trimming: false
      decoder_only_format: true
      shuffle: true
      # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
      # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
      # # of the dataset.
      # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
      # # to profile this run's optimal packing_ratio as it depends on GPU count,
      # # batch size, sequence length
      # packing_ratio: auto
    drop_last: true
    num_workers: 8
    pin_memory: false
    prefetch_factor: 2
    persistent_workers: true
    timeout: 0

  eval_loader:
    name: finetuning
    dataset:
      hf_name: mosaicml/dolly_hhrlhf
      split: test
      max_seq_len: ${variables.max_seq_len}
      allow_pad_trimming: false
      decoder_only_format: true
      # packing_ratio:
      shuffle: false
    drop_last: true
    num_workers: 8
    pin_memory: false
    prefetch_factor: 2
    persistent_workers: true
    timeout: 0

  # Optimization
  scheduler:
    name: cosine_with_warmup
    t_warmup: 100ba
    alpha_f: 0.1

  # Note: You may want to change learning rate, betas, weight decay
  optimizer:
    name: decoupled_lionw
    lr: 5.0e-7
    betas:
    - 0.9
    - 0.95
    weight_decay: 0.0

  algorithms:
    gradient_clipping:
      clipping_type: norm
      clipping_threshold: 1.0

  max_duration: 1ep
  eval_first: false
  eval_interval: 1ep
  eval_subset_num_batches: -1
  global_train_batch_size: 16

  # System
  seed: ${variables.global_seed}
  device_eval_batch_size: 1
  device_train_microbatch_size: 1
  precision: amp_bf16

  # FSDP
  fsdp_config:
    state_dict_type: sharded  # Note: we enable sharded checkpointing to avoid GPU OOM
    sharding_strategy: FULL_SHARD
    mixed_precision: PURE
    activation_checkpointing: true
    activation_checkpointing_reentrant: false
    activation_cpu_offload: false
    limit_all_gathers: true

  # Logging
  progress_bar: false
  log_to_console: true
  console_log_interval: 1ba

  callbacks:
    speed_monitor:
      window_size: 10
    lr_monitor: {}
    memory_monitor: {}
    runtime_estimator: {}

  load_weights_only: true  # Only load the weights, not the optimizer state, LR schedule, etc

#   loggers:
#     wandb: {}

#   Checkpoint to local filesystem or remote object store
#   save_interval: 2000ba
#   save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
#   save_folder: ./{run_name}/checkpoints
#   save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints

#   Load from local filesystem or remote object store
#   load_path: ./gpt-1b/checkpoints/latest-rank{rank}.pt
#   load_path: s3://my-bucket/my-folder/gpt-1b/checkpoints/latest-rank{rank}.pt
