max_seq_len: 2048
global_seed: 17

# Run Name
run_name:  # If left blank, will be read from env var $COMPOSER_RUN_NAME

# Model
model:
  name: hf_causal_lm
  pretrained_model_name_or_path: meta-llama/Meta-Llama-3-8B
  pretrained: true  # false: only use the architecture; true: initialize with pretrained weights
  config_overrides:
    max_seq_len: ${max_seq_len}
    attn_config:
      attn_impl: flash
      # Set this to `true` if using `train_loader.dataset.packing_ratio` below
      attn_uses_sequence_id: false
  # Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models
  use_auth_token: true

# Tokenizer
tokenizer:
  name: meta-llama/Meta-Llama-3-8B
  kwargs:
    model_max_length: ${max_seq_len}

# Dataloaders
train_loader:
  name: finetuning
  dataset:
    ############
    hf_name: json
    hf_kwargs:
      # Note: absolute paths for data_dir are more reliable;
      # relative paths will be interpreted relative to whatever your
      # working directory is when you run `train.py`
      data_dir: finetune_example
    # Note: `scripts/train` will be the working directory when resolving
    # the preprocessing_fn import path
    preprocessing_fn: finetune_example.preprocessing:multiple_choice
    split: train
    ############
    shuffle: true
    max_seq_len: ${max_seq_len}
    decoder_only_format: true
    # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
    # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
    # # of the dataset.
    # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
    # # to profile this run's optimal packing_ratio as it depends on GPU count,
    # # batch size, sequence length
    # packing_ratio:
  drop_last: true
  num_workers: 8

# Optimization
scheduler:
  name: cosine_with_warmup
  t_warmup: 100ba
  alpha_f: 0.1

optimizer:
  name: decoupled_adamw
  lr: 6.0e-4
  betas:
  - 0.9
  - 0.95
  eps: 1.0e-08
  weight_decay: 0.0

algorithms:
  gradient_clipping:
    clipping_type: norm
    clipping_threshold: 1.0

max_duration: 1ep
eval_interval: 1
eval_first: false
eval_subset_num_batches: -1
global_train_batch_size: 8

# System
seed: ${global_seed}
device_eval_batch_size: 1
device_train_microbatch_size: 1
# device_train_microbatch_size: auto
precision: amp_bf16

# FSDP
fsdp_config:
  sharding_strategy: FULL_SHARD
  mixed_precision: PURE
  activation_checkpointing: true
  activation_checkpointing_reentrant: false
  activation_cpu_offload: false
  limit_all_gathers: true

# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba

callbacks:
  speed_monitor:
    window_size: 10
  lr_monitor: {}
  memory_monitor: {}
  runtime_estimator: {}

# loggers:
#   wandb: {}

# Checkpoint to local filesystem or remote object store
# save_interval: 500ba
# save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
# save_folder: ./{run_name}/checkpoints
# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints

# Load from local filesystem or remote object store
# load_path: ./gpt-125m/checkpoints/latest-rank{rank}.pt
# load_path: s3://my-bucket/my-folder/gpt-125m/checkpoints/latest-rank{rank}.pt
