variables:
  tokenizer_name: mosaicml/mpt-30b
  global_seed: 17

  max_seq_len: 8192

  # Run Name
  run_name:  # If left blank, will be read from env var $COMPOSER_RUN_NAME

  icl_max_seq_len: 2048

max_seq_len: ${variables.max_seq_len}
run_name: ${variables.run_name}

# Model
model:
  name: hf_causal_lm
  pretrained: true
  pretrained_model_name_or_path: mosaicml/mpt-30b
  init_device: mixed
  config_overrides:
    max_seq_len: ${variables.max_seq_len}
    attn_config:
      attn_impl: flash
      # Note: we still use packing, but turn this off for memory.
      # Initial tests suggest this does not hurt performance too much.
      attn_uses_sequence_id: false

# Tokenizer
tokenizer:
  name: ${variables.tokenizer_name}
  kwargs:
    model_max_length: ${variables.max_seq_len}

# Dataloaders
train_loader:
  name: finetuning
  dataset:
    hf_name: mosaicml/instruct-v3
    split: train
    max_seq_len: ${variables.max_seq_len}
    allow_pad_trimming: false
    decoder_only_format: true
    packing_ratio: 9
    shuffle: true
  drop_last: true
  num_workers: 8
  pin_memory: false
  prefetch_factor: 2
  persistent_workers: true
  timeout: 0

eval_loader:
  name: finetuning
  dataset:
    hf_name: mosaicml/instruct-v3
    split: test
    max_seq_len: ${variables.max_seq_len}
    allow_pad_trimming: false
    decoder_only_format: true
    packing_ratio: 9
    shuffle: true
  drop_last: true
  num_workers: 8
  pin_memory: false
  prefetch_factor: 2
  persistent_workers: true
  timeout: 0

# Optimization
scheduler:
  name: linear_decay_with_warmup  # linear no warmup is HF default which dolly used
  t_warmup: 50ba
  alpha_f: 0

optimizer:
  # mimic InstructGPT
  name: decoupled_adamw
  lr: 9.65e-6
  betas:
  - 0.9
  - 0.95
  eps: 1.0e-8
  weight_decay: 0

algorithms:
  gradient_clipping:
    clipping_type: norm
    clipping_threshold: 1.0

max_duration: 8ep
eval_interval: 1ep
eval_first: true
global_train_batch_size: 72

# System
seed: ${variables.global_seed}
device_eval_batch_size: 4
device_train_microbatch_size: 1
precision: amp_bf16

# FSDP
fsdp_config:
  sharding_strategy: FULL_SHARD
  mixed_precision: PURE
  activation_checkpointing: true
  activation_checkpointing_reentrant: false
  activation_cpu_offload: false
  limit_all_gathers: true
  sync_module_states: true

# Logging
progress_bar: false
log_to_console: true
console_log_interval: 20ba

callbacks:
  speed_monitor:
    window_size: 10
  runtime_estimator: {}
  lr_monitor: {}

# loggers:
#   wandb: {}

# save_folder:
# save_interval: 3ep
# save_num_checkpoints_to_keep: 1

icl_max_seq_len: ${variables.icl_max_seq_len}

# YOU MUST ADD YOUR OWN DATASET URIs
# this section can be removed if you do not want to track these metrics
icl_tasks:
-
  label: piqa
  dataset_uri:  # ADD YOUR OWN DATASET URI
  num_fewshot:
  - 0
  batch_size: 4
  max_seq_len: ${variables.icl_max_seq_len}
  icl_task_type: multiple_choice
  metric_names:
  - InContextLearningMultipleChoiceAccuracy
  prompt_string: ""  # this goes at the beginning of each input
  example_delimiter: "\n"  # this goes between fewshot examples
  continuation_delimiter: " "  # this separates questions from answers
-
  label: hellaswag
  dataset_uri:  # ADD YOUR OWN DATASET URI
  num_fewshot:
  - 0
  batch_size: 4
  max_seq_len: ${variables.icl_max_seq_len}
  icl_task_type: multiple_choice
  metric_names:
  - InContextLearningMultipleChoiceAccuracy
  prompt_string: ""  # this goes at the beginning of each input
  example_delimiter: "\n"  # this goes between fewshot examples
  continuation_delimiter: " "  # this separates questions from answers
-
  label: arc_easy
  dataset_uri:  # ADD YOUR OWN DATASET URI
  num_fewshot:
  - 0
  batch_size: 4
  max_seq_len: ${variables.icl_max_seq_len}
  icl_task_type: multiple_choice
  metric_names:
  - InContextLearningMultipleChoiceAccuracy
  prompt_string: ""  # this goes at the beginning of each input
  example_delimiter: "\n"  # this goes between fewshot examples
  continuation_delimiter: " "  # this separates questions from answers
