max_seq_len: 512

load_path:  # set via bash script to be absolute path to your sparse checkpoint
precision: amp_bf16

max_duration: # TODO
eval_interval: 1ep
eval_first: false
seed: 42

global_train_batch_size: #TODO
# for mpt-7b dense:
# 4 x A100_80GB = "device_train_microbatch_size: 12"
# 8 x A6000_48GB = "device_train_microbatch_size: 6"

# for mpt-7b sparse (with masks):
# 8 x A6000_48GB = "device_train_microbatch_size: 4"
device_train_microbatch_size: 16
device_eval_batch_size: 16

# Run Name
run_name: # If left blank, will be read from env var $RUN_NAME

model:
  name: hf_causal_lm
  pretrained: true
  pretrained_model_name_or_path: #TODO
  # max_seq_len: ${max_seq_len}
  # output_hidden_states: true
  master_weights_dtype: bfloat16
#  config_overrides:
#    attn_config:
#      attn_impl: sdpa

# Tokenizer
tokenizer:
  name: #TODO
  kwargs:
    model_max_length: ${max_seq_len}

train_loader:
  name: finetuning
  dataset:
    hf_name: json
    split: train
    hf_kwargs:
      data_files: #TODO
    preprocessing_fn: preprocessing:gsm8k_preprocessing_function
    max_seq_len: ${max_seq_len}
    allow_pad_trimming: false
    decoder_only_format: true
    shuffle: false
  drop_last: false
  num_workers: 0
  pin_memory: false
  prefetch_factor:
  persistent_workers: false
  timeout: 0


# Optimization
scheduler:
  name: linear_decay_with_warmup
  t_warmup: 20ba
  alpha_f: 0

optimizer:
  name: decoupled_adamw
  lr: # TODO
  betas:
  - 0.9
  - 0.999
  eps: 1.0e-8
  weight_decay: 0.0

algorithms:
  data_seeder:
    seed: 42
  loss_weighter:
    fake: true

# FSDP
fsdp_config:
  sharding_strategy: FULL_SHARD
  mixed_precision: FULL
  activation_checkpointing: true
  activation_checkpointing_reentrant: true
  forward_prefetch: true
  activation_cpu_offload: false
  limit_all_gathers: true
  verbose: false

# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba

callbacks:
  speed_monitor:
    window_size: 10
  lr_monitor: {}
  memory_monitor: {}
  runtime_estimator: {}
  hq_hf_checkpointer:
    overwrite: true
    precision: # TODO
    save_folder: checkpoints/${run_name}
    save_interval: 1dur

loggers:
  wandb: {}

# # Checkpoint to local filesystem or remote object store
# save_interval: 1ep
# save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
# save_folder: output_dir/{run_name}/checkpoints
# save_overwrite: true
