benchmark_name: MosaicDeepspeed_1xA100-80GB-bf16-GPT-1B

max_seq_len: 2048
tokenizer_name: EleutherAI/gpt-neox-20b

# Tokenizer
tokenizer:
  name: ${tokenizer_name}
  kwargs:
    model_max_length: ${max_seq_len}

model:
  name: mpt_causal_lm
  init_device: cpu
  d_model: 2048
  n_heads: 16  # Modified 24->16 so that d_head == 128 to statisfy FlashAttention
  n_layers: 24
  expansion_ratio: 4
  max_seq_len: ${max_seq_len}
  vocab_size: 50368
  init_std: 0.02
  attn_pdrop: 0.0
  resid_pdrop: 0.0
  emb_pdrop: 0.0
  attn_config:
    attn_impl: flash

device: null
model_dtype: bf16
autocast_dtype: null
use_deepspeed: false

batch_sizes: [1, 2, 4, 8, 16, 32, 64]
input_lengths: [128]
output_lengths: [8]
use_cache: true

num_batches: 5
num_warmup_batches: 3
