name: gpt2
kwargs:
  model_type: llama
  bos_token_id: 1
  eos_token_id: 2
  hidden_dim: 768
  attn_dim: 768
  ffn_dim: 2048  # 8/3
  num_q_heads: 12
  num_kv_heads: 12
  num_layers: 12
  hidden_drop: 0.0
  embd_drop: 0.0
  max_position_embeddings: 1024  # consider the 2048
  vocab_size: 32000
  tie_word_embeddings: false
  ln: rmsnorm
  act: swiglu
  bias: false
  scale_attn_by_inverse_layer_idx: false
  pos_emb:
    name: rope
    rotary_interleaved: false
    seq_len_interpolation_factor: null
    rotary_base: 10000
  init:
    weight_init: fixed
    initializer_range: 0.02
