name: gpt2
kwargs:
  model_type: gpt2
  bos_token_id: 50256
  eos_token_id: 50256
  hidden_dim: 384
  attn_dim: 384
  ffn_dim: 1536
  num_q_heads: 6
  num_kv_heads: 6
  num_layers: 12
  hidden_drop: 0.1
  embd_drop: 0.1
  max_position_embeddings: 1024
  vocab_size: 50304
  tie_word_embeddings: true
  ln: layernorm
  act: gelu
  bias: true
  scale_attn_by_inverse_layer_idx: false
  pos_emb:
    name: rope
    rotary_interleaved: false
    seq_len_interpolation_factor: null
    rotary_base: 10000
  init:
    weight_init: fixed
    initializer_range: 0.02