name: gpt2
kwargs:
  model_type: llama
  bos_token_id: 1
  eos_token_id: 2
  hidden_dim: 1024
  attn_dim: 1024
  ffn_dim: 4096
  num_q_heads: 16
  num_kv_heads: 16
  num_layers: 24
  hidden_drop: 0.0
  embd_drop: 0.0
  max_position_embeddings: 1024
  vocab_size: 32000
  tie_word_embeddings: true
  ln: layernorm
  act: gelu
  bias: true
  scale_attn_by_inverse_layer_idx: false
  pos_emb:
    name: rope
    rotary_interleaved: false
    seq_len_interpolation_factor: null
    rotary_base: 10000
  init:
    weight_init: fixed
    initializer_range: 0.02
mlp_policy: mix1 # mix1 and mix2
