model:
    same_length: False
    pre_lnorm: True
    untie_r: True
    vocab_size: 50265
    d_model: 512
    n_layer: 12
    n_head: 8
    d_head: 64
    d_inner: 2048
    dropout: 0.0
    dropatt: 0.0
    # mem len
    mem_len: 16
    tgt_len: 64
    ext_len: 0
    # compress memory
    cmem_len: 16
    c_ratio: 4
    # init
    init_std: 0.02
    clamp_len: -1
    layer_norm_epsilon: 1e-5
    pad_token_id: 1
    sep_token_id: 2