model:
    num_encoder_layers: 4
    num_decoder_layers: 8
    decoder_cache_len: 0
    encoder_memory_len: 32
    # relative position
    num_attn_buckets: 64
    max_attn_distance: 256
    vocab_size: 50265
    # model
    dim_embed: 512
    dim_model: 512
    num_heads: 8
    dim_head: 64
    dim_ff_inner: 2048
    dropout: 0.0
    dropattn: 0.0
    layer_norm_type: layer_norm
    act_type: swish
    # special token
    pad_token_id: 1
    sep_token_id: 2