type: 'BertEncoder'
args:
    vocab_size: 256
    hidden_size: 256
    num_hidden_layers: 3
    num_attention_heads: 8
    intermediate_size: 2048
    hidden_act: "gelu"
    hidden_dropout_prob: 0.4
    max_position_embeddings: 700
    attention_probs_dropout_prob: 0.5