name: 'ed'
layers: 4
token_dim: 128
hidden_dim: 512
num_heads: 64
ff_dim: 2048
# lr:

# pre_norm: true
ctx_len: ???
batch_size: 256
lr: 2e-4
mask_ratio: 0.15
clip_grad_norm: 5
mem_len: ???