name: 'ad'
layers: 4
token_dim: 128
hidden_dim: 512
num_heads: 64
ff_dim: 2048
# lr:
ctx_len: ???
batch_size: 256
lr: 2e-4
mask_ratio: 0.15
clip_grad_norm: 5
reduced: False