_name_: model
layer:
  - _name_: s3
    d_model: 64
    measure: legs
    rank: 1
    dt_min: 0.001
    dt_max: 0.1
    trainable:
        A: 1
        B: 2
        C: 1
        dt: 1
    lr:
        dt: 0.0005
        A: 0.0005
        B: 0.0005
        C: null
    cache: False
    weight_decay: 0.0
    activation: gelu
    postact: glu
    initializer: null
    l_max: ${dataset.__l_max} # TODO temporary; use hydra interpolation for now
    dropout: ${...dropout} # Same as null
  - _name_: ff
    expand: 4
    activation: gelu
    dropout: ${...dropout} # Same as null
prenorm: True
transposed: True
n_layers: 12
d_model: 768
residual:
  _name_: residual
  alpha: 1.0
  beta: 1.0
  # gamma: 0.0
  # scalar: True
pool: null
norm: layer
  # scalar: True # only applicable if transposed=True
dropout: 0.1
