seed = 4

[data]
normalization = 'quantile'
path = 'data/jannis'

[model]
activation = 'reglu'
attention_dropout = 0.09963966666531431
d_ffn_factor = 2.373331606275802
d_token = 344
ffn_dropout = 0.4139876834686211
initialization = 'kaiming'
n_heads = 8
n_layers = 1
prenormalization = true
residual_dropout = 0.1302411059958489
token_bias = false
token_cat_bias = false

[training]
batch_size = 512
eval_batch_size = 8192
lr = 3.590826817474381e-05
n_epochs = 1000000000
optimizer = 'adamw'
patience = 16
weight_decay = 1.456180042807087e-05
