seed = 8

[data]
normalization = 'quantile'
path = 'data/california_housing'
y_policy = 'mean_std'

[model]
activation = 'reglu'
attention_dropout = 0.4518858598707761
d_ffn_factor = 2.342425545888827
d_token = 272
ffn_dropout = 0.1462394754853018
initialization = 'kaiming'
n_heads = 8
n_layers = 3
prenormalization = false
residual_dropout = 0.0

[training]
batch_size = 256
eval_batch_size = 8192
lr = 9.230462560646988e-05
n_epochs = 1000000000
optimizer = 'adamw'
patience = 16
weight_decay = 2.242573649081314e-06
