seed = 0
n_clusters = 20
eplison = 0.1
max_iter = 200
weight_ = 0.5

[data]
normalization = "quantile"
path = "data/helena"

[model]
d_numerical = 784
categories = []
n_layers = 3
d_token = 192
n_heads = 8
attention_dropout = 0.1
residual_dropout = 0.1
activation = "relu"
prenormalization = false
initialization = "xavier"
d_out = 192


[training]
batch_size = 128
eval_batch_size = 8192
lr = 0.0001
n_epochs = 200
optimizer = "adamw"
patience = 16
weight_decay = 1e-5
