seed = 4

[data]
path = 'tabular_data/epsilon'

[model]
activation = 'reglu'
attention_dropout = 0.2
d_ffn_factor = 1.333333333333333
d_token = 192
ffn_dropout = 0.1
initialization = 'kaiming'
kv_compression = 0.064
kv_compression_sharing = 'headwise'
n_heads = 8
n_layers = 3
prenormalization = true
residual_dropout = 0.0

[training]
batch_size = 1024
eval_batch_size = 8192
lr = 0.0001
lr_n_decays = 0
n_epochs = 1000000000
optimizer = 'adamw'
patience = 16
weight_decay = 1e-05
