program = 'bin/ft_transformer.py'

[base_config]
seed = 0

    [base_config.data]
    normalization = 'quantile'
    path = 'data/jannis'

    [base_config.model]
    activation = 'reglu'
    initialization = 'kaiming'
    n_heads = 8
    prenormalization = true
    token_bias = false

    [base_config.training]
    batch_size = 512
    eval_batch_size = 8192
    n_epochs = 1000000000
    optimizer = 'adamw'
    patience = 16

[optimization.options]
n_trials = 100

[optimization.sampler]
seed = 0

[optimization.space.model]
attention_dropout = [ 'uniform', 0.0, 0.5 ]
d_ffn_factor = [ '$d_ffn_factor', 1.0, 4.0 ]
d_token = [ '$d_token', 64, 512 ]
ffn_dropout = [ 'uniform', 0.0, 0.5 ]
n_layers = [ 'int', 1, 4 ]
residual_dropout = [ '?uniform', 0.0, 0.0, 0.2 ]

[optimization.space.training]
lr = [ 'loguniform', 1e-05, 0.001 ]
weight_decay = [ 'loguniform', 1e-06, 0.001 ]
