program = 'bin/ft_transformer.py'

[base_config]
seed = 0

    [base_config.data]
    normalization = 'quantile'
    path = 'data/microsoft'
    y_policy = 'mean_std'

    [base_config.model]
    activation = 'reglu'
    d_ffn_factor = 1.333333333333333
    initialization = 'kaiming'
    n_heads = 8
    prenormalization = true
    residual_dropout = 0.0
    token_bias = false

    [base_config.training]
    batch_size = 1024
    eval_batch_size = 8192
    n_epochs = 1000000000
    optimizer = 'adamw'
    patience = 16

[optimization.options]
n_trials = 50

[optimization.sampler]
seed = 0

[optimization.space.model]
attention_dropout = [ 'uniform', 0.0, 0.5 ]
d_token = [ '$d_token', 64, 512 ]
ffn_dropout = [ 'uniform', 0.0, 0.5 ]
n_layers = [ 'int', 1, 6 ]

[optimization.space.training]
lr = [ 'loguniform', 3e-05, 0.0003 ]
weight_decay = [ 'loguniform', 1e-06, 0.001 ]
