seed = 0
function = "bin.t2g.main"
n_trials = 100

[sampler]

[space]
seed = 0
batch_size = 512
n_epochs = -1

[space.data]
cache = true
path = "data/otto"

[space.optimizer]
type = "AdamW"
lr = [
    "_tune_",
    "loguniform",
    1e-05,
    0.001,
]
weight_decay = [
    "_tune_",
    "loguniform",
    1e-06,
    0.001,
]

[space.model]
token_bias = true
n_layers = [
    "_tune_",
    "int",
    1,
    4,
]
d_token = [
    "_tune_",
    "categorical",
    [
        8,
        16,
        32,
        64,
        128,
    ],
]
residual_dropout = [
    "_tune_",
    "?uniform",
    0.0,
    0.0,
    0.2,
]
attention_dropout = [
    "_tune_",
    "uniform",
    0.0,
    0.5,
]
ffn_dropout = [
    "_tune_",
    "uniform",
    0.0,
    0.5,
]
d_ffn_factor = [
    "_tune_",
    "uniform",
    0.6666666666666667,
    2.6666666666666665,
]
activation = "reglu"
prenormalization = false
initialization = "kaiming"
sym_weight = true
sym_topology = false
nsi = true
n_heads = 8
frozen_switch = [
    "_tune_",
    "categorical",
    [
        true,
        false,
    ],
]

[space.model.num_embeddings]
type = "GGPLEmbeddings"

[space.bins]
n_bins = [
    "_tune_",
    "int",
    2,
    48,
]

[space.bins.tree_kwargs]
type = "xgb_global"
