batch_size: 
  values: [256, 512, 1024]
  default: 512
max_epochs: 300
optimizer: adamw
lr:
  distribution: log_uniform_values
  min: 1.e-5
  max: 1.e-3
  default: 1.e-4
weight_decay:
  distribution: log_uniform_values
  min: 1.e-6
  max: 1.e-3
  default: 1.e-5
lr_scheduler:
  values: [True, False]
  default: True
lr_scheduler_patience: 30
early_stopping_patience: 40


d_token:
  distribution: q_uniform
  min: 64
  max: 512
  default: 192
activation: reglu
token_bias: True
prenormalization: True
kv_compression:
  values: [True, False]
  default: True
kv_compression_sharing:
  values: [headwise, key-value]
  default: headwise
initialization: kaiming
n_layers:
  distribution: q_uniform
  min: 1
  max: 6
  default: 3
n_heads: 8
d_ffn_factor:
  distribution: uniform
  min: 0.667
  max: 2.333
  default: 1.333
ffn_dropout:
  distribution: uniform
  min: 0.0
  max: 0.5
  default: 0.1
attention_dropout:
  distribution: uniform
  min: 0.0
  max: 0.5
  default: 0.2
residual_dropout:
  distribution: uniform
  min: 0.0
  max: 0.5
  default: 0.0