model_config:
  token_bias: false
  n_layers: 1
  d_token: 32
  n_heads: 2
  d_ffn_factor: 1.333
  attention_dropout: 0.1
  ffn_dropout: 0.1
  residual_dropout: 0.0
  prenormalization: false
  initialization: kaiming
  d_out: null
  data_type: null
  categories: null
  d_numerical: null
  kv_compression: null
  kv_compression_sharing: null
  activation: relu
  residual: true
  mask_type: random
  random_mask_alpha: 0.8
train_config:
  optimizer_config:
    name: adam
    lr: 0.01
    weight_decay: 0.0
  dataset: helena
  dataset_root: data
  normalization: quantile
  cat_nan_policy: new
  cat_policy: indices
  cat_min_frequency: 0.0
  dataset_seed: 0
  batch_size: 128
  epochs: 10
  scheduler_config: null
  rand_weights_init: true
experiment_dir: null
random_seed: null
keep_n_checkpoints: 3
tensorboard: true
amp: true
device: cuda
verbose: console
eval_subsample: 1.0
metrics_n_batches: 32
metrics_mb_limit: 100
early_stopping_iter: null
eval_epoch: 1.0
log_epoch: 1.0
init_chkpt: null
warm_up_epochs: 1.0
divergence_factor: 100.0
total_trials: 2000
concurrent_trials: 100
search_space:
  train_config.optimizer_config.name:
    categorical_values: ["adam", "adamw","adabelief","radam","sgd"]
  train_config.optimizer_config.lr:
    value_range: [0.0001,0.01]
  train_config.epochs:
    value_range: [5,20]
    value_type: int
  train_config.dataset:
    categorical_values: ["year","yahoo","helena","covtype","epsilon","jannis","adult","aloi","higgs_small","microsoft","california_housing"]
  train_config.normalization:
    categorical_values: ["standard", "quantile"]
  train_config.cat_nan_policy:
    categorical_values: ["new", "most_frequent"]
  train_config.cat_policy:
    categorical_values: ["ohe", "indices", "counter"]
  train_config.cat_min_frequency:
    value_range: [0,0.2]
    value_type: float
  train_config.dataset_seed:
    value_range: [0, 100]
    value_type: int
  # Model tunable
  model_config.token_bias:
    categorical_values: [True, False]
  model_config.n_layers:
    value_range: [1, 10]
    value_type: int
  model_config.d_token:
    value_range: [8, 128]
    value_type: int
  model_config.n_heads:
    value_range: [1, 12]
    value_type: int
  model_config.d_ffn_factor:
    value_range: [1, 5]
    value_type: int
  model_config.attention_dropout:
    value_range: [0, 0.3]
    value_type: float
  model_config.ffn_dropout:
    value_range: [0, 0.3]
    value_type: float
  model_config.residual_dropout:
    value_range: [0,0.3]
    value_type: float
  model_config.prenormalization:
    categorical_values: [True, False]
  model_config.initialization:
    categorical_values: ["xavier", "kaiming"]
  model_config.mask_type:
    categorical_values: ["mix","global","full","random"]
  model_config.activation:
    categorical_values: ["relu","gelu","geglu","reglu","leaky_relu","sigmoid"]
  model_config.residual:
    categorical_values: [True, False]
  model_config.random_mask_alpha:
    value_range: [0.5, 1]
    value_type: float
optim_metrics:
  val_loss: min
gpu_mb_per_experiment: 1000
cpus_per_experiment: 0.01
search_algo: tpe
ignore_invalid_params: false
remote_config: null
gcp_config: null
