model_config:
  token_bias: false
  n_layers: 1
  d_token: 32
  n_heads: 2
  d_ffn_factor: 1.333
  attention_dropout: 0.1
  ffn_dropout: 0.1
  residual_dropout: 0.0
  prenormalization: false
  initialization: kaiming
  d_out: null
  data_type: null
  categories: null
  d_numerical: null
  kv_compression: null
  kv_compression_sharing: null
  activation: relu
  residual: true
  mask_type: random
  random_mask_alpha: 0.8
train_config:
  optimizer_config:
    name: adam
    lr: 0.01
    weight_decay: 0.0
  dataset: helena
  dataset_root: data
  normalization: quantile
  cat_nan_policy: new
  cat_policy: indices
  cat_min_frequency: 0.0
  dataset_seed: 0
  batch_size: 128
  epochs: 10
  scheduler_config: null
  rand_weights_init: true
experiment_dir: null
random_seed: null
keep_n_checkpoints: 3
tensorboard: true
amp: true
device: cuda
verbose: console
eval_subsample: 1.0
metrics_n_batches: 32
metrics_mb_limit: 100
early_stopping_iter: null
eval_epoch: 1.0
log_epoch: 1.0
init_chkpt: null
warm_up_epochs: 1.0
divergence_factor: 100.0
total_trials: 100
concurrent_trials: 10
search_space:
  train_config.optimizer_config.name:
    categorical_values: ["adam", "adamw","adabelief","radam","sgd"]
  model_config.activation:
    categorical_values: ["relu","gelu","geglu","reglu","leaky_relu","sigmoid"]
  model_config.residual:
    categorical_values: [True, False]
  model_config.random_mask_alpha:
    value_range: [0.5, 1]
    value_type: float
optim_metrics:
  val_loss: min
gpu_mb_per_experiment: 1000
cpus_per_experiment: 0.01
search_algo: tpe
ignore_invalid_params: false
remote_config: null
gcp_config: null
