model_config:
  token_bias: false
  n_layers: 1
  d_token: 32
  n_heads: 2
  d_ffn_factor: 1.333
  attention_dropout: 0.1
  ffn_dropout: 0.1
  residual_dropout: 0.0
  prenormalization: false
  initialization: kaiming
  d_out: null
  data_type: null
  categories: null
  d_numerical: null
  kv_compression: null
  kv_compression_sharing: null
  activation: relu
  residual: true
  mask_type: random
  random_mask_alpha: 0.8
train_config:
  optimizer_config:
    name: adam
    lr: 0.01
    weight_decay: 0.0
  dataset: helena
  dataset_root: data
  normalization: quantile
  cat_nan_policy: new
  cat_policy: indices
  cat_min_frequency: 0.0
  dataset_seed: 0
  batch_size: 128
  epochs: 10
  scheduler_config: null
  rand_weights_init: true
experiment_dir: null
random_seed: null
keep_n_checkpoints: 3
tensorboard: true
amp: true
device: cuda
verbose: console
eval_subsample: 1.0
metrics_n_batches: 32
metrics_mb_limit: 100
early_stopping_iter: null
eval_epoch: 1.0
log_epoch: 1.0
init_chkpt: null
warm_up_epochs: 1.0
divergence_factor: 100.0
