dim: 512
dim_embedding: null
dim_output: 10
n_layers: 12
n_heads: 4
task: CLASSIFICATION

n_classes: 10

max_samples_support: 8192
max_samples_query: 1024

max_epochs: 300
optimizer: adamw
regression_loss: MSE
lr:
  distribution: log_uniform_values
  min: 1.e-6
  max: 1.e-3
  default: 1.e-6
weight_decay: 
  values: [0.0, 1.e-1, 1.e-2]
  default: 0.0
lr_scheduler:
  values: [True, False]
  default: False
lr_scheduler_patience: 25
warmup_steps:
  values: [0, 10]
  default: 0

early_stopping_patience: 40
early_stopping_data_split: VALID
early_stopping_max_samples: 2048

precision: bfloat16
grad_scaler_enabled: False
grad_scaler_scale_init: 65536.
grad_scaler_scale_min: 65536.
grad_scaler_growth_interval: 1000

label_smoothing: 0.0

use_pretrained_weights: False
path_to_weights: outputs/runs/2024-08-08/23-58-03/weights/model_step_12000.pt

use_quantile_transformer: False
use_feature_count_scaling: False
shuffle_classes: True
shuffle_features: False
random_mirror_x: True
random_mirror_regression: True