
# Configuration for SAN models (pairwise, HSA)

epochs: 300
batch_size: 40
eval_every: 10

# Models
encoder:
  sa_type: 'pair_hier'                                        # Self-attention type
  layers: [2, 2, 2, 2]                                        # Number of bottleneck (attention) layers in every encoder block
  kernels: [3, 3, 3, 3]                                       # Kernel sizes for bottleneck layers of each encoder block
  num_classes: 10                                             # Number of classes in dataset (CIFAR-10 has 10)

optimizer:
  name: sgd
  lr: 0.1
  weight_decay: 1.e-04

scheduler:
  name: cosine
  warmup_epochs: 10

criterion:
  smoothing: 0.1                                              # 10% label smoothing

# Data
dataset:
  name: cifar10
  root: data/cifar10
  train_transform:
    random_flip:
    to_tensor:
    normalize:
      mean: [0.4914, 0.4822, 0.4465]
      std: [0.2023, 0.1994, 0.2010]
  val_transform:
    to_tensor:
    normalize:
      mean: [0.4914, 0.4822, 0.4465]
      std: [0.2023, 0.1994, 0.2010]
