
# Configuration for Bert Self attention

epochs: 300
batch_size: 100
eval_every: 10                                                  # Number of epochs after which evaluation is done

encoder:
  attention: resnet                                             # Only for logging routine to work, resnet doesn't use attention

resnet: 
  name: resnet18                                                # ResNet model to use (choices: resnet18, resnet50, resnet101)
  classes: 10                                                   # Number of classes in dataset (CIFAR-10 has 10)
  zero_init_residual: False                                     # Initialization for some layers

optimizer:
  name: sgd
  lr: 0.1
  weight_decay: 1.e-04

scheduler:
  name: cosine
  warmup_epochs: 10


dataset:
  name: cifar10
  root: data/cifar10

  train_transform:                                              # Best results were seen for this sequence of augmentations
    random_crop:
      size: 32
      padding: 4
    random_flip:
    random_rotate:
      degrees: 5
    color_jitter:
      brightness: 0.4
      contrast: 0.4
      saturation: 0.4
      hue: 0.1
      apply_prob: 0.8
    to_tensor:
    normalize:
      mean: [0.4914, 0.4822, 0.4465]
      std: [0.2023, 0.1994, 0.2010]

  val_transform:
    to_tensor:
    normalize:
      mean: [0.4914, 0.4822, 0.4465]
      std: [0.2023, 0.1994, 0.2010]
