# Configuration for Bert Self attention

epochs: 100
batch_size: 32
eval_every: 10

# Models
encoder:
  num_encoder_blocks: 6
  num_heads: 9
  model_dim: 400
  ff_dim: 512
  pre_norm: True
  attention_dropout_prob: 0.1
  attention_norm: False
  hierarchical_weight_sharing: True

patcher:
  patch_grid_size: 16
  model_dim: 400
  pre_norm: True
  image_size: [32, 32]
  resnet_pooling: True
  resnet_name: resnet18
  resnet_blocks: 1

clf_head:
  model_dim: 400
  num_classes: 10

optimizer:
  name: sgd
  lr: 0.01
  weight_decay: 1.e-04

scheduler:
  name: cosine
  warmup_epochs: 5

# Data
dataset:
  name: cifar10
  root: data/cifar10
  train_transform:
    random_crop:
      size: 32
      padding: 4
    random_flip:
    to_tensor:
    normalize:
      mean: [0.4914, 0.4822, 0.4465]
      std: [0.2023, 0.1994, 0.2010]
  val_transform:
    to_tensor:
    normalize:
      mean: [0.4914, 0.4822, 0.4465]
      std: [0.2023, 0.1994, 0.2010]
