
# Configuration for learned 2D positional encoding setup

epochs: 300
batch_size: 100
eval_every: 10                                                  # Number of training epochs after which validation is done 

# Models
encoder:
  attention: learned_2d                                         # Learned 2D relative positional encoding as the paper calls it
  ff_activation: relu                                           # Activation for hidden layer of MLP within attention layer
  num_encoder_blocks: 6                                         # Number of self-attention layers
  num_heads: 9                                                  # Number of heads for self-attention layers
  model_dim: 400                                                # Working size for the encoder
  ff_dim: 512                                                   # Size of hidden layer of MLP
  pre_norm: False                                               # Whether LayerNorm is before the residual connection (post-norm default)
  use_attention_data: True                                      # Whether to use content-content attention 
  query_positional_score: True                                  # Whether to use positional attention for query pixels
  max_position_embedding: 16                                    # Helps in setting up location map for positional encoding 
  position_embedding_size: -1                                   # By default (-1), it is same equal to model_dim
  attention_dropout_prob: 0.1                                   # Dropout prob for attention probs
  hierarchical_weight_sharing: True                             # Enable/disable hierarchical self-attention

feature_pooling:                                         
  resnet: resnet50                                              # We are using 2x2 downsampling for pooling, so this is ignored
  block: 1                                                      # Number of BasicBlocks of resnet to include, if being used
  pretrained: False                                             # Whether resnet is pretrained
  model_dim: 256                                                # Working size for the encoder
  pool_with_resnet: False                                       # Not pooling with resnet by default
  pool_downsample_size: 2                                       # Image is reversibly downsampled by factor of 2 (default)

clf_head:
  model_dim: 256                                                # in_features into the classification head
  num_classes: 10                                               # CIFAR-10 has 10 classes

optimizer:
  name: sgd
  lr: 0.1
  weight_decay: 1.e-04

scheduler:
  name: cosine
  warmup_epochs: 15

# Data
dataset:
  name: cifar10
  root: data/cifar10
  train_transform:                                              # Transformations chosen same as in paper
    random_crop:
      size: 32
      padding: 4
    random_flip:
    to_tensor:
    normalize:
      mean: [0.4914, 0.4822, 0.4465]
      std: [0.2023, 0.1994, 0.2010]
  val_transform:
    to_tensor:
    normalize:
      mean: [0.4914, 0.4822, 0.4465]
      std: [0.2023, 0.1994, 0.2010]
