target:
  name: clip
  variant: ViT-B/32
surrogate:
  output_dim: 512
dataset:
  name: imagenet
  split: val
  sampling: true
  encoding_path: outputs/imagenet/average_encoding.pkl
  caption_path: /outputs/imagenet/captions.json
output:
  path: outputs/imagenet/ckpts/
loss:
  train:
    name: SymmetricKDLoss
    gamma1: 0.9
    gamma2: 0.9
    temp_student: 0.1
    temp_teacher: 0.1
  val:
    name: accuracy
optimizer:
  name: AdamW
  lr: 2.5e-3
  weight_decay: 1.0e-6
  amsgrad: true
training:
  train_dataloader:
    batch_size: 1024
    num_workers: 3
    prefetch_factor: 2
    # drop the last batch since smaller batches are 'easier' to learn,
    # hence they cause spikes in the loss
    drop_last: true
  val_dataloader:
    batch_size: 500
    num_workers: 3
    prefetch_factor: 2
  epochs: 150