aa: rand-m9-mstd0.5
batch_size: 64 # x 16 gpus = 1024bs
color_jitter: 0.4
decay_by_epoch: false
decay_epochs: 3
decay_rate: 0.967
# dropout
drop: 0.0
drop_path_rate: 0.2

epochs: 300
log_interval: 50
lr: 1.e-3
min_lr: 5.0e-06
model_ema: False
model_ema_decay: 0.999
momentum: 0.9
opt: adamw
opt_betas: null
opt_eps: 1.0e-08
clip_grad_norm: true
clip_grad_max_norm: 5.0

interpolation: 'bicubic'

# random erase
remode: pixel
reprob: 0.25

# mixup
mixup: 0.8
cutmix: 1.0
mixup_prob: 1.0
mixup_switch_prob: 0.5
mixup_mode: 'batch'

sched: cosine
seed: 42
warmup_epochs: 20
warmup_lr: 5.e-7
weight_decay: 0.04
workers: 16

# kd
#kd: 'dist'
ori_loss_weight: 1.
kd_loss_weight: 1.
teacher_model: 'timm_swin_large_patch4_window7_224'
teacher_pretrained: True
kd_loss_kwargs:
  use_ae: True
  ae_channels: 1024
  tau: 1