dataset: cifar100
image_mean: [0.5071, 0.4867, 0.4408]
image_std: [0.2675, 0.2565, 0.2761]
aa: null
batch_size: 64
color_jitter: 0.0
cutout_length: 0
decay_by_epoch: True
decay_epochs: 30
decay_rate: 0.1
drop: 0.0
epochs: 240
log_interval: 50
lr: 0.05
smoothing: 0.0
min_lr: 1.0e-06
model_ema: false
model_ema_decay: 0.9998
momentum: 0.9
opt: sgd
opt_betas: null
opt_eps: 1.0e-08
remode: const
reprob: 0.0
sched: step
seed: 42
warmup_epochs: 120
warmup_lr: 0.05
weight_decay: 5.0e-04
workers: 4
sgd_no_nesterov: True
opt_no_filter: True
# kd
kd: 'dskd'
addloss_name:  #
addloss_name_2: # 
alpha: 0.1 #
ori_loss_weight: 1.
kd_loss_weight: 1.
teacher_model: cifar_wrn_40_2
teacher_pretrained: True
kd_loss_kwargs:
  use_ae: True
  ae_channels: 256
  tau: 4
  cond: True #
  logits: #