MODEL:
  TYPE: CvT
  NUM_CLASSES: 100
TRANSFORMER:
  PATCH_SIZE: [7, 3, 3]
  PATCH_STRIDE: [4, 2, 2]
  PATCH_PADDING: [2, 1, 1]
  HIDDEN_DIM: [64, 192, 384]
  DEPTH: [1, 2, 10]
  NUM_HEADS: [1, 3, 6]
  MLP_RATIO: [4, 4, 4]
  LN_EPS: 1.0e-5
  DROP_RATE: [0.0, 0.0, 0.0]
  DROP_PATH_RATE: [0.0, 0.0, 0.1]
  ATTENTION_DROP_RATE: [0.0, 0.0, 0.0]
CVT:
  WITH_CLS_TOKEN: [False, False, True]
  QKV_PROJ_METHOD: ['dw_bn', 'dw_bn', 'dw_bn']
  KERNEL_QKV: [3, 3, 3]
  STRIDE_KV: [2, 2, 2]
  STRIDE_Q: [1, 1, 1]
  PADDING_KV: [1, 1, 1]
  PADDING_Q: [1, 1, 1]
CNN:
  DEPTH: 56
RESNET:
  TRANS_FUN: basic_transform
DISTILLATION: 
  ENABLE_INTER: True
  INTER_TRANSFORM: linear
  INTER_TEACHER_INDEX: [0, 1, 2]
  INTER_STUDENT_INDEX: [0, 6, 11]
  INTER_WEIGHT: 2.5
  TEACHER_MODEL: ResNet
  TEACHER_WEIGHTS: work_dirs/r-56_c100/model.pyth
  TEACHER_IMG_SIZE: 32
OPTIM:
  OPTIMIZER: adamw
  BASE_LR: 5.0e-4
  MIN_LR: 5.0e-6
  LR_POLICY: cos
  MAX_EPOCH: 300
  WEIGHT_DECAY: 0.05
  WARMUP_FACTOR: 0.001
  WARMUP_EPOCHS: 20
TRAIN:
  DATASET: cifar100
  SPLIT: train
  BATCH_SIZE: 128
TEST:
  DATASET: cifar100
  SPLIT: test
  BATCH_SIZE: 200
NUM_GPUS: 1
DATA_LOADER:
  NUM_WORKERS: 4
CUDNN:
  BENCHMARK: False
