MODEL:
  TYPE: PiT
  NUM_CLASSES: 200
TRANSFORMER:
  PATCH_SIZE: 16
  HIDDEN_DIM: [64, 128, 256]
  DEPTH: [2, 6, 4]
  NUM_HEADS: [2, 4, 8]
  MLP_RATIO: 4
  DROP_RATE: 0.0
  DROP_PATH_RATE: 0.1
  ATTENTION_DROP_RATE: 0.0
PIT:
  STRIDE: 8
OPTIM:
  OPTIMIZER: adamw
  BASE_LR: 5.0e-4
  MIN_LR: 5.0e-6
  LR_POLICY: cos
  MAX_EPOCH: 300
  WEIGHT_DECAY: 0.05
  WARMUP_FACTOR: 0.001
  WARMUP_EPOCHS: 20
TRAIN:
  DATASET: tiny_imagenet
  SPLIT: train
  BATCH_SIZE: 128
TEST:
  DATASET: tiny_imagenet
  SPLIT: val
  BATCH_SIZE: 200
NUM_GPUS: 1
DATA_LOADER:
  NUM_WORKERS: 4
CUDNN:
  BENCHMARK: False
