DATA:
  DATASET: imagenet
  IMG_SIZE: 224
MODEL:
  TYPE: swin_cosa
  NAME: baseline_top2_epoch60
  DROP_PATH_RATE: 0.2
  SWIN_MOE:
    EMBED_DIM: 96
    DEPTHS: [ 2, 2, 18, 2 ]
    NUM_HEADS: [ 3, 6, 12, 24 ]
    WINDOW_SIZE: 7
    MLP_FC2_BIAS: False
    INIT_STD: 0.005
    MOE_BLOCKS: [ [ -1 ], [ -1 ], [ 1, 3, 5, 7, 9, 11, 13, 15, 17 ], [ 1 ] ]
    NUM_LOCAL_EXPERTS: 4
    TOP_VALUE: 2
    CAPACITY_FACTOR: 1.25
    IS_GSHARD_LOSS: False
    USE_COSA: False
    COSA_POSITIONS: [[-1], [-1], [-1], [-1]]
    MOE_DROP: 0.1
    AUX_LOSS_WEIGHT: 0.01
TRAIN:
  EPOCHS: 90
  WARMUP_EPOCHS: 10
  WEIGHT_DECAY: 0.1
  BASE_LR: 1.25e-4 # 4096 batch-size
  WARMUP_LR: 1.25e-7
  MIN_LR: 1.25e-6
  CLIP_GRAD: 3.0
  AUTO_RESUME: True
TEST:
  SHUFFLE: True