MODEL:
  NAME: cls_cvt
  SPEC:
    INIT: 'trunc_norm'
    NUM_STAGES: 3
    PATCH_SIZE: [7, 3, 3]
    PATCH_STRIDE: [4, 2, 2]
    PATCH_PADDING: [2, 1, 1]
    DIM_EMBED: [64, 192, 384]
    NUM_HEADS: [1, 3, 6]
    DEPTH: [1, 2, 10]
    MLP_RATIO: [4.0, 4.0, 4.0]
    ATTN_DROP_RATE: [0.0, 0.0, 0.0]
    DROP_RATE: [0.0, 0.0, 0.0]
    DROP_PATH_RATE: [0.0, 0.0, 0.1]
    QKV_BIAS: [True, True, True]
    CLS_TOKEN: [False, False, True]
    POS_EMBED: [False, False, False]
    QKV_PROJ_METHOD: ['dw_bn', 'dw_bn', 'dw_bn']
    KERNEL_QKV: [3, 3, 3]
    PADDING_KV: [1, 1, 1]
    STRIDE_KV: [2, 2, 2]
    PADDING_Q: [1, 1, 1]
    STRIDE_Q: [1, 1, 1]

  TRAIN:
    LR_SCHEDULER:
      METHOD: 'timm'
      ARGS:
        sched: 'cosine'
        warmup_epochs: 0
        warmup_lr: 0.000001
        min_lr: 0.00001
        cooldown_epochs: 10
        decay_rate: 0.1
        epochs: 200