TRAIN:
  ENABLE: True
  DATASET: kinetics
  BATCH_SIZE: 16
  EVAL_PERIOD: 10
  CHECKPOINT_PERIOD: 10
  AUTO_RESUME: True
DATA:
  USE_OFFSET_SAMPLING: True
  DECODING_BACKEND: torchvision
  NUM_FRAMES: 32
  SAMPLING_RATE: 3
  TRAIN_JITTER_SCALES: [256, 320]
  TRAIN_CROP_SIZE: 224
  TEST_CROP_SIZE: 224
  INPUT_CHANNEL_NUM: [3]
  # PATH_TO_DATA_DIR: path-to-k400-or-k600-dir
  TRAIN_JITTER_SCALES_RELATIVE: [0.08, 1.0]
  TRAIN_JITTER_ASPECT_RELATIVE: [0.75, 1.3333]
MVIT:
  ZERO_DECAY_POS_CLS: False
  SEP_POS_EMBED: True
  DEPTH: 16
  NUM_HEADS: 1
  EMBED_DIM: 96
  PATCH_KERNEL: (3, 7, 7)
  PATCH_STRIDE: (2, 4, 4)
  PATCH_PADDING: (1, 3, 3)
  MLP_RATIO: 4.0
  QKV_BIAS: True
  DROPPATH_RATE: 0.3
  NORM: "layernorm"
  MODE: "conv"
  DEPTH: 24
  POOL_Q_STRIDE: [[2,1, 2, 2], [5, 1, 2, 2],  [21, 1, 2, 2]]
  DIM_MUL: [[2, 2.0], [5, 2.0], [21, 2.0]]
  HEAD_MUL: [[2, 2.0], [5, 2.0], [21, 2.0]]
  POOL_KV_STRIDE_ADAPTIVE: [1, 8, 8]
  SEP_POS_EMBED: True
AUG:
  ENABLE: True
  COLOR_JITTER: 0.4
  AA_TYPE: rand-m7-n4-mstd0.5-inc1
  INTERPOLATION: bicubic
  NUM_SAMPLE: 2
  RE_MODE: pixel
  RE_COUNT: 1
  RE_SPLIT: False
MIXUP:
  ENABLE: True
  LABEL_SMOOTH_VALUE: 0.1
  ALPHA: 0.8
  CUTMIX_ALPHA: 1.0
  PROB: 1.0
  SWITCH_PROB: 0.5
SOLVER:
  BASE_LR_SCALE_NUM_SHARDS: True
  BASE_LR: 1e-4
  CLIP_GRAD_L2NORM: 1.0
  LR_POLICY: cosine
  COSINE_AFTER_WARMUP: True
  COSINE_END_LR: 1e-6
  MAX_EPOCH: 200
  WARMUP_EPOCHS: 30.0
  MOMENTUM: 0.9
  WEIGHT_DECAY: 5e-2
  ZERO_WD_1D_PARAM: True
  WARMUP_START_LR: 1e-6
  OPTIMIZING_METHOD: adamw
MODEL:
  NUM_CLASSES: 400 # or 600 for K600
  ARCH: mvit
  MODEL_NAME: MViT
  LOSS_FUNC: soft_cross_entropy
  DROPOUT_RATE: 0.5
TEST:
  ENABLE: True
  DATASET: kinetics
  BATCH_SIZE: 64
  NUM_SPATIAL_CROPS: 1
DATA_LOADER:
  NUM_WORKERS: 8
  PIN_MEMORY: True
NUM_GPUS: 8
RNG_SEED: 0
OUTPUT_DIR: .
