TRAIN:
  ENABLE: True
  DATASET: kinetics
  BATCH_SIZE: 16
  EVAL_PERIOD: 10
  CHECKPOINT_PERIOD: 10
  AUTO_RESUME: True
DATA:
  USE_OFFSET_SAMPLING: True
  DECODING_BACKEND: torchvision
  NUM_FRAMES: 16
  SAMPLING_RATE: 4
  TRAIN_JITTER_SCALES: [256, 320]
  TRAIN_CROP_SIZE: 224
  TEST_CROP_SIZE: 224
  INPUT_CHANNEL_NUM: [3]
  # PATH_TO_DATA_DIR: path-to-k400-dir
  TRAIN_JITTER_SCALES_RELATIVE: [0.08, 1.0]
  TRAIN_JITTER_ASPECT_RELATIVE: [0.75, 1.3333]
MVIT:
  ZERO_DECAY_POS_CLS: False
  SEP_POS_EMBED: True
  DEPTH: 16
  NUM_HEADS: 1
  EMBED_DIM: 96
  PATCH_KERNEL: (3, 7, 7)
  PATCH_STRIDE: (2, 4, 4)
  PATCH_PADDING: (1, 3, 3)
  MLP_RATIO: 4.0
  QKV_BIAS: True
  DROPPATH_RATE: 0.2
  NORM: "layernorm"
  MODE: "conv"
  CLS_EMBED_ON: True
  DIM_MUL: [[1, 2.0], [3, 2.0], [14, 2.0]]
  HEAD_MUL: [[1, 2.0], [3, 2.0], [14, 2.0]]
  POOL_KVQ_KERNEL: [3, 3, 3]
  POOL_KV_STRIDE_ADAPTIVE: [1, 8, 8]
  POOL_Q_STRIDE: [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]]
  DROPOUT_RATE: 0.0
AUG:
  NUM_SAMPLE: 2
  ENABLE: True
  COLOR_JITTER: 0.4
  AA_TYPE: rand-m7-n4-mstd0.5-inc1
  INTERPOLATION: bicubic
  RE_PROB: 0.25
  RE_MODE: pixel
  RE_COUNT: 1
  RE_SPLIT: False
MIXUP:
  ENABLE: True
  ALPHA: 0.8
  CUTMIX_ALPHA: 1.0
  PROB: 1.0
  SWITCH_PROB: 0.5
  LABEL_SMOOTH_VALUE: 0.1
BN:
  USE_PRECISE_STATS: False
  NUM_BATCHES_PRECISE: 200
SOLVER:
  ZERO_WD_1D_PARAM: True
  CLIP_GRAD_L2NORM: 1.0
  BASE_LR_SCALE_NUM_SHARDS: True
  BASE_LR: 0.0001
  COSINE_AFTER_WARMUP: True
  COSINE_END_LR: 1e-6
  WARMUP_START_LR: 1e-6
  WARMUP_EPOCHS: 30.0
  LR_POLICY: cosine
  MAX_EPOCH: 200
  MOMENTUM: 0.9
  WEIGHT_DECAY: 0.05
  OPTIMIZING_METHOD: adamw
MODEL:
  NUM_CLASSES: 400
  ARCH: mvit
  MODEL_NAME: MViT
  LOSS_FUNC: soft_cross_entropy
  DROPOUT_RATE: 0.5
TEST:
  ENABLE: True
  DATASET: kinetics
  BATCH_SIZE: 64
  NUM_SPATIAL_CROPS: 1
DATA_LOADER:
  NUM_WORKERS: 8
  PIN_MEMORY: True
NUM_GPUS: 8
NUM_SHARDS: 1
RNG_SEED: 0
OUTPUT_DIR: .
