name: MViT
cfg:
  # Parameters from https://github.com/facebookresearch/SlowFast/blob/master/configs/Kinetics/MVIT_B_16x4_CONV.yaml
  MODEL:
    NUM_CLASSES: ${model.feature_size}  # In self-supervised learning, this is the feature size
    DROPOUT_RATE: 0.5
    HEAD_ACT: "softmax"
  MVIT:
    ZERO_DECAY_POS_CLS: False
    SEP_POS_EMBED: True
    DEPTH: 16
    NUM_HEADS: 1
    EMBED_DIM: 96
    PATCH_KERNEL: [ 3, 7, 7 ]
    PATCH_STRIDE: [ 2, 4, 4 ]
    PATCH_PADDING: [ 1, 3, 3 ]
    MLP_RATIO: 4.0  # Dimension reduction ratio for the MLP layers.
    QKV_BIAS: True
    DROPPATH_RATE: 0.2
    NORM: "layernorm"
    MODE: "conv"
    CLS_EMBED_ON: True
    # Dimension multiplication at layer i. If 2.0 is used, then the next block will increase
    # the dimension by 2 times. Format: [depth_i: mul_dim_ratio]
    DIM_MUL: [ [ 1, 2.0 ], [ 3, 2.0 ], [ 14, 2.0 ] ]
    # Head number multiplication at layer i. If 2.0 is used, then the next block will
    # increase the number of heads by 2 times. Format: [depth_i: head_mul_ratio]
    HEAD_MUL: [ [ 1, 2.0 ], [ 3, 2.0 ], [ 14, 2.0 ] ]
    # If not None, overwrite the KV_KERNEL and Q_KERNEL size with POOL_KVQ_CONV_SIZ.
    # Otherwise the kernel_size is [s + 1 if s > 1 else s for s in stride_size].
    POOL_KVQ_KERNEL: [ 3, 3, 3 ]
    # Stride size for the Pool KV at layer i.
    # Format: [[i, stride_t_i, stride_h_i, stride_w_i], ...,]
    POOL_KV_STRIDE: [
      [ 0, 1, 8, 8 ], [ 1, 1, 4, 4 ], [ 2, 1, 4, 4 ], [ 3, 1, 2, 2 ], [ 4, 1, 2, 2 ], [ 5, 1, 2, 2 ], [ 6, 1, 2, 2 ],
      [ 7, 1, 2, 2 ], [ 8, 1, 2, 2 ], [ 9, 1, 2, 2 ], [ 10, 1, 2, 2 ], [ 11, 1, 2, 2 ], [ 12, 1, 2, 2 ], [ 13, 1, 2, 2 ] ]
    # Stride size for the Pool Q at layer i.
    # Format: [[i, stride_t_i, stride_h_i, stride_w_i], ...,]
    POOL_Q_STRIDE: [ [ 1, 1, 2, 2 ], [ 3, 1, 2, 2 ], [ 14, 1, 2, 2 ] ]
    DROPOUT_RATE: 0.0
    PATCH_2D: False
    NORM_STEM: False
  DATA:
    TRAIN_CROP_SIZE: ${assert_value:${dataset.dataset_params.resolution},224}
    NUM_FRAMES: ${assert_value:${dataset.dataset_params.num_frames},16}
    INPUT_CHANNEL_NUM: 3
    SAMPLING_RATE: ${assert_value:${dataset.dataset_params.video_sampling_rate},4}
