# Spatial-Temporal Transformer Configuration
# This module encodes visual input into intermediate features
_target_: models.hierarchical_model_Dinov2.STTransformer
intermediate_dim: 768
channels: 768
H: 16
W: 16
patch_size: 1
spatial_depth: 4
temporal_depth: 4
dim_head: 64
heads: 8
batch_first: ${batch_first}