[
  {
    "label": "Events Input Sequence"
  },
  {
    "label": "Self-Attention Layer"
  },
  {
    "label": "Multi-Layer Perceptron"
  },
  {
    "label": "Temporal Context Encoder Block"
  },
  {
    "label": "Image Input Sequence"
  },
  {
    "label": "Spatial Transformer Layer"
  },
  {
    "label": "Feedforward Layer"
  },
  {
    "label": "Spatial Context Encoder Block"
  },
  {
    "label": "Events Input V^{Tk+1}_{Tk}"
  },
  {
    "label": "Image Input I_{Tk}"
  },
  {
    "label": "Feature"
  },
  {
    "label": "Selector Mode: Train/Eval"
  },
  {
    "label": "Concat/Attention"
  },
  {
    "label": "HW D"
  },
  {
    "label": "MLP Mixer"
  },
  {
    "label": "Dropout"
  },
  {
    "label": "Conv 11"
  },
  {
    "label": "Conv 33"
  },
  {
    "label": "ReLU"
  },
  {
    "label": "BatchNorm"
  },
  {
    "label": "Residual Add"
  },
  {
    "label": "Sum"
  },
  {
    "label": "LayerNorm"
  },
  {
    "label": "Spatio-temporal Context Feature Representation"
  }
]