[
  {
    "label": "Events Input Sequence"
  },
  {
    "label": "Image Input Sequence"
  },
  {
    "label": "Temporal Context Encoder Block"
  },
  {
    "label": "Spatial Context Encoder Block"
  },
  {
    "label": "HW  D"
  },
  {
    "label": "HW  D"
  },
  {
    "label": "Concat/Attention"
  },
  {
    "label": "Conv 11"
  },
  {
    "label": "Conv 33"
  },
  {
    "label": "ReLU"
  },
  {
    "label": "Conv 11"
  },
  {
    "label": "BatchNorm"
  },
  {
    "label": "Residual Add"
  },
  {
    "label": "Sum"
  },
  {
    "label": "Spatiotemporal Context Feature Representation"
  },
  {
    "label": "MLP Mixer"
  },
  {
    "label": "Dropout"
  },
  {
    "label": "Conv 11"
  },
  {
    "label": "LayerNorm"
  },
  {
    "label": "Events Input\nV^{Tk+1}_{Tk}"
  },
  {
    "label": "Image Input\nI_{Tk}"
  },
  {
    "label": "Feature Selector\nMode: Train/Eval"
  },
  {
    "label": "Self-Attention Layer"
  },
  {
    "label": "Multi-Layer Perceptron"
  },
  {
    "label": "Spatial Transformer Layer"
  },
  {
    "label": "Feedforward Layer"
  }
]