[
  {
    "label": "Events Input Sequence"
  },
  {
    "label": "Image Input Sequence"
  },
  {
    "label": "Temporal Context Encoder Block"
  },
  {
    "label": "Spatial Context Encoder Block"
  },
  {
    "label": "Self-Attention Layer"
  },
  {
    "label": "Multi-Layer Perceptron"
  },
  {
    "label": "Spatial Transformer Layer"
  },
  {
    "label": "Feedforward Layer"
  },
  {
    "label": "HW  D"
  },
  {
    "label": "HW  D"
  },
  {
    "label": "Conv 11"
  },
  {
    "label": "Conv 33"
  },
  {
    "label": "Relu"
  },
  {
    "label": "Conv 11"
  },
  {
    "label": "BN"
  },
  {
    "label": "Sum"
  },
  {
    "label": "Conv 11"
  },
  {
    "label": "LayerNorm"
  },
  {
    "label": "Conv 11"
  },
  {
    "label": "LayerNorm"
  },
  {
    "label": "Spatiotemporal Context Feature Representation"
  },
  {
    "label": "Events Input Sequence"
  },
  {
    "label": "V(Tk+1,Tk)"
  },
  {
    "label": "Image Input Sequence"
  },
  {
    "label": "I(Tk)"
  },
  {
    "label": "Feature Selector Mode: Train/Eval"
  }
]