\begin{figure}[t]
    \centering
    \resizebox{\textwidth}{!}{%
    \begin{tikzpicture}[
        node distance=1.3cm,
        every node/.style={font=\sffamily\small},
        process/.style={rectangle, minimum width=2.5cm, minimum height=0.9cm,
                        text centered, draw=black, fill=blue!10, rounded corners},
        memory/.style={rectangle, minimum width=3.0cm, minimum height=1.0cm,
                       text centered, draw=black, fill=purple!10, rounded corners},
        encoder/.style={trapezium, trapezium stretches=true,
                        trapezium left angle=75, trapezium right angle=75,
                        shape border rotate=270,
                        text centered, draw=black, fill=blue!10, rounded corners},
        decoder/.style={trapezium, trapezium stretches=true,
                        trapezium left angle=75, trapezium right angle=75,
                        shape border rotate=90,
                        text centered, draw=black, fill=blue!10, rounded corners},
        io/.style={trapezium, trapezium left angle=70, trapezium right angle=110,
                   minimum width=1.6cm, minimum height=0.9cm,
                   text centered, draw=black, fill=orange!10},
        latent/.style={rectangle, minimum width=1.8cm, minimum height=0.8cm,
                       text centered, draw=black, fill=red!10, dashed},
        arrow/.style={thick,->,>=stealth}
    ]

    % =========================
    % Shared proposal and memory stage
    % =========================
    \node (video) [io, text width=1.9cm]
        {$\mathcal{X}$\\[-1mm]\scriptsize$\{x_t\}_{t=1}^{T}$};

    \node (props) [process, right=0.8cm of video, text width=2.7cm]
        {(1) Object\\Proposals};

    \draw [arrow] (video) -- (props);

    \node (memory) [memory, right=0.9cm of props, text width=3.4cm]
        {(2) Memory Bank\\Tracking + Tube Construction\\$o_{t,k}\rightarrow M_i$};

    \draw [arrow] (props) -- (memory);

    % =========================
    % Identity branch
    % =========================
    \node (idenc) [encoder, above right=1.6cm and 1.0cm of memory, text width=3.0cm]
        {(3) Identity Encoder\\$\mathcal{V}^{(i)} \rightarrow \bar{z}^{(i)}$};

    \draw [arrow] (memory.north) |- (idenc.west);

    \node (lfq) [process, right=0.9cm of idenc, text width=2.5cm]
        {(4) LFQ\\Identity Tokenization};

    \draw [arrow] (idenc) -- (lfq);

    \node (zid) [latent, right=0.9cm of lfq, text width=1.9cm]
        {$z_{\mathrm{id}}^{(i)}$};

    \draw [arrow] (lfq) -- (zid);

    \node (store) [memory, below=1.0cm of zid, text width=3.0cm]
        {(5) Store Identity\\in Memory Slot $M_i$};

    \draw [arrow] (zid) -- (store);

    % =========================
    % State branch
    % =========================
    \node (stenc) [encoder, below right=1.6cm and 1.0cm of memory, text width=3.3cm]
        {(6) State Encoder\\$(x_t,o_{t,i})\rightarrow s^{(t,i)}$};

    \draw [arrow] (video.south) |- (stenc.west);
    \draw [arrow, dashed]
        (memory.south) |- node[pos=0.25, left, font=\scriptsize]{$o_{t,i}$ only}
        (stenc.west);

    \node (state) [latent, right=0.9cm of stenc, text width=1.9cm]
        {$s^{(t,i)}$};

    \draw [arrow] (stenc) -- (state);

    % =========================
    % Decoder
    % =========================
    \node (dec) [decoder, right=1.6cm of state,
                 minimum height=3.0cm, text width=3.2cm]
        {(7) Decoder\\$[z_{\mathrm{id}}^{(i)};s^{(t,i)}]$};

    \draw [arrow] (state) -- (dec);

    \draw [arrow]
        (store.east) -| node[pos=0.25, right, font=\scriptsize]
        {same memory index $i$}
        (dec.north);

    \node (recon) [io, right=0.9cm of dec, text width=1.6cm]
        {$\hat{x}_t$};

    \draw [arrow] (dec) -- (recon);

    % =========================
    % Group boxes
    % =========================
    \begin{scope}[on background layer]
        \node [fit=(idenc) (lfq) (zid) (store),
              draw, dashed, fill=blue!5, inner sep=0.4cm,
              label=above:\textbf{Identity Branch: Time-Invariant}] {};
    \end{scope}

    \begin{scope}[on background layer]
        \node [fit=(stenc) (state),
              draw, dashed, fill=green!5, inner sep=0.4cm,
              label=below:\textbf{State Branch: Time-Varying}] {};
    \end{scope}

    \end{tikzpicture}
    }
    \caption{%
        \textbf{\modelcode}: high-level pipeline for disentangled object-centric
        identity and state representation.
        The input video is first converted into object proposals. A dynamic memory bank
        tracks proposals across frames and groups them into persistent object tubes
        indexed by memory slots $M_i$.
        The identity branch encodes each tube into a time-invariant representation
        and maps it through LFQ to a discrete identity code $z_{\mathrm{id}}^{(i)}$,
        which is stored in the corresponding memory slot.
        The state branch separately encodes the current frame and the proposal
        associated with memory slot $i$ into a time-varying state latent
        $s^{(t,i)}$.
        The state encoder receives the current proposal $o_{t,i}$ but not the identity
        code, preventing identity information from being directly injected into the
        state representation.
        The decoder reconstructs the frame from the memory-paired representation
        $[z_{\mathrm{id}}^{(i)};s^{(t,i)}]$, where pairing is determined by the shared
        memory index $i$.
    }
    \label{fig:grounded-identity-state}
\end{figure}