\clearpage
\section{Appendix: Algorithms}

\begin{algorithm}[t]
\scriptsize
\caption{Memory-Guided Identity Encoding with LFQ Tokenization}
\label{alg:identity_lfq}
\begin{algorithmic}[1]

\State \textbf{Input:} Video $\mathcal{V}=\{x_t\}_{t=1}^{T}$
\State \textbf{Parameters:} LFQ temperature $\tau$, commitment weight $\beta$, entropy weight $\lambda_{\mathrm{entropy}}$
\State \textbf{Initialize:} Class-agnostic segmenter $\mathrm{Seg}$, identity encoder $q_{\phi_z}$, LFQ head $g_{\phi_z}$, identity projection $(W_{\mathrm{id}},c_{\mathrm{id}})$, memory bank $\mathcal{B}\leftarrow\emptyset$

\Statex
\State \textcolor{blue}{\textit{// 1. Extract object proposals and proposal-level identity evidence}}
\For{$t = 1$ to $T$}
    \State $\mathcal{O}_t \leftarrow \mathrm{Seg}(x_t)$
    \Comment{$\mathcal{O}_t=\{o_{t,k}\}_{k=1}^{K_t}$}
    \For{each proposal $o_{t,k}=(b_{t,k},m_{t,k},v_{t,k}) \in \mathcal{O}_t$}
        \State $x_{t,k}^{\alpha} \leftarrow \mathrm{RGB\text{-}A}(x_t,m_{t,k})$
        \Comment{object-focused input}
        \State $z_{t,k} \leftarrow q_{\phi_z}(x_{t,k}^{\alpha})$
        \Comment{continuous identity evidence}
    \EndFor
\EndFor

\Statex
\State \textcolor{blue}{\textit{// 2. Dynamic memory assignment of proposals to persistent object indices}}
\For{$t = 1$ to $T$}
    \For{each proposal $o_{t,k}$ and each active memory $M_i\in\mathcal{B}$}
        \State $C_{k,i}^{(t)} \leftarrow
        \eta_{\mathrm{app}}\big(1-\cos(z_{t,k},\mu_i^z)\big)
        +\eta_{\mathrm{box}}\big(1-\mathrm{IoU}(b_{t,k},b_i^{\mathrm{last}})\big)
        +\eta_{\mathrm{mask}}\big(1-\mathrm{MaskIoU}(m_{t,k},m_i^{\mathrm{last}})\big)$
    \EndFor

    \State $\mathcal{A}_t \leftarrow \mathrm{HungarianAssign}(C^{(t)}, \rho)$
    \Comment{accept matches below threshold $\rho$}

    \For{each matched pair $(k,i)\in\mathcal{A}_t$}
        \State Assign proposal $o_{t,k}$ to memory slot $M_i$
        \State Append $x_{t,k}^{\alpha}$ to video tube $\mathcal{V}^{(i)}$
        \State Store indexed proposal $o_{t,i}\leftarrow o_{t,k}$
        \State Update memory summary $\mu_i^z$, $b_i^{\mathrm{last}}$, and $m_i^{\mathrm{last}}$
    \EndFor

    \For{each unmatched proposal $o_{t,k}$}
        \State Create new memory slot $M_j$
        \State $\mathcal{V}^{(j)} \leftarrow \{x_{t,k}^{\alpha}\}$
        \State Store indexed proposal $o_{t,j}\leftarrow o_{t,k}$
        \State Initialize $\mu_j^z\leftarrow z_{t,k}$, $b_j^{\mathrm{last}}\leftarrow b_{t,k}$, $m_j^{\mathrm{last}}\leftarrow m_{t,k}$
        \State Insert $M_j$ into $\mathcal{B}$
    \EndFor
\EndFor

\Statex
\State \textcolor{blue}{\textit{// 3. Track-wise LFQ identity tokenization}}
\State $\mathcal{L}_{\mathrm{commit}} \leftarrow 0$
\For{each memory slot $M_i\in\mathcal{B}$}
    \State $\bar{z}^{(i)} \leftarrow
    \frac{1}{|\mathcal{V}^{(i)}|}
    \sum_{x_{t,k}^{\alpha}\in\mathcal{V}^{(i)}}
    q_{\phi_z}(x_{t,k}^{\alpha})$
    \Comment{aggregate object evidence}

    \State $a^{(i)} \leftarrow g_{\phi_z}(\bar{z}^{(i)})$
    \Comment{$a^{(i)}\in\mathbb{R}^{B}$}

    \State $b^{(i)} \leftarrow 2\mathbf{1}[a^{(i)}\geq 0]-1$
    \Comment{binary LFQ code}

    \State $\tilde{b}^{(i)} \leftarrow
    a^{(i)}+\mathrm{sg}\!\left[b^{(i)}-a^{(i)}\right]$
    \Comment{straight-through estimator}

    \State $z_{\mathrm{id}}^{(i)} \leftarrow
    W_{\mathrm{id}}\tilde{b}^{(i)}+c_{\mathrm{id}}$

    \State $\mathcal{L}_{\mathrm{commit}} \leftarrow
    \mathcal{L}_{\mathrm{commit}}
    +\left\|a^{(i)}-\mathrm{sg}[b^{(i)}]\right\|_2^2$
\EndFor

\State $\mathcal{L}_{\mathrm{commit}} \leftarrow
|\mathcal{B}|^{-1}\mathcal{L}_{\mathrm{commit}}$

\Statex
\State \textcolor{blue}{\textit{// 4. LFQ entropy code-utilization regularization}}
\For{each memory slot $M_i\in\mathcal{B}$}
    \State $p^{(i)} \leftarrow \sigma(a^{(i)}/\tau)$
\EndFor
\State $\bar{p} \leftarrow |\mathcal{B}|^{-1}\sum_i p^{(i)}$
\State $\mathcal{L}_{\mathrm{entropy}} \leftarrow
|\mathcal{B}|^{-1}\sum_i\sum_{r=1}^{B}h(p_r^{(i)})
-
\sum_{r=1}^{B}h(\bar{p}_r)$

\Statex
\State \textbf{Output:} Memory-indexed proposals $\{o_{t,i}\}$, video tubes $\{\mathcal{V}^{(i)}\}$, identity codes $\{z_{\mathrm{id}}^{(i)}\}$, $\mathcal{L}_{\mathrm{commit}}$, $\mathcal{L}_{\mathrm{entropy}}$

\end{algorithmic}
\end{algorithm}


\begin{algorithm}[t]
\scriptsize
\caption{Proposal-Conditioned State Encoding, Memory-Paired Decoding, and Optimization}
\label{alg:training_lfq}
\begin{algorithmic}[1]

\State \textbf{Input:} Video $\mathcal{V}=\{x_t\}_{t=1}^{T}$, memory-indexed proposals $\{o_{t,i}\}$, identity codes $\{z_{\mathrm{id}}^{(i)}\}$
\State \textbf{Parameters:} $\lambda_s$, $\beta$, $\lambda_{\mathrm{entropy}}$, $\lambda_{\mathrm{con}}$, margin $m$
\State \textbf{Initialize:} Frozen visual backbone $F_{\psi}$, geometry encoder $\gamma$, state encoder $E_{\mathrm{st}}$, decoder $D_{\theta}$, projection heads $r_s$ and $r_{\mathrm{id}}$

\While{not converged}

    \State \textcolor{blue}{\textit{// 1. Sample training frames}}
    \State Sample sub-sequence $\tau_s \subset \{1,\dots,T\}$
    \State $\mathcal{L}_{\mathrm{rec}}\leftarrow 0$, $\mathcal{L}_{\mathrm{KL}}\leftarrow 0$, $\mathcal{L}_{\mathrm{con}}^{s,\mathrm{id}}\leftarrow 0$

    \Statex
    \State \textcolor{blue}{\textit{// 2. Proposal-conditioned state encoding}}
    \For{$t\in\tau_s$}
        \State $H_t \leftarrow F_{\psi}(x_t)$
        \Comment{dense ViT/DINOv2 feature map}

        \For{each memory slot $i$ visible in frame $t$}
            \State $o_{t,i}=(b_{t,i},m_{t,i},v_{t,i})$

            \State $G_{t,i}\leftarrow
            \mathrm{RoIAlign}(H_t,b_{t,i})$
            \Comment{$G_{t,i}\in\mathbb{R}^{R\times R\times D}$}

            \If{mask $m_{t,i}$ is available}
                \State $\tilde{m}_{t,i}\leftarrow
                \mathrm{Resize}(m_{t,i},R,R)$
            \Else
                \State $\tilde{m}_{t,i}\leftarrow \mathbf{1}_{R\times R}$
            \EndIf

            \State $\bar{G}_{t,i}\leftarrow
            \tilde{m}_{t,i}\odot G_{t,i}$
            \Comment{soft proposal gate}

            \State $g_{t,i}\leftarrow \gamma(b_{t,i},m_{t,i})$

            \State $(\mu_{t,i},\log\sigma_{t,i}^{2})
            \leftarrow
            E_{\mathrm{st}}\!\left(
            \mathrm{Pool}(\bar{G}_{t,i}),g_{t,i}
            \right)$

            \State $\epsilon\sim\mathcal{N}(0,I)$
            \State $s^{(t,i)}\leftarrow
            \mu_{t,i}+\sigma_{t,i}\odot\epsilon$

            \State $\mathcal{L}_{\mathrm{KL}}\leftarrow
            \mathcal{L}_{\mathrm{KL}}
            +
            D_{\mathrm{KL}}\!\left(
            q_{\phi_s}(s^{(t,i)}\mid x_t,o_{t,i})
            \,\|\,p(s)
            \right)$

            \Statex
            \State \textcolor{blue}{\textit{// 3. Memory-paired identity--state decoding}}
            \State $u^{(t,i)}\leftarrow
            [z_{\mathrm{id}}^{(i)};s^{(t,i)}]$
            \Comment{paired only by memory index $i$}

            \State $(\hat{v}^{(t,i)},\hat{\alpha}^{(t,i)})
            \leftarrow D_{\theta}(u^{(t,i)})$

            \Statex
            \State \textcolor{blue}{\textit{// 4. Identity--state separation loss}}
            \State $\mathbf{u}_{t,i}\leftarrow
            \frac{r_s(s^{(t,i)})}{\|r_s(s^{(t,i)})\|_2}$

            \State $\mathbf{v}_{i}\leftarrow
            \frac{r_{\mathrm{id}}(z_{\mathrm{id}}^{(i)})}
            {\|r_{\mathrm{id}}(z_{\mathrm{id}}^{(i)})\|_2}$

            \State $\mathcal{L}_{\mathrm{con}}^{s,\mathrm{id}}
            \leftarrow
            \mathcal{L}_{\mathrm{con}}^{s,\mathrm{id}}
            +
            \left[
            \max\left(
            0,
            \mathbf{u}_{t,i}^{\top}\mathrm{sg}[\mathbf{v}_{i}]
            -
            m
            \right)
            \right]^2$
        \EndFor

        \State $\hat{x}_t\leftarrow
        \sum_i
        \mathrm{softmax}_i
        \left(
        \hat{\alpha}^{(t,i)}
        \right)
        \hat{v}^{(t,i)}$

        \State $\mathcal{L}_{\mathrm{rec}}\leftarrow
        \mathcal{L}_{\mathrm{rec}}
        +
        \|x_t-\hat{x}_t\|_2^2$
    \EndFor

    \Statex
    \State \textcolor{blue}{\textit{// 5. Normalize losses}}
    \State $\mathcal{L}_{\mathrm{KL}}\leftarrow
    N_{\mathrm{vis}}^{-1}\mathcal{L}_{\mathrm{KL}}$

    \State $\mathcal{L}_{\mathrm{con}}^{s,\mathrm{id}}\leftarrow
    N_{\mathrm{vis}}^{-1}\mathcal{L}_{\mathrm{con}}^{s,\mathrm{id}}$

    \Statex
    \State \textcolor{blue}{\textit{// 6. Full optimization objective}}
    \State $\mathcal{L}\leftarrow
    \mathcal{L}_{\mathrm{rec}}
    +
    \lambda_s\mathcal{L}_{\mathrm{KL}}
    +
    \beta\mathcal{L}_{\mathrm{commit}}
    +
    \lambda_{\mathrm{entropy}}\mathcal{L}_{\mathrm{entropy}}
    +
    \lambda_{\mathrm{con}}\mathcal{L}_{\mathrm{con}}^{s,\mathrm{id}}$

    \State Backpropagate $\nabla\mathcal{L}$ and update trainable parameters

\EndWhile

\Statex
\State \textbf{Output:} Trained identity encoder, state encoder, LFQ projection, memory-indexed identity codes, and decoder

\end{algorithmic}
\end{algorithm}