\centering
\fontsize{8pt}{6pt}\selectfont % Adjust the font size as needed
\begin{algorithmic}[1]
\LineComment{ $\mathbf{T}$: Sequence Length}
\LineComment{ $\mathbf{W}$: Number of Future State Predictions to Make}
\LineComment{ $\mathbf{s}^{[1:T]}$: Sequence of States | $s^{1}, s^{2}, \ldots, s^{T}$}
\LineComment{ $\mathbf{Squeezer}$: Squeezer Network for Intermediate Context Representations}
\LineComment{ $\mathbf{ContextGPT}$: GPT2-like Transformer}
\LineComment{ $\mathbf{ContextDecoders}^{[k]}$: $k^{th}$ Decoder Network}
\Procedure{ContextGPTLoss}{$s^{[1:T]}$}
    \StateComment{Project state into intermediate context representations.}
    \State $\mathbf{c_{s}}^{[1:T]} \gets \mathbf{Squeezer}(s^{[1:T]})$
    \StateComment{Predict the transitions context from a sequence of intermediate context representations.}
    \State $\mathbf{c_{tr}}^{[1:T]} \gets \mathbf{ContextGPT}(c_{s}^{[1:T]})$
    \StateComment{Initialize Context-GPT loss.}
    \State $\mathbf{loss} \gets 0$
    \StateComment{Computations of Context-GPT loss across shifted windows.}
    \For{$t \gets W + 1$ to $T$}
        \For{$k \gets 1$ to $W$}
            \IIStateComment{Relevance of the context at ($t - W$) in predicting the state at ($t - W + k$).}
            \State $\mathbf{\rho_{k}} = \frac{W - k + 1}{W}$
            \IIStateComment{Predict the state at ($t - W + k$) using the context at ($t - W$).}
            \State $\mathbf{\hat{s}}^{[t - W + k]} \gets \mathbf{ContextDecoders}^{[k]}(c_{tr}^{[t - W]})$
            \IIStateComment{Accumulate the discrepancy measure between $s^{[t - W + k]}$ and $\hat{s}^{[t - W + k]}$.}
            \State $\mathbf{loss} \gets \mathbf{loss} + \rho_{k} * \mathbf{VICReg}(s^{[t - W + k]}, \hat{s}^{[t - W + k]})$
        \EndFor
    \EndFor
\EndProcedure
\end{algorithmic}

