\documentclass{article} % For LaTeX2e
\usepackage{iclr2024_conference,times}
\usepackage[textsize=tiny]{todonotes}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{math_commands.tex}

\usepackage{pseudocode}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{graphicx} 
\usepackage{subcaption}
\usepackage{url}

% \newcommand{\cmmnt}[1]{\@bsphack\@esphack}
% \newcommand{\todo}[1]{\ignorespaces}
\newcommand{\ignore}[1]{}

\setlength{\marginparwidth}{2.7cm}
\newcommand{\bdz}[1]{\todo[author=Brian,color=red]{#1}}
\newcommand{\js}[1]{\todo[author=Jurat,color=yellow]{#1}}

\title{CaStRL: Context-Aware State Representation Learning with Transformer}

% Authors must not appear in the submitted version. They should be hidden
% as long as the \iclrfinalcopy macro remains commented out below.
% Non-anonymous submissions will be rejected without review.

\author{Antiquus S.~Hippocampus, Natalia Cerebro \& Amelie P. Amygdale \thanks{ Use footnote for providing further information
about author (webpage, alternative address)---\emph{not} for acknowledging
funding agencies.  Funding acknowledgements go at the end of the paper.} \\
Department of Computer Science\\
Cranberry-Lemon University\\
Pittsburgh, PA 15213, USA \\
\texttt{\{hippo,brain,jen\}@cs.cranberry-lemon.edu} \\
\And
Ji Q. Ren \& Yevgeny LeNet \\
Department of Computational Neuroscience \\
University of the Witwatersrand \\
Joburg, South Africa \\
\texttt{\{robot,net\}@wits.ac.za} \\
\AND
Coauthor \\
Affiliation \\
Address \\
\texttt{email}
}

% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to \LaTeX{} to determine where to break
% the lines. Using \AND forces a linebreak at that point. So, if \LaTeX{}
% puts 3 of 4 authors names on the first line, and the last on the second
% line, try using \AND instead of \And before the third author name.

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

%\iclrfinalcopy % Uncomment for camera-ready version, but NOT for submission.
\begin{document}


\maketitle

\begin{abstract}
Learning a versatile representation from high-dimensional observation data is a crucial stepping stone for building autonomous agents capable of effective decision-making in various downstream tasks. Yet, learning such a representation without additional \ignore{supervision, or labeled data}supervisory signals poses formidable practical challenges. In this work, we introduce Context-Aware State Representation Learning (CaStRL), a novel unsupervised representation pretraining approach that combines the strength of generative autoregressive modeling with the pretraining-finetuning paradigm\ignore{a novel unsupervised representation generative pretraining approach}. To encourage CaStRL to grasp the underlying dynamics information of the environment\ignore{the Markov Decision Process (MDP)}, we enforce it to jointly learn the latent state representation along with the contexts that influence the model's ability to learn a \ignore{universal} generalizable representation for control tasks. In CaStRL, we first employ the Video-Swin Transformer as a vision encoder, customizing it to support auto-regressive modeling through the incorporation of a causal attention mask. Then, we design Context-GPT to learn context from historical sequences of state representation\ignore{which serve as a learning signal to drive model to learn better representation}\ignore{where context encapsulates essential information related to all preceding states}, which drives the model towards capturing global structural patterns by %that leads to a seamless propagation of 
propagating information across extended time horizons. This significantly improves the adaptability of the learned representation for diverse control tasks. By emphasizing reward-free evaluation and \ignore{acknowledging}limited data constraints in both pretraining and fine-tuning stages, we find, across a wide range of Atari experiments, that pretrained representations can substantially facilitate downstream learning efficiency.

\end{abstract}

% \section{Submission of conference papers to ICLR 2024}
\section{Introduction}

% &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

State representations are often crucial for effective Reinforcement Learning (RL), activity recognition, and other downstream tasks \citep{Oord2018RepresentationLW, Han2019VideoRL, Chen2022AnEI}.
A representation that focuses narrowly on relevant information can facilitate learning an effective control policy, while a representation that includes irrelevant noise can be detrimental.
Using hand-crafted features or high-dimensional raw sensory data to define the state are two simple approaches. 
Unfortunately, each has significant disadvantages. 
Handcrafting the state for Atari environments \citep{Mnih2015HumanlevelCT}, for example, requires certain domain expertise and additional human oversight. Often, %Additionally, 
manually designed state representations suffer certain drawbacks such as presence of irrelevant information, provide insufficient coverage of complex dependencies within state space. In contrast, raw observation data is typically high-dimensional, and includes irrelevant information that makes it difficult to leverage any synergistic benefits from multiple down-stream tasks.

% State representation learning involves learning a mapping from high-dimensional observations to a low-dimensional compact state. %representation, which is 
% Deep Reinforcement Learning (DRL) has enabled learning such representations in an end-to-end (e2e) fashion (see Figure \ref{fig:e2esrl}) in two primary ways: 
% (1) in an implicit manner by directly learning a downstream policy by mapping visual observation to actions; or (2) explicitly extracting a latent state representation from high-dimensional data such as images, and then using it to learn a control policy with the learned representation as input, or for other downstream tasks. Unfortunately, extracting a general-purpose state representation for other downstream tasks using the first approach is limited challenging. The main reason because directly learning \ignore{optimal}control policy from high-dimensional sensory image, oftentimes create practical challenges to RL agent due to nature of sensory images which are high-dimensional and may contain unnecessary noise or irrelevant information for decision making of tasks at hand \citep{Bengio2017TheCP,Scholkopf2019CausalityFM}.
% Motivated by recent work showing that transformer architectures can serve as foundation models for sequential decision modeling \citep{Chen2021DecisionTR} and recent advances in large-scale offline reinforcement learning \citep{Levine2020OfflineRL,Agarwal2019AnOP}, we focus on the latter task.

% \begin{figure}[t]
%     \includegraphics[width=0.8 \textwidth]{CASART-v2-e2e_srl.jpg}
%     \centering
%     \caption{End to End (E2E) learning approach in DRL}
%     \label{fig:e2esrl}
% \end{figure}

% ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Deep Reinforcement Learning (DRL) has gained extensive attraction in recent years, especially, its ability to compress high-dimensional observation into a compact representation \citep{Silver2016MasteringTG,Berner2019Dota2W}. But,its extensive usage prevented from sample efficiency problem: requires large amount of data which leads to lengthy training time \citep{DulacArnold2019ChallengesOR}. To address this issue, the focus has been shifted to incorporating inductive bias into DRL framework such as exploring parameterized neural network architecture \citep{Zambaldi2018DeepRL}. Choice of DRL architecture has been explored but not limited to state representation learning objective. Yet, what type of DRL architecture is suitable for learning generalizable state representation remains elusive. The rise of the Transformer architecture \citep{Vaswani2017AttentionIA} has revolutionized the learning paradigm in various domain. For instance, The profound success achieved by BERT \citep{Devlin2019BERTPO} and GPT model \citep{radford2018improving} in machine translation, language understanding, has sparked a big frenzy within machine learning community. As a result, this led to widespread adoption of the pretrain-then-finetune paradigm \citep{Howard2018UniversalLM} as popular method to progressively enhance performance across wide-range of downstream learning tasks including imitation learning \citep{Schwarzer2021PretrainingRF}. The recent works have shown that transformer architecture enjoys great extent of scalability and suitable for modeling long decencies such sequential decision making \citep{Khan2021TransformersIV}. Followed by such success, there has been increasing level of interest in adopting transformer in RL, in particular, top choice for learning state representation.

% ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
% -- todo comment: due to space limit, we are going to cut out this paragraph
% ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
% In recent line of work have shown that exploiting low dimensional abstract representation, instead of raw sensory data, has become main means to facilitate downstream learning tasks \citep{Lesort2018StateRL}. Notable approaches within this line of research include world model \citep{ha2018world}, DeepMDP \citep{Gelada2019DeepMDPLC}, CURL \citep{Srinivas2020CURLCU}. Despite the effectiveness of above approaches to learning abstract state representation, current methods fails to expedite downstream policy learning to achieve optimal human-like decision making performance. Since the underlying state of the environment cannot be derived directly from the high-dimensional sensory input, encoding raw observation data into low dimensional representation that directly influence the RL agent on decision making has been key focus on designing effective autonomous system. Therefore,Learning compact and generelizable representation from high-dimensional sensory data without substantial supervision has been widely practiced, and such unsupervised learning approach become main objective for State Representation Leanirng (SRL) \citep{Lesort2018StateRL}. Nevertheless, current unsupervised exploration techniques for state representation learning requires extended periods of real-time interaction\ignore{experience}, which might not be feasible \ignore{realistic} for real-world settings \ignore{systems} that are constrained by interaction budgets and bounded costs.
% Therefore, It is crucial to develop pretraining methods that effectively leverage large existing datasets coupled\ignore{in conjunction} with specific hardware configurations \citep{Stooke2020DecouplingRL,Agarwal2019AnOP}.

% ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
We consider the question: how can learned state representations be made more generalizable for downstream learning tasks? 
%of interest in terms of generalizability to unseen environments. 
To address this question, we introduce Context-Aware State Representation Learning (CaStRL), a novel unsupervised representation pretraining approach. CaStRL not only emphasizes state representation learning, but also integrates context-awareness, where \textbf{context} pertains to features characterized by global structures that enable the propagation of information across extended time horizons. By including contextual information as regularization, our approach enhances the quality and relevance of learned state representations, making them more suitable for downstream control tasks. In our proposed approach, firstly, we employ the Video-Swin Transformer \citep{Liu2021VideoST} as \ignore{our vision}a vision encoder, tailoring it to support auto-regressive modeling through the incorporation of a causal-attention mask. Then, we design Context-GPT \ref{contLr} aims to learn underlying latent state representation with its context. Whereas, \textbf{contexts} encapsulate information pertaining to all preceding states to  enable jointly learning state-representation along with its context. Our empirical results demonstrate significant performance gains with CaStRL, showcasing the effectiveness of our unsupervised pretraining framework in enabling the model to rapidly emulate behaviors across multiple environments simultaneously.

\begin{figure}[!h]
    \centering
    \includegraphics[width=1.0 \textwidth]{plots/castrl-best-data-scores.png}
    \caption{Performance of the top 3 CaStRL rollouts, normalized by the best training dataset scores.}
    \label{fig:castrl-data-normalized-scores}
    % \vspace{-2.5mm}
\end{figure}

The CaStRL framework has demonstrated significant performance gains and stability in multi-game settings. We evaluate CaStRL's capability to surpass the best training demonstrations and measure its deviation from random behavior. Figure \ref{fig:castrl-data-normalized-scores} presents the normalized scores using the best DQN scores observed in the training dataset, following the Top-3 Metric as described in \citep{Lee2022MultiGameDT}.
%^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Our contributions are threefold; First, we propose CaStRL, a novel unsupervised representation pretraining approach for jointly learning latent state representation and its context, that influence the model's ability to learn a versatile representation for control tasks.
Second, we design Context-GPT to learn context from historical sequences of state representation, which drives the model towards capturing global structural patterns by propagating information across extended time horizons. This significantly improves the adaptability of the learned representation for diverse control tasks. 
Third, we conduct extensive experiments on a wide spectrum of 46 Atari 2600 video games \citep{Mnih2015HumanlevelCT}. By evaluating  using 41 games for training and 5 games for evaluation,  we assess the  generalizability of the learned representation. By emphasizing reward-free evaluation and limited data constraints in both pretraining and fine-tuning stages, we find, across a wide range of Atari experiments, that pretrained representations can substantially facilitate downstream learning efficiency.

\section{Related Works}
% ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Recent work has extensively explored learning representations directly from raw sensory data with deep neural networks \citep{Lange2012AutonomousRL,Bhmer2015AutonomousLO,wahlstrom2015pixels}. Using DRL to learn latent states with minimal supervision or through fully unsupervised approaches has gained significant traction \citep{Radford2015UnsupervisedRL,Anand2019UnsupervisedSR}. The main intuition is not to mandate a model that is directly supervised using ground-truth states since underlying state is inaccessible. Instead, it leverages sequences of observations and actions from offline experience datasets, intensively benefiting from the power of autoregressive models like GPT \citep{radford2018improving} to learn generalizable representations for diverse control tasks. 

% Video Swin (Local 
% @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

\textbf{Reinforcement Learning as Sequence Modeling:}
In light of the significant success achieved by Generative Pre-training Transformer (GPT) and Large Language Models (LLMs) \citep{brown2020language} across a wide range of tasks, recent research, as seen in \citep{Chen2021DecisionTR} and \citep{Shang2021StARformerTW}, has extended the concept of generative pretraining to the realm of offline reinforcement learning. This extension involves formulating the RL problem as a sequence modeling challenge, enabling the learning of representations conditioned on rewards or return-to-go (RTG). 

\textbf{Pretraining For Reinforcement Learning:}
Subsequently, \citep{Lee2022MultiGameDT} presented empirical evidence demonstrating the capability of Decision Transformer (DT) \citep{Chen2021DecisionTR} in expert action inference, showing a notable transfer learning performance gain by using multi-game settings for pretraining. Other recent works, such as \citep{Sun2023SMARTSM}, introduce a self-supervised supervised learning approach that incorporates momentum encoders and control transformers as a pretraining framework. However, there remains a scarcity of works dedicated to the comprehensive exploration of state representation learning from sequences of visual observations.

\textbf{Vision Encoders For Control Tasks:} 
Encoding visual observations is a fundamental building block for various sequence modeling problems. In recent works focused on control tasks \citep{Chen2021DecisionTR, Shang2021StARformerTW, Sun2023SMARTSM}, the visual encoder has typically been either a Convolutional Neural Network (CNN) \citep{LeCun1998GradientbasedLA} or a Vision Transformer (ViT) \citep{dosovitskiy2020image}. However, these encoders lack the ability to incorporate temporal information effectively resulting in a loss of the finer temporal details present in a sequence of observations within their high-dimensional representation spaces. To address this concern, one potential approach is to use a vision transformer model explicitly designed to account for temporal dimensions, like the Video Swin Transformer \citep{Liu2021VideoST}. While the Video-Swin Transformer has demonstrated remarkable success in learning representations from video clips, its design initially lacks a direct emphasis on addressing control tasks formulated as autoregressive models. This is primarily due to the inherent nature of its underlying attention mechanism | Shifted Window Masked Self-Attention (SW-MSA), which lacks the causality required for autoregressive modeling with Transformers. Another challenge hindering the Video-Swin Transformer's applicability in control tasks is its lower inductive bias compared to basic convolution encoders, resulting in slower training.

%@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
\section{Preliminaries}
% \section{problem statement and preliminaries} \js{this section need revision, working on it}
The main objective of this work is to learn a compact, generalizable state representation to solve downstream tasks where control tasks and environments \ignore{learning such representation is defined in POMDP}are defined by Partially Observed Markov Decision Processes (POMDPs), since the underlying state of the environment is not directly accessible \ignore{obtained} from available high-dimensional sensory observations. A POMDP is described by a tuple $\langle \mathcal{S,A, O, T, R,\gamma}, E \rangle$, where: $\mathcal{S}$ is state space; $\mathcal{A}$ is action space; $\mathcal{O}$ is observation space, denoting image collections rendered from the environment; $\mathcal{T}: \mathcal{S} {\times} \mathcal{A} {\rightarrow} \mathcal{S}$ is state transition dynamics, which specify the probability that action $a_t {\in} \mathcal{A}$ in state $s {\in} \mathcal{S}$ will lead to next state $s_{t+1}$; $\mathcal{R}: \mathcal{S} {\times} \mathcal{A} {\rightarrow} \mathbb{R}$ is the reward function; and $E$ is observation probability $E(o|s)$ of perceiving observation $o$ given state $s$. At each timestep t, the RL agent observes $o_t$ based on underlying state $s_t {\in} \mathcal{S}$, takes action $a_t {\in} \mathcal{A}$, and receives reward $r_t$ then transition to\ignore{MDP will make transition to} next state $s_{t+1}$. Given sequences of observation-action tuples of length $L$ in the replay buffer, $b_t{=}(o_{t-L}, a_{t-L}, o_{t-L+1}, a_{t-L+1}, \ldots, o_t)$, the agent takes action $a_t$ based on policy $\pi: a_t{=}\pi(b_t)$, the agent aims to learn optimal policy $\pi^*$ that maximizes the cumulative reward $\mathbb{E}_P [\sum_{t=1}^\infty \gamma^t r_t]$ \citep{Sutton2005ReinforcementLA}. 

% by optimizing $\mathcal{L}_{cgpt}$ \ref{cgpt}. State representation that outputted from context-gpt will be fed into policy network which parameterized by $\omega$ to get action at time step $t$. that is ${a_t = \pi_{\omega} (f_{\theta}(s_t))}$, the objective is to maximize cumulative discounted reward.

\subsection{Good Criteria for State Representation Learning}
The main objective in State Representation Learning (SRL) is to learn an informative and sufficient abstract representation from high-dimensional raw sensory data. We review important facets of what defines a good state representation that facilitate downstream policy learning. From a broader perspective, the state representation should retain sufficient, compact and generic information from the observation in order to solve the RL task and filter out redundant information. Evaluating the quality of learned state representation, we consider these dimension of state space:
\begin{itemize}
  \item \textbf{Compactness:} a state representation should be low-dimensional to directly guide the RL agent on decision making processes. Such compact representation should possess essential and meaningful representation, while ignoring redundant features.
  \item \textbf{Sufficiency:} encoding high-dimensional observation into low-dimensional abstract representation must contain essential and sufficient information to solve downstream RL tasks. State representations that contain insufficient information may lead to sub-optimal policies.
  \item \textbf{Generalizability:} learning more generelizable representation across semantically similar environments, which include but not limited to new environment states or totally different environments, has been considered as a key characteristic to assess its effectiveness in downstream learning tasks.
\end{itemize}

Tasks with long time horizon and partial observability are key challenges to representation learning. 

\section{Method}
\label{castRL_method}
CaStRL jointly learn the latent state representation
along with the contexts that influence the model’s ability to extract a generalizable representation for control tasks. To learn such representation, firstly, we employ the Video-Swin Transformer as our vision encoder, tailoring it to support auto-regressive modeling through the incorporation of a causal-attention mask. In the second part of our work, we design Context-GPT learns contexts given a sequence of state-representation aims to learn underlying latent state representation with its context, where contexts encapsulate information pertaining to all preceding states to enable jointly learning state-representation along with its context. 

\begin{figure}[h]
    \includegraphics[width=0.95 \textwidth]{CaStRL_prtrn.png}
    \centering
    \caption{Illustration of the CaStRL pretraining pipeline.}
    \label{fig:castrl_pretr}
\end{figure}

\subsection{Vision Encoder}
\label{visencoder}
In this work, we employ Video Swin Transformer, originally proposed for video understanding tasks \citep{Liu2021VideoST}. We have adapted the Video-Swin Transformer by replacing the Shifted Window Multi-Head-Self-Attention (SW-MSA) with the Causal Shifted-Window Multi-Head-Self-Attention (CSW-MSA) (see details in appendix \ref{appdx:csw-msa}). This tailored attention mechanism better suits the sequence modeling task, particularly in its autoregressive nature. Our primary focus is on leveraging Video-Swin Transformer as a foundational vision encoder capable of effectively learning both spatial and temporal representations. To the best of our knowledge, we are the first to include Causal-Attention into the Video-Swin architecture for state representation learning. 

% We assess the impact of CSW-MSA by incorporating it into the Video-Swin architecture as the vision encoder in proposed CaStRL framework \ref{fig:castrl_pretr}

\begin{figure}[h]
    \includegraphics[width=1.0 \textwidth]{plots/dt-vswin-csw.png}
    \centering
    \caption{Performance of DT + Video Swin Transformer (With/Without CSW-MSA).}
    \label{fig:dt-vswin-csw}
\end{figure}

A fundamental question arises: to what extent does CSW-MSA broaden the scope of applicability of the Video-Swin Transformer in autoregressive modeling-based learning? To address this question, we replaced the convolutional encoder in the DT architecture \citep{Chen2021DecisionTR} with the Video-Swin Transformer. We then evaluated the model's performance in Behavioral cloning (BC) \citep{Pomerleau1988ALVINNAA} tasks across some Atari environments in \textbf{single-game settings}, results shown in Figure \ref{fig:dt-vswin-csw}. Another notable aspect we addressed is the lack of inductive bias in Video-Swin. During training, we simultaneously distilled knowledge from a convolution-based visual encoder (i.e., ResNet34 \citep{He2015DeepRL}) into our transformer-based encoder (Video-Swin). This resulted in the learned features from convolution encoders guiding the transformer encoder in the early stages of training. The knowledge distillation was achieved implicitly \citep{wang2021knowledge}, without the need for any additional modeling objectives. Further details on how this implicit knowledge distillation was achieved are discussed in section \ref{StRL}.

% \ignore{for the decision transformer}. The absence of CSW-MSA results in a significant decline in performance, with the model struggling to learn meaningful information. These results highlight the critical role of CSW-MSA in enabling effective learning. Another notable aspect we address is the lack of inductive bias in Video-Swin. During training, we simultaneously distill knowledge from a CNN encoder (e.g., ResNet34) into our transformer-based encoder (Video-Swin). This results in the learned features from CNN encoders guiding the transformer encoder in the early stages of training.  The knowledge distillation was achieved implicitly, without the need for any additional modeling objectives. Our work places state-representation learning within the context of learning from high-dimensional observations, without explicitly considering the underlying sequence modeling problem. The primary goal is to acquire domain and task-agnostic representations, enabling broader applicability and generalization. 

% \subsection{State Encoding}
\subsection{State Representation Learning}
\label{StRL}
% \subsection{Context-Aware Learaning}
% we adopt Video-Swin (VSwin) Transformer \cite{Liu2021VideoST}, which has shown State-Of-The-Art (SOTA) performance in video understanding tasks. Inspired by its saucers, \ignore{customized Video-Swin with incoporation of casual self-attention mechansim, we implement the module with VSwin as main state encoder}. During the training process, a sequence of historical observations $o_{\leq t}$ and actions $a_{\leq t}$ are sampled from exprience replay buffer \cite{Agarwal2019AnOP}, and we first augment the observations to two different views $o_t$ and $\prime{o_t}$ with random transformation. In order to prevent representation collapsing problem, we use $\mathcal{L}_{VICReg}$ objective \cite{Bardes2021VICRegVR}. 

The full architecture of our proposed Context-Aware State Representation Learning (CaStRL) approach is shown in Fig \ref{fig:castrl_pretr}. CaStRL contains two learning components: state representation learning and transitions context learning. This section primarily addresses key questions related to the state representation learning component.

\textbf{Choice of Representation Learning Method:} While the Momentum encoder \citep{He2019MomentumCF} has shown effectiveness as a powerful representation learning method \citep{Sun2023SMARTSM}, it does come with significant limitations regarding modeling flexibility. In this work, we emphasize the critical role of flexibility in the learning process. However, as the learning becomes more flexible, ensuring the effectiveness of what is learned becomes increasingly challenging. Issues such as degenerated solutions and overfitting come into play, leading most representation learning methods to incorporate regularization techniques.  For instance, recent work \citep{Oord2018RepresentationLW} places higher requirements on the network to reduce the risk of overfitting, making it an effective regularization method for learning high-quality representations, as recognized in various studies. 

Considering the modeling requirements outlined in section \ref{visencoder} for the visual encoder, we employ Variance-Invariance-Covariance Regularization (VICReg) \citep{Bardes2021VICRegVR} as our chosen representation learning objective. Apart from VICReg's inherent flexibility, we chose it as the primary representation learning objective because it introduces two vision encoders that can be completely asymmetric with no shared structure or parameters.  This property empowers our approach and contributes to its effectiveness in the task of representation learning. To achieve our objective of aligning Video Swin Transformer learned features with those of the CNN encoder during the  pretraining phase. We employ both a CNN encoder (i.e., ResNet-like architecture) and a transformer encoder (i.e., Video-Swin) in our approach. Through minimizing the discrepancy between the representations acquired from these encoders, we enable Video-Swin to learn features comparable to those of the CNN encoder, resembling the knowledge distillation process between a teacher and a student network \citep{fan2018learning}. The CNN encoder's inherent locality in its operations provides a substantial inductive bias, which the transformer lacks. Consequently, asserting that the CNN encoder learns features as poorly as the transformer encoder becomes a challenging proposition.

The VICReg objective effectively avoids the representation collapse problem. The steps of incorporating the VICReg objective into CaStRL are describe in Appendix \ref{alg:strl-loss}.

\subsection{Context-Aware Learning}
\label{contLr}
In contrast to GPT \citep{brown2020language} and LLMs, which excel at processing sequences of tokens (typically words) and harness the power of contexts to enable in-context learning \citep{dong2022survey}, deriving context directly from a sequence of visual observations presents significantly greater challenges. To address this, we start by optimizing state representations, then use these optimized representations to learn context. This is a process of state representation optimization, followed by representations refinement guided by a Context-GPT objective. As shown in Figure \ref{fig:castrl_pretr}, The learned state-representation $Y^\prime$ is fed into a projection layer, which aims to summarize the state-representation in the feature dimension. This summarized output is referred to as the state-context representation or intermediate-context representation $C_{s}$. Next, a sequence of intermediate-context representations from previous steps are utilized as input for an autoregressive model -  ContextGPT (CGPT). The autoregressive model summarizes the sequence in the temporal dimension, resulting in the transitions context $C_{tr}= cgpt({C_{s}}_{\leq t})$. During this process, each transition context at timestep ($k$) is optimized to capture key information about all previous state-representations. The proper context learning approach ensures that the resulting transition context enables predictions in the future or past to be effectively independent of other timesteps, satisfying the Markovian property. As a result, the transitions context serves as a valuable representation, retaining the essential temporal dependencies.

In practice, there are various types of contexts that can induce context awareness. In this work, we explore two primary approaches for learning context:

\begin{enumerate}
    \item Predicting multiple state representations in the future, akin to the way CPC \citep{Oord2018RepresentationLW} learns context. However, we do not employ contrastive losses in our method.
    \item Addressing the Blank-filling problem \citep{raffel2020exploring}, which involves masking some of the tokens and then attempting to retrieve them. This task encourages the model to consider neighboring unmasked tokens to gather information about the missing tokens.
\end{enumerate}

\begin{algorithm}[h]
  % \algsetup{linenosize=\tiny}
    % \scriptsize
    \caption{Context GPT Loss (For Context-Aware StRL: $\textbf{Context Type} \gets \textbf{Next State}$)}
    \label{alg:cgpt-loss-next-state}
    \input{code/cgpt-loss-next-state}
\end{algorithm}

Using these two approaches, we introduce 3 variants of the Context-GPT objective:

\begin{enumerate}
    \item $\textbf{Context Type} \gets \textbf{Next State}$: This variant is tailored to support the learning of features extending over a longer time horizon, as elaborated in pseudocode \ref{alg:cgpt-loss-next-state}.
    \item $\textbf{Context Type} \gets \textbf{Masked State}$: This variant emphasizes context learning, with a focus on local context from neighboring states.
    \item $\textbf{Context Type} \gets \textbf{Masked Action}$: While this variant involves supervised learning, it remains task-agnostic, as the model is not optimized to produce any specific BC policies.
\end{enumerate}

The other variants of the Context-GPT objective are described in pseudocodes \ref{alg:cgpt-loss-masked-state} and \ref{alg:cgpt-loss-masked-action}.


% *****************************************************************
\section{Experiments} 
In our experiments, we focus our attention on using behavior cloning (BC) \citep{Pomerleau1988ALVINNAA} in multi-environment scenarios as a downstream task to assess the effectiveness of CaStRL. Instead of using setups that depend on rewards \citep{Chen2021DecisionTR,Shang2021StARformerTW}, we use BC to see how useful our learned state representation is independently of rewards. %In a nutshell, 
Our experiments focus on:
%\begin{enumerate}
    %\item First, we're 
    examining the extent to which pretraining influences our model's performance;
    %\item In addition, we're conducting an 
    in-depth investigation into how %the 
    incorporating or excluding specific context information influences our model's performance; %This encompasses scenarios where information regarding previous actions is omitted, as well as variations in pretraining context, as discussed in Section [?].
    %\item Then, 
    %we have reproduced our experimental setup with \cite{Chen2021DecisionTR} \cite{Shang2021StARformerTW}, enabling us to 
    and assessment of the performance of existing methods \citep{Chen2021DecisionTR,Shang2021StARformerTW} in comparison to our framework.
%\end{enumerate}

\subsection{Experimental Setup}
Throughout both the pretraining and finetuning phases, training takes several days on 4 V100 32GB GPUs. To manage this computational demand, our studies are based on a limited portion of the Atari offline experience datasets \citep{Agarwal2019AnOP}. In both the pretraining and fine-tuning phases, our experiments were conducted with limited data constraints. This decision serves a twofold purpose: to reduce training time and to gain insights into how our approach performs in situations with limited data availability.

\textbf{Pretraining and Finetuning Dataset:} For both the pretraining, finetuning phases, and even the baseline training, we used an existing offline experience datasets \citep{Agarwal2019AnOP}, and omitted the consideration of incorporating rewards into our training data, as the primary objective is to evaluate the CaStRL model in reward-free settings. This dataset consists of trajectories collected during the training of a DQN agent.

\textbf{Atari Environment Selection:} In alignment with the environment selection strategy outlined in Multi-Game DT \citep{Lee2022MultiGameDT}, we used 41 environments in the pre-training phase while reserving 5 environments to assess CaStRL's generalizability. This encompasses a total of 46 Atari environments in our study.

\textbf{Dataset Size for Multi-game Experiments:} In all multi-game experiments, a limited subset of offline experience datasets was employed, with each environment accumulating 100$K$ steps, totaling $4.1$ million environment steps across the 41 environments. This is notably smaller than the training set of 
%one employed for training in works such as 
\citep{Lee2022MultiGameDT}, where %precisely 
the same $41$ environments were used with 160 billion environment steps.

\textbf{Data Augmentation:} For 
%Our primary approach to 
data augmentation, %involved treating
we treat 
spatiotemporal aspects separately. Specifically, we applied color and noise-based augmentation individually to each frame while maintaining consistent geometric transformations across frames for each segment of the observation sequence. This strategy was implemented to prevent the model from excessively relying on low-level optical-flow features for learning, aligning with the approach mentioned in \citep{Han2019VideoRL}.

\textbf{Generalization Evaluation:} To assess CaStRL's generalization capabilities, a multi-game experiments conducted based on the pretrained model. Specifically, we evaluate the model's performance on 5 held-out environments.

\textbf{Pretraining Procedure:} %Implementation Details:} 
We use an embedding size of $96$ for the Video Swin Transformer \citep{Liu2021VideoST}, complemented by our custom attention mechanism, CSW-MSA. In our architecture, we incorporate ResNet34 with minor adjustments \citep{He2015DeepRL}. To facilitate tokenization in the temporal dimension, we introduce an embedding block consisting of a single 3D Convolution layer, followed by Batch Norm, and ReLU activation function. For VICReg, we use an expander network comprising 3 layers, each featuring a linear layer with a dimension of 1024, followed by a GELU activation function. The hyperparameters for VICReg, including $\gamma$, $\beta$, and $\eta$, are set to $1.0$, $0.1$, and $1.0$, respectively. In the GPT module, both during pretraining and finetuning stage, we employ 6 layers and 4 attention heads. We set a context length of $T = 16$ and a context dimension of $dim_{context} = 192$. During the pretraining phase, we simultaneously optimize two objectives: VICReg and Context-GPT Loss. We conducted the pretraining for a total of $50$ epochs and put an explicit limit on the number of batch samples to $13.5K$ per epoch. This limitation was added to speed up the pretraining process. while CaStRL continued to perform well under these training constraints.

\textbf{Finetuning Procedure:}
%Implementation Details:} 
In the finetuning phase, we further optimize the pretrained model on the  BC task for 10 epochs in a multi-game setting. During this process, we employ 41 environments to assess the effectiveness of CaStRL. Additionally, for the purpose of generalization testing, we finetune the unsupervised pretrained version on 5 environments that were not encountered during the pretraining phase. Similar to the pretraining phase, we impose an explicit limit on the number of batch samples, set to $13.5K$ per epoch. It could be problematic if the model forgets what it learned during the pre-training phase. This issue may arise due to a distribution shift, which is a result of not using augmentation during the finetuning phase and incorporating environments not seen during the pretraining phase for generalization testing. To overcome the challenge of catastrophic forgetting, we adopt a simple yet effective approach: we initially freeze all layers except the last Linear layer for the first epoch and subsequently unfreeze all layers, mirroring the approach proposed in \citep{Howard2018UniversalLM}. We finetuned the entire model, as freezing the pretrained model has shown to be ineffective in handling complex environments \citep{Sun2023SMARTSM}. This decision is justified by the realization that what is learned during the pretraining phase may not always be directly exploitable during the finetuning phase, and some adaptation or refinement would be required.

\textbf{Baselines:} For DT \citep{Chen2021DecisionTR} and Starformer \citep{Shang2021StARformerTW} baselines, we replicate our finetuning experiments settings. Since training multi-game environments takes a prohibitively long time, we train DT and Starformer from scratch based on just one seed for evaluation. Nevertheless, we carefully replicate our finetuning experimental settings, ensuring an identical subset of the data was used. Additionally, to ensure a fair comparison with CaStRL, we matched the GPT model scale.

\textbf{Evaluation Metrics:} We assess the performance on individual Atari games relative to an assumed lower bound: optimizing a randomly initialized version of our model, i.e., without any pretraining. We then calculate the relative score of each game in reference to this baseline. See Appendix \ref{appdx:castrl-scores} for more results.


\begin{figure}[b]
    \centering
    %\begin{subfigure}[b]{0.75\textwidth}
        \includegraphics[width=1.0 \textwidth]{plots/castrl-variants-iqm.png}
        \centering
        %\caption{This analysis covers diverse initialization and pretraining strategies, all maintaining consistent model size and dataset configurations, showcasing the impact of learned representations and action history utilization during finetuning.}
        %\label{fig:castrl-vars}
     %\end{subfigure}
     %\hfill
    %\begin{subfigure}[b]{0.75\textwidth}
    \caption{Interquartile Mean (IQM) performance across 41 Atari games relative to a ``No Pretrain" architecture version using diverse initialization and pretraining strategies, all maintaining consistent model size and dataset configurations.} 
    \label{fig:castrl-vars}
\end{figure}

\subsection{Experimental Results}


\begin{figure}[h]
    \centering      \includegraphics[width=1.0 \textwidth]{plots/castrl-vs-baselines-iqm.png}
        \centering
        %\caption{Comparing CaStRL to the baseline, with the GPT model size in DT and Starformer matched to that of CaStRL for a fair evaluation.}
        %\label{fig:castrl-vs-baselines}
     %\end{subfigure}
    \caption{Interquartile Mean (IQM) performance
comparison over 41 Atari games relative to "No Pretrain" architecture version of CaStRL to the baselines, with the GPT model size in DT and Starformer matched to that of CaStRL for a fair evaluation.}
    \label{fig:castrl-vs-baselines}
\end{figure}

%Our objective extends beyond demonstrating the performance of CaStRL in comparison to the baselines; we also aim to
We first 
evaluate how altering the context being learned affects BC task performance. To achieve this, we pretrained 7 different versions of our model, each targeting a distinct context type, and the results are presented in Figure \ref{fig:castrl-vars}. The best-performing context version is then compared against the baseline models in the BC task. The best-performing variant of CaStRL is then compared against the baseline models in the BC task, with the results shown in Figure \ref{fig:castrl-vs-baselines}.

\begin{figure}[h]
    \includegraphics[width=1.0 \textwidth]{plots/castrl-eval-envs.png}
    \centering
    \caption{Fine-tuning performance on unseen environments after pretraining on a limited dataset of 41 Atari environments. The fully unsupervised pretrained CaStRL notably outperforms DT and Starformer in the ability to generate trajectories with higher returns.}
    \label{fig:castrl-eval-envs}
\end{figure}

The evaluation of CaStRL's scalability is based on how quickly it outperforms DT and Starformer in multi-game settings for the BC as downstream task. As shown in the Figure \ref{fig:castrl-eval-envs}, CaStRL is much more efficient to train. Given that it's a completely unsupervised pre-training framework, its added value is expected to increase with larger-scale training, a direction we will explore in future work.


\section{Conclusions}
In this paper, we introduced Context-aware State Representation Learning (CaStRL), a novel unsupervised state representation learning approach aimed at learning generalizable state representations. Despite explicit limitations in both pretraining and finetuning, which included small dataset sizes, a limited number of epochs, and a focus on reward-free settings, CaStRL demonstrated the adaptability of the resulting state representations, making it possible to seamlessly use the learned representations across multiple environments simultaneously. Future work includes scaling up the training of CaStRL, applying CaStRL to human demonstrations for tasks even more complex than Atari, and exploring its zero-shot transfer capabilities.
%\section{Acknowledgments}
%This research supported in part by the University of Illinois at Chicago and the National Science Foundation (NSF) under grant \textnumero: CNS-1828265 \ignore{for MRI: Acquisition of a Composable Platform as a Service Instrument for Deep Learning and Visualization (COMPaaS DLV)}\citep{Long2022ComposableIF}. We also thank all the anonymous reviewers for their valuable feedback on this work. 

% \section{conclusion}
% this paper delivers how to achieve generelizable representation from high-dimensional observations to facilitate downstream learning tasks under multi-task sequential decision making regimes.\ignore{ We propose CaStRL, a novel unsupervised representation pretraining approach to }




% ICLR requires electronic submissions, processed by
% \url{https://openreview.net/}. See ICLR's website for more instructions.

% If your paper is ultimately accepted, the statement {\tt
%   {\textbackslash}iclrfinalcopy} should be inserted to adjust the
% format to the camera ready requirements.

% The format for the submissions is a variant of the NeurIPS format.
% Please read carefully the instructions below, and follow them
% faithfully.

% \subsection{Style}

% Papers to be submitted to ICLR 2024 must be prepared according to the
% instructions presented here.

%% Please note that we have introduced automatic line number generation
%% into the style file for \LaTeXe. This is to help reviewers
%% refer to specific lines of the paper when they make their comments. Please do
%% NOT refer to these line numbers in your paper as they will be removed from the
%% style file for the final version of accepted papers.

% Authors are required to use the ICLR \LaTeX{} style files obtainable at the
% ICLR website. Please make sure you use the current files and
% not previous versions. Tweaking the style files may be grounds for rejection.

% \subsection{Retrieval of style files}

% The style files for ICLR and other conference information are available online at:
% \begin{center}
%    \url{http://www.iclr.cc/}
% \end{center}
% The file \verb+iclr2024_conference.pdf+ contains these
% instructions and illustrates the
% various formatting requirements your ICLR paper must satisfy.
% Submissions must be made using \LaTeX{} and the style files
% \verb+iclr2024_conference.sty+ and \verb+iclr2024_conference.bst+ (to be used with \LaTeX{}2e). The file
% \verb+iclr2024_conference.tex+ may be used as a ``shell'' for writing your paper. All you
% have to do is replace the author, title, abstract, and text of the paper with
% your own.

% The formatting instructions contained in these style files are summarized in
% sections \ref{gen_inst}, \ref{headings}, and \ref{others} below.

% \section{General formatting instructions}
% \label{gen_inst}

% The text must be confined within a rectangle 5.5~inches (33~picas) wide and
% 9~inches (54~picas) long. The left margin is 1.5~inch (9~picas).
% Use 10~point type with a vertical spacing of 11~points. Times New Roman is the
% preferred typeface throughout. Paragraphs are separated by 1/2~line space,
% with no indentation.

% Paper title is 17~point, in small caps and left-aligned.
% All pages should start at 1~inch (6~picas) from the top of the page.

% Authors' names are
% set in boldface, and each name is placed above its corresponding
% address. The lead author's name is to be listed first, and
% the co-authors' names are set to follow. Authors sharing the
% same address can be on the same line.

% Please pay special attention to the instructions in section \ref{others}
% regarding figures, tables, acknowledgments, and references.


% There will be a strict upper limit of 9 pages for the main text of the initial submission, with unlimited additional pages for citations. 

% \section{Headings: first level}
% \label{headings}

% First level headings are in small caps,
% flush left and in point size 12. One line space before the first level
% heading and 1/2~line space after the first level heading.

% \subsection{Headings: second level}

% Second level headings are in small caps,
% flush left and in point size 10. One line space before the second level
% heading and 1/2~line space after the second level heading.

% \subsubsection{Headings: third level}

% Third level headings are in small caps,
% flush left and in point size 10. One line space before the third level
% heading and 1/2~line space after the third level heading.

% \section{Citations, figures, tables, references}
% \label{others}

% These instructions apply to everyone, regardless of the formatter being used.

% \subsection{Citations within the text}

% Citations within the text should be based on the \texttt{natbib} package
% and include the authors' last names and year (with the ``et~al.'' construct
% for more than two authors). When the authors or the publication are
% included in the sentence, the citation should not be in parenthesis using \verb|\citet{}| (as
% in ``See \citet{Hinton06} for more information.''). Otherwise, the citation
% should be in parenthesis using \verb|\citep{}| (as in ``Deep learning shows promise to make progress
% towards AI~\citep{Bengio+chapter2007}.'').

% The corresponding references are to be listed in alphabetical order of
% authors, in the \textsc{References} section. As to the format of the
% references themselves, any style is acceptable as long as it is used
% consistently.

% \subsection{Footnotes}

% Indicate footnotes with a number\footnote{Sample of the first footnote} in the
% text. Place the footnotes at the bottom of the page on which they appear.
% Precede the footnote with a horizontal rule of 2~inches
% (12~picas).\footnote{Sample of the second footnote}

% \subsection{Figures}

% All artwork must be neat, clean, and legible. Lines should be dark
% enough for purposes of reproduction; art work should not be
% hand-drawn. The figure number and caption always appear after the
% figure. Place one line space before the figure caption, and one line
% space after the figure. The figure caption is lower case (except for
% first word and proper nouns); figures are numbered consecutively.

% Make sure the figure caption does not get separated from the figure.
% Leave sufficient space to avoid splitting the figure and figure caption.

% You may use color figures.
% However, it is best for the
% figure captions and the paper body to make sense if the paper is printed
% either in black/white or in color.
% \begin{figure}[h]
% \begin{center}
% %\framebox[4.0in]{$\;$}
% \fbox{\rule[-.5cm]{0cm}{4cm} \rule[-.5cm]{4cm}{0cm}}
% \end{center}
% \caption{Sample figure caption.}
% \end{figure}

% \subsection{Tables}

% All tables must be centered, neat, clean and legible. Do not use hand-drawn
% tables. The table number and title always appear before the table. See
% Table~\ref{sample-table}.

% Place one line space before the table title, one line space after the table
% title, and one line space after the table. The table title must be lower case
% (except for first word and proper nouns); tables are numbered consecutively.

% \begin{table}[t]
% \caption{Sample table title}
% \label{sample-table}
% \begin{center}
% \begin{tabular}{ll}
% \multicolumn{1}{c}{\bf PART}  &\multicolumn{1}{c}{\bf DESCRIPTION}
% \\ \hline \\
% Dendrite         &Input terminal \\
% Axon             &Output terminal \\
% Soma             &Cell body (contains cell nucleus) \\
% \end{tabular}
% \end{center}
% \end{table}

% \section{Default Notation}

% In an attempt to encourage standardized notation, we have included the
% notation file from the textbook, \textit{Deep Learning}
% \cite{goodfellow2016deep} available at
% \url{https://github.com/goodfeli/dlbook_notation/}.  Use of this style
% is not required and can be disabled by commenting out
% \texttt{math\_commands.tex}.


% \centerline{\bf Numbers and Arrays}
% \bgroup
% \def\arraystretch{1.5}
% \begin{tabular}{p{1in}p{3.25in}}
% $\displaystyle a$ & A scalar (integer or real)\\
% $\displaystyle \va$ & A vector\\
% $\displaystyle \mA$ & A matrix\\
% $\displaystyle \tA$ & A tensor\\
% $\displaystyle \mI_n$ & Identity matrix with $n$ rows and $n$ columns\\
% $\displaystyle \mI$ & Identity matrix with dimensionality implied by context\\
% $\displaystyle \ve^{(i)}$ & Standard basis vector $[0,\dots,0,1,0,\dots,0]$ with a 1 at position $i$\\
% $\displaystyle \text{diag}(\va)$ & A square, diagonal matrix with diagonal entries given by $\va$\\
% $\displaystyle \ra$ & A scalar random variable\\
% $\displaystyle \rva$ & A vector-valued random variable\\
% $\displaystyle \rmA$ & A matrix-valued random variable\\
% \end{tabular}
% \egroup
% \vspace{0.25cm}

% \centerline{\bf Sets and Graphs}
% \bgroup
% \def\arraystretch{1.5}

% \begin{tabular}{p{1.25in}p{3.25in}}
% $\displaystyle \sA$ & A set\\
% $\displaystyle \R$ & The set of real numbers \\
% $\displaystyle \{0, 1\}$ & The set containing 0 and 1 \\
% $\displaystyle \{0, 1, \dots, n \}$ & The set of all integers between $0$ and $n$\\
% $\displaystyle [a, b]$ & The real interval including $a$ and $b$\\
% $\displaystyle (a, b]$ & The real interval excluding $a$ but including $b$\\
% $\displaystyle \sA \backslash \sB$ & Set subtraction, i.e., the set containing the elements of $\sA$ that are not in $\sB$\\
% $\displaystyle \gG$ & A graph\\
% $\displaystyle \parents_\gG(\ervx_i)$ & The parents of $\ervx_i$ in $\gG$
% \end{tabular}
% \vspace{0.25cm}


% \centerline{\bf Indexing}
% \bgroup
% \def\arraystretch{1.5}

% \begin{tabular}{p{1.25in}p{3.25in}}
% $\displaystyle \eva_i$ & Element $i$ of vector $\va$, with indexing starting at 1 \\
% $\displaystyle \eva_{-i}$ & All elements of vector $\va$ except for element $i$ \\
% $\displaystyle \emA_{i,j}$ & Element $i, j$ of matrix $\mA$ \\
% $\displaystyle \mA_{i, :}$ & Row $i$ of matrix $\mA$ \\
% $\displaystyle \mA_{:, i}$ & Column $i$ of matrix $\mA$ \\
% $\displaystyle \etA_{i, j, k}$ & Element $(i, j, k)$ of a 3-D tensor $\tA$\\
% $\displaystyle \tA_{:, :, i}$ & 2-D slice of a 3-D tensor\\
% $\displaystyle \erva_i$ & Element $i$ of the random vector $\rva$ \\
% \end{tabular}
% \egroup
% \vspace{0.25cm}


% \centerline{\bf Calculus}
% \bgroup
% \def\arraystretch{1.5}
% \begin{tabular}{p{1.25in}p{3.25in}}
% % NOTE: the [2ex] on the next line adds extra height to that row of the table.
% % Without that command, the fraction on the first line is too tall and collides
% % with the fraction on the second line.
% $\displaystyle\frac{d y} {d x}$ & Derivative of $y$ with respect to $x$\\ [2ex]
% $\displaystyle \frac{\partial y} {\partial x} $ & Partial derivative of $y$ with respect to $x$ \\
% $\displaystyle \nabla_\vx y $ & Gradient of $y$ with respect to $\vx$ \\
% $\displaystyle \nabla_\mX y $ & Matrix derivatives of $y$ with respect to $\mX$ \\
% $\displaystyle \nabla_\tX y $ & Tensor containing derivatives of $y$ with respect to $\tX$ \\
% $\displaystyle \frac{\partial f}{\partial \vx} $ & Jacobian matrix $\mJ \in \R^{m\times n}$ of $f: \R^n \rightarrow \R^m$\\
% $\displaystyle \nabla_\vx^2 f(\vx)\text{ or }\mH( f)(\vx)$ & The Hessian matrix of $f$ at input point $\vx$\\
% $\displaystyle \int f(\vx) d\vx $ & Definite integral over the entire domain of $\vx$ \\
% $\displaystyle \int_\sS f(\vx) d\vx$ & Definite integral with respect to $\vx$ over the set $\sS$ \\
% \end{tabular}
% \egroup
% \vspace{0.25cm}

% \centerline{\bf Probability and Information Theory}
% \bgroup
% \def\arraystretch{1.5}
% \begin{tabular}{p{1.25in}p{3.25in}}
% $\displaystyle P(\ra)$ & A probability distribution over a discrete variable\\
% $\displaystyle p(\ra)$ & A probability distribution over a continuous variable, or over
% a variable whose type has not been specified\\
% $\displaystyle \ra \sim P$ & Random variable $\ra$ has distribution $P$\\% so thing on left of \sim should always be a random variable, with name beginning with \r
% $\displaystyle  \E_{\rx\sim P} [ f(x) ]\text{ or } \E f(x)$ & Expectation of $f(x)$ with respect to $P(\rx)$ \\
% $\displaystyle \Var(f(x)) $ &  Variance of $f(x)$ under $P(\rx)$ \\
% $\displaystyle \Cov(f(x),g(x)) $ & Covariance of $f(x)$ and $g(x)$ under $P(\rx)$\\
% $\displaystyle H(\rx) $ & Shannon entropy of the random variable $\rx$\\
% $\displaystyle \KL ( P \Vert Q ) $ & Kullback-Leibler divergence of P and Q \\
% $\displaystyle \mathcal{N} ( \vx ; \vmu , \mSigma)$ & Gaussian distribution %
% over $\vx$ with mean $\vmu$ and covariance $\mSigma$ \\
% \end{tabular}
% \egroup
% \vspace{0.25cm}

% \centerline{\bf Functions}
% \bgroup
% \def\arraystretch{1.5}
% \begin{tabular}{p{1.25in}p{3.25in}}
% $\displaystyle f: \sA \rightarrow \sB$ & The function $f$ with domain $\sA$ and range $\sB$\\
% $\displaystyle f \circ g $ & Composition of the functions $f$ and $g$ \\
%   $\displaystyle f(\vx ; \vtheta) $ & A function of $\vx$ parametrized by $\vtheta$.
%   (Sometimes we write $f(\vx)$ and omit the argument $\vtheta$ to lighten notation) \\
% $\displaystyle \log x$ & Natural logarithm of $x$ \\
% $\displaystyle \sigma(x)$ & Logistic sigmoid, $\displaystyle \frac{1} {1 + \exp(-x)}$ \\
% $\displaystyle \zeta(x)$ & Softplus, $\log(1 + \exp(x))$ \\
% $\displaystyle || \vx ||_p $ & $\normlp$ norm of $\vx$ \\
% $\displaystyle || \vx || $ & $\normltwo$ norm of $\vx$ \\
% $\displaystyle x^+$ & Positive part of $x$, i.e., $\max(0,x)$\\
% $\displaystyle \1_\mathrm{condition}$ & is 1 if the condition is true, 0 otherwise\\
% \end{tabular}
% \egroup
% \vspace{0.25cm}



% \section{Final instructions}
% Do not change any aspects of the formatting parameters in the style files.
% In particular, do not modify the width or length of the rectangle the text
% should fit into, and do not change font sizes (except perhaps in the
% \textsc{References} section; see below). Please note that pages should be
% numbered.

% \section{Preparing PostScript or PDF files}

% Please prepare PostScript or PDF files with paper size ``US Letter'', and
% not, for example, ``A4''. The -t
% letter option on dvips will produce US Letter files.

% Consider directly generating PDF files using \verb+pdflatex+
% (especially if you are a MiKTeX user).
% PDF figures must be substituted for EPS figures, however.

% Otherwise, please generate your PostScript and PDF files with the following commands:
% \begin{verbatim}
% dvips mypaper.dvi -t letter -Ppdf -G0 -o mypaper.ps
% ps2pdf mypaper.ps mypaper.pdf
% \end{verbatim}

% \subsection{Margins in LaTeX}

% Most of the margin problems come from figures positioned by hand using
% \verb+\special+ or other commands. We suggest using the command
% \verb+\includegraphics+
% from the graphicx package. Always specify the figure width as a multiple of
% the line width as in the example below using .eps graphics
% \begin{verbatim}
%    \usepackage[dvips]{graphicx} ...
%    \includegraphics[width=0.8\linewidth]{myfile.eps}
% \end{verbatim}
% or % Apr 2009 addition
% \begin{verbatim}
%    \usepackage[pdftex]{graphicx} ...
%    \includegraphics[width=0.8\linewidth]{myfile.pdf}
% \end{verbatim}
% for .pdf graphics.
% See section~4.4 in the graphics bundle documentation (\url{http://www.ctan.org/tex-archive/macros/latex/required/graphics/grfguide.ps})

% A number of width problems arise when LaTeX cannot properly hyphenate a
% line. Please give LaTeX hyphenation hints using the \verb+\-+ command.

% \subsubsection*{Author Contributions}
% If you'd like to, you may include  a section for author contributions as is done
% in many journals. This is optional and at the discretion of the authors.

%\subsubsection*{Acknowledgments}
%Use unnumbered third level headings for the acknowledgments. All
%acknowledgments, including those to funding agencies, go at the end of the paper.


\bibliography{iclr2024_conference}
\bibliographystyle{iclr2024_conference}

\clearpage
\appendix
\section{Video Swin Transformer for Control Tasks}
\label{appdx:vswin}

\subsection{Tackling the Absence of Causality in Video-Swin Transformer}
\label{appdx:csw-msa}

\begin{figure}[h]
    \includegraphics[width=0.8 \textwidth]{Causal-SW-MSA.png}
    \centering
    \caption{Illustration of Causal Shifted-Window Multi-Head self attention }
    \label{fig:csw_msa}
\end{figure}

In this work, we introduce a tailored attention mechanism called CSW-MSA. The incorporation of CSW-MSA seamlessly empowers the Video-Swin Transformer to excel in autoregressive control tasks, all without the need for extensive architectural adjustments. See Figure \ref{fig:csw_msa} for an illustration.

\subsection{Inductive Bias Boost: Pre-training and Shared Knowledge with Convolutional Encoder}
\label{appdx:strl-objective}

\begin{figure}[h]
    \includegraphics[width=1.0 \textwidth]{plots/castrl-single-game.png}
    \centering
    \caption{The finetuning performance of CaStRL in comparison to DT and Starformer.}
    \label{fig:castrl-single-game}
\end{figure}


Even though the Video-Swin Transformer \citep{Liu2021VideoST} relies on spatiotemporal locality as an inductive bias, in control tasks, we've observed that its performance is significantly affected by initialization. This is evident when observing the performance improvement of the CaStRL model after pretraining across various Atari environments, and then finetuning the pretrained model in \textbf{single-game settings}, as illustrated in Figure \ref{fig:castrl-single-game}.
\clearpage

\begin{algorithm}
    \caption{State Representation Learning (StRL)}
    \label{alg:strl-loss}
    \input{code/strl-loss}
\end{algorithm}


CaStRL employs VICReg \citep{Bardes2021VICRegVR} as the state representation learning objective, highlighting the necessity for two modules: a transformer module, exemplified by the Video-Swin Transformer, which possesses the capability to scale and learn spatiotemporal features, and another module, such as ResNet34, designed to compensate for the initial lack of inductive bias during the early stages of training. The pseudocode \ref{alg:strl-loss} illustrates the steps involved in StRL objective.

\clearpage
\section{Context-GPT Objectives}

\begin{algorithm}
    \caption{Context GPT Loss (For Context-Aware StRL: $\textbf{Context Type} \gets \textbf{Masked State}$)}
    \label{alg:cgpt-loss-masked-state}
    \input{code/cgpt-loss-masked-state}
\end{algorithm}


\begin{algorithm}
    \caption{Context GPT Loss (For Context-Aware StRL: $\textbf{Context Type} \gets \textbf{Masked Action}$)}
    \label{alg:cgpt-loss-masked-action}
    \input{code/cgpt-loss-masked-action}
\end{algorithm}

\section{Additional Results}

% \section{Additional Results}

\subsection{CaStRL Normalized Scores}
\label{appdx:castrl-scores}

\begin{figure}[h]
    \centering
    %\begin{subfigure}[b]{0.75\textwidth}
        \includegraphics[width=1.0 \textwidth]{plots/castrl-dqn-normalized-scores.png}
        \centering
    \caption{DQN-Normalized Scores.}
    \label{fig:castrl-dqn-normalized-scores}
\end{figure}

\begin{figure}[h]
    \centering
    %\begin{subfigure}[b]{0.75\textwidth}
        \includegraphics[width=1.0 \textwidth]{plots/castrl-human-normalized-scores.png}
        \centering
    \caption{Human-Normalized Scores | Carnival and Pooya: \textbf{NA} (No Reference Human Scores).}
    \label{fig:castrl-human-normalized-scores}
\end{figure}

In Figures \ref{fig:castrl-human-normalized-scores} and \ref{fig:castrl-dqn-normalized-scores}, we present human-normalized scores (HNS) \citep{Toromanoff2019IsDR} and DQN-normalized scores \citep{agarwal2021deep}, respectively. Normalized scores are calculated as follows:

\begin{equation}
   \frac{score - score_{random}}{score_{human/dqn} - score_{random}}
\end{equation}

% attention rollouts
\subsection{Visualize Attentions}
We visualize the learned representations with CaStRL using the Attention-Rollout technique \citep{abnar2020quantifying} to generate attention maps (as seen in Figure \ref{fig:castrl-attn_rollout}). 
\begin{figure}[h]
    \centering
    %\begin{subfigure}[b]{0.75\textwidth}
        \includegraphics[width=0.3 \textwidth]{plots/attn-rollouts/breakout-attn-rollout.png}
        \centering
        \includegraphics[width=0.3 \textwidth]{plots/attn-rollouts/boxing-attn-rollout.png}
        \centering
        \includegraphics[width=0.3 \textwidth]{plots/attn-rollouts/pong-attn-rollout.png}
        \centering
    \caption{Visualization of attention maps in CaStRL, extracted for breakout game. We highlights the movement of ball and paddel with heat-map style coloring}
    \label{fig:castrl-attn_rollout}
\end{figure}

\section{Implementation Details}

For both pretraining and fine-tuning, we employed the AdamW optimizer \citep{loshchilov2017decoupled} with $\beta_{1}{=}0.9$, $\beta_{2}{=}0.999$, and $\lambda$ (weight decay) set to $0.01$. We implemented a learning rate decay using a cosine annealing method described by \citep{loshchilov2016sgdr}, where $T_{max}{=} 2000$ represents the maximum number of iterations, and the learning rate range is specified as $[\eta_{min}, \eta_{max}] \gets [7{\times}10^{-5}, 6{\times}10^{-4}]$.
\end{document}
