% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
% \usepackage[compact]{titlesec}  
% \titlespacing{\section}{-0.05mm}{-0.0mm}{-0.0mm}
% \titlespacing{\subsection}{-0.15mm}{-0.15mm}{-0.15mm}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amsfonts}
\usepackage{dsfont}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{outlines}
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{multirow}
\usepackage{hyperref}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{A Trajectory is Worth Three Sentences:\\Multimodal Transformer for Offline Reinforcement Learning}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<yiqiw2@andrew.cmu.edu>?Subject=Decision Transducer Paper}{Yiqi~Wang}{}}
\author[2]{Mengdi~Xu}
\author[1]{Laixi~Shi}
\author[1]{Yuejie~Chi}
% Add affiliations after the authors
\affil[1]{%
    Department of Electrical and Computer Engineering\\
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Department of Mechanical Engineering\\
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
  
  \begin{document}

\renewcommand{\labelenumii}{\arabic{enumi}.\arabic{enumii}}
\renewcommand{\labelenumiii}{\arabic{enumi}.\arabic{enumii}.\arabic{enumiii}}
\renewcommand{\labelenumiv}{\arabic{enumi}.\arabic{enumii}.\arabic{enumiii}.\arabic{enumiv}}

\maketitle
\begin{abstract}
Transformers hold tremendous promise in solving offline reinforcement learning (RL) by formulating it as a sequence modeling problem inspired by language modeling (LM). Prior works using transformers model a sample (trajectory) of RL as one sequence analogous to a sequence of words (one sentence) in LM, despite the fact that each trajectory includes tokens from three diverse modalities: state, action, and reward, while a sentence contains words only. Rather than taking a modality-agnostic approach which uniformly models the tokens from different modalities as one sequence, we propose a multimodal sequence modeling approach in which a trajectory (one ``sentence'') of three modalities (state, action, reward) is disentangled into three unimodal ones (three ``sentences''). We investigate the correlation of different modalities during sequential decision-making and use the insights to design a multimodal transformer, named Decision Transducer (DTd). DTd outperforms prior art in offline RL on the conducted D4RL benchmarks and enjoys better sample efficiency and algorithm flexibility. Our code is made publicly \href{https://github.com/berniewang8177/Official-codebase-for-Decision-Transducer/}{here}. 

\end{abstract}

\section{Introduction}\label{sec:intro}
 

Reinforcement learning (RL) has been formulated as a sequential decision-making problem with wide applicability in areas such as strategy games \citet{ye2020mastering, vinyals2019alphastar}, robotics \citep{zeng2021transporter, agarwal2022legged}, and self-driving \citep{bojarski2016end, chen2020learning}. Often, collecting data from online interactions can be costly or risky. To address the data acquisition challenge, offline RL seeks to learn an optimal policy by leveraging a pre-collected dataset, without online interactions \citet{fujimoto2019off, kumar2019stabilizing, wu2019behavior}. Conventionally, offline RL is approached similarly to online RL via temporal difference (TD) learning \citep{sutton2018reinforcement}. While a wide literature \citet{fujimoto2019off, kumar2019stabilizing, wu2019behavior} has investigated how to leverage TD learning to solve offline RL, many of these methodologies differ in their model architecture and objectives.

Recently, transformers \citep{vaswani2017attention} have achieved remarkable success in language modeling (LM) \citep{radford2018improving, kaplan2020scaling} by using a temporal transformer \citep{radford2019language}  to learn the distributions of concepts given sequential inputs (i.e. sentences). Since samples in offline RL (i.e., trajectories) are also sequential, it is natural to formulate offline RL as a sequence modeling problem \citep{chen2021decision, janner2021offline} and leverage a temporal transformer to model the distributions of the behaviors within the pre-collected dataset, built on the architectural advances in LM. In contrast to TD methods \citep{fujimoto2019off, kostrikov2021offline}  that tackle offline RL with multiple components and objectives, a single temporal transformer with a behavior cloning objective achieves surprisingly competitive and promising results \citep{konan2022contrastive, yamagata2022q, sudhakaran2022skill, wang2022bootstrapped, xu2022prompting}.  

However, prior works with transformers \citep{janner2021offline, chen2021decision} have not examined one major difference between offline RL and LM: their sequential inputs are inherently different in terms of modality. Specifically, the transformers in LM regard a sentence as a unimodal sequence where every token belongs to one consistent modality. In contrast, offline RL includes a multimodal sequence includes three distinct modalities: state, action, and reward (return). Inspired by recent developments in robotics where a sequential input involving observations, actions, and goals are regarded as a multimodal sequence (see more detailed discussions in the related work in Section~\ref{sec:rel work}), we hypothesize that: 

\begin{center}
\textit{When treating offline RL as a sequence modeling task, it is beneficial to take a multimodal approach}.
\end{center}

\paragraph{Offline RL via Multimodal Sequence Modeling?}
Before starting to design a multimodal transformer for offline RL, we first investigate the importance of the multimodal interactions that took place within the prior transformers (e.g. the Decision Transformer (DT) \citep{chen2021decision}). 

Our findings suggest that some interactions between modalities are more weighted than others (shown in Figure~\ref{fig:attn_stats}), according to the attention map of DT. Therefore, we capitalize on the important interactions of these modalities to design our multimodal transformer for better performance (a detailed discussion will be provided in Section~\ref{enu:findings}). The main contributions are as follows:
\begin{enumerate}
   \item To obtain heuristics for multimodal architecture design, we first quantify the cross-modal and intra-modal interactions in the sequential decision-making process (Section~\ref{modality analysis}), by aggregating and analyzing the last-layer attention scores of DT.
   
   \item Using the heuristics from multimodal quantification in Section~\ref{modality analysis}, we propose a multimodal transformer called Decision Transducer (DTd)\footnote{DTd doesn't have a ``Transducer'' objective, which is a sequence alignment objective proposed by \citet{graves2012sequence}. The ``Transducer'' here is to give credit to Transformer Transducer \citep{zhang2020transformer} and its neural biasing variant \citep{chang2021context}, which motivates DTd's architecture design.},  which outperforms prior art and enjoys better sample efficiency on the D4RL \citep{fu2020d4rl} offline RL benchmarks.

   \item Due to the multimodal design, DTd is more efficient and flexible in leveraging diverse types of task goals, such as long-term return, targeted physical locations, and the value function over states, with comparisons to prior arts (e.g., DT).
\end{enumerate}

\begin{figure}[htb]
  \centering
  \includegraphics[width=0.8\linewidth]{figures/attn_stats_crop.pdf}
  \caption{\textbf{Importance of Modality Interactions}. On the hopper domain with medium-expert dataset \citep{fu2020d4rl}, attention scores with respect to different types of modality interactions from the last layer of a Decision Transformer (DT) \citep{chen2021decision} are aggregated across one episode. After normalizing scores into percentages, it is observed that DT generally pays more attention to cross-modal (orange) interactions (60\%) compared to intra-modal (blue) interactions (40\%). Therefore, we design a multimodal transformer to exploit the modality importance discovered by DT (seeSection~\ref{enu:findings} for details) to enable more effective multimodal sequence modeling.}

\end{figure}\label{fig:attn_stats}

\section{Related works}\label{sec:rel work}
 

\paragraph{Transformers for Offline RL.} The popular model-free DT achieves promising results on the D4RL benchmark by offering return-conditioned sequence modeling for offline RL \citep{fu2020d4rl}. Based on DT's formulation, ConDT \citep{konan2022contrastive} further improves DT's performance by introducing contrastive objectives to learn more discriminative representations. Instead of return-conditioning, other works of DT replace return by state distributions \citep{sudhakaran2022skill} or value function \citep{yamagata2022q} to tackle the sparse reward scenarios. Alternatively, popular model-based Trajectory Transformer (TT) \citep{janner2021offline} fully exploits the power of autoregressive sequence modeling by learning both the policy and the dynamic model simultaneously. Not only does such an ability facilitate look-ahead planning, but it can also be used to bootstrap TT simulated data to improve coverage \citep{wang2022bootstrapped}. 

\paragraph{Multimodal Transformers for Robotics.}
Transformers for robotics usually adopt multimodal designs because the input involves complex visual observations and sophisticated robot actions. Besides, the agent may be given abstract goals (e.g., language instruction) \citep{shridhar2022perceiver, brohan2022rt,  pashevich2021episodic, guhur2022instruction, lynch2022interactive} instead of a scalar return. The multimodal architecture enables agents to process observations, actions, and language instructions from different modalities. 


\begin{figure*}[!htb]
  \centering
  \includegraphics[width=.9\linewidth]{figures/overview_supersimplified.pdf}
  \caption{\textbf{Decision Transducer (DTd) Overview}.
  We leverage the modality importance discovered by DT's attention mechanism to develop DTd's architecture. We arrange more important modality interactions recognized by DT in higher layers of DTd and less important ones in lower layers. We list the modality involved in each layer and the operation to incorporate different modalities into the multimodal decision-making at the right side of the plot, grouped by (a), (b), (c), and (d). More connections between modality importance and DTd's choice of input modality are discussed in Section~\ref{enu:findings}.}\label{fig:overview}
\end{figure*}

\section{Preliminaries}\label{sec:prelim}
 

\paragraph{Offilne Reinforcement Learning (RL).}
RL is often formulated as sequential decision-making via the framework of Markov Decision Processes (MDPs), which could be described by a tuple
$ ( \mathcal{ S, A, T, R, \gamma} )$. At each timestep $t$, an agent will experience state $s_{t} \in \mathcal{S}$ and execute an
action $a_t \in \mathcal{A}$, where $\mathcal{S}$ is the state space and $\mathcal{A}$ denotes the action space. $\mathcal{T}$ is a transition function such that an agent will be exposed to a new state at timestep $t+1$ with a probability of $ 0 \leq \mathcal{T}(s_{t+1} | s_{t}, a_{t}) \leq 1$. A reward will be given by the reward function $r_{t} = \mathcal{R}(s_t, a_t)$ for each timestep. 
Given a discount factor $\gamma \in [0,1)$ and a horizon length of $T$, a discounted return $R$  at timestep $t$ is defined by:
$R_{t} = \sum_{t'=t}^{T} \gamma^{t'-1}r_{t'}$. The goal is to find a policy $\pi(a_t | s_t)$ that maximizes the objective $J = \mathbb{E}_{a_t \sim \pi(\cdot | s_t), s_{t+1} \sim \mathcal{T}(\cdot | s_t, a_t)} [\sum_{t}^{T} \gamma^{t-1}r_{t} ] $.

While online RL allows one to collect data $(s_t,a_t,r_t,s_{t+1})$ through online interactions, offline RL does not allow a policy to interact with the environment and generally learns from a fixed dataset $\mathcal{D}$ pre-collected by an inaccessible behavior policy $\pi_{b}$. In this work, we train a policy $\pi$ on a dataset $\mathcal{D}={ \{\tau_{i}\} }_{i=1}^{N}$ with a number of $N$ pre-collected trajectories. Here, each trajectory is the result of the interaction with the environment via the behavior policy $\pi_{b}$, in the form of $\tau_{i} = { \{ (s_t, a_t, r_t, s_{t+1}) \} }_{t=1}^{T}$.

\section{Decision Transducer}\label{sec:DTd}
 
In this section, we first compare our multimodal formulation to DT's predecessor (Section~\ref{multimodal formulation}), quantify the importance of different modalities derived from DT's attention map (Section~\ref{modality analysis}), and finally explain the connections between the results of our multimodal quantification to the design of DTd's architecture (Section~\ref{architecture design}).

\subsection{Multimodal Sequence Modeling}\label{multimodal formulation}

According to the sequence modeling formulation proposed by DT \citet{chen2021decision}, each trajectory $\tau$ of length $T$ within an offline dataset 
$\mathcal{D}$ is transformed into a sequence $\tau = ( R_1, s_1, a_1, ... , R_T, s_T, a_T)$ where $R_t = \sum_{t'=t}^{T} r_{t'}$ is an un-discounted return (known as return-to-go in \citet{chen2021decision}). Given the similarity between sequence modeling and language modeling, a model-free decision-making transformer such as DT only needs an autoregressive objective $\log P_{\theta}(a_t | \tau_{<t} )$ derived from LM, where $\tau_{<t} = ( R_1, s_1, a_1, ... , R_t, s_t)$. 

However, a trajectory is inherently multimodal including states, actions, and returns. Our formulation aligns better with the multimodality of the input space by considering a trajectory as three sequences, including state sequence $s_{1:T}$, action sequence $a_{1:T}$, and return sequence $R_{1:T}$. To facilitate multimodal decision-making, each unimodal sequence will be processed separately and fused selectively $\log P_{\theta}(a_t | R_{\leq t}, s_{\leq t}, a_{<t} )$.  
 

\subsection{Multimodal quantification}\label{modality analysis}

To guide our multimodal design, a multimodal quantification on DT was conducted on medium-expert data of hopper environment \citep{fu2020d4rl}, where the DT has its last-layer attention maps logged and analyzed.

Within the attention map, there are 9 types of interactions between modalities since DT takes a tri-modal input including returns, states, and actions. After aggregating symmetric cross-modal attention scores (e.g. return-state, state-return) together, the scope of analysis is narrowed down from 9 to 6 types of interactions. It is noticed that DT pays more attention to cross-modal interactions (orange colorbar in Figure~\ref{fig:attn_stats}) than intra-modal interactions (blue colorbar in Figure~\ref{fig:attn_stats}). Following are our findings:
\begin{enumerate}\label{enu:findings}
\item State-state interaction is more important (26\%) than other intra-modal interactions during decision-making.
\item Cross-modal interactions are more important (60\%) than intra-modal interactions (40\%).
\item Different cross-modal interactions are weighted differently in DT's decision-making.
    \begin{enumerate}
        \item State-action interaction is the most salient among cross-modal interactions (23\%).
        \item Return-state (20\%) and return-action (17\%) interactions also play important roles in decision-making.
    \end{enumerate}
\end{enumerate}

Our high-level design heuristics is that less important interactions between modalities should be processed before important ones during representation learning, ensuring the representation learning with the most important interactions involves minimum distraction.

With our findings and heuristics above, we start by placing all intra-modal interactions before cross-modal interactions (cf. (b) before (c,d) in Figure~\ref{fig:overview}) hierarchically. As the state-state interaction is identified as the most important intra-modal interaction within the multimodal input, we disentangle it from other modalities and let an ad-hoc state encoder handle it (cf. the state encoder shown in Figure~\ref{fig:overview}, row (b) ). The expectation is to refine state representation by intra-modal attention without distraction from other modalities. After intra-modal interactions are applied, we arrange less important cross-modal interaction with respect to our findings below more important cross-modal interaction, as illustrated in (c,d) in Figure~\ref{fig:overview}. As a result, all interactions between modalities within the model are ranked with respect to their importance in multimodal decision-making.

\subsection{Architecture Design of DTd}\label{architecture design}

Putting all the design heuristics together, DTd predicts actions following the procedure below: 

\begin{enumerate}
   \item \textbf{Modality encoders} refine unimodal representations via intra-modal interactions after applying modality disentanglement to the input trajectory (finding 1).
   \item \textbf{Biasing layer} learns bias-to-goal for each non-goal modality by applying cross-modal attention to the goal and non-goal modalities (findings 2 and 3.2).
   \item \textbf{Combiner} reminds the transformer about its goal by combining the bias-to-goal into the decision-making via additive fusion (findings 2 and 3.2).
   \item \textbf{Joint encoder} fuses the most fundamental modalities for the final decision-making (finding 3.1). 
\end{enumerate}
An overview of DTd is shown in Figure~\ref{fig:overview}. 
% \textbf{Data Pre-Processing} and each \textbf{Component} of the model will be explained in the following subsections.
Data pre-processing and each component of the model will be explained in the following subsections.

\paragraph{Data Pre-Processing.} We follow pre-processing and formulation proposed by \citet{chen2021decision}. However, we denote a tri-modal trajectory in a more general way $\tau=( G_1, s_1, a_1, ..., G_T, s_T, a_T)$ which we replace $R_t$  by $G_t$, denoting the desired outcome or goal of the task. In the reward-dense setting, $G_t = R_t$ but $G_t$ could also become a goal position or learned state-value function in a sparse-reward setting. Specifically, we first disentangle the tri-modal trajectory of length $T$ into three unimodal sequences as
\begin{equation}\label{eq:pre-proc}
 G_{1:T}, s_{1:T}, a_{1:T} = \tau.
\end{equation}
Then, we let modality-specific embedding layers transform tokens from different modalities into the same dimension and add 2 types of embeddings:  
 \begin{align}
h^{G} & = f_{emb}^{G}(G_{1:T}) + E^{\tau} + E^{G} ,\nonumber \\
h^{s} & = f_{emb}^{s}(s_{1:T}) + E^{\tau} + E^{s} ,\nonumber \\
h^{a} & = f_{emb}^{a}(a_{1:T}) + E^{\tau} + E^{a}. 
\end{align}\label{eq:pre-proc2}
Specifically, $E^{\tau}$ is the time embedding with respect to the time horizon of the trajectory, and modality embeddings --- including $E^{G}$, $E^{s}$, and $E^{a}$ --- are used to encourage DTd to be aware of the multimodal input.

\textbf{Modality Encoder}.
After tokens from different modalities are embedded into the same dimension, we apply modality-specific encoders $f_{enc}$ and obtain
 \begin{align}\label{eq:mod-enc}
H^{G}, H^{s}, H^{a} & = f_{enc}^{G}(h^{G}), f_{enc}^{s}(h^{s}), f_{enc}^{a}(h^{a})
\end{align}
to refine representation. 
Each $f_{enc}$ is a 3-layer transformer encoder proposed by \citet{vaswani2017attention} but with layer-norm applied before the self-attention. Due to the sequence modeling formulation, DTd is autoregressive such that $f_{enc}$ takes a causal attention mask (i.e., temporal transformer) to avoid leaking future information to the model similar to \citet{chen2021decision}.  

\textbf{Biasing}.
After refining representation and encouraging intra-modal interactions with $f_{enc}$, DTd learns bias-to-goal to bias high-level decision-making towards the task's goal. Specifically, a cross-attention layer \citep{vaswani2017attention} is applied to 2 different modalities where keys and values come from the goal modality (denotes as $H^{G}$ in Figure~\ref{fig:overview}) by applying trainable matrices $f_{k}, f_{v}$, and similarly, queries come from non-goal modalities need to be biased such as states and actions ($H^{s}, H^{a}$ in Figure~\ref{fig:overview})  by applying trainable matrix $f_{q}$, represented by:
\begin{align}
Q^{1},K^{1},V^{1} & = f_{q_1}( H^{s} ), f_{k_1}( H^{G} ),f_{v_1}(H^{G}), \nonumber\\
Q^{2},K^{2},V^{2} & = f_{q_2}( H^{a} ), f_{k_2}( H^{G} ),f_{v_2}(H^{G}) . \label{eq:bias1}
\end{align}
Finally, the bias-to-goal considering goal and non-goal modalities (state or action) will be learned by applying the attention layer on the learned queries, keys, and values as below:
\begin{align}
H_{s}^{ G } & = \text{Attention}(Q^{1},K^{1},V^{1}),\nonumber\\
H_{a}^{ G } & = \text{Attention}(Q^{2},K^{2},V^{2}). \label{eq:bias2}
\end{align}

\textbf{Combiner}.
The next step is to fuse the bias-to-goal into the non-goal modality so that the action prediction is biased toward the goal of the task. DTd leverages additive fusion within Combiner to fulfill this requirement.

The additive fusion for each non-goal modality ($H^{s}, H^{a}$) is implemented by projecting the representation and its bias-to-goal ($H^{G}_{ s}, H^{G}_{ a} $) into a 
new hidden space, as shown below
\begin{align}
H^{s'} & = \text{GELU}( \text{W}_{1}^{s} H^{s} +\text{W}_{2}^{s} H^{G}_{ s } ) \text{W}_{3}^{s}, \nonumber\\
H^{a'} & = \text{GELU}( \text{W}_{1}^{a} H^{a} +\text{W}_{2}^{a} H^{G}_{ a } ) \text{W}_{3}^{a} .\label{eq:combine}
\end{align}
We expand the dimensions of $H^{s}$, $H^{G}_{s}$, $H^{G}_{ a}$, and $H^{a}$ 2 times with linear layers $\text{W}_1, \text{W}_2$, followed by an activation function GELU \citep{hendrycks2016gaussian}. Lastly, we project the representation back to the original dimension of the model hidden state with another linear layer $\text{W}_3$. 

The reason to apply additive fusion instead of a more complicated multiplicative fusion \citep{jayakumar2020multiplicative} is that adding information has become the widely-adopted practice to remind 
transformers about something important. For example, a standard practice to help the transformer to be aware of positional information within a sequential input is by adding positional embedding \citep{vaswani2017attention}. Similarly, to help a transformer to distinguish different modalities within a sequential input, modality-type embedding is usually added to the input \citep{bao2021vlmo, kim2021vilt}. In our case, to remind high-level DTd layers about the goal of the task, we choose to fuse the bias-to-goal into the representation with an additive operation.

\textbf{Joint Encoder}.
We decide to let biased states $H^{s'}$ and biased actions $H^{a'}$ interact before DTd makes a decision (finding 3.1 in findings \ref{enu:findings}). We first interleave 2 sequences of length $T$ from 2 combiners into 1 sequence as
\begin{align}\label{eq:join1}
% H^{src1} & = \text{Combiner}( H^{s}, \text{Biasing}(H^{s}, H^{G}) ) \\
% H^{src2} & = \text{Combiner}( H^{a}, \text{Biasing}(H^{a}, H^{G}) ) \\
H^{joint} & = { \{ (H_{t}^{s'}, H_{t}^{a'}) \} }_{t=1}^{T} .
\end{align}
Secondly, a 1-layer temporal transformer named Joint net $f_{joint\_enc}$ similar to $f_{enc}$ will be applied on the joint representation $H^{joint}$, leading to
\begin{align}\label{eq:join2}
H^{joint\_enc} & = f_{joint\_enc}(H^{joint}) .
\end{align}
The Joint net will encourage the most important cross-modal interaction state and action to interact with each other.

Finally, the action prediction is made on the representation belonging to states $H_{t}^{s''}$. In particular, $H_{t}^{s''}$ is disentangled from $H^{joint\_enc}$ and a prediction head $f_{pred}$ is applied on top to predict action, summarized as follows
\begin{align}
{\{ (H_{t}^{s''}, H_{t}^{a''}) \}  }_{t=1}^{T} & = H^{joint\_enc} , \nonumber \\
a_{1:T} & = f_{pred}( H_{1:T}^{s''} ) . \label{eq:join3}
\end{align}

We provide an architecture comparison table in supplement materials to highlight the difference between DTd and DT in terms of architecture design. In short, DT's decision backbone is the joint encoder of DTd but with a tri-modal sequential input instead of a bi-modal sequence learned from 3 unimodal sequences. Besides, DTd attaches several components before the joint encoder to reflect the multimodal nature of a trajectory. In Section~\ref{ablation_choice}, we found these components are crucial for good performance. 
\begin{table*}[h]
\centering
\caption{\textbf{D4RL Locomotion Performance}. We evaluate DTd and other offline RL transformers (DT, TT) by reporting the mean and standard deviation of normalized scores across 12 seeds (4 independent models and 3 evaluations for each). The methods reproduced by ourselves using the protocol above are highlighted with \textbf{*}. DTd achieves competitive results across environments and datasets on average. Note that TT and Diffuser require to forward the model multiple times to plan ahead for competitive performance while DTd requires 1 forward pass only without planning thanks to its multimodal designs.}\label{tab:overall d4rl}
\begin{center}
\resizebox{\textwidth}{!}{
\begin{tabular}{cc|cccccc|cc}
\toprule
\multirow{2}{*}{\textbf{Dataset}} & \multirow{2}{*}{\textbf{Environment}} & \multicolumn{6}{c|}{ \textbf{No Planning}}  & \multicolumn{2}{c}{ \textbf{Planning} }  \\
 &  & \textbf{BC} &\textbf{CQL} &\textbf{IQL} &\textbf{DT*} &\textbf{DT-large*} &\textbf{DTd} (Ours)  &\textbf{TT*} &\textbf{Diffuser} \\
\midrule
%medium-replay score
\multirow{3}{*}{{Medium-Replay}}  &Hopper        &27.6    &95.0   &94.7       &55.2$\pm$18.4          &75.9$\pm$4.6       &91.2$\pm$6.8       &82.6$\pm$6.9           &\bf{96.8} \\
 &Walker2d      &36.9   &77.2    &73.9       &59.2$\pm$12.8          &62.4$\pm$11.6      &\bf{81.8$\pm$3.0}       &71.5$\pm$10.9      &61.2  \\
 &HalfCheetah   &4.3    &\bf{45.5}    &44.2       &33.3$\pm$3.1      &33.6$\pm$4.6       &41.4$\pm$0.8       &44.3$\pm$1.3           &42.2 \\
\midrule
%medium score
\multirow{3}{*}{{Medium}} &Hopper         &63.9   &58.5    &66.3       &\bf{67.8$\pm$5.8}      &62.9$\pm$11.0      &57.4$\pm$4.2       &60.0$\pm$5.1           &58.5 \\
 &Walker2d       &77.3   &72.5    &78.3       &77.8$\pm$4.4           &61.7$\pm$13.0      &78.8$\pm$3.8      &70.4$\pm$20.2          &\bf{79.7} \\
 &HalfCheetah    &43.1   &44.0    &\bf{47.4}       &42.9$\pm$0.4       &42.5$\pm$0.4       &42.7$\pm$0.3       &46.2$\pm$1.4      &44.2 \\
\midrule
%medium-expert score
\multirow{3}{*}{{Medium-Expert}} &Hopper        &79.6   &105.3   &91.5       &110.6$\pm$1.7          &95.2$\pm$16.0      &\bf{112.5$\pm$1.2} &82.2$\pm$16.8      &107.2 \\
 &Walker2d      &36.6   &108.8   &\bf{109.6} &100.6$\pm$10.8         &100.3$\pm$12.0     &109.0$\pm$0.4      &105.2$\pm$3.52      &108.4 \\
 &HalfCheetah   &59.9   &91.6    &86.7       &83.2$\pm$3.0           &77.7$\pm$9.0       &\bf{92.1$\pm$0.7}       &90.2$\pm$7.2  &79.8 \\
\midrule
\multicolumn{2}{c|}{Average} & 47.7 & 77.6 & 77.0 & 70.1 & 68.0 & \bf{78.5} & 72.5 & 75.3 \\
\bottomrule
\end{tabular}
}
\end{center}
\end{table*}

\begin{table}[h]
\centering
\caption{\textbf{Leveraging Diverse Types of Goals.} DTd can more effectively leverage state value (\textbf{-V}) or goal position (\textbf{-goal}) in the D4RL AntMaze domain compared to DT when returns based on dense rewards are not available. The scores reported are the average across 40 evaluations (4 independent models with different training seeds and 10 evaluations for each with different evaluation seeds).}\label{tab:antmaze}
\begin{adjustbox}{width=0.5\textwidth}
\begin{tabular}{c|ccccc}
\toprule
\textbf{Dataset} &\textbf{DT-goal} &\textbf{DTd-goal} (Ours) &\textbf{DT-V} &\textbf{DTd-V} (Ours) \\
\midrule
Umaze-v0    &\bf{67.5 $\pm$ 18.0}          &55.0$\pm$ 21.0     & \bf{75.0 $\pm$ 15.0}   & 67.5 $\pm$ 15.0\\
Umaze-diverse   &\bf{67.5 $\pm$ 16.4}   &57.5 $\pm$ 14.8    & 60.0 $\pm$ 18.7   & \bf{62.5 $\pm$ 21.7}\\
Medium-play    & 0.0 $\pm$ 0.0         &\bf{22.5 $\pm$ 11.0}    & 10.0 $\pm$ 7.1    & \bf{40.0 $\pm$ 8.2} \\
Medium-diverse     & 0.0 $\pm$ 0.0         &\bf{32.5 $\pm$ 13.0}    & 15.0 $\pm$ 15.0   & \bf{57.5 $\pm$ 8.3}\\
\midrule
Average     & 33.8                  &\bf{41.9}               & 40                & \bf{56.9}\\
\bottomrule
\end{tabular}
\end{adjustbox}
\end{table}

\begin{table*}[h]
\centering
\caption{\textbf{Ablation on the architecture}. To justify our architecture design, we compare DTd to its variants by removing the cross-modal interactions introduced by the Biasing-Combiner layer within DTd everywhere (DTd-zero), left (DTd-left), or right (DTd-right). The most important finding is that all cross-modal interactions we selected are necessary for good performance (DTd VS. DTd-left/right). Interestingly, while DT-large has access to all cross-modal interactions, DTd-left still outperform it on average by leveraging a number of limited and but essential cross-modal interactions.}\label{tab:table_ablation_choice}
\begin{adjustbox}{width=0.9\textwidth}
\begin{tabular}{cc|cccccc}
\toprule
\textbf{Dataset} & \textbf{Environment} & \textbf{BC} & \textbf{DT-large} &\textbf{DTd-zero} &\textbf{DTd-left} &\textbf{DTd-right} &\textbf{DTd} \\
\midrule
%medium-expert score
\multirow{3}{*}{{Medium-Expert}}    & Hopper           &79.6   &95.2$\pm$16.0  & 89.8$\pm$16.8 & 108.3$\pm$6.0 & 109.4$\pm$4.4 & \bf{112.5$\pm$1.2}\\
                                    & Walker2d         &36.6   &100.3$\pm$12.0 & 107.7$\pm$0.4 & 108.8$\pm$0.2 & 108.1$\pm$0.3 & \bf{109.0$\pm$0.4}\\
                                    & HalfCheetah   &59.9   &77.7$\pm$9.0   & 58.8$\pm$0.6  & 91.2$\pm$0.8  & 91.9$\pm0.5$  & \bf{92.1$\pm$0.7}\\ 
\midrule
%medium-replay score
\multirow{3}{*}{{Medium-Replay}}    & Hopper        &27.6   &75.9$\pm$4.6   & 16.8$\pm$2.8  & 81.6$\pm$5.1 & 33.3$\pm$25.6 & \bf{91.2$\pm$6.8}\\
                                    & Walker2d      &36.9   &62.4$\pm$11.6  & 34.0$\pm$16.4 & 44.8$\pm$30.5 & 20.6$\pm$12.0 & \bf{81.8$\pm$3.0}\\
                                    & HalfCheetah   &4.3    &33.6$\pm$4.6   & 31.3$\pm$9.8  & 36.1$\pm$9.7  & 38.4$\pm$4.9  & \bf{41.4$\pm$0.8}\\
\midrule
\multicolumn{2}{c|}{Average} & 41.3 & 74.2 & 56.4 & 78.4& 67.0 & \bf{88.0}  \\
\bottomrule
\end{tabular}
\end{adjustbox}
\end{table*}

\begin{table}[h]
\centering
\caption{\textbf{Ablation on the modality order}. In the current DTd, state, goal (e.g. return), and action (S-G-A) are input modality from left to right. In order to meet our design heuristics, it provides states and actions as inputs to the Joint net where goals are placed in the center. Our result in Table \ref{tab:table_ablation_order} shows that any order who fails to provide the state-action interaction required by our heuristics will lead to bad model performance. Our discussion discards one order from a pair of symmetric order (e.g. S-G-A, A-G-S) since DTd is symmetric. More insights on the input order and heuristics are provided in the Section~\ref{ablation_order}.}\label{tab:table_ablation_order}
\begin{adjustbox}{width=0.48\textwidth}
\begin{tabular}{cc|cccccc}
\toprule
\textbf{Dataset} & \textbf{Environment}& \textbf{S-G-A (current)} & \textbf{S-A-G} & \textbf{A-S-G} \\
\midrule
%medium-expert score
\multirow{3}{*}{{Medium-Expert}}    & Hopper           &    \bf{112.5$\pm$1.2} & 5.1$\pm$1.4 & 5.0$\pm$1.3 \\
                                    & Walker2d         &    \bf{109.0$\pm$0.4} & 0.9$\pm$0.9 & 0.8$\pm$0.2 \\
                                    & HalfCheetah      &    \bf{92.1$\pm$0.7} & 2.1$\pm$0.03 & 2.1$\pm$0.1\\ 
\midrule    
%medium-replay score
\multirow{3}{*}{{Medium-Replay}}    & Hopper        &       \bf{91.2$\pm$6.8} & 5.3$\pm$1.4 & 4.1$\pm$0.9\\
                                    & Walker2d      &       \bf{81.8$\pm$3.0} & 1.0$\pm$0.1 & 1.0$\pm0$.1\\
                                    & HalfCheetah   &       \bf{41.4$\pm$0.8} & 2.1$\pm$0.09 & 2.25$\pm$0.1\\
\midrule
\multicolumn{2}{c|}{Average} &\bf{88.0} & 2.7 & 2.5 \\
\bottomrule
\end{tabular}
\end{adjustbox}
\end{table}

\section{Evaluations and discussions}

In this section, we evaluate the experimental performance of DTd over the offline RL D4RL benchmark \citep{fu2020d4rl} and analyze from the following perspectives:
\begin{enumerate}
   \item Effectiveness of the proposed DTd compared to DT when leveraging different types of task goals as the input, such as long-term return, value function over states, and targeted physical positions.
   \item The ablation study of the architecture components and input modalities.
   \item Pros and cons brought by the explicitly modeled goal after modality disentanglement.
\end{enumerate}

To facilitate the future work and reproducibility, our code is made publicly via github \footnote{ \url{https://github.com/berniewang8177/Official-codebase-for-Decision-Transducer/}}. 

\subsection{Experimental Performance}\label{overall performance}

\textbf{Datasets.} We conduct experiments over MuJoCo locomotion tasks including hopper, halfcheetah, and walker2d for evaluation. For each task, we use three different levels of history datasets (i.e., medium-expert, medium, and medium-replay) in the D4RL benchmark \citep{fu2020d4rl} that are different in data collections and sizes.

\textbf{Baselines.} We compare with several kinds of baselines that are widely used in offline RL. Specifically, the baselines including 1) behavior cloning (BC, scores taken from \citet{chen2021decision}
); 2) multiple state-of-the-art Temporal Difference (TD) methods such as CQL\citep{kumar2020conservative} and IQL \citep{kostrikov2021offline} (scores taken from \citep{kostrikov2021offline}); DT \citet{chen2021decision}, as the model-free transformer baseline, which has a similar formulation as ours but not from a multimodal perspective, we select DT; and finally two planning-based methods including Trajectory Transformer (TT) \citep{janner2021offline} and Diffuser \citep{janner2022planning}, used to evaluate the competitiveness of DTd as a multimodal model-free counterpart.

DTd is more sophisticated than DT due to the multimodal design and has about 2.5M parameters whereas the original DT has about 0.7M parameters. Therefore, in order to provide a fair comparison between DT and DTd, we present DT-large, a variant of DT with a larger amount of parameters. DT-large is designed to match DTd not only in terms of total parameters but also in terms of capabilities. DT-large has 4 layers (analogous to DTd 3-layer encoder + 1 Joinet net) and has 3 attention heads (analogous to 3 1-head modality encoders of DTd). We also raise the dimension of representation to increase its total parameters. While DT in Table \ref{tab:overall d4rl} refers to the DT with the original hyper-parameters evaluated with our protocol, DT-large is a DT with about 2.4M parameters. Detailed comparisons of hyper-parameters and training details between DT-large and DTd can be found in the supplement materials.

All transformer-based approaches, including DT, DT-large, DTd, and TT, are evaluated by reporting average and standard deviation (std) across 4 runs, where each run has a different training seed and is evaluated with 3 different seeds. Scores are normalized by the performance of expert and random policy according to the instruction from D4RL \citep{fu2020d4rl}. During the evaluation, DTd uses the same initial $R_t$ as the DT implemented by \citet{chen2021decision}. We highlight the methods reproduced by ourselves using the protocol above with \textbf{*} in Table \ref{tab:overall d4rl}.

\textbf{Performance results.} As shown in Table \ref{tab:overall d4rl}, DTd outperforms its modality-agnostic predecessor DT and other methods, including TD and planning-based methods. While TD methods required to design different networks including actor, critic, target network and complicated objectives \citep{fujimoto2019off, kostrikov2021offline, kumar2020conservative}, DTd only requires a uniform network architecture (stacking transformer blocks) and a simple behavior cloning objective. While a planning-based method such as TT or Diffuser requires unrolling a full model multiple times during deployment, DTd is capable to achieve model-based method performance without unrolling a dynamic model, which shows its potential application in real-time decision-making problems. 

\subsection{Effectiveness with Diverse Goals} 

To evaluate whether DTd could effectively leverage goals other than $R_t$, we choose the AntMaze navigation task from D4RL \citep{fu2020d4rl} and challenge DTd with other types of goals including state value, and goal position.  In AntMaze, an agent receives a reward of 1 only if it reaches the goal position within the maze and 0 in most cases. Such a sparse-reward setting makes an episodic return binary and less useful to prompt the model for action sequence generation. Therefore, leveraging other types of goals effectively in AntMaze becomes critical for good performance. 

\textbf{State Value as $G_t$.} As a first step, we evaluate the ability of DT and DTd to leverage state value by training an IQL agent on every dataset and using its state value to represent the task's goal. This concept is similar to the Q-function guided planning of TT \citep{janner2021offline} such that the state value specifies the desired outcome (goal) of the model. 

\textbf{Goal Position as $G_t$.} In addition to using a state value that provides a return-like scalar as $G_t$, we further challenge DTd and DT to condition the goal position of the maze. Specifically, $G_t$ becomes the 2D position of the agent at timestep $t$ concatenated with the goal position. To encourage useful hidden space, we ask DT and DTd to predict the 2D position (waypoint) 3 steps away from the current 2D position as an auxiliary task. For DT, this is implemented by using an extra prediction head to predict waypoints based on $G_t$'s representation from the last layer. For DTd, we apply the prediction head right above the modality encoder for the goal representation (not the joint encoder).

\textbf{Results.} We show the results in Table \ref{tab:antmaze}. The mean and std of the success rate across 4 runs are reported where each run is trained with different training seeds and evaluated with 10 different seeds. 
All methods are evaluated on 2 types of maze (Umaze, medium) with 3 types of datasets (Umaze-v0, play, diverse) from the D4RL. 
The DT-large and DTd variants with a value function have a suffix of \textbf{V}. The variant of DT-large and DTd with auxiliary waypoint prediction and goal position as input has a suffix of \textbf{goal}.  

When replacing $G_t$ with the concatenation of goal position and 2D position, only DTd-goal can reach the goal position and gain reward in the medium domain across 2 types of datasets, whereas DT-goal fails to solve the medium domain entirely. The advantage of DTd might be due to its architecture, which allows it to learn future-related (3 steps away waypoint) latent representation to affect the high-level decision-making while DT-goal is only able to apply such an auxiliary task at the last layer of the model, affecting the latent representation less effectively.

In Umaze tasks, DTd does not seem to offer any obvious advantages over DT. Umaze domains require a trivial U-shape solution in order to reach a goal position from a starting point, whereas DTd models require a waypoint prediction within the model (on top of the goal encoder) to affect latent representations, which may be overkill.

In the setting where the goal is represented by the state value, both DT and DTd are capable of solving the problem, but DTd-V shows better performance on average across environments and datasets. Credit should be given to the explicit representation of goals, which prompts the model to recall action sequences in an effective manner. 

\begin{figure*}[!htb]
  \centering
  \includegraphics[width=0.95\linewidth]{figures/efficient_medexp.png}
  \caption{ \textbf{Sample efficiency}. We plot the evaluation curve of our DTd against DT-large throughout the training across 4 runs. To reach DT-large's performance, DTd only requires 50\% or less amount of gradient steps. In the end, DTd achieves not only better performance but also smaller variance across multiple runs and many evaluations. We provide evaluation curves for all environments in the supplement materials.}\label{fig:efficient}
\end{figure*}

\begin{figure*}[!htb]
  \centering
  \includegraphics[width=0.95\linewidth]{figures/sensitive_check.png}
  \caption{\textbf{Robustness}. To study whether DTd is robust to the out-of-distribution (OOD) goal at test time, we varied the target return (goal) at test time from in-distribution return to OOD return (target return higher than the max return logged in the dataset). Notice that if the target return is no more than 1.2x of the max return logged in the dataset, DT and DTd could both achieve extrapolation. However, DTd is more vulnerable to unrealistically target returns that are 1.2x larger than the max trajectory return logged within the dataset since the negative effect created by OOD return will be cascaded into the final decision-making after the biasing layer (additive fusion).}\label{fig:sensitive}
\end{figure*}

\subsection{Ablation on the architecture}\label{ablation_choice}

In DTd's architecture, the biasing and combiner layers selectively introduce different types of cross-modal interactions, including state-return interaction and action-return interactions suggested by our design heuristics. We conducted ablation studies on these cross-modal interactions to better understand the role of them in multimodal decision-making and show the results in Table~\ref{tab:table_ablation_choice}.

We provide 3 variants of DTd in this ablation study:

\begin{enumerate}
   \item \textbf{DTd-zero} has biasing layers, combiner layers, and goal encoder removed. It is a simple behavior cloning model taking bi-modal inputs including states and actions. 
   \item \textbf{DTd-left} only applies biasing and combiner layers for state representation. All modality encoders exist.
   \item \textbf{DTd-right} only applies biasing and combiner layers for action representation. All modality encoders exist.
   \item \textbf{DTd} is the complete model drawn in the Figure~\ref{fig:overview}. 
\end{enumerate}

The score reported in Table \ref{tab:table_ablation_choice} follows the same protocol as in Table~\ref{tab:overall d4rl}.
We selected the medium-expert dataset where DTd yields the best result. If important designs are removed from the model, we expect to observe a performance reductions. Additionally, we include the medium-replay dataset for ablation. A removal of Biasing-Combiner at either side (DTd-left/right) of DTd should result in a more significant degradation in performance compared with medium-expert. Since DTd heavily relies on bias-to-goal from Biasing-Combiner to recall suitable trajectory to fulfill the task goal, a diverse dataset with varies goals will further verify the role of a Biasing-Combiner layer within the DTd.

Table \ref{tab:table_ablation_choice} shows that DTd-zero performs worse than DTd. This observation verifies the importance of cross-modal interaction again, as aforementioned in Section~\ref{modality analysis}. Losing cross-modal interaction from both sides of the model results in DTd-zero strictly performs worse than DTd. Since DTd-zero still includes the most important cross-modal attention (state-action) at Joinet net, it makes DTd-zero outperforms BC on average.

When we compare DTd-left and DTd-right to DT-large as shown in Table \ref{tab:table_ablation_choice}, we found DT-large has no significant advantages in terms of performance (DTd-left even outperforms DTd-large on average). This observation is interesting because both DTd-left and DTd-right are restricted to a limited number of cross-modal interactions yet occasionally outperform DT-large which leverages all possible intra-modal and cross-modal interactions by self-attention at every layer. Therefore, we argue that modality-agnostic design (e.g. DT) creats ineffective cross-modal interactions via self-attention at every layer while receiving marginal benefits in terms of performance.

As expected, leveraging all necessary cross-modal interactions (DTd) leads to the best performance compared to leveraging representations derived from restricted cross-modal interactions (DTd-left, DTd-right). Additionally, since DTd is strictly better than DTd-left and DTd-right, we believe that return-state and return-action interactions are complementary in multimodal decision-making.

\subsection{Ablation on the modality order}\label{ablation_order}

In addition to justifying the heuristics behind DTd's architecture design in Table \ref{tab:table_ablation_choice}, we show that the design heuristics of DTd suggest the best order for DTd's modalities. In Section~\ref{modality analysis}, state-action cross-modal interactions are prioritized over less important cross-modal interactions. Therefore, the current DTd has an input order of state, goal, and action (S-G-A) so that the Joint net takes the biased state and action with respect to the goal encoder in the center. Different from the current S-G-A setting, our ablation experiment tests DTd variants without placing goal modality in the center including S-A-G and A-S-G. As a result, the Joinet net no longer takes the state and action representation required by our heuristics. Note that our discussion below will ignore one order of a pair of symmetric order since DTd's architecture is symmetric. For example, we ignore A-G-S (i.e., S-G-A), G-A-S (i.e., S-A-G), and G-S-A (i.e., A-S-G). Specifically, we argue that an order which fails to offer a biased state and action to the Joint net will suffer from performance degradation, which is revealed in Table \ref{tab:table_ablation_order}. While S-A-G offered a biased state and goal to the Joint net, A-S-G presented biased action and goal to the Joint net. Neither of them are presenting the most important cross-modal interaction to the Joint net as S-G-A does. As a result, they both have bad performance as we expected.


\subsection{Pros \& Cons of Disentanglement}

On medium-expert datasets, we plot 100 evaluations of DTd and DT-large for MuJoCo locomotion tasks. DTd is more sample efficient than DT, as shown in Figure~\ref{fig:efficient}.  In general, DTd achieves a higher average score with a smaller variance and consuming at least 50\% fewer gradient steps. Since DTd includes cross-modal interactions only when necessary, confusion in decision-making is reduced due to de-entanglement at input. We found that DTd's sample efficiency is less evident when trajectories are of variable quality (medium replay) or suboptimal (medium). We provides comparison between DT-large and DTd cross all environments and datasets in the supplementary material for reader's reference.

As a consequence of modality disentanglement, DTd is more sensitive to out-of-distribution goals than DT. In MuJoCo locomotion tasks, we train and evaluate DTd and DT-large on datasets with diverse returns (medium-replay), ranging from in-distribution returns to out-of-distribution returns. The raw scores are averaged across four runs. As shown in Figure~\ref{fig:sensitive}, when the user specifies an unrealistic target return, DTd experiences a steep performance drop. The disentanglement of modalities is likely to have led to this negative result. A multimodal decision was made by DTd by explicitly learning a representation of return and fusing it into other modalities. During test time, if the distribution of return is significantly different from that during training, the representation of the other two modalities will also be influenced, and the negative effect created by OOD return will cascade into the prediction. DT is robust to OOD return because it mainly makes the multimodal decision based on state-state and state-action interactions, contributing 49\% of the attention weights (namely, 26\% + 23\% in Figure~\ref{fig:attn_stats}). Since return is not the major modality involved in DT's decision-making, an OOD return has less effect on its performance.


\section{Conclusion and Future Work} 

We advocate that solving offline RL via sequence modeling may benefit from a multimodal approach. While prior works such as DT model a tri-modal trajectory as one sequence assuming every token belongs to the same modality analogous to LM, our multimodal DTd models a trajectory by disentangling it into three unimodal sequences. After investigating the importance of cross-modal interactions within DT, we use the ranking of the importance discovered by DT as our heuristic for selective cross-modal fusion within DTd. DTd not only outperforms prior transformers, TD learning, and diffusion-based approaches on the D4RL benchmark, but also enjoys sample efficiency during training and algorithm flexibility to leverage diverse types of goals including the return, state-value, and 2D goal position.

DTd is our first step towards more effective and efficient sequential decision-making leveraging a multimodal approach. We point out some future directions that are worth pursuing.

\begin{enumerate}
    \item \textbf{Automating the modality-driven architecture.} Our work ``hard-code'' the importance of modality into the design of a multimodal architecture after the attention analysis of another model. Instead of training a model to discover the importance and leverage it with a new model, could such a process be automated and done within one model?

    % \item \textbf{Scaling DTd to other domains.} While DTd and its predecessor DT are all motivated by works in LM, LM like GPT4 \citep{openAIgpt4} has become multimodal instead of unimodal. Does  for multimodal large language model in the future?

    \item \textbf{Initializing decision-making models.} It has been shown that transformer-based RL can benefit from a pre-text task that involves text modality \citep{reid2022can}. How should we initialize a model like DTd which has many small transformer components?

\end{enumerate}


\begin{contributions} 
                        % will be removed in pdf for initial submission 
					  % (without ‘accepted’ option in \documentclass)
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
    Y. Wang devised the idea, conducted the experiment, and drafted the paper. M. Xu and L. Shi contributed to the experiment design and paper writing. Y. Chi supervised the entire project.
     
\end{contributions}


\begin{acknowledgements} 

% will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
   This work is supported in part by NSF via CCF-2106778.
\end{acknowledgements}

\newpage
% References
\bibliography{wang_552}
\end{document}
