% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{multirow}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{A Trajectory is Worth Three Sentences:\\Multimodal Transformer for Offline Reinforcement Learning\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<yiqiw2@andrew.cmu.edu>?Subject=Decision Transducer Paper}{Yiqi~Wang}{}}
\author[2]{Mengdi~Xu}
\author[1]{Laixi~Shi}
\author[1]{Yuejie~Chi}
% Add affiliations after the authors
\affil[1]{%
    Department of Electrical and Computer Engineering\\
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Department of Mechanical Engineering\\
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

The following information is provided to support discussion in the paper: (A) hyperparameter and training comparisons between the Decision Transformer \citep{chen2021decision} (DT), DT-large, and our Decision Transducer (DTd); (B) evaluations curves of DT-large and DTd on medium-expert, medium, medium-replay of the D4RL benchmark \citep{fu2020d4rl}.

\appendix
\section{Hyper-parameters for DT-large and DTd}
In Table~\ref{tab:hyp}, we have listed the architecture difference between DT-large and DTd. 

\begin{table*}[h]
\centering
\caption{\textbf{Hyperparameters}. While DT, DT-large, and DTd's Join net are all temporal transformers \citep{radford2019language} taking multimodal input, their input modalities varies according to the design.}\label{tab:hyp}
\begin{center}
\resizebox{\textwidth}{!}{
\begin{tabular}{c|cc|ccc|ccc|cc|ccc}
\toprule
\multirow{2}{*}{\textbf{Models}} & \multirow{2}{*}{\textbf{size}} & \multirow{2}{*}{\textbf{dimension}}& \multicolumn{3}{c|}{ \textbf{Modality Encoders}}  & \multicolumn{3}{c|}{ \textbf{Biasing} } & \multicolumn{2}{c|}{ \textbf{Combiner} } & \multicolumn{3}{c}{ \textbf{Joint Encoder} }  \\
& & &layers&heads&modality  &layers&heads&modality  &layers&modality  &layers&heads&modality\\
\midrule
DTd &2.52M&128    &3&1&uni-modal     &1&1&bi-modal      &1&bi-modal  &1&2&bi-modal\\
DT &0.7M & 128 &$\setminus$&$\setminus$ &$\setminus$     &$\setminus$ &$\setminus$ & $\setminus$     &$\setminus$ &$\setminus$  &3&1&tri-modal\\
DT-large &2.41M&213    &$\setminus$&$\setminus$ &$\setminus$    &$\setminus$ &$\setminus$ & $\setminus$     &$\setminus$ &$\setminus$ &4&3&tri-modal\\
\bottomrule
\end{tabular}
}
\end{center}
\end{table*}


\begin{figure*}[!htb]
  \centering
  \includegraphics[width=0.8\linewidth]{supp_figures/lr_schedule.png}
  \caption{ \textbf{Learning rate (lr) schedule}. DTd and DT-large are trained with the same number of gradient steps. However, on small dataset, DTd's lr (schedule 1) will be exponentially decayed after linear warm-up while DT-large don't (schedule 2). Since DTd is much more sample efficient than DT, a decaying lr will present it to overfit the data.}\label{fig:lr}
\end{figure*}

DT-large and DTd were both trained with 25k gradients steps of 256 batch size before evaluations. We found DTd learns faster than DT and may overfit the smaller dataset from D4RL benchmark \citep{fu2020d4rl} such as medium and medium-replay. Therefore, we provide an exponentially decay learning rate schedule for DTd on smaller dataset other than medium-expert. Both DT-large and DTd has a peak learning rate 0.0001. We provide the learning rate curve throughout the training process as in Figure~\ref{fig:lr}. DT-large always use schedule 2 provided by \citet{chen2021decision} while DTd use schedule 2 when dataset is large (medium-expert) and choose exponentially decaying schedule 1 when dataset is small to avoid potential overfitting.

\section{evaluations curves of DT-large and DTd}
The normalized scores on D4RL benchmark \citep{fu2020d4rl} across 4 runs with independent training seed per run and 3 different evaluation seeds was plotted across all 3 environments (hopper, walker2d, halfcheetah) and across all dataset (medium-expert, medium, medium-replay). Both DT-large and DTd was trained 25k gradient steps with a batch size of 256 and was evaluated every 250 gradients steps following the protocol we discussed above. Curves are the average result across 4 runs and the shaded area corresponds to the standard deviation. As shown in the Figure~\ref{fig:medexp}, DTd is more sample efficient than DT on medium-expert data across all environment but such an advantage is not consistently observed on medium and medium-replay dataset across 3 environments as shown in Figure~\ref{fig:med} and Figure~\ref{fig:medrep}.

\begin{figure*}[!htb]
  \centering
  \includegraphics[width=1.0\linewidth]{supp_figures/efficient_medexp.png}
  \caption{Evaluation Curve on Medium-Expert Dataset}.\label{fig:medexp}
\end{figure*}

\begin{figure*}[!htb]
  \centering
  \includegraphics[width=1.0\linewidth]{supp_figures/efficient_med.png}
  \caption{Evaluation Curve on Medium Dataset}.\label{fig:med}
\end{figure*}

\begin{figure*}[!htb]
  \centering
  \includegraphics[width=1.0\linewidth]{supp_figures/efficient_medrep.png}
  \caption{Evaluation Curve on Medium-Replay Dataset}.\label{fig:medrep}
\end{figure*}

\bibliography{wang_552}

\end{document}
