%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

\usepackage{graphicx}
\usepackage{dutchcal}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{multirow}

\def\gL{{\mathcal{L}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gN{{\mathcal{N}}}
\def\gh{{\mathcal{h}}}
\def\gi{{\mathcal{i}}}
\def\gj{{\mathcal{j}}}
\def\gm{{\mathcal{m}}}
\def\gy{{\mathcal{y}}}
\def\gg{{\mathcal{g}}}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{$E(2)$-Equivariant Vision Transformer\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:rux@zju.edu.cn}{Renjun~Xu}\thanks{Contributed equally.}}
% \author[1]{Kaifan~Yang∗^*}
\author[1,2]{\href{mailto:yangkaifan@zju.edu.cn}{Kaifan~Yang$^*$}}
\author[1,2]{\href{mailto:lk2017@zju.edu.cn}{Ke~Liu$^*$\thanks{Corresponding author: Ke Liu}}}
\author[3,2]{Fengxiang~He}
% Add affiliations after the authors
\affil[1]{%
    College of Computer Science and Technology\\
    Zhejiang University%\\
%    Hangzhou, Zhejiang, China
}
\affil[2]{%
    JD Explore Academy, JD.com, Inc.
}
\affil[3]{%
    AIAI, School of Informatics, University of Edinburgh
}
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 

\appendix
\section{Errors in GSA-Nets}\label{Appendix:mistake}

\noindent In this part, we first review the proof of equivariance of the GSA-Nets~\citep{GSA-Nets}, and then point out the mistakes in the proof process using the positional encoding as:
\begin{align}\nonumber
    \rho((i,\tilde{\mathcal{h}}), (j,\hat{\mathcal{h}})) = \rho^{P}(x(j)-x(i), \tilde{\mathcal{h}}^{-1}\hat{\mathcal{h}})
\end{align}
\subsection{Definitions and Notations.}

\subsubsection{Definition of Group Equivariant Self-Attention.} If the group self-attention formulation $\mathcal{m}^{r}_{\mathcal{G}}[f,\rho](i,\mathcal{h})$ is $\mathcal{G}$-equivariant, if and only if it satisfies: $$\mathcal{m}^{r}_\mathcal{G}[\mathcal{L}_{\mathcal{g}}[f],\rho](i,\mathcal{h})= \mathcal{L}_{\mathcal{g}}[\mathcal{m}^{r}_\mathcal{G}[f,\rho]](i,\mathcal{h}),\quad\mathcal{g}\in \mathcal{G}$$
\subsubsection{Input under $g$-Transformed}A $\mathcal{g}$-transformed input can be expressed as:
\begin{align*}
    \mathcal{L}_{\mathcal{g}}[f](\mathcal{i}, \tilde{h}) = \mathcal{L}_{y}\mathcal{L}_{\bar{\mathcal{h}}}[f](\mathcal{i}, \tilde{\mathcal{h}}) = f(\rho^{-1}(\bar{\mathcal{h}}^{-1}(\rho(\mathcal{i}) - y)), \bar{\mathcal{h}}^{-1}\tilde{\mathcal{h}}),\\ 
    \mathcal{g} = (y, \bar{\mathcal{h}}),\; y \in \mathbb{R}^{d},\; \bar{\mathcal{h}} \in \mathcal{H}.
\end{align*}

\section{Proof of GE-ViT}\label{Appendix:right}

\noindent In this section, we prove that GE-ViT is group equivariant. For brevity, we also use the substitutions:
$$\bar{\gi} = x^{-1}(\bar{\gh}^{-1} (x(\gi) - y)) \Rightarrow \gi = x^{-1}(\bar{\gh}x(\bar{\gi}) + y)), \tilde{\gh}' = \bar{\gh}^{-1}\tilde{\gh},$$ 
and 
$$\bar{\gj} = x^{-1}(\bar{\gh}^{-1} (x(\gj) - y)) \Rightarrow \gj = x^{-1}(\bar{\gh}x(\bar{\gj}) + y)), \hat{\gh}' = \bar{\gh}^{-1}\hat{\gh}.$$

The complete proof process is as follows:
\begin{align}
    \mathcal{m}&^{r}_{\mathcal{G}}\big[\mathcal{L}_{y}\mathcal{L}_{\bar{\mathcal{h}}}[f], \rho\big](\mathcal{i}, \mathcal{h}) \\ \nonumber 
    &= \varphi_{\text{out}}\Big( \bigcup_{h \in [H]} \sum_{\tilde{\gh} \in \gH}\hspace{-1.05cm}  \sum_{\qquad \quad (\gj, \hat{\gh}) \in \gN(\gi, \tilde{\gh})} \hspace{-0.6cm}\hspace{-5mm}\sigma\hspace{-0.5mm}_{\gj, \hat{\gh}}\big(\langle \varphi_{\text{qry}}^{(h)}(\gL_{y}\gL_{\bar{\gh}}[f](\gi, \tilde{\gh})), \varphi_{\text{key}}^{(h)}(\gL_{y}\gL_{\bar{\gh}}[f](\gj, \hat{\gh})\\[-1\jot] \nonumber
    &\hspace{4.1cm} + \gL_{\gh}[\rho]((\gi, \tilde{\gh}), (\gj, \hat{\gh})) \rangle \big)
    \varphi_{\text{val}}^{(h)}(\gL_{y}\gL_{\bar{\gh}}[f](\gj, \hat{\gh})) \Big) \\
    &= \varphi_{\text{out}}\Big( \bigcup_{h \in [H]} \sum_{\tilde{\gh} \in \gH}\hspace{-1.05cm} \sum_{\qquad \quad (\gj, \hat{\gh}) \in \gN(\gi, \tilde{\gh})} \hspace{-0.6cm}\hspace{-5mm}\sigma\hspace{-0.5mm}_{\gj, \hat{\gh}}\big(\langle \varphi_{\text{qry}}^{(h)}(f(x^{-1}(\bar{\gh}^{-1}(x(\gi) - y)), \bar{\gh}^{-1}\tilde{\gh})),\\  \nonumber
    &\hspace{1cm}\varphi_{\text{key}}^{(h)}(f(x^{-1}(\bar{\gh}^{-1}(x(\gj) - y)), \bar{\gh}^{-1}\hat{\gh}) + \gL_{\gh}[\rho]((\gi, \tilde{\gh}), (\gj, \hat{\gh})) \rangle \big)\\ \nonumber
    &\hspace{2cm}\varphi_{\text{val}}^{(h)}(f(x^{-1}(\bar{\gh}^{-1}(x(\gj) - y)), \bar{\gh}^{-1}\hat{\gh})) \Big)\\ 
    &= \varphi_{\text{out}}\Big( \bigcup_{h \in [H]} \sum_{\bar{\gh}\tilde{\gh}' \in \gH} \hspace{-4.6cm}\sum_{\qquad\qquad\qquad\qquad\qquad\qquad \quad (x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \bar{\gh}\hat{\gh}') \in \gN(x^{-1}(\bar{\gh}x(\bar{\gi}) + y), \bar{\gh}\tilde{\gh}')} \hspace{-4.4cm}\sigma\hspace{-0.5mm}_{x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \bar{\gh}\hat{\gh}'}\big(\langle \varphi_{\text{qry}}^{(h)}(f(\bar{\gi}, \tilde{\gh}')),\varphi_{\text{key}}^{(h)}(f(\bar{\gj}, \hat{\gh}') \\
    \nonumber
    &\hspace{0.6cm} + \gL_{\textcolor{red}{\gh}}[\rho]((x^{-1}(\bar{\gh}x(\bar{\gi}) + y), \textcolor{red}{\bar{\gh}\tilde{\gh}'}),
    (x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \textcolor{red}{\bar{\gh}\hat{\gh}'})) \rangle \big)\varphi_{\text{val}}^{(h)}(f(\bar{\gj}, \hat{\gh}')) \Big)
\end{align}
By using the definition:
$$
    \rho((i,\tilde{\mathcal{h}}), (j,\hat{\mathcal{h}})) = \rho^{P}(x(j)-x(i), \tilde{\mathcal{h}}\hat{\mathcal{h}}^{-1}\tilde{\mathcal{h}})
$$
and 
$$
    \mathcal{L}_{\mathcal{h}}[\rho]((i,\tilde{h}),(j,\hat{h})) = \rho^{P}(\mathcal{h}^{-1}(x(j)-x(i)),\mathcal{h}^{-1}(\tilde{\mathcal{h}}\hat{\mathcal{h}}^{-1}\tilde{\mathcal{h}})).
$$
The above formula can be further derived:
\begin{align}
& =\varphi_{\text{out}}\Big( \bigcup_{h \in [H]} \sum_{\bar{\gh}\tilde{\gh}' \in \gH} \hspace{-4.6cm}\sum_{\qquad\qquad\qquad\qquad\qquad\qquad \quad (x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \bar{\gh}\hat{\gh}') \in \gN(x^{-1}(\bar{\gh}x(\bar{\gi}) + y), \bar{\gh}\tilde{\gh}')} \hspace{-4.4cm}\sigma\hspace{-0.5mm}_{x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \bar{\gh}\hat{\gh}'}\big(\langle \varphi_{\text{qry}}^{(h)}(f(\bar{\gi}, \tilde{\gh}')), \varphi_{\text{key}}^{(h)}(f(\bar{\gj}, \hat{\gh}') \\[-0.5\jot]  \nonumber
&\hspace{0.35cm} + \rho^P( \gh^{-1}(\bar{\gh}x(\bar{\gj}) + y - (\bar{\gh}x(\bar{\gi}) + y)), \textcolor{red}{\gh^{-1}(\bar{\gh}\tilde{\gh}')(\bar{\gh}\hat{\gh}')^{-1}(\bar{\gh}\tilde{\gh}')} )) \rangle \big) \varphi_{\text{val}}^{(h)}(f(\bar{\gj}, \hat{\gh}')) \Big)\\ 
& =\varphi_{\text{out}}\Big( \bigcup_{h \in [H]} \sum_{\bar{\gh}\tilde{\gh}' \in \gH} \hspace{-4.6cm}\sum_{\qquad\qquad\qquad\qquad\qquad\qquad \quad (x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \bar{\gh}\hat{\gh}') \in \gN(x^{-1}(\bar{\gh}x(\bar{\gi}) + y), \bar{\gh}\tilde{\gh}')} \hspace{-4.4cm}\sigma\hspace{-0.5mm}_{x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \bar{\gh}\hat{\gh}'}\big(\langle \varphi_{\text{qry}}^{(h)}(f(\bar{\gi}, \tilde{\gh}')), \varphi_{\text{key}}^{(h)}(f(\bar{\gj}, \hat{\gh}') \\[-0.5\jot]  \nonumber
&\hspace{0.65cm} + \rho^P( \gh^{-1}(\bar{\gh}x(\bar{\gj}) + y - (\bar{\gh}x(\bar{\gi}) + y)), \textcolor{red}{\gh^{-1}\bar{\gh}\tilde{\gh}'{\hat{\gh}}^{'-1}\tilde{\gh}'} )) \rangle \big) \varphi_{\text{val}}^{(h)}(f(\bar{\gj}, \hat{\gh}')) \Big)\\
& = \varphi_{\text{out}}\Big( \bigcup_{h \in [H]} \sum_{\bar{\gh}\tilde{\gh}' \in \gH} \hspace{-4.6cm}\sum_{\qquad\qquad\qquad\qquad\qquad\qquad \quad (x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \bar{\gh}\hat{\gh}') \in \gN(x^{-1}(\bar{\gh}x(\bar{\gi}) + y), \bar{\gh}\tilde{\gh}')} \hspace{-4.4cm}\sigma\hspace{-0.5mm}_{x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \bar{\gh}\hat{\gh}'}\big(\langle \varphi_{\text{qry}}^{(h)}(f(\bar{\gi}, \tilde{\gh}')), \varphi_{\text{key}}^{(h)}(f(\bar{\gj}, \hat{\gh}')\\[-0.5\jot] \nonumber
&\hspace{3cm}  + \rho^P( \textcolor{red}{\gh^{-1}\bar{\gh}}(x(\bar{\gj})- x(\bar{\gi}), \textcolor{red}{\tilde{\gh}'{\hat{\gh}}^{'-1}\tilde{\gh}'}))) \rangle \big)  \varphi_{\text{val}}^{(h)}(f(\bar{\gj}, \hat{\gh}')) \Big) \\
& = \varphi_{\text{out}}\Big( \bigcup_{h \in [H]} \sum_{\bar{\gh}\tilde{\gh}' \in \gH} \hspace{-4.6cm}\sum_{\qquad\qquad\qquad\qquad\qquad\qquad \quad (x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \bar{\gh}\hat{\gh}') \in \gN(x^{-1}(\bar{\gh}x(\bar{\gi}) + y), \bar{\gh}\tilde{\gh}')} \hspace{-4.4cm}\sigma\hspace{-0.5mm}_{x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \bar{\gh}\hat{\gh}'}\big(\langle \varphi_{\text{qry}}^{(h)}(f(\bar{\gi}, \tilde{\gh}')), \varphi_{\text{key}}^{(h)}(f(\bar{\gj}, \hat{\gh}')  \\[-0.5\jot] \nonumber
&\hspace{3.65cm} + \gL_{\textcolor{red}{\bar{\gh}^{-1}\gh}}[\rho]((\bar{\gi}, \textcolor{red}{\tilde{\gh}'}),(\bar{\gj}, \textcolor{red}{\hat{\gh}'}))) \rangle \big)  \varphi_{\text{val}}^{(h)}(f(\bar{\gj}, \hat{\gh}')) \Big)
\end{align}

The subsequent proof is similar to the GSA-Nets~\citep{GSA-Nets}. For unimodular groups, the area of summation remains equal for any transformation $\gg \in \gG$, which means that:
\begin{align*} 
    \sum_{(x^{-1}(\bar{\gh}x(\bar{\gj}) + y), \bar{\gh}\hat{\gh}') \in \gN(x^{-1}(\bar{\gh}x(\bar{\gi}) + y), \bar{\gh}\tilde{\gh}')}  \hspace{-2cm} [\cdot] \hspace{1.5cm} \quad=& \hspace{0.5cm} \sum_{(x^{-1}(\bar{\gh}x(\bar{\gj})), \bar{\gh}\hat{\gh}') \in \gN(x^{-1}(\bar{\gh}x(\bar{\gi})), \bar{\gh}\tilde{\gh}')}  \hspace{-1.75cm} [\cdot] \\
    =& \hspace{0.8cm} \sum_{(x^{-1}(x(\bar{\gj})), \hat{\gh}') \in \gN(x^{-1}(x(\bar{\gi})), \tilde{\gh}')}  \hspace{-1.45cm} [\cdot] \hspace{1cm} \\ 
    =& \hspace{1.9cm}\sum_{(\bar{\gj}, \hat{\gh}') \in \gN(\bar{\gi}, \tilde{\gh}')} \hspace{-0.35cm} [\cdot].
\end{align*}
and because of the basic properties of groups, we can get $ \sum_{\bar{\gh}\tilde{\gh}' \in \gH}[\cdot] = \sum_{\tilde{\gh}' \in \gH}[\cdot]$. Consequently, the above formula can be further simplified as:

\begin{equation}
    \begin{aligned}
\gm^{r}_{\gG}\big[\gL_{\gy}\gL_{\bar{\gh}}[f], \rho\big](\gi, \gh) =& \varphi_{\text{out}}\Big( \bigcup_{h \in [H]}\sum_{\tilde{\gh}' \in \gH} \sum_{ (\bar{\gj}, \hat{\gh}') \in \gN(\bar{\gi}, \tilde{\gh}')} \hspace{-0.65cm}\sigma\hspace{-0.5mm}_{\bar{\gj}, \hat{\gh}'}\big(\langle 
\varphi_{\text{qry}}^{(h)}(f(\bar{\gi}, \tilde{\gh}')), 
 \\[-1\jot]
&\hspace{0.1cm}\varphi_{\text{key}}^{(h)}(f(\bar{\gj}, \hat{\gh}')  + 
\gL_{\bar{\gh}^{-1}\gh}[\rho]((\bar{\gi}, \tilde{\gh}'),(\bar{\gj}, \hat{\gh}'))) \rangle \big)  \varphi_{\text{val}}^{(h)}(f(\bar{\gj}, \hat{\gh}')) \Big)
\\[2\jot]
=&\gm^{r}_{\gG}[f, \rho](\bar{\gi}, \bar{h}^{-1}\gh) \\[2\jot]
=& \gm^{r}_{\gG}[f, \rho](x^{-1}(\bar{\gh}^{-1} (x(\gi) - y)), \bar{h}^{-1}\gh) 
\\[2\jot]
=& \gL_{y}\gL_{\bar{h}}\big[\gm^{r}_{\gG}[f, \rho]\big](\gi, \gh).
    \end{aligned}
\end{equation}
From the above formula, it can be seen that: 
$$
\gm^{r}_{\gG}[\gL_{y}\gL_{\bar{h}}[f], \rho](\gi, \gh) = \gL_{y}\gL_{\bar{h}}[\gm^{r}_{\gG}[f, \rho]](\gi, \gh),
$$
which is the same as:
$$
\mathcal{m}^{r}_\mathcal{G}[\mathcal{L}_{\mathcal{g}}[f],\rho](i,\mathcal{h})= \mathcal{L}_{\mathcal{g}}[\mathcal{m}^{r}_\mathcal{G}[f,\rho]](i,\mathcal{h}),\quad\mathcal{g}\in \mathcal{G}.
$$
Therefore, with the positional encoding we proposed:
$$
    \rho((i,\tilde{\mathcal{h}}), (j,\hat{\mathcal{h}})) = \rho^{P}(x(j)-x(i), \tilde{\mathcal{h}}\hat{\mathcal{h}}^{-1}\tilde{\mathcal{h}}),
$$
the group self-attention is group equivariant.

\bibliography{xu_298}

\end{document}
